544 files changed, 132501 insertions, 41141 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index dddb0443..bbe1be9a 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-eab3b355cf6fcabbf07d7a9032c68e95cab37ad0
+bc01863fb6eff06f7b028e4c5cd8850d21d7f10d
diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 00000000..9270938d
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,34 @@
+root = true
+
+[*]
+charset = utf-8
+end_of_line = lf
+indent_size = 8
+indent_style = tab
+insert_final_newline = true
+max_line_length = 80
+tab_width = 8
+trim_trailing_whitespace = true
+
+[.git/**]
+max_line_length = 72
+
+[*.nix]
+indent_style = space
+indent_size = 2
+
+[*.y{,a}ml]
+indent_size = 2
+indent_style = space
+
+[Cargo.toml]
+indent_style = space
+indent_size = 4
+
+[*.rs]
+indent_style = space
+indent_size = 4
+
+[*.sh]
+indent_size = 4
+indent_style = space
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 00000000..5ace4600
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
diff --git a/.github/workflows/build-packages.yml b/.github/workflows/build-packages.yml
new file mode 100644
index 00000000..052f366a
--- /dev/null
+++ b/.github/workflows/build-packages.yml
@@ -0,0 +1,90 @@
+on: [push]
+
+name: build
+
+jobs:
+
+  deb:
+    name: bcachefs-tools-deb
+    strategy:
+      matrix:
+        os: [ubuntu-22.04, ubuntu-24.04]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Disable initramfs update
+        run: sudo sed -i 's/yes/no/g' /etc/initramfs-tools/update-initramfs.conf
+      - name: Disable man-db update
+        run: sudo rm -f /var/lib/man-db/auto-update
+      - name: Install build-deps
+        run: |
+          sudo apt-get update && sudo apt-get -y --no-install-recommends install \
+              valgrind \
+              equivs devscripts
+          grep -q 22.04 /etc/os-release && sed -i  -e 's/ systemd-dev,/ systemd,/g' debian/control
+          mk-build-deps
+          sudo apt install ./bcachefs-tools-build-deps_*.deb
+      - name: Setup Rust
+        run: |
+          curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal
+          echo "$HOME/.cargo/bin" >> $GITHUB_PATH
+      - name: Make deb
+        run: |
+          make -j`nproc` deb
+          mkdir dist && mv ../*.deb ./dist/
+      - name: Upload deb
+        uses: actions/upload-artifact@v4
+        with:
+          name: bcachefs-tools-deb_${{ matrix.os }}
+          path: dist
+
+  rpm:
+    name: bcachefs-tools-rpm
+    runs-on: ubuntu-latest
+    container:
+      image: docker.io/fedora:latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install build essentials
+        run: dnf install -y nodejs make rpmdevtools yum-utils
+      - name: Setup Rust
+        run: |
+          curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal
+          echo "$HOME/.cargo/bin" >> $GITHUB_PATH
+      - name: Install build-deps
+        run: dnf builddep -y packaging/bcachefs-tools.spec
+      - name: Make rpm
+        run: |
+          make -j`nproc` rpm
+          mv ${HOME}/rpmbuild ./
+      - name: Upload rpm
+        uses: actions/upload-artifact@v4
+        with:
+          name: bcachefs-tools-rpm
+          path: rpmbuild
+
+  msrv:
+    name: bcachefs-tools-msrv
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Disable initramfs update
+        run: sudo sed -i 's/yes/no/g' /etc/initramfs-tools/update-initramfs.conf
+      - name: Disable man-db update
+        run: sudo rm -f /var/lib/man-db/auto-update
+      - name: Install build-deps
+        run: |
+          sudo apt-get update && sudo apt-get -y --no-install-recommends install pkg-config libaio-dev libblkid-dev \
+              libkeyutils-dev liblz4-dev libsodium-dev liburcu-dev libzstd-dev \
+              uuid-dev zlib1g-dev valgrind libudev-dev python3-docutils libclang-dev
+      - name: Extract MSRV
+        run: |
+          MSRV=$(cargo metadata --format-version 1 --no-deps |
+                  jq -r '.packages[] | select(.name == "bcachefs-tools") | .rust_version')
+          echo "MSRV=$MSRV" >> $GITHUB_ENV
+      - name: Install Rust ${{ env.MSRV }} toolchain
+        run: |
+          rustup install --profile minimal ${{ env.MSRV }}
+      - name: Make
+        run: |
+          CARGO_TOOLCHAIN_VERSION=${{ env.MSRV }} make -j`nproc`
diff --git a/.github/workflows/nix-flake-update.yml b/.github/workflows/nix-flake-update.yml
new file mode 100644
index 00000000..2b65c329
--- /dev/null
+++ b/.github/workflows/nix-flake-update.yml
@@ -0,0 +1,21 @@
+name: update-flake-lock
+on:
+  workflow_dispatch: # allows manual triggering
+  schedule:
+    - cron: '0 0 1 * *' # Run monthly
+  push:
+      paths:
+        - 'flake.nix'
+jobs:
+  lockfile:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Install Nix
+        uses: cachix/install-nix-action@v27
+        with:
+          extra_nix_config: |
+            access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
+      - name: Update flake.lock
+        uses: DeterminateSystems/update-flake-lock@v21
diff --git a/.github/workflows/nix-flake.yml b/.github/workflows/nix-flake.yml
new file mode 100644
index 00000000..4dfb6453
--- /dev/null
+++ b/.github/workflows/nix-flake.yml
@@ -0,0 +1,22 @@
+name: "Nix-Tests"
+on:
+  pull_request:
+  push:
+jobs:
+  nix-flake-check:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - uses: cachix/install-nix-action@v27
+      with:
+        extra_nix_config: |
+          experimental-features = nix-command flakes
+          access-tokens = github.com=${{ secrets.GITHUB_TOKEN }}
+    - uses: cachix/cachix-action@v15
+      with:
+        name: bcachefs-tools
+        # If you chose API tokens for write access OR if you have a private cache
+        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
+    - run: nix flake show
+    - run: nix flake check --print-build-logs
+    - run: nix build --print-build-logs
diff --git a/.gitignore b/.gitignore
index 7e4ee7a3..2accefbb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,32 @@
 /result
-bcachefs
+bcachefs.5
 .*
 *.o
+*.so
 *.d
 *.a
 tags
+TAGS
 cscope*
 bcachefs-tools
+compile_commands.json
+
+# dot-files that we don't want to ignore
+!.gitignore
+!.github/dependabot.yml
+!.github/workflows/
+!.editorconfig
+
+bcachefs-principles-of-operation.*
+
+bch_bindgen/Cargo.lock
+
+# will have compiled files and executables
+debug/
+target/
+
+# These are backup files generated by rustfmt
+**/*.rs.bk
+
+# MSVC Windows builds of rustc generate these, which store debugging information
+*.pdb
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 00000000..28bb2eee
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,714 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "aho-corasick"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "anstream"
+version = "0.6.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1"
+
+[[package]]
+name = "anstyle-parse"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eb47de1e80c2b463c735db5b217a0ddc39d612e7ac9e2e96a5aed1f57616c1cb"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d36fc52c7f6c869915e99412912f22093507da8d9e942ceaf66fe4b7c14422a"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8"
+dependencies = [
+ "anstyle",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6"
+
+[[package]]
+name = "bcachefs-tools"
+version = "1.12.0"
+dependencies = [
+ "anyhow",
+ "bch_bindgen",
+ "clap",
+ "clap_complete",
+ "either",
+ "env_logger",
+ "errno 0.2.8",
+ "libc",
+ "log",
+ "owo-colors",
+ "rustix",
+ "strum",
+ "strum_macros",
+ "udev",
+ "uuid",
+ "zeroize",
+]
+
+[[package]]
+name = "bch_bindgen"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "bindgen",
+ "bitfield",
+ "bitflags 1.3.2",
+ "paste",
+ "pkg-config",
+ "uuid",
+]
+
+[[package]]
+name = "bindgen"
+version = "0.69.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
+dependencies = [
+ "bitflags 2.6.0",
+ "cexpr",
+ "clang-sys",
+ "itertools",
+ "lazy_static",
+ "lazycell",
+ "log",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+ "syn",
+ "which",
+]
+
+[[package]]
+name = "bitfield"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d7e60934ceec538daadb9d8432424ed043a904d8e0243f3c6446bce549a46ac"
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "bitflags"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
+
+[[package]]
+name = "cc"
+version = "1.1.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e80e3b6a3ab07840e1cae9b0666a63970dc28e8ed5ffbcdacbfc760c281bfc1"
+dependencies = [
+ "shlex",
+]
+
+[[package]]
+name = "cexpr"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
+dependencies = [
+ "nom",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "clang-sys"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
+dependencies = [
+ "glob",
+ "libc",
+ "libloading",
+]
+
+[[package]]
+name = "clap"
+version = "4.5.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.5.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+ "terminal_size",
+]
+
+[[package]]
+name = "clap_complete"
+version = "4.5.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9646e2e245bf62f45d39a0f3f36f1171ad1ea0d6967fd114bca72cb02a8fcdfb"
+dependencies = [
+ "clap",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.5.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "clap_lex"
+version = "0.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
+
+[[package]]
+name = "colorchoice"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0"
+
+[[package]]
+name = "either"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
+
+[[package]]
+name = "env_logger"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580"
+dependencies = [
+ "log",
+]
+
+[[package]]
+name = "errno"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
+dependencies = [
+ "errno-dragonfly",
+ "libc",
+ "winapi",
+]
+
+[[package]]
+name = "errno"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "errno-dragonfly"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
+dependencies = [
+ "cc",
+ "libc",
+]
+
+[[package]]
+name = "glob"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
+
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
+[[package]]
+name = "home"
+version = "0.5.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
+
+[[package]]
+name = "itertools"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
+
+[[package]]
+name = "lazycell"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
+
+[[package]]
+name = "libc"
+version = "0.2.159"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5"
+
+[[package]]
+name = "libloading"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
+dependencies = [
+ "cfg-if",
+ "windows-targets",
+]
+
+[[package]]
+name = "libudev-sys"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c8469b4a23b962c1396b9b451dda50ef5b283e8dd309d69033475fa9b334324"
+dependencies = [
+ "libc",
+ "pkg-config",
+]
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
+
+[[package]]
+name = "log"
+version = "0.4.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
+
+[[package]]
+name = "memchr"
+version = "2.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+
+[[package]]
+name = "minimal-lexical"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
+
+[[package]]
+name = "nom"
+version = "7.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+dependencies = [
+ "memchr",
+ "minimal-lexical",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
+
+[[package]]
+name = "owo-colors"
+version = "4.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb37767f6569cd834a413442455e0f066d0d522de8630436e2a1761d9726ba56"
+
+[[package]]
+name = "paste"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
+
+[[package]]
+name = "pkg-config"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
+
+[[package]]
+name = "prettyplease"
+version = "0.2.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "479cf940fbbb3426c32c5d5176f62ad57549a0bb84773423ba8be9d089f5faba"
+dependencies = [
+ "proc-macro2",
+ "syn",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b3e4daa0dcf6feba26f985457cdf104d4b4256fc5a09547140f3631bb076b19a"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "regex"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
+
+[[package]]
+name = "rustc-hash"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
+
+[[package]]
+name = "rustix"
+version = "0.38.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811"
+dependencies = [
+ "bitflags 2.6.0",
+ "errno 0.3.9",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "rustversion"
+version = "1.0.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6"
+
+[[package]]
+name = "shlex"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
+
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
+[[package]]
+name = "strum"
+version = "0.26.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
+dependencies = [
+ "strum_macros",
+]
+
+[[package]]
+name = "strum_macros"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+ "syn",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.79"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "terminal_size"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f599bd7ca042cfdf8f4512b277c02ba102247820f9d9d4a9f521f496751a6ef"
+dependencies = [
+ "rustix",
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "udev"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ebdbbd670373442a12fe9ef7aeb53aec4147a5a27a00bbc3ab639f08f48191a"
+dependencies = [
+ "libc",
+ "libudev-sys",
+ "pkg-config",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
+
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
+[[package]]
+name = "uuid"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314"
+
+[[package]]
+name = "which"
+version = "4.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
+dependencies = [
+ "either",
+ "home",
+ "once_cell",
+ "rustix",
+]
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
+[[package]]
+name = "zeroize"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
+dependencies = [
+ "zeroize_derive",
+]
+
+[[package]]
+name = "zeroize_derive"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 00000000..25a42e50
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,37 @@
+[package]
+name = "bcachefs-tools"
+version = "1.12.0"
+authors = ["Yuxuan Shui <yshuiv7@gmail.com>", "Kayla Firestack <dev@kaylafire.me>", "Kent Overstreet <kent.overstreet@linux.dev>" ]
+edition = "2021"
+rust-version = "1.77"
+
+[[bin]]
+name = "bcachefs"
+path = "src/bcachefs.rs"
+
+[features]
+fuse = []
+
+[dependencies]
+log = { version = "0.4", features = ["std"] }
+clap = { version = "4.0.32", features = ["derive", "wrap_help"] }
+clap_complete = "4.3.2"
+anyhow = "1.0"
+libc = "0.2.69"
+udev = "0.7.0"
+uuid = "1.2.2"
+errno = "0.2"
+either = "1.5"
+bch_bindgen = { path = "bch_bindgen" }
+strum = { version = "0.26", features = ["derive"] }
+strum_macros = "0.26"
+zeroize = { version = "1", features = ["std", "zeroize_derive"] }
+rustix = { version = "0.38.34", features = ["termios"] }
+owo-colors = "4"
+
+[dependencies.env_logger]
+version = "0.10"
+default-features = false
+
+[profile.release]
+strip = "none"
diff --git a/INSTALL b/INSTALL
deleted file mode 100644
index c20b7d95..00000000
--- a/INSTALL
+++ /dev/null
@@ -1,19 +0,0 @@
-
-Dependencies:
-
- * libattr1
- * libblkid
- * libuuid
- * libscrypt
- * libsodium
- * libkeyutils
- * liburcu
- * pkg-config
- * zlib1g
-
-On debian, you can install these with
-    apt install -y pkg-config libblkid-dev uuid-dev libscrypt-dev libsodium-dev
-	libkeyutils-dev liburcu-dev zlib1g-dev libzstd-dev libattr1-dev
-	libaio-dev liblz4-dev
-
-Then, just make && make install
diff --git a/INSTALL.md b/INSTALL.md
new file mode 100644
index 00000000..6aef12c5
--- /dev/null
+++ b/INSTALL.md
@@ -0,0 +1,88 @@
+Getting started
+---------------
+
+Build dependencies:
+
+ * libaio
+ * libblkid
+ * libclang
+ * libkeyutils
+ * liblz4
+ * libsodium
+ * liburcu
+ * libuuid
+ * libzstd
+ * pkg-config
+ * valgrind
+ * zlib1g
+
+In addition a recent Rust toolchain is required (rustc, cargo), either by using
+[rustup](https://rustup.rs/) or make sure to use a distribution where a recent
+enough rustc is available. Please check `rust-version` in `Cargo.toml` to see
+the minimum supported Rust version (MSRV).
+
+``` shell
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path
+```
+
+Debian (Bullseye or later) and Ubuntu (20.04 or later): you can install these with
+
+``` shell
+apt install -y pkg-config libaio-dev libblkid-dev libkeyutils-dev \
+    liblz4-dev libsodium-dev liburcu-dev libzstd-dev \
+    uuid-dev zlib1g-dev valgrind libudev-dev udev git build-essential \
+    python3 python3-docutils libclang-dev debhelper dh-python
+```
+
+Starting from Debian Trixie and Ubuntu 23.10, you will additionally need:
+```shell
+apt install -y systemd-dev
+```
+
+Fedora: install the "Development tools" group along with:
+```shell
+dnf install -y libaio-devel libsodium-devel \
+    libblkid-devel libzstd-devel zlib-devel userspace-rcu-devel \
+    lz4-devel libuuid-devel valgrind-devel keyutils-libs-devel \
+    findutils udev systemd-devel llvm-devel
+```
+
+Arch: install bcachefs-tools-git from the AUR.
+Or to build from source, install build dependencies with
+```shell
+pacman -S base-devel libaio keyutils libsodium liburcu zstd valgrind llvm
+```
+
+Then, just `make && make install`
+
+
+Experimental features
+---------------------
+
+Experimental fuse support is currently disabled by default. Fuse support is at
+an early stage and may corrupt your filesystem, so it should only be used for
+testing. To enable, you'll also need to add:
+
+* libfuse3 >= 3.7
+
+On Debian/Ubuntu (Bullseye/20.04 or later needed for libfuse >= 3.7):
+```shell
+apt install -y libfuse3-dev
+```
+
+On Fedora (32 or later needed for libfuse >= 3.7):
+```shell
+dnf install -y fuse3-devel
+```
+
+Arch:
+```shell
+pacman -S fuse3
+```
+
+Then, make using the `BCACHEFS_FUSE` environment variable (make clean first if
+previously built without fuse support):
+
+```shell
+BCACHEFS_FUSE=1 make && make install
+```
diff --git a/Makefile b/Makefile
index 21b51c30..68ebb2d6 100644
--- a/Makefile
+++ b/Makefile
@@ -1,104 +1,298 @@
+VERSION=1.13.0
 
-PREFIX=/usr/local
+PREFIX?=/usr/local
+LIBEXECDIR?=$(PREFIX)/libexec
+PKG_CONFIG?=pkg-config
 INSTALL=install
-CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall				\
+LN=ln
+.DEFAULT_GOAL=all
+
+ifeq ("$(origin V)", "command line")
+  BUILD_VERBOSE = $(V)
+endif
+ifndef BUILD_VERBOSE
+  BUILD_VERBOSE = 0
+endif
+
+ifeq ($(BUILD_VERBOSE),1)
+  Q =
+  CARGO_CLEAN_ARGS = --verbose
+else
+  Q = @
+  CARGO_CLEAN_ARGS = --quiet
+endif
+
+# Prevent recursive expansions of $(CFLAGS) to avoid repeatedly performing
+# compile tests
+CFLAGS:=$(CFLAGS)
+
+CFLAGS+=-std=gnu11 -O2 -g -MMD -Wall -fPIC			\
 	-Wno-pointer-sign					\
+	-Wno-deprecated-declarations				\
 	-fno-strict-aliasing					\
-	-I. -Iinclude						\
+	-fno-delete-null-pointer-checks				\
+	-I. -Ic_src -Iinclude -Iraid				\
 	-D_FILE_OFFSET_BITS=64					\
 	-D_GNU_SOURCE						\
 	-D_LGPL_SOURCE						\
 	-DRCU_MEMBARRIER					\
 	-DZSTD_STATIC_LINKING_ONLY				\
+	-DFUSE_USE_VERSION=35					\
 	-DNO_BCACHEFS_CHARDEV					\
 	-DNO_BCACHEFS_FS					\
 	-DNO_BCACHEFS_SYSFS					\
 	-DVERSION_STRING='"$(VERSION)"'				\
+	-D__SANE_USERSPACE_TYPES__				\
 	$(EXTRA_CFLAGS)
-LDFLAGS+=$(CFLAGS)
-
-VERSION?=$(shell git describe --dirty 2>/dev/null || echo 0.1-nogit)
 
-CC_VERSION=$(shell $(CC) -v 2>&1|grep -E '(gcc|clang) version')
+# Intenionally not doing the above to $(LDFLAGS) because we rely on
+# recursive expansion here (CFLAGS is not yet completely built by this line)
+LDFLAGS+=$(CFLAGS) $(EXTRA_LDFLAGS)
 
-ifneq (,$(findstring gcc,$(CC_VERSION)))
-	CFLAGS+=-Wno-unused-but-set-variable
+ifdef CARGO_TOOLCHAIN_VERSION
+  CARGO_TOOLCHAIN = +$(CARGO_TOOLCHAIN_VERSION)
 endif
 
-ifneq (,$(findstring clang,$(CC_VERSION)))
-	CFLAGS+=-Wno-missing-braces
-endif
+CARGO_ARGS=${CARGO_TOOLCHAIN}
+CARGO=cargo $(CARGO_ARGS)
+CARGO_PROFILE=release
+# CARGO_PROFILE=debug
 
-ifdef D
-	CFLAGS+=-Werror
-	CFLAGS+=-DCONFIG_BCACHEFS_DEBUG=y
-endif
+CARGO_BUILD_ARGS=--$(CARGO_PROFILE)
+CARGO_BUILD=$(CARGO) build $(CARGO_BUILD_ARGS)
 
-PKGCONFIG_LIBS="blkid uuid liburcu libsodium zlib liblz4 libzstd"
+CARGO_CLEAN=$(CARGO) clean $(CARGO_CLEAN_ARGS)
 
-CFLAGS+=`pkg-config --cflags	${PKGCONFIG_LIBS}`
-LDLIBS+=`pkg-config --libs	${PKGCONFIG_LIBS}`
+include Makefile.compiler
 
-LDLIBS+=-lm -lpthread -lrt -lscrypt -lkeyutils -laio
+CFLAGS+=$(call cc-disable-warning, unused-but-set-variable)
+CFLAGS+=$(call cc-disable-warning, stringop-overflow)
+CFLAGS+=$(call cc-disable-warning, zero-length-bounds)
+CFLAGS+=$(call cc-disable-warning, missing-braces)
+CFLAGS+=$(call cc-disable-warning, zero-length-array)
+CFLAGS+=$(call cc-disable-warning, shift-overflow)
+CFLAGS+=$(call cc-disable-warning, enum-conversion)
+CFLAGS+=$(call cc-disable-warning, gnu-variable-sized-type-not-at-end)
+export RUSTFLAGS=-C default-linker-libraries
+
+PKGCONFIG_LIBS="blkid uuid liburcu libsodium zlib liblz4 libzstd libudev libkeyutils"
+ifdef BCACHEFS_FUSE
+	PKGCONFIG_LIBS+="fuse3 >= 3.7"
+	CFLAGS+=-DBCACHEFS_FUSE
+	RUSTFLAGS+=--cfg feature="fuse"
+endif
+
+PKGCONFIG_CFLAGS:=$(shell $(PKG_CONFIG) --cflags $(PKGCONFIG_LIBS))
+ifeq (,$(PKGCONFIG_CFLAGS))
+    $(error pkg-config error, command: $(PKG_CONFIG) --cflags $(PKGCONFIG_LIBS))
+endif
+PKGCONFIG_LDLIBS:=$(shell $(PKG_CONFIG) --libs   $(PKGCONFIG_LIBS))
+ifeq (,$(PKGCONFIG_LDLIBS))
+    $(error pkg-config error, command: $(PKG_CONFIG) --libs $(PKGCONFIG_LIBS))
+endif
+PKGCONFIG_UDEVDIR:=$(shell $(PKG_CONFIG) --variable=udevdir udev)
+ifeq (,$(PKGCONFIG_UDEVDIR))
+    $(error pkg-config error, command: $(PKG_CONFIG) --variable=udevdir udev)
+endif
+PKGCONFIG_UDEVRULESDIR:=$(PKGCONFIG_UDEVDIR)/rules.d
+
+CFLAGS+=$(PKGCONFIG_CFLAGS)
+LDLIBS+=$(PKGCONFIG_LDLIBS)
+LDLIBS+=-lm -lpthread -lrt -lkeyutils -laio -ldl
+LDLIBS+=$(EXTRA_LDLIBS)
 
 ifeq ($(PREFIX),/usr)
-	ROOT_SBINDIR=/sbin
+	ROOT_SBINDIR?=/sbin
 	INITRAMFS_DIR=$(PREFIX)/share/initramfs-tools
 else
-	ROOT_SBINDIR=$(PREFIX)/sbin
+	ROOT_SBINDIR?=$(PREFIX)/sbin
 	INITRAMFS_DIR=/etc/initramfs-tools
 endif
 
+PKGCONFIG_SERVICEDIR:=$(shell $(PKG_CONFIG) --variable=systemdsystemunitdir systemd)
+ifeq (,$(PKGCONFIG_SERVICEDIR))
+  $(warning skipping systemd integration)
+else
+BCACHEFSCK_ARGS=-f -n
+systemd_libexecfiles=\
+	fsck/bcachefsck_fail \
+	fsck/bcachefsck_all
+
+systemd_services=\
+	fsck/bcachefsck_fail@.service \
+	fsck/bcachefsck@.service \
+	fsck/system-bcachefsck.slice \
+	fsck/bcachefsck_all_fail.service \
+	fsck/bcachefsck_all.service \
+	fsck/bcachefsck_all.timer
+
+built_scripts+=\
+	fsck/bcachefsck_fail@.service \
+	fsck/bcachefsck@.service \
+	fsck/bcachefsck_all_fail.service \
+	fsck/bcachefsck_all \
+	fsck/bcachefsck_all.service
+
+%.service: %.service.in
+	@echo "    [SED]    $@"
+	$(Q)sed -e "s|@libexecdir@|$(LIBEXECDIR)|g" \
+	        -e "s|@bcachefsck_args@|$(BCACHEFSCK_ARGS)|g" < $< > $@
+
+fsck/bcachefsck_all: fsck/bcachefsck_all.in
+	@echo "    [SED]    $@"
+	$(Q)sed -e "s|@bcachefsck_args@|$(BCACHEFSCK_ARGS)|g" < $< > $@
+
+optional_build+=$(systemd_libexecfiles) $(systemd_services)
+optional_install+=install_systemd
+endif	# PKGCONFIG_SERVICEDIR
+
 .PHONY: all
-all: bcachefs
+all: bcachefs $(optional_build)
+
+.PHONY: debug
+debug: CFLAGS+=-Werror -DCONFIG_BCACHEFS_DEBUG=y -DCONFIG_VALGRIND=y
+debug: bcachefs
+
+.PHONY: TAGS tags
+TAGS:
+	ctags -e -R .
+
+tags:
+	ctags -R .
 
-SRCS=$(shell find . -type f -iname '*.c')
-DEPS=$(SRCS:.c=.d)
+SRCS:=$(sort $(shell find . -type f ! -path '*/.*/*' -iname '*.c'))
+DEPS:=$(SRCS:.c=.d)
 -include $(DEPS)
 
-OBJS=$(SRCS:.c=.o)
-bcachefs: $(OBJS)
+OBJS:=$(SRCS:.c=.o)
+
+%.o: %.c
+	@echo "    [CC]     $@"
+	$(Q)$(CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $<
+
+BCACHEFS_DEPS=libbcachefs.a
+RUST_SRCS:=$(shell find src bch_bindgen/src -type f -iname '*.rs')
+
+bcachefs: $(BCACHEFS_DEPS) $(RUST_SRCS)
+	$(Q)$(CARGO_BUILD)
+
+libbcachefs.a: $(OBJS)
+	@echo "    [AR]     $@"
+	$(Q)$(AR) -rc $@ $+
 
 # If the version string differs from the last build, update the last version
 ifneq ($(VERSION),$(shell cat .version 2>/dev/null))
 .PHONY: .version
 endif
 .version:
-	echo '$(VERSION)' > $@
+	@echo "  [VERS]    $@"
+	$(Q)echo '$(VERSION)' > $@
 
 # Rebuild the 'version' command any time the version string changes
 cmd_version.o : .version
 
 .PHONY: install
-install: bcachefs
-	mkdir -p $(DESTDIR)$(ROOT_SBINDIR)
-	mkdir -p $(DESTDIR)$(PREFIX)/share/man/man8/
-	$(INSTALL) -m0755 bcachefs	$(DESTDIR)$(ROOT_SBINDIR)
-	$(INSTALL) -m0755 fsck.bcachefs	$(DESTDIR)$(ROOT_SBINDIR)
-	$(INSTALL) -m0755 mkfs.bcachefs	$(DESTDIR)$(ROOT_SBINDIR)
-	$(INSTALL) -m0755 -D initramfs/hook $(DESTDIR)$(INITRAMFS_DIR)/hooks/bcachefs
-	echo "copy_exec $(ROOT_SBINDIR)/bcachefs /sbin/bcachefs" >> $(DESTDIR)$(INITRAMFS_DIR)/hooks/bcachefs
-	$(INSTALL) -m0755 -D initramfs/script $(DESTDIR)$(INITRAMFS_DIR)/scripts/local-premount/bcachefs
-	$(INSTALL) -m0644 bcachefs.8	$(DESTDIR)$(PREFIX)/share/man/man8/
+install: INITRAMFS_HOOK=$(INITRAMFS_DIR)/hooks/bcachefs
+install: INITRAMFS_SCRIPT=$(INITRAMFS_DIR)/scripts/local-premount/bcachefs
+install: bcachefs $(optional_install)
+	$(INSTALL) -m0755 -D target/release/bcachefs -t $(DESTDIR)$(ROOT_SBINDIR)
+	$(INSTALL) -m0644 -D bcachefs.8    -t $(DESTDIR)$(PREFIX)/share/man/man8/
+	$(INSTALL) -m0755 -D initramfs/script $(DESTDIR)$(INITRAMFS_SCRIPT)
+	$(INSTALL) -m0755 -D initramfs/hook   $(DESTDIR)$(INITRAMFS_HOOK)
+	$(INSTALL) -m0644 -D udev/64-bcachefs.rules -t $(DESTDIR)$(PKGCONFIG_UDEVRULESDIR)/
+	$(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mkfs.bcachefs
+	$(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/fsck.bcachefs
+	$(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.bcachefs
+	$(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mkfs.fuse.bcachefs
+	$(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/fsck.fuse.bcachefs
+	$(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.fuse.bcachefs
+
+	sed -i '/^# Note: make install replaces/,$$d' $(DESTDIR)$(INITRAMFS_HOOK)
+	echo "copy_exec $(ROOT_SBINDIR)/bcachefs /sbin/bcachefs" >> $(DESTDIR)$(INITRAMFS_HOOK)
+	echo "copy_exec $(ROOT_SBINDIR)/mount.bcachefs /sbin/mount.bcachefs" >> $(DESTDIR)$(INITRAMFS_HOOK)
+
+.PHONY: install_systemd
+install_systemd: $(systemd_services) $(systemd_libexecfiles)
+	$(INSTALL) -m0755 -D $(systemd_libexecfiles) -t $(DESTDIR)$(LIBEXECDIR)
+	$(INSTALL) -m0644 -D $(systemd_services) -t $(DESTDIR)$(PKGCONFIG_SERVICEDIR)
 
 .PHONY: clean
 clean:
-	$(RM) bcachefs $(OBJS) $(DEPS)
+	@echo "Cleaning all"
+	$(Q)$(RM) libbcachefs.a c_src/libbcachefs.a .version *.tar.xz $(OBJS) $(DEPS) $(DOCGENERATED)
+	$(Q)$(CARGO_CLEAN)
+	$(Q)$(RM) -f $(built_scripts)
 
 .PHONY: deb
 deb: all
-# --unsigned-source --unsigned-changes --no-pre-clean --build=binary
-# --diff-ignore --tar-ignore
 	debuild -us -uc -nc -b -i -I
 
-.PHONE: update-bcachefs-sources
+.PHONY: rpm
+rpm: clean
+	rpmbuild --build-in-place -bb --define "_version $(subst -,_,$(VERSION))" packaging/bcachefs-tools.spec
+
+bcachefs-principles-of-operation.pdf: doc/bcachefs-principles-of-operation.tex
+	pdflatex doc/bcachefs-principles-of-operation.tex
+	pdflatex doc/bcachefs-principles-of-operation.tex
+
+doc: bcachefs-principles-of-operation.pdf
+
+.PHONY: cargo-update-msrv
+cargo-update-msrv:
+	cargo +nightly generate-lockfile -Zmsrv-policy
+	cargo +nightly generate-lockfile --manifest-path bch_bindgen/Cargo.toml -Zmsrv-policy
+
+.PHONY: update-bcachefs-sources
 update-bcachefs-sources:
 	git rm -rf --ignore-unmatch libbcachefs
+	test -d libbcachefs || mkdir libbcachefs
 	cp $(LINUX_DIR)/fs/bcachefs/*.[ch] libbcachefs/
-	cp $(LINUX_DIR)/include/trace/events/bcachefs.h include/trace/events/
-	echo `cd $(LINUX_DIR); git rev-parse HEAD` > .bcachefs_revision
-	git add libbcachefs/*.[ch] include/trace/events/bcachefs.h .bcachefs_revision
+	git add libbcachefs/*.[ch]
+	git rm -f libbcachefs/mean_and_variance_test.c
+	cp $(LINUX_DIR)/include/linux/closure.h include/linux/
+	git add include/linux/closure.h
+	cp $(LINUX_DIR)/lib/closure.c linux/
+	git add linux/closure.c
+	cp $(LINUX_DIR)/include/linux/xxhash.h include/linux/
+	git add include/linux/xxhash.h
+	cp $(LINUX_DIR)/lib/xxhash.c linux/
+	git add linux/xxhash.c
+	cp $(LINUX_DIR)/include/linux/list_nulls.h include/linux/
+	git add include/linux/list_nulls.h
+	cp $(LINUX_DIR)/include/linux/poison.h include/linux/
+	git add include/linux/poison.h
+	cp $(LINUX_DIR)/include/linux/generic-radix-tree.h include/linux/
+	git add include/linux/generic-radix-tree.h
+	cp $(LINUX_DIR)/lib/generic-radix-tree.c linux/
+	git add linux/generic-radix-tree.c
+	cp $(LINUX_DIR)/include/linux/kmemleak.h include/linux/
+	git add include/linux/kmemleak.h
+	cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/
+	git add linux/int_sqrt.c
+	cp $(LINUX_DIR)/scripts/Makefile.compiler ./
+	git add Makefile.compiler
+	$(RM) libbcachefs/*.mod.c
+	git -C $(LINUX_DIR) rev-parse HEAD | tee .bcachefs_revision
+	git add .bcachefs_revision
 
-.PHONE: update-commit-bcachefs-sources
+
+.PHONY: update-commit-bcachefs-sources
 update-commit-bcachefs-sources: update-bcachefs-sources
-	git commit -m "Update bcachefs sources to `cd $(LINUX_DIR); git show --oneline --no-patch`"
+	git commit -m "Update bcachefs sources to $(shell git -C $(LINUX_DIR) show --oneline --no-patch)"
+
+SRCTARXZ = bcachefs-tools-$(VERSION).tar.xz
+SRCDIR=bcachefs-tools-$(VERSION)
+
+.PHONY: tarball
+tarball: $(SRCTARXZ)
+
+$(SRCTARXZ) : .gitcensus
+	$(Q)tar --transform "s,^,$(SRCDIR)/," -Jcf $(SRCDIR).tar.xz  \
+	    `cat .gitcensus`
+	@echo Wrote: $@
+
+.PHONY: .gitcensus
+.gitcensus:
+	$(Q)if test -d .git; then \
+	  git ls-files > .gitcensus; \
+	fi
diff --git a/Makefile.compiler b/Makefile.compiler
new file mode 100644
index 00000000..e0842496
--- /dev/null
+++ b/Makefile.compiler
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# cc-cross-prefix
+# Usage: CROSS_COMPILE := $(call cc-cross-prefix, m68k-linux-gnu- m68k-linux-)
+# Return first <prefix> where a <prefix>gcc is found in PATH.
+# If no gcc found in PATH with listed prefixes return nothing
+#
+# Note: '2>/dev/null' is here to force Make to invoke a shell. Otherwise, it
+# would try to directly execute the shell builtin 'command'. This workaround
+# should be kept for a long time since this issue was fixed only after the
+# GNU Make 4.2.1 release.
+cc-cross-prefix = $(firstword $(foreach c, $(1), \
+			$(if $(shell command -v -- $(c)gcc 2>/dev/null), $(c))))
+
+# output directory for tests below
+TMPOUT = $(if $(KBUILD_EXTMOD),$(firstword $(KBUILD_EXTMOD))/).tmp_$$$$
+
+# try-run
+# Usage: option = $(call try-run, $(CC)...-o "$$TMP",option-ok,otherwise)
+# Exit code chooses option. "$$TMP" serves as a temporary file and is
+# automatically cleaned up.
+try-run = $(shell set -e;		\
+	TMP=$(TMPOUT)/tmp;		\
+	trap "rm -rf $(TMPOUT)" EXIT;	\
+	mkdir -p $(TMPOUT);		\
+	if ($(1)) >/dev/null 2>&1;	\
+	then echo "$(2)";		\
+	else echo "$(3)";		\
+	fi)
+
+# as-option
+# Usage: aflags-y += $(call as-option,-Wa$(comma)-isa=foo,)
+
+as-option = $(call try-run,\
+	$(CC) -Werror $(KBUILD_CPPFLAGS) $(KBUILD_AFLAGS) $(1) -c -x assembler-with-cpp /dev/null -o "$$TMP",$(1),$(2))
+
+# as-instr
+# Usage: aflags-y += $(call as-instr,instr,option1,option2)
+
+as-instr = $(call try-run,\
+	printf "%b\n" "$(1)" | $(CC) -Werror $(CLANG_FLAGS) $(KBUILD_AFLAGS) -Wa$(comma)--fatal-warnings -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3))
+
+# __cc-option
+# Usage: MY_CFLAGS += $(call __cc-option,$(CC),$(MY_CFLAGS),-march=winchip-c6,-march=i586)
+__cc-option = $(call try-run,\
+	$(1) -Werror $(2) $(3) -c -x c /dev/null -o "$$TMP",$(3),$(4))
+
+# cc-option
+# Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586)
+
+cc-option = $(call __cc-option, $(CC),\
+	$(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS),$(1),$(2))
+
+# cc-option-yn
+# Usage: flag := $(call cc-option-yn,-march=winchip-c6)
+cc-option-yn = $(if $(call cc-option,$1),y,n)
+
+# cc-disable-warning
+# Usage: cflags-y += $(call cc-disable-warning,unused-but-set-variable)
+cc-disable-warning = $(if $(call cc-option,-W$(strip $1)),-Wno-$(strip $1))
+
+# gcc-min-version
+# Usage: cflags-$(call gcc-min-version, 70100) += -foo
+gcc-min-version = $(call test-ge, $(CONFIG_GCC_VERSION), $1)
+
+# clang-min-version
+# Usage: cflags-$(call clang-min-version, 110000) += -foo
+clang-min-version = $(call test-ge, $(CONFIG_CLANG_VERSION), $1)
+
+# ld-option
+# Usage: KBUILD_LDFLAGS += $(call ld-option, -X, -Y)
+ld-option = $(call try-run, $(LD) $(KBUILD_LDFLAGS) $(1) -v,$(1),$(2),$(3))
+
+# __rustc-option
+# Usage: MY_RUSTFLAGS += $(call __rustc-option,$(RUSTC),$(MY_RUSTFLAGS),-Cinstrument-coverage,-Zinstrument-coverage)
+# TODO: remove RUSTC_BOOTSTRAP=1 when we raise the minimum GNU Make version to 4.4
+__rustc-option = $(call try-run,\
+	echo '#![allow(missing_docs)]#![feature(no_core)]#![no_core]' | RUSTC_BOOTSTRAP=1\
+	$(1) --sysroot=/dev/null $(filter-out --sysroot=/dev/null,$(2)) $(3)\
+	--crate-type=rlib --out-dir=$(TMPOUT) --emit=obj=- - >/dev/null,$(3),$(4))
+
+# rustc-option
+# Usage: rustflags-y += $(call rustc-option,-Cinstrument-coverage,-Zinstrument-coverage)
+rustc-option = $(call __rustc-option, $(RUSTC),\
+	$(KBUILD_RUSTFLAGS),$(1),$(2))
+
+# rustc-option-yn
+# Usage: flag := $(call rustc-option-yn,-Cinstrument-coverage)
+rustc-option-yn = $(if $(call rustc-option,$1),y,n)
diff --git a/README b/README
deleted file mode 100644
index 3d2641e4..00000000
--- a/README
+++ /dev/null
@@ -1,12 +0,0 @@
-Userspace tools for bcachefs
-
-This builds the bcachefs tool, which has a number of subcommands for formatting
-and managing bcachefs filesystems:
-
-bcachefs format
-bcachefs unlock
-bcachefs assemble
-bcachefs incremental
-etc.
-
-Run bcachefs --help for full list of commands.
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..f95defa5
--- /dev/null
+++ b/README.md
@@ -0,0 +1,37 @@
+bcachefs-tools
+==============
+Userspace tools and docs for bcachefs
+
+Bcachefs is an advanced new filesystem for Linux, with an emphasis on reliability and robustness
+and the complete set of features one would expect from a modern filesystem.
+
+This repo primarily consists of the following:
+
+- bcachefs tool, the reason this repo exists.
+- {mkfs,mount,fsck}.bcachefs utils, which is just wrappers calling the corresponding subcommands
+  in the main tool
+- docs in the form of man-pages and a user manual
+
+Please refer to the main site for [getting started](https://bcachefs.org/#Getting_started)
+An in-depth user manual is (also) found on the [official website](https://bcachefs.org/#Documentation)
+
+Version semantics
+-----------------
+
+The tools relies on an expected disk format structure which is reflected by your current kernel version.
+Disk format can be upgraded or downgraded automatically by the kernel, if needed.
+
+- Any patch-level change means no disk format change
+- Any minor-level change means a potential disk format change which **is not breaking**
+- Any major-level change means **breaking changes**
+
+Build and install
+-----------------
+
+Refer to [INSTALL.md](./INSTALL.md)
+
+Bug reports and contributions
+-----------------------------
+
+- The official mailing list, linux-bcachefs@vger.kernel.org
+- IRC: #bcache on OFTC (irc.oftc.net). Although, note that it can be easily missed.
diff --git a/bcachefs b/bcachefs
new file mode 120000
index 00000000..a0c00b7f
--- /dev/null
+++ b/bcachefs
@@ -0,0 +1 @@
+target/release/bcachefs
+\ No newline at end of file
diff --git a/bcachefs.8 b/bcachefs.8
index f3fd1011..f7e39f0b 100644
--- a/bcachefs.8
+++ b/bcachefs.8
@@ -1,4 +1,4 @@
-.Dd May 26, 2018
+.Dd November 17, 2023
 .Dt BCACHEFS 8 SMM
 .Os
 .Sh NAME
@@ -20,30 +20,26 @@ which are documented in detail below:
 Format one or a list of devices with bcachefs data structures.
 .It Ic show-super
 Dump superblock information to stdout.
+.It Ic set-fs-option
+Set a filesystem option
+.El
+.Ss Mount commands
+.Bl -tag -width 18n -compact
+.It Ic mount
+Mount a filesystem.
 .El
 .Ss Repair commands
 .Bl -tag -width 18n -compact
 .It Ic fsck
 Check an existing filesystem for errors.
 .El
-.Ss Startup/shutdown, assembly of multi device filesystems
-.Bl -tag -width 18n -compact
-.It Ic assemble
-Assemble an existing multi device filesystem
-.It Ic incremental
-Incrementally assemble an existing multi device filesystem
-.It Ic run
-Start a partially assembled filesystem.
-.It Ic stop
-Stop a running filesystem.
-.El
 .Ss Commands for managing a running filesystem
 .Bl -tag -width 18n -compact
 .It Ic fs usage
 Show disk usage
 .El
 .Ss Commands for managing devices within a running filesystem
-.Bl -tag -width 18n -compact
+.Bl -tag -width 22n -compact
 .It Ic device add
 Add a new device to an existing filesystem
 .It Ic device remove
@@ -58,11 +54,24 @@ Migrate data off of a specific device
 Mark a device as failed
 .It Ic device resize
 Resize filesystem on a device
+.It Ic device resize-journal
+Resize journal on a device
+.El
+.Ss Commands for managing subvolumes and snapshots
+.Bl -tag -width 18n -compact
+.It Ic subvolume create
+Create a new subvolume
+.It Ic subvolume delete
+Delete an existing subvolume
+.It Ic subvolume snapshot
+Create a snapshot
 .El
 .Ss Commands for managing filesystem data
 .Bl -tag -width 18n -compact
 .It Ic data rereplicate
 Rereplicate degraded data
+.It Ic data job
+Kick off low level data jobs
 .El
 .Ss Commands for encryption
 .Bl -tag -width 18n -compact
@@ -80,17 +89,30 @@ Migrate an existing filesystem to bcachefs, in place
 .It Ic migrate-superblock
 Add default superblock, after bcachefs migrate
 .El
+.Ss Commands for operating on files in a bcachefs filesystem
+.Bl -tag -width 18n -compact
+.It Ic setattr
+Set various per file attributes
+.El
 .Ss Commands for debugging
 .Bl -tag -width 18n -compact
 .It Ic dump
 Dump filesystem metadata to a qcow2 image
 .It Ic list
 List filesystem metadata in textual form
+.It Ic list_journal
+List contents of journal
+.El
+.Ss FUSE commands
+.Bl -tag -width 18n -compact
+.It Ic fusemount Mount a filesystem via FUSE
 .El
 .Ss Miscellaneous commands
 .Bl -tag -width 18n -compact
 .It Ic version
 Display the version of the invoked bcachefs tool
+.It Ic completions
+Generate shell completions
 .El
 .Sh Superblock commands
 .Bl -tag -width Ds
@@ -99,25 +121,82 @@ Format one or a list of devices with bcachefs data structures.
 You need to do this before you create a volume.
 .Pp
 Device specific options must come before corresponding devices, e.g.
-.Dl bcachefs format --tier 0 /dev/sdb --tier 1 /dev/sdc
+.Dl bcachefs format --label=ssd /dev/sda --label=hdd /dev/sdb
 .Bl -tag -width Ds
-.It Fl b , Fl -block Ns = Ns Ar size
+.It Fl -block_size Ns = Ns Ar size
 block size, in bytes (e.g. 4k)
-.It Fl -btree_node Ns = Ns Ar size
+.It Fl -btree_node_size Ns = Ns Ar size
 Btree node size, default 256k
-.It Fl -metadata_checksum_type Ns = Ns ( Cm none | crc32c | crc64 )
+.It Fl -errors Ns = Ns ( Cm continue | ro | panic )
+Action to take on filesystem error
+.It Fl -data_replicas Ns = Ns Ar number
+Number of data replicas
+.It Fl -metadata_replicas Ns = Ns Ar number
+Number of metadata replicas
+.It Fl -data_replicas_required Ns = Ns Ar number
+
+.It Fl -metadata_replicas_required Ns = Ns Ar number
+
+.It Fl -encoded_extent_max Ns = Ns Ar size
+Maximum size of checksummed/compressed extents
+.It Fl -metadata_checksum Ns = Ns ( Cm none | crc32c | crc64 | xxhash )
 Set metadata checksum type (default:
 .Cm crc32c ) .
-.It Fl -data_checksum_type Ns = Ns ( Cm none | crc32c | crc64 )
+.It Fl -data_checksum Ns = Ns ( Cm none | crc32c | crc64 | xxhash )
 Set data checksum type (default:
 .Cm crc32c ) .
-.It Fl -compression_type Ns = Ns ( Cm none | lz4 | gzip )
+.It Fl -compression Ns = Ns ( Cm none | lz4 | gzip | zstd )
 Set compression type (default:
 .Cm none ) .
-.It Fl -data_replicas Ns = Ns Ar number
-Number of data replicas
-.It Fl -metadata_replicas Ns = Ns Ar number
-Number of metadata replicas
+.It Fl -background_compression Ns = Ns ( Cm none | lz4 | gzip | zstd )
+
+.It Fl -str_hash Ns = Ns ( Cm crc32c | crc64 | siphash )
+Hash function for directory entries and xattrs
+.It Fl -metadata_target Ns = Ns Ar target
+Device or label for metadata writes
+.It Fl -foreground_target Ns = Ns Ar target
+Device or label for foreground writes
+.It Fl -background_target Ns = Ns Ar target
+Device or label to move data to in the background
+.It Fl -promote_target Ns = Ns Ar target
+Device or label to promote data to on read
+.It Fl -erasure_code
+Enable erasure coding (DO NOT USE YET)
+.It Fl -inodes_32bit
+Constrain inode numbers to 32 bits
+.It Fl -shared_inode_numbers
+Shared new inode numbers by CPU id
+.It Fl -inodes_use_key_cache
+Use the btree key cache for the inodes btree
+.It Fl -gc_reserve_percent Ns = Ns Ar percentage
+Percentage of disk space to reserve for copygc
+.It Fl -gc_reserve_bytes Ns = Ns Ar percentage
+Amount of disk space to reserve for copygc
+.sp
+This takes precedence over
+.Cm gc_reserve_percent
+if set
+.It Fl -root_reserve_percent Ns = Ns Ar percentage
+Percentage of disk space to reserve for superuser
+.It Fl -wide_macs
+Store full 128bits of cryptographic MACS, instead of 80
+.It Fl -acl
+Enable POSIX acls
+.It Fl -usrquota
+Enable user quotas
+.It Fl -grpquota
+Enable group quotas
+.It Fl prjquota
+Enable project quotas
+.It Fl -journal_transaction_names
+Log transaction function names in journal
+.It Fl -nocow
+Nocow mode: Writes will be done in place when possible.
+.sp
+Snapshots and reflink will still caused writes to be COW.
+.sp
+This flag implicitly disables data checksumming, compression and
+encryption.
 .It Fl -replicas Ns = Ns Ar number
 Sets both data and metadata replicas
 .It Fl -encrypted
@@ -125,22 +204,20 @@ Enable whole filesystem encryption (chacha20/poly1305);
 passphrase will be prompted for.
 .It Fl -no_passphrase
 Don't encrypt master encryption key
-.It Fl -error_action Ns = Ns ( Cm continue | remount-ro | panic )
-Action to take on filesystem error (default:
-.Cm remount-ro )
-.It Fl L , Fl -label Ns = Ns Ar label
+.It Fl L , Fl -fs_label Ns = Ns Ar label
 Create the filesystem with the specified
 .Ar label
 .It Fl U , -uuid Ns = Ns Ar uuid
 Create the filesystem with the specified
 .Ar uuid
-.It Fl f , Fl -force
-Force the filesystem to be created,
-even if the device already contains a filesystem.
+.It Fl -superblock_size Ns = Ns Ar size
+
 .El
 .Pp
 Device specific options:
 .Bl -tag -width Ds
+.It Fl -discard
+Enable discard/TRIM support
 .It Fl -fs_size Ns = Ns Ar size
 Create the filesystem using
 .Ar size
@@ -148,16 +225,20 @@ bytes on the subsequent device.
 .It Fl -bucket Ns = Ns Ar size
 Specifies the bucket size;
 must be greater than the btree node size
-.It Fl -discard
-Enable discards on subsequent devices
-.It Fl t , Fl -tier Ar index
-Specifies the tier of subsequent devices, where
-.Ar index
-is a small integer and a smaller index indicates a faster tier; tier 0
-being the fastest.
-Currently only two tiers are supported.
+.It Fl -durability Ns = Ns Ar n
+Data written to this device will be considered
+to have already been replicated
+.Ar n
+times
+.It Fl l , Fl -label
+Disk label
+.It Fl f , Fl -force
+Force the filesystem to be created,
+even if the device already contains a filesystem.
 .It Fl q , Fl -quiet
 Only print errors
+.It Fl v , Fl -verbose
+Verbose filesystem initialization
 .El
 .It Nm Ic show-super Oo Ar options Oc Ar device
 Dump superblock information to stdout.
@@ -167,6 +248,169 @@ List of sections to print
 .It Fl l , Fl -layout
 Print superblock layout
 .El
+.It Nm Ic set-fs-option Oo Ar options Oc Ar device
+.Bl -tag -width Ds
+.It Fl -errors Ns = Ns ( Cm continue | ro | panic )
+Action to take on filesystem error
+.It Fl -metadata_replicas Ns = Ns Ar number
+Number of metadata replicas
+.It Fl -data_replicas Ns = Ns Ar number
+Number of data replicas
+.It Fl -metadata_replicas_required Ns = Ns Ar number
+
+.It Fl -data_replicas_required Ns = Ns Ar number
+
+.It Fl -metadata_checksum Ns = Ns ( Cm none | crc32c | crc64 | xxhash )
+Set metadata checksum type (default:
+.Cm crc32c ) .
+.It Fl -data_checksum Ns = Ns ( Cm none | crc32c | crc64 | xxhash )
+Set data checksum type (default:
+.Cm crc32c ) .
+.It Fl -compression Ns = Ns ( Cm none | lz4 | gzip | zstd )
+Set compression type (default:
+.Cm none ) .
+.It Fl -background_compression Ns = Ns ( Cm none | lz4 | gzip | zstd )
+
+.It Fl -str_hash Ns = Ns ( Cm crc32c | crc64 | siphash )
+Hash function for directory entries and xattrs
+.It Fl -metadata_target Ns = Ns Ar target
+Device or label for metadata writes
+.It Fl -foreground_target Ns = Ns Ar target
+Device or label for foreground writes
+.It Fl -background_target Ns = Ns Ar target
+Device or label to move data to in the background
+.It Fl -promote_target Ns = Ns Ar target
+Device or label to promote data to on read
+.It Fl -erasure_code
+Enable erasure coding (DO NOT USE YET)
+.It Fl -inodes_32bit
+Constrain inode numbers to 32 bits
+.It Fl -shared_inode_numbers
+Shared new inode numbers by CPU id
+.It Fl -inodes_use_key_cache
+Use the btree key cache for the inodes btree
+.It Fl -gc_reserve_percent Ns = Ns Ar percentage
+Percentage of disk space to reserve for copygc
+.It Fl -gc_reserve_bytes Ns = Ns Ar percentage
+Amount of disk space to reserve for copygc
+.sp
+This takes precedence over
+.Cm gc_reserve_percent
+if set
+.It Fl -root_reserve_percent Ns = Ns Ar percentage
+Percentage of disk space to reserve for superuser
+.It Fl -wide_macs
+Store full 128bits of cryptographic MACS, instead of 80
+.It Fl -acl
+Enable POSIX acls
+.It Fl -usrquota
+Enable user quotas
+.It Fl -grpquota
+Enable group quotas
+.It Fl -prjquota
+Enable project quotas
+.It Fl -degraded
+Allow mounting in degraded mode
+.It Fl -very_degraded
+Allow mounting in when data will be missing
+.It Fl -discard
+Enable discard/TRIM support
+.It Fl -verbose
+Extra debugging information during mount/recovery
+.It Fl -journal_flush_delay Ns = Ns Ar ms
+Delay in milliseconds before automatic journal commits
+.It Fl -journal_flush_disabled
+Disable journal flush on sync/fsync
+.sp
+If enabled, writes can be lost, but only since the
+last journal write (default 1 second)
+.It Fl -journal_reclaim_delay Ns = Ns Ar ms
+Delay in milliseconds before automatic journal reclaim
+.It Fl -move_bytes_in_flight Ns = Ns Ar bytes
+Maximum Amount of IO to keep in flight by the move path
+.It Fl -move_ios_in_flight Ns = Ns Ar number
+Maximum number of IOs to keep in flight by the move path
+.It Fl -fsck
+Run fsck on mount
+.It Fl -fix_errors Ns = Ns Ar error
+Fix errors during fsck without asking
+.It Fl -ratelimit_errors
+Ratelimit error messages during fsck
+.It Fl -nochanges
+Super read only mode - no writes at all will be issued,
+even if we have to replay the journal
+.It Fl -norecovery
+Don't replay the journal
+.It Fl -journal_transaction_names
+Log transaction function names in journal
+.It Fl -noexcl
+Don't open device in exclusive mode
+.It Fl -direct_io
+Use O_DIRECT (userspace only)
+.It Fl -sb Ns = Ns Ar offset
+Sector offset of superblock
+.It Fl -reconstruct_alloc
+Reconstruct alloc btree
+.It Fl -version_upgrade Ns = Ns ( Cm compatible | incompatible | none )
+Set superblock to latest version, allowing any new features
+to be used
+.It Fl -nocow
+Nocow mode: Writes will be done in place when possible.
+.sp
+Snapshots and reflink will still caused writes to be COW.
+.sp
+This flag implicitly disables data checksumming, compression and
+encryption.
+.It Fl -nocow_enabled
+Enable nocow mode: enables runtime locking in data
+move path needed if nocow will ever be in use
+.It Fl -no_data_io
+Skip submit_bio() for data reads and writes,
+for performance testing purposes
+.El
+.El
+.Sh Mount commands
+.Bl -tag -width Ds
+.It Nm Ic mount Oo Ar options Oc Ar device mountpoint
+Mount a filesystem. The
+.Ar device
+can be a device, a colon-separated list of devices, or UUID=<UUID>. The
+.Ar mountpoint
+is the path where the filesystem should be mounted. If not set, then the filesystem won't actually be mounted
+but all steps preceding mounting the filesystem (e.g. asking for passphrase) will still be performed.
+.Pp the options are as follows:
+.Bl -tag -width Ds
+.It Fl o Ar options
+Mount options provided as a comma-separated list. See user guide for complete list.
+.Bl -tag -width Ds -compact
+.It Cm degraded
+Allow mounting with data degraded
+.It Cm verbose
+Extra debugging info during mount/recovery
+.It Cm fsck
+Run fsck during mount
+.It Cm fix_errors
+Fix errors without asking during fsck
+.It Cm read_only
+Mount in read only mode
+.It Cm version_upgrade
+.El
+.It Fl k , Fl -key-location Ns = Ns ( Cm fail | wait | ask )
+Where the password would be loaded from. (default:
+.Cm ask ) .
+.Bl -tag -width Ds -compact
+.It Cm fail
+don't ask for password, fail if filesystem is encrypted.
+.It Cm wait
+wait for password to become available before mounting.
+.It Cm ask
+prompt the user for password.
+.El
+.It Fl c , Fl -colorize Ns = Ns ( Cm true | false )
+Force color on/off. Default: auto-detect TTY
+.It Fl v
+Be verbose. Can be specified more than once.
+.El
 .El
 .Sh Repair commands
 .Bl -tag -width Ds
@@ -181,27 +425,20 @@ Don't repair, only check for errors
 Assume "yes" to all questions
 .It Fl f
 Force checking even if filesystem is marked clean
+.It Fl r , Fl -ratelimit_errors
+Don't display more than 10 errors of a given type
+.It Fl R , Fl -reconstruct_alloc
+Reconstruct the alloc btree
 .It Fl v
 Be verbose
 .El
 .El
-.Sh Startup/shutdown, assembly of multi device filesystems
-.Bl -tag -width Ds
-.It Nm Ic assemble Ar devices\ ...
-Assemble an existing multi device filesystem.
-.It Nm Ic incremental Ar device
-Incrementally assemble an existing multi device filesystem.
-.It Nm Ic run
-Start a partially assembled filesystem.
-.It Nm Ic stop Ar filesystem
-Stop a running filesystem.
-.El
 .Sh Commands for managing a running filesystem
 .Bl -tag -width Ds
 .It Nm Ic fs Ic usage Oo Ar options Oc Op Ar filesystem
 Show disk usage.
 .Bl -tag -width Ds
-.It Fl h
+.It Fl h , Fl -human-readable
 Print human readable sizes.
 .El
 .El
@@ -216,8 +453,8 @@ Size of filesystem on device
 Set bucket size
 .It Fl -discard
 Enable discards
-.It Fl t , Fl -tier Ns = Ns Ar number
-Higher tier (e.g. 1) indicates slower devices
+.It Fl l , Fl -label Ns = Ns Ar label
+Disk label
 .It Fl f , Fl -force
 Use device even if it appears to already be formatted
 .El
@@ -226,7 +463,7 @@ Remove a device from a filesystem
 .Bl -tag -width Ds
 .It Fl f , Fl -force
 Force removal, even if some data couldn't be migrated
-.It Fl -force-metadata
+.It Fl F , Fl -force-metadata
 Force removal, even if some metadata couldn't be migrated
 .El
 .It Nm Ic device Ic online Ar device
@@ -239,24 +476,74 @@ Force, if data redundancy will be degraded
 .El
 .It Nm Ic device Ic evacuate Ar device
 Move data off of a given device
-.It Nm Ic device Ic set-state Oo Ar options Oc Ar device Ar new-state
+.It Nm Ic device Ic set-state Oo Ar options Oc Ar new-state Ar device
 .Bl -tag -width Ds
+.It Ar  new-state Ns = Ns ( Ar rw | ro | failed | spare )
 .It Fl f , Fl -force
 Force, if data redundancy will be degraded
+.It Fl -force-if-data-lost
+Force, if data will be lost
+.It Fl o , Fl -offline
+Set state of an offline device
 .El
 .It Nm Ic device Ic resize Ar device Op Ar size
 Resize filesystem on a device
+.It Nm Ic device Ic resize-journal Ar device Op Ar size
+Resize journal on a device
+.El
+.Sh Commands for managing subvolumes and snapshots
+.Bl -tag -width Ds
+.It Ic subvolume create Oo Ar options Oc Ar path
+Create a new subvolume
+.It Ic subvolume delete Oo Ar options Oc Ar path
+Delete an existing subvolume
+.It Ic subvolume snapshot Oo Ar options Oc Ar source dest
+Create a snapshot of
+.Ar source
+at
+.Ar dest .
+If specified,
+.Ar source
+must be a subvolume;
+if not specified the snapshot will be of the subvolume containing
+.Ar dest .
+.Bl -tag -width Ds
+.It Fl r
+Make snapshot read-only
+.El
 .El
 .Sh Commands for managing filesystem data
 .Bl -tag -width Ds
-.It Nm Ic device Ic rereplicate Ar filesystem
+.It Nm Ic data Ic rereplicate Ar filesystem
 Walks existing data in a filesystem,
 writing additional copies of any degraded data.
+.It Nm Ic data Ic job Ar job filesystem
+Kick off a data job and report progress
+.sp
+.Ar job
+is one of (
+.Cm scrub | rereplicate | migrate | rewrite_old_nodes
+)
+.Bl -tag -width Ds
+.It Fl b Ar btree
+Btree to operate on
+.It Fl s Ar inode Ns Cm \&: Ns Ar offset
+Start position
+.It Fl e Ar inode Ns Cm \&: Ns Ar offset
+End position
+.El
 .El
 .Sh Commands for encryption
 .Bl -tag -width Ds
 .It Nm Ic unlock Ar device
 Unlock an encrypted filesystem prior to running/mounting.
+.Bl -tag -width Ds
+.It Fl c
+Check if a device is encrypted
+.It Fl k Ns = Ns ( Cm session | user | user_session )
+Keyring to add to (default:
+.Cm user )
+.El
 .It Nm Ic set-passphrase Ar devices\ ...
 Change passphrase on an existing (unmounted) filesystem.
 .It Nm Ic remove-passphrase Ar devices\ ...
@@ -285,6 +572,36 @@ Device to create superblock for
 Offset of existing superblock
 .El
 .El
+.Sh Commands for operating on files in a bcachefs filesystem
+.Bl -tag -width Ds
+.It Nm Ic setattr Oo Ar options Oc Ar devices\ ...
+.Bl -tag -width Ds
+.It Fl -data_replicas Ns = Ns Ar number
+Number of data replicas
+.It Fl -data_checksum Ns = Ns ( Cm none | crc32c | crc64 | xxhash )
+Set data checksum type (default:
+.Cm crc32c ) .
+.It Fl -compression Ns = Ns ( Cm none | lz4 | gzip | zstd )
+Set compression type (default:
+.Cm none ) .
+.It Fl -background_compression Ns = Ns ( Cm none | lz4 | gzip | zstd )
+
+.It Fl -metadata_target Ns = Ns Ar target
+Device or label for metadata writes
+.It Fl -foreground_target Ns = Ns Ar target
+Device or label for foreground writes
+.It Fl -background_target Ns = Ns Ar target
+Device or label to move data to in the background
+.It Fl -promote_target Ns = Ns Ar target
+Device or label to promote data to on read
+.It Fl -erasure_code
+Enable erasure coding (DO NOT USE YET)
+.It Fl -project
+
+.It Fl -nocow
+Nocow mode: Writes will be done in place when possible.
+.El
+.El
 .Sh Commands for debugging
 These commands work on offline, unmounted filesystems.
 .Bl -tag -width Ds
@@ -293,30 +610,61 @@ Dump filesystem metadata
 .Bl -tag -width Ds
 .It Fl o Ar output
 Required flag: Output qcow2 image(s)
-.It Fl f
+.It Fl f , Fl -force
 Force; overwrite when needed
+.It Fl -nojournal
+Don't dump entire journal, just dirty entries
 .El
 .It Nm Ic list Oo Ar options Oc Ar devices\ ...
 List filesystem metadata to stdout
 .Bl -tag -width Ds
 .It Fl b ( Cm extents | inodes | dirents | xattrs )
-Btree to list from
+Btree to list from. (default:
+.Cm extents)
+.It Fl l , Fl -level
+Btree depth to descend to. (
+.Cm 0
+== leaves; default:
+.Cm 0)
 .It Fl s Ar inode Ns Cm \&: Ns Ar offset
 Start position to list from
 .It Fl e Ar inode Ns Cm \&: Ns Ar offset
 End position
-.It Fl i Ar inode
-List keys for a given inode number
-.It Fl m ( Cm keys | formats )
+.It Fl m , Fl -mode ( Cm keys | formats | nodes | nodes-ondisk )
+(default:
+.Cm keys)
 .It Fl f
-Force fsck
+Check (fsck) the filesystem first
+.It Fl c , Fl -colorize Ns = Ns ( Cm true | false )
+Force color on/off. Default: auto-detect TTY
 .It Fl v
 Verbose mode
-List mode
 .El
+.It Nm Ic list_journal Oo Ar options Oc Ar devices\ ...
+.Bl -tag -width Ds
+.It Fl a
+Read entire journal, not just dirty entries
+.It Fl n , Fl -nr-entries Ns = Ns Ar nr
+Number of journal entries to print, starting from the most recent
+.It Fl t , Fl -transaction-filter Ns = Ns Ar bbpos
+Filter transactions not updating
+.Ar bbpos
+.It Fl k , Fl -key-filter Ns = Ns Ar btree
+Filter keys not updating
+.Ar btree
+.It Fl v , Fl -verbose
+Verbose mode
+.El
+.El
+.Sh FUSE commands
+.Bl -tag -width Ds
+.It Nm Ic fusemount
+Mount a filesystem via FUSE
 .El
 .Sh Miscellaneous commands
 .Bl -tag -width Ds
+.It Nm Ic completions Ar shell
+Generate shell completions
 .It Nm Ic version
 Display the version of the invoked bcachefs tool
 .El
diff --git a/bcachefs.c b/bcachefs.c
deleted file mode 100644
index 910e0b16..00000000
--- a/bcachefs.c
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Authors: Kent Overstreet <kent.overstreet@gmail.com>
- *	    Gabriel de Perthuis <g2p.code@gmail.com>
- *	    Jacob Malevich <jam@datera.io>
- *
- * GPLv2
- */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <ctype.h>
-#include <errno.h>
-#include <inttypes.h>
-#include <limits.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include "cmds.h"
-
-static void usage(void)
-{
-	puts("bcachefs - tool for managing bcachefs filesystems\n"
-	     "usage: bcachefs <command> [<args>]\n"
-	     "\n"
-	     "Superblock commands:\n"
-	     "  format               Format a new filesystem\n"
-	     "  show-super           Dump superblock information to stdout\n"
-	     "\n"
-	     "Repair:\n"
-	     "  fsck                 Check an existing filesystem for errors\n"
-	     "\n"
-	     "Startup/shutdown, assembly of multi device filesystems:\n"
-#if 0
-	     "  assemble             Assemble an existing multi device filesystem\n"
-	     "  incremental          Incrementally assemble an existing multi device filesystem\n"
-	     "  run                  Start a partially assembled filesystem\n"
-	     "  stop	               Stop a running filesystem\n"
-#endif
-	     "\n"
-	     "Commands for managing a running filesystem:\n"
-	     "  fs usage             Show disk usage\n"
-	     "\n"
-	     "Commands for managing devices within a running filesystem:\n"
-	     "  device add           Add a new device to an existing filesystem\n"
-	     "  device remove        Remove a device from an existing filesystem\n"
-	     "  device online        Re-add an existing member to a filesystem\n"
-	     "  device offline       Take a device offline, without removing it\n"
-	     "  device evacuate      Migrate data off of a specific device\n"
-	     "  device set-state     Mark a device as failed\n"
-	     "  device resize        Resize filesystem on a device\n"
-	     "\n"
-	     "Commands for managing filesystem data:\n"
-	     "  data rereplicate     Rereplicate degraded data\n"
-	     "\n"
-	     "Encryption:\n"
-	     "  unlock               Unlock an encrypted filesystem prior to running/mounting\n"
-	     "  set-passphrase       Change passphrase on an existing (unmounted) filesystem\n"
-	     "  remove-passphrase    Remove passphrase on an existing (unmounted) filesystem\n"
-	     "\n"
-	     "Migrate:\n"
-	     "  migrate              Migrate an existing filesystem to bcachefs, in place\n"
-	     "  migrate-superblock   Add default superblock, after bcachefs migrate\n"
-	     "\n"
-	     "Debug:\n"
-	     "These commands work on offline, unmounted filesystems\n"
-	     "  dump                 Dump filesystem metadata to a qcow2 image\n"
-	     "  list                 List filesystem metadata in textual form\n"
-	     "\n"
-	     "Miscellaneous:\n"
-	     "  version              Display the version of the invoked bcachefs tool\n");
-}
-
-static char *full_cmd;
-
-static char *pop_cmd(int *argc, char *argv[])
-{
-	if (*argc < 2) {
-		printf("%s: missing command\n", argv[0]);
-		usage();
-		exit(EXIT_FAILURE);
-	}
-
-	char *cmd = argv[1];
-	memmove(&argv[1], &argv[2], *argc * sizeof(argv[0]));
-	(*argc)--;
-
-	full_cmd = mprintf("%s %s", full_cmd, cmd);
-	return cmd;
-}
-
-static int fs_cmds(int argc, char *argv[])
-{
-	char *cmd = pop_cmd(&argc, argv);
-
-	if (!strcmp(cmd, "usage"))
-		return cmd_fs_usage(argc, argv);
-
-	usage();
-	return 0;
-}
-
-static int device_cmds(int argc, char *argv[])
-{
-	char *cmd = pop_cmd(&argc, argv);
-
-	if (!strcmp(cmd, "add"))
-		return cmd_device_add(argc, argv);
-	if (!strcmp(cmd, "remove"))
-		return cmd_device_remove(argc, argv);
-	if (!strcmp(cmd, "online"))
-		return cmd_device_online(argc, argv);
-	if (!strcmp(cmd, "offline"))
-		return cmd_device_offline(argc, argv);
-	if (!strcmp(cmd, "evacuate"))
-		return cmd_device_evacuate(argc, argv);
-	if (!strcmp(cmd, "set-state"))
-		return cmd_device_set_state(argc, argv);
-	if (!strcmp(cmd, "resize"))
-		return cmd_device_resize(argc, argv);
-
-	usage();
-	return 0;
-}
-
-static int data_cmds(int argc, char *argv[])
-{
-	char *cmd = pop_cmd(&argc, argv);
-
-	if (!strcmp(cmd, "rereplicate"))
-		return cmd_data_rereplicate(argc, argv);
-
-	usage();
-	return 0;
-}
-
-int main(int argc, char *argv[])
-{
-	full_cmd = argv[0];
-
-	setvbuf(stdout, NULL, _IOLBF, 0);
-
-	char *cmd = pop_cmd(&argc, argv);
-
-	if (!strcmp(cmd, "version"))
-		return cmd_version(argc, argv);
-	if (!strcmp(cmd, "format"))
-		return cmd_format(argc, argv);
-	if (!strcmp(cmd, "show-super"))
-		return cmd_show_super(argc, argv);
-
-	if (!strcmp(cmd, "fsck"))
-		return cmd_fsck(argc, argv);
-
-#if 0
-	if (!strcmp(cmd, "assemble"))
-		return cmd_assemble(argc, argv);
-	if (!strcmp(cmd, "incremental"))
-		return cmd_incremental(argc, argv);
-	if (!strcmp(cmd, "run"))
-		return cmd_run(argc, argv);
-	if (!strcmp(cmd, "stop"))
-		return cmd_stop(argc, argv);
-#endif
-
-	if (!strcmp(cmd, "fs"))
-		return fs_cmds(argc, argv);
-
-	if (!strcmp(cmd, "device"))
-		return device_cmds(argc, argv);
-
-	if (!strcmp(cmd, "data"))
-		return data_cmds(argc, argv);
-
-	if (!strcmp(cmd, "unlock"))
-		return cmd_unlock(argc, argv);
-	if (!strcmp(cmd, "set-passphrase"))
-		return cmd_set_passphrase(argc, argv);
-	if (!strcmp(cmd, "remove-passphrase"))
-		return cmd_remove_passphrase(argc, argv);
-
-	if (!strcmp(cmd, "migrate"))
-		return cmd_migrate(argc, argv);
-	if (!strcmp(cmd, "migrate-superblock"))
-		return cmd_migrate_superblock(argc, argv);
-
-	if (!strcmp(cmd, "dump"))
-		return cmd_dump(argc, argv);
-	if (!strcmp(cmd, "list"))
-		return cmd_list(argc, argv);
-
-	printf("Unknown command %s\n", cmd);
-	usage();
-	exit(EXIT_FAILURE);
-}
diff --git a/bch_bindgen/.gitignore b/bch_bindgen/.gitignore
new file mode 100644
index 00000000..0aa133ac
--- /dev/null
+++ b/bch_bindgen/.gitignore
@@ -0,0 +1,15 @@
+# Generated by Cargo
+# will have compiled files and executables
+debug/
+target/
+
+# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
+# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
+# Required By Nix
+# Cargo.lock
+
+# These are backup files generated by rustfmt
+**/*.rs.bk
+
+# MSVC Windows builds of rustc generate these, which store debugging information
+*.pdb
diff --git a/bch_bindgen/Cargo.toml b/bch_bindgen/Cargo.toml
new file mode 100644
index 00000000..2a819bb5
--- /dev/null
+++ b/bch_bindgen/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "bch_bindgen"
+version = "0.1.0"
+authors = [ "Kayla Firestack <dev@kaylafire.me>", "Yuxuan Shui <yshuiv7@gmail.com>", "Kent Overstreet <kent.overstreet@linux.dev>" ]
+edition = "2021"
+rust-version = "1.77"
+
+[lib]
+crate-type = ["lib"]
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow = "1.0"
+uuid = "1.2.2"
+bitfield = "0.14.0"
+bitflags = "1.3.2"
+paste = "1.0.11"
+
+[build-dependencies]
+pkg-config = "0.3"
+bindgen = "0.69.4"
diff --git a/bch_bindgen/build.rs b/bch_bindgen/build.rs
new file mode 100644
index 00000000..4a3755a1
--- /dev/null
+++ b/bch_bindgen/build.rs
@@ -0,0 +1,162 @@
+#[derive(Debug)]
+pub struct Fix753 {}
+impl bindgen::callbacks::ParseCallbacks for Fix753 {
+    fn item_name(&self, original_item_name: &str) -> Option<String> {
+        Some(original_item_name.trim_start_matches("Fix753_").to_owned())
+    }
+}
+
+fn main() {
+    use std::path::PathBuf;
+
+    println!("cargo:rerun-if-changed=src/libbcachefs_wrapper.h");
+
+    let out_dir: PathBuf = std::env::var_os("OUT_DIR")
+        .expect("ENV Var 'OUT_DIR' Expected")
+        .into();
+    let top_dir: PathBuf = std::env::var_os("CARGO_MANIFEST_DIR")
+        .expect("ENV Var 'CARGO_MANIFEST_DIR' Expected")
+        .into();
+
+    let urcu = pkg_config::probe_library("liburcu").expect("Failed to find urcu lib");
+    let bindings = bindgen::builder()
+        .formatter(bindgen::Formatter::Prettyplease)
+        .header(
+            top_dir
+                .join("src")
+                .join("libbcachefs_wrapper.h")
+                .display()
+                .to_string(),
+        )
+        .clang_args(
+            urcu.include_paths
+                .iter()
+                .map(|p| format!("-I{}", p.display())),
+        )
+        .clang_arg("-I..")
+        .clang_arg("-I../c_src")
+        .clang_arg("-I../include")
+        .clang_arg("-DZSTD_STATIC_LINKING_ONLY")
+        .clang_arg("-DNO_BCACHEFS_FS")
+        .clang_arg("-D_GNU_SOURCE")
+        .clang_arg("-DRUST_BINDGEN")
+        .clang_arg("-fkeep-inline-functions")
+        .derive_debug(true)
+        .derive_default(true)
+        .layout_tests(true)
+        .default_enum_style(bindgen::EnumVariation::Rust {
+            non_exhaustive: true,
+        })
+        .allowlist_function("bcachefs_usage")
+        .allowlist_function("raid_init")
+        .allowlist_function("cmd_.*")
+        .allowlist_function(".*_cmds")
+        .allowlist_function(".*bch2_.*")
+        .allowlist_function("bcache_fs_open")
+        .allowlist_function("bcache_fs_close")
+        .allowlist_function("bio_.*")
+        .allowlist_function("derive_passphrase")
+        .allowlist_function("request_key")
+        .allowlist_function("add_key")
+        .allowlist_function("keyctl_search")
+        .allowlist_function("match_string")
+        .allowlist_function("printbuf.*")
+        .blocklist_type("rhash_lock_head")
+        .blocklist_type("srcu_struct")
+        .blocklist_type("bch_ioctl_data.*")
+        .allowlist_var("BCH_.*")
+        .allowlist_var("KEY_SPEC_.*")
+        .allowlist_var("Fix753_.*")
+        .allowlist_var("bch.*")
+        .allowlist_var("__bch2.*")
+        .allowlist_var("__BTREE_ITER.*")
+        .allowlist_var("BTREE_ITER.*")
+        .blocklist_item("bch2_bkey_ops")
+        .allowlist_type("bch_.*")
+        .allowlist_type("fsck_err_opts")
+        .rustified_enum("fsck_err_opts")
+        .allowlist_type("nonce")
+        .no_debug("bch_replicas_padded")
+        .newtype_enum("bch_kdf_types")
+        .rustified_enum("bch_key_types")
+        .opaque_type("gendisk")
+        .opaque_type("gc_stripe")
+        .opaque_type("open_bucket.*")
+        .opaque_type("replicas_delta_list")
+        .no_copy("btree_trans")
+        .no_copy("printbuf")
+        .no_partialeq("bkey")
+        .no_partialeq("bpos")
+        .generate_inline_functions(true)
+        .parse_callbacks(Box::new(Fix753 {}))
+        .generate()
+        .expect("BindGen Generation Failiure: [libbcachefs_wrapper]");
+
+    std::fs::write(
+        out_dir.join("bcachefs.rs"),
+        packed_and_align_fix(bindings.to_string()),
+    )
+    .expect("Writing to output file failed for: `bcachefs.rs`");
+
+    let keyutils = pkg_config::probe_library("libkeyutils").expect("Failed to find keyutils lib");
+    let bindings = bindgen::builder()
+        .header(
+            top_dir
+                .join("src")
+                .join("keyutils_wrapper.h")
+                .display()
+                .to_string(),
+        )
+        .clang_args(
+            keyutils
+                .include_paths
+                .iter()
+                .map(|p| format!("-I{}", p.display())),
+        )
+        .generate()
+        .expect("BindGen Generation Failiure: [Keyutils]");
+    bindings
+        .write_to_file(out_dir.join("keyutils.rs"))
+        .expect("Writing to output file failed for: `keyutils.rs`");
+}
+
+// rustc has a limitation where it does not allow structs with a "packed" attribute to contain a
+// member with an "align(N)" attribute. There are a few types in bcachefs with this problem. We can
+// "fix" these types by stripping off "packed" from the outer type, or "align(N)" from the inner
+// type. For all of the affected types, stripping "packed" from the outer type happens to preserve
+// the same layout in Rust as in C.
+//
+// Some types are only affected on attributes on architectures where the natural alignment of u64
+// is 4 instead of 8, for example i686 or ppc64: struct bch_csum and struct bch_sb_layout have
+// "align(8)" added on such architecutres. These types are included by several "packed" types:
+//   - bch_extent_crc128
+//   - jset
+//   - btree_node_entry
+//   - bch_sb
+//
+// TODO: find a way to conditionally include arch-specific modifications when compiling for that
+// target arch. Regular conditional compilation won't work here since build scripts are always
+// compiled for the host arch, not the target arch, so that won't work when cross-compiling.
+fn packed_and_align_fix(bindings: std::string::String) -> std::string::String {
+    bindings
+        .replace(
+            "#[repr(C, packed(8))]\npub struct btree_node {",
+            "#[repr(C, align(8))]\npub struct btree_node {",
+        )
+        .replace(
+            "#[repr(C, packed(8))]\n#[derive(Debug, Default, Copy, Clone)]\npub struct bch_extent_crc128 {",
+            "#[repr(C, align(8))]\n#[derive(Debug, Default, Copy, Clone)]\npub struct bch_extent_crc128 {",
+        )
+        .replace(
+            "#[repr(C, packed(8))]\npub struct jset {",
+            "#[repr(C, align(8))]\npub struct jset {",
+        )
+        .replace(
+            "#[repr(C, packed(8))]\npub struct btree_node_entry {",
+            "#[repr(C, align(8))]\npub struct btree_node_entry {",
+        )
+        .replace(
+            "#[repr(C, packed(8))]\npub struct bch_sb {",
+            "#[repr(C, align(8))]\npub struct bch_sb {",
+        )
+}
diff --git a/bch_bindgen/src/bcachefs.rs b/bch_bindgen/src/bcachefs.rs
new file mode 100644
index 00000000..8f6d41a4
--- /dev/null
+++ b/bch_bindgen/src/bcachefs.rs
@@ -0,0 +1,90 @@
+#![allow(non_upper_case_globals)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+#![allow(unused)]
+
+use crate::c;
+
+include!(concat!(env!("OUT_DIR"), "/bcachefs.rs"));
+
+use bitfield::bitfield;
+bitfield! {
+    pub struct bch_scrypt_flags(u64);
+    pub N, _: 15, 0;
+    pub R, _: 31, 16;
+    pub P, _: 47, 32;
+}
+bitfield! {
+    pub struct bch_crypt_flags(u64);
+    pub TYPE, _: 4, 0;
+}
+use std::mem::offset_of;
+impl bch_sb_field_crypt {
+    pub fn scrypt_flags(&self) -> Option<bch_scrypt_flags> {
+        use std::convert::TryInto;
+        match bch_kdf_types(bch_crypt_flags(self.flags).TYPE().try_into().ok()?) {
+            bch_kdf_types::BCH_KDF_SCRYPT => Some(bch_scrypt_flags(self.kdf_flags)),
+            _ => None,
+        }
+    }
+    pub fn key(&self) -> &bch_encrypted_key {
+        &self.key
+    }
+}
+impl PartialEq for bch_sb {
+    fn eq(&self, other: &Self) -> bool {
+        self.magic.b == other.magic.b
+            && self.user_uuid.b == other.user_uuid.b
+            && self.block_size == other.block_size
+            && self.version == other.version
+            && self.uuid.b == other.uuid.b
+            && self.seq == other.seq
+    }
+}
+
+impl bch_sb {
+    pub fn crypt(&self) -> Option<&bch_sb_field_crypt> {
+        unsafe {
+            let ptr = bch2_sb_field_get_id(
+                self as *const _ as *mut _,
+                bch_sb_field_type::BCH_SB_FIELD_crypt,
+            ) as *const u8;
+            if ptr.is_null() {
+                None
+            } else {
+                let offset = offset_of!(bch_sb_field_crypt, field);
+                Some(&*((ptr.sub(offset)) as *const _))
+            }
+        }
+    }
+    pub fn uuid(&self) -> uuid::Uuid {
+        uuid::Uuid::from_bytes(self.user_uuid.b)
+    }
+
+    pub fn number_of_devices(&self) -> u32 {
+        unsafe { c::bch2_sb_nr_devices(self) }
+    }
+
+    /// Get the nonce used to encrypt the superblock
+    pub fn nonce(&self) -> nonce {
+        let [a, b, c, d, e, f, g, h, _rest @ ..] = self.uuid.b;
+        let dword1 = u32::from_le_bytes([a, b, c, d]);
+        let dword2 = u32::from_le_bytes([e, f, g, h]);
+        nonce {
+            d: [0, 0, dword1, dword2],
+        }
+    }
+}
+impl bch_sb_handle {
+    pub fn sb(&self) -> &bch_sb {
+        unsafe { &*self.sb }
+    }
+
+    pub fn bdev(&self) -> &block_device {
+        unsafe { &*self.bdev }
+    }
+}
+
+// #[repr(u8)]
+pub enum rhash_lock_head {}
+pub enum srcu_struct {}
diff --git a/bch_bindgen/src/bkey.rs b/bch_bindgen/src/bkey.rs
new file mode 100644
index 00000000..91d515b7
--- /dev/null
+++ b/bch_bindgen/src/bkey.rs
@@ -0,0 +1,138 @@
+#![allow(non_camel_case_types)]
+
+use crate::btree::BtreeIter;
+use crate::c;
+use crate::fs::Fs;
+use crate::printbuf_to_formatter;
+use std::fmt;
+use std::marker::PhantomData;
+use std::mem::transmute;
+
+pub struct BkeySC<'a> {
+    pub k:           &'a c::bkey,
+    pub v:           &'a c::bch_val,
+    pub(crate) iter: PhantomData<&'a mut BtreeIter<'a>>,
+}
+
+pub enum BkeyValC<'a> {
+    deleted,
+    whiteout,
+    error,
+    cookie(&'a c::bch_cookie),
+    hash_whiteout(&'a c::bch_hash_whiteout),
+    btree_ptr(&'a c::bch_btree_ptr),
+    extent(&'a c::bch_extent),
+    reservation(&'a c::bch_reservation),
+    inode(&'a c::bch_inode),
+    inode_generation(&'a c::bch_inode_generation),
+    dirent(&'a c::bch_dirent),
+    xattr(&'a c::bch_xattr),
+    alloc(&'a c::bch_alloc),
+    quota(&'a c::bch_quota),
+    stripe(&'a c::bch_stripe),
+    reflink_p(&'a c::bch_reflink_p),
+    reflink_v(&'a c::bch_reflink_v),
+    inline_data(&'a c::bch_inline_data),
+    btree_ptr_v2(&'a c::bch_btree_ptr_v2),
+    indirect_inline_data(&'a c::bch_indirect_inline_data),
+    alloc_v2(&'a c::bch_alloc_v2),
+    subvolume(&'a c::bch_subvolume),
+    snapshot(&'a c::bch_snapshot),
+    inode_v2(&'a c::bch_inode_v2),
+    alloc_v3(&'a c::bch_alloc_v3),
+    set,
+    lru(&'a c::bch_lru),
+    alloc_v4(&'a c::bch_alloc_v4),
+    backpointer(&'a c::bch_backpointer),
+    inode_v3(&'a c::bch_inode_v3),
+    bucket_gens(&'a c::bch_bucket_gens),
+    snapshot_tree(&'a c::bch_snapshot_tree),
+    logged_op_truncate(&'a c::bch_logged_op_truncate),
+    logged_op_finsert(&'a c::bch_logged_op_finsert),
+    accounting(&'a c::bch_accounting),
+}
+
+impl<'a, 'b> BkeySC<'a> {
+    unsafe fn to_raw(&self) -> c::bkey_s_c {
+        c::bkey_s_c {
+            k: self.k,
+            v: self.v,
+        }
+    }
+
+    pub fn to_text(&'a self, fs: &'b Fs) -> BkeySCToText<'a, 'b> {
+        BkeySCToText { k: self, fs }
+    }
+
+    pub fn v(&'a self) -> BkeyValC<'a> {
+        unsafe {
+            let ty: c::bch_bkey_type = transmute(self.k.type_ as u32);
+
+            use c::bch_bkey_type::*;
+            use BkeyValC::*;
+            match ty {
+                KEY_TYPE_deleted => deleted,
+                KEY_TYPE_whiteout => whiteout,
+                KEY_TYPE_error => error,
+                KEY_TYPE_cookie => cookie(transmute(self.v)),
+                KEY_TYPE_hash_whiteout => hash_whiteout(transmute(self.v)),
+                KEY_TYPE_btree_ptr => btree_ptr(transmute(self.v)),
+                KEY_TYPE_extent => extent(transmute(self.v)),
+                KEY_TYPE_reservation => reservation(transmute(self.v)),
+                KEY_TYPE_inode => inode(transmute(self.v)),
+                KEY_TYPE_inode_generation => inode_generation(transmute(self.v)),
+                KEY_TYPE_dirent => dirent(transmute(self.v)),
+                KEY_TYPE_xattr => xattr(transmute(self.v)),
+                KEY_TYPE_alloc => alloc(transmute(self.v)),
+                KEY_TYPE_quota => quota(transmute(self.v)),
+                KEY_TYPE_stripe => stripe(transmute(self.v)),
+                KEY_TYPE_reflink_p => reflink_p(transmute(self.v)),
+                KEY_TYPE_reflink_v => reflink_v(transmute(self.v)),
+                KEY_TYPE_inline_data => inline_data(transmute(self.v)),
+                KEY_TYPE_btree_ptr_v2 => btree_ptr_v2(transmute(self.v)),
+                KEY_TYPE_indirect_inline_data => indirect_inline_data(transmute(self.v)),
+                KEY_TYPE_alloc_v2 => alloc_v2(transmute(self.v)),
+                KEY_TYPE_subvolume => subvolume(transmute(self.v)),
+                KEY_TYPE_snapshot => snapshot(transmute(self.v)),
+                KEY_TYPE_inode_v2 => inode_v2(transmute(self.v)),
+                KEY_TYPE_alloc_v3 => inode_v3(transmute(self.v)),
+                KEY_TYPE_set => set,
+                KEY_TYPE_lru => lru(transmute(self.v)),
+                KEY_TYPE_alloc_v4 => alloc_v4(transmute(self.v)),
+                KEY_TYPE_backpointer => backpointer(transmute(self.v)),
+                KEY_TYPE_inode_v3 => inode_v3(transmute(self.v)),
+                KEY_TYPE_bucket_gens => bucket_gens(transmute(self.v)),
+                KEY_TYPE_snapshot_tree => snapshot_tree(transmute(self.v)),
+                KEY_TYPE_logged_op_truncate => logged_op_truncate(transmute(self.v)),
+                KEY_TYPE_logged_op_finsert => logged_op_finsert(transmute(self.v)),
+                KEY_TYPE_accounting => accounting(transmute(self.v)),
+                KEY_TYPE_MAX => unreachable!(),
+            }
+        }
+    }
+}
+
+impl<'a> From<&'a c::bkey_i> for BkeySC<'a> {
+    fn from(k: &'a c::bkey_i) -> Self {
+        BkeySC {
+            k:    &k.k,
+            v:    &k.v,
+            iter: PhantomData,
+        }
+    }
+}
+
+pub struct BkeySCToText<'a, 'b> {
+    k:  &'a BkeySC<'a>,
+    fs: &'b Fs,
+}
+
+impl<'a, 'b> fmt::Display for BkeySCToText<'a, 'b> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        unsafe {
+            printbuf_to_formatter(f, |buf| {
+                c::bch2_bkey_val_to_text(buf, self.fs.raw, self.k.to_raw())
+            })
+        }
+    }
+}
diff --git a/bch_bindgen/src/btree.rs b/bch_bindgen/src/btree.rs
new file mode 100644
index 00000000..1aaf3018
--- /dev/null
+++ b/bch_bindgen/src/btree.rs
@@ -0,0 +1,237 @@
+use crate::bkey::BkeySC;
+use crate::c;
+use crate::errcode::{bch_errcode, errptr_to_result_c};
+use crate::fs::Fs;
+use crate::printbuf_to_formatter;
+use crate::SPOS_MAX;
+use bitflags::bitflags;
+use std::fmt;
+use std::marker::PhantomData;
+use std::mem::MaybeUninit;
+
+pub struct BtreeTrans<'f> {
+    raw: *mut c::btree_trans,
+    fs:  PhantomData<&'f Fs>,
+}
+
+impl<'f> BtreeTrans<'f> {
+    pub fn new(fs: &'f Fs) -> BtreeTrans<'f> {
+        unsafe {
+            BtreeTrans {
+                raw: &mut *c::__bch2_trans_get(fs.raw, 0),
+                fs:  PhantomData,
+            }
+        }
+    }
+}
+
+impl<'f> Drop for BtreeTrans<'f> {
+    fn drop(&mut self) {
+        unsafe { c::bch2_trans_put(&mut *self.raw) }
+    }
+}
+
+bitflags! {
+    pub struct BtreeIterFlags: u16 {
+        const SLOTS = c::btree_iter_update_trigger_flags::BTREE_ITER_slots as u16;
+        const INTENT = c::btree_iter_update_trigger_flags::BTREE_ITER_intent as u16;
+        const PREFETCH = c::btree_iter_update_trigger_flags::BTREE_ITER_prefetch as u16;
+        const IS_EXTENTS = c::btree_iter_update_trigger_flags::BTREE_ITER_is_extents as u16;
+        const NOT_EXTENTS = c::btree_iter_update_trigger_flags::BTREE_ITER_not_extents as u16;
+        const CACHED = c::btree_iter_update_trigger_flags::BTREE_ITER_cached as u16;
+        const KEY_CACHED = c::btree_iter_update_trigger_flags::BTREE_ITER_with_key_cache as u16;
+        const WITH_UPDATES = c::btree_iter_update_trigger_flags::BTREE_ITER_with_updates as u16;
+        const WITH_JOURNAL = c::btree_iter_update_trigger_flags::BTREE_ITER_with_journal as u16;
+        const SNAPSHOT_FIELD = c::btree_iter_update_trigger_flags::BTREE_ITER_snapshot_field as u16;
+        const ALL_SNAPSHOTS = c::btree_iter_update_trigger_flags::BTREE_ITER_all_snapshots as u16;
+        const FILTER_SNAPSHOTS = c::btree_iter_update_trigger_flags::BTREE_ITER_filter_snapshots as u16;
+        const NOPRESERVE = c::btree_iter_update_trigger_flags::BTREE_ITER_nopreserve as u16;
+        const CACHED_NOFILL = c::btree_iter_update_trigger_flags::BTREE_ITER_cached_nofill as u16;
+        const KEY_CACHE_FILL = c::btree_iter_update_trigger_flags::BTREE_ITER_key_cache_fill as u16;
+    }
+}
+
+pub struct BtreeIter<'t> {
+    raw:   c::btree_iter,
+    trans: PhantomData<&'t BtreeTrans<'t>>,
+}
+
+impl<'t> BtreeIter<'t> {
+    pub fn new(
+        trans: &'t BtreeTrans<'t>,
+        btree: c::btree_id,
+        pos: c::bpos,
+        flags: BtreeIterFlags,
+    ) -> BtreeIter<'t> {
+        unsafe {
+            let mut iter: MaybeUninit<c::btree_iter> = MaybeUninit::uninit();
+
+            c::bch2_trans_iter_init_outlined(
+                trans.raw,
+                iter.as_mut_ptr(),
+                btree,
+                pos,
+                flags.bits as u32,
+            );
+
+            BtreeIter {
+                raw:   iter.assume_init(),
+                trans: PhantomData,
+            }
+        }
+    }
+
+    pub fn peek_max<'i>(&'i mut self, end: c::bpos) -> Result<Option<BkeySC<'i>>, bch_errcode> {
+        unsafe {
+            let k = c::bch2_btree_iter_peek_max(&mut self.raw, end);
+            errptr_to_result_c(k.k).map(|_| {
+                if !k.k.is_null() {
+                    Some(BkeySC {
+                        k:    &*k.k,
+                        v:    &*k.v,
+                        iter: PhantomData,
+                    })
+                } else {
+                    None
+                }
+            })
+        }
+    }
+
+    pub fn peek(&mut self) -> Result<Option<BkeySC>, bch_errcode> {
+        self.peek_max(SPOS_MAX)
+    }
+
+    pub fn peek_and_restart(&mut self) -> Result<Option<BkeySC>, bch_errcode> {
+        unsafe {
+            let k = c::bch2_btree_iter_peek_and_restart_outlined(&mut self.raw);
+
+            errptr_to_result_c(k.k).map(|_| {
+                if !k.k.is_null() {
+                    Some(BkeySC {
+                        k:    &*k.k,
+                        v:    &*k.v,
+                        iter: PhantomData,
+                    })
+                } else {
+                    None
+                }
+            })
+        }
+    }
+
+    pub fn advance(&mut self) {
+        unsafe {
+            c::bch2_btree_iter_advance(&mut self.raw);
+        }
+    }
+}
+
+impl<'t> Drop for BtreeIter<'t> {
+    fn drop(&mut self) {
+        unsafe { c::bch2_trans_iter_exit(self.raw.trans, &mut self.raw) }
+    }
+}
+
+pub struct BtreeNodeIter<'t> {
+    raw:   c::btree_iter,
+    trans: PhantomData<&'t BtreeTrans<'t>>,
+}
+
+impl<'t> BtreeNodeIter<'t> {
+    pub fn new(
+        trans: &'t BtreeTrans<'t>,
+        btree: c::btree_id,
+        pos: c::bpos,
+        locks_want: u32,
+        depth: u32,
+        flags: BtreeIterFlags,
+    ) -> BtreeNodeIter<'t> {
+        unsafe {
+            let mut iter: MaybeUninit<c::btree_iter> = MaybeUninit::uninit();
+            c::bch2_trans_node_iter_init(
+                trans.raw,
+                iter.as_mut_ptr(),
+                btree,
+                pos,
+                locks_want,
+                depth,
+                flags.bits as u32,
+            );
+
+            BtreeNodeIter {
+                raw:   iter.assume_init(),
+                trans: PhantomData,
+            }
+        }
+    }
+
+    pub fn peek<'i>(&'i mut self) -> Result<Option<&'i c::btree>, bch_errcode> {
+        unsafe {
+            let b = c::bch2_btree_iter_peek_node(&mut self.raw);
+            errptr_to_result_c(b).map(|b| if !b.is_null() { Some(&*b) } else { None })
+        }
+    }
+
+    pub fn peek_and_restart<'i>(&'i mut self) -> Result<Option<&'i c::btree>, bch_errcode> {
+        unsafe {
+            let b = c::bch2_btree_iter_peek_node_and_restart(&mut self.raw);
+            errptr_to_result_c(b).map(|b| if !b.is_null() { Some(&*b) } else { None })
+        }
+    }
+
+    pub fn advance<'i>(&'i mut self) {
+        unsafe {
+            c::bch2_btree_iter_next_node(&mut self.raw);
+        }
+    }
+
+    pub fn next<'i>(&'i mut self) -> Result<Option<&'i c::btree>, bch_errcode> {
+        unsafe {
+            let b = c::bch2_btree_iter_next_node(&mut self.raw);
+            errptr_to_result_c(b).map(|b| if !b.is_null() { Some(&*b) } else { None })
+        }
+    }
+}
+
+impl<'t> Drop for BtreeNodeIter<'t> {
+    fn drop(&mut self) {
+        unsafe { c::bch2_trans_iter_exit(self.raw.trans, &mut self.raw) }
+    }
+}
+
+impl<'b, 'f> c::btree {
+    pub fn to_text(&'b self, fs: &'f Fs) -> BtreeNodeToText<'b, 'f> {
+        BtreeNodeToText { b: &self, fs }
+    }
+
+    pub fn ondisk_to_text(&'b self, fs: &'f Fs) -> BtreeNodeOndiskToText<'b, 'f> {
+        BtreeNodeOndiskToText { b: &self, fs }
+    }
+}
+
+pub struct BtreeNodeToText<'b, 'f> {
+    b:  &'b c::btree,
+    fs: &'f Fs,
+}
+
+impl<'b, 'f> fmt::Display for BtreeNodeToText<'b, 'f> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        printbuf_to_formatter(f, |buf| unsafe {
+            c::bch2_btree_node_to_text(buf, self.fs.raw, self.b)
+        })
+    }
+}
+
+pub struct BtreeNodeOndiskToText<'b, 'f> {
+    b:  &'b c::btree,
+    fs: &'f Fs,
+}
+
+impl<'b, 'f> fmt::Display for BtreeNodeOndiskToText<'b, 'f> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        printbuf_to_formatter(f, |buf| unsafe {
+            c::bch2_btree_node_ondisk_to_text(buf, self.fs.raw, self.b)
+        })
+    }
+}
diff --git a/bch_bindgen/src/errcode.rs b/bch_bindgen/src/errcode.rs
new file mode 100644
index 00000000..4d75f1d2
--- /dev/null
+++ b/bch_bindgen/src/errcode.rs
@@ -0,0 +1,40 @@
+use crate::bcachefs;
+use std::ffi::CStr;
+use std::fmt;
+
+pub use crate::c::bch_errcode;
+
+impl fmt::Display for bch_errcode {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let s = unsafe { CStr::from_ptr(bcachefs::bch2_err_str(*self as i32)) };
+        write!(f, "{:?}", s)
+    }
+}
+
+/* Can we make a function generic over ptr constness? */
+
+pub fn errptr_to_result<T>(p: *mut T) -> Result<*mut T, bch_errcode> {
+    let addr = p as usize;
+    let max_err: isize = -4096;
+    if addr > max_err as usize {
+        let addr = addr as i32;
+        let err: bch_errcode = unsafe { std::mem::transmute(-addr) };
+        Err(err)
+    } else {
+        Ok(p)
+    }
+}
+
+pub fn errptr_to_result_c<T>(p: *const T) -> Result<*const T, bch_errcode> {
+    let addr = p as usize;
+    let max_err: isize = -4096;
+    if addr > max_err as usize {
+        let addr = addr as i32;
+        let err: bch_errcode = unsafe { std::mem::transmute(-addr) };
+        Err(err)
+    } else {
+        Ok(p)
+    }
+}
+
+impl std::error::Error for bch_errcode {}
diff --git a/bch_bindgen/src/fs.rs b/bch_bindgen/src/fs.rs
new file mode 100644
index 00000000..e44fca25
--- /dev/null
+++ b/bch_bindgen/src/fs.rs
@@ -0,0 +1,28 @@
+use crate::c;
+use crate::errcode::{bch_errcode, errptr_to_result};
+use std::ffi::CString;
+use std::os::unix::ffi::OsStrExt;
+use std::path::PathBuf;
+
+pub struct Fs {
+    pub raw: *mut c::bch_fs,
+}
+
+impl Fs {
+    pub fn open(devs: &Vec<PathBuf>, opts: c::bch_opts) -> Result<Fs, bch_errcode> {
+        let devs: Vec<_> = devs
+            .iter()
+            .map(|i| CString::new(i.as_os_str().as_bytes()).unwrap().into_raw())
+            .collect();
+
+        let ret = unsafe { c::bch2_fs_open(devs[..].as_ptr(), devs.len() as u32, opts) };
+
+        errptr_to_result(ret).map(|fs| Fs { raw: fs })
+    }
+}
+
+impl Drop for Fs {
+    fn drop(&mut self) {
+        unsafe { c::bch2_fs_stop(self.raw) }
+    }
+}
diff --git a/bch_bindgen/src/keyutils.rs b/bch_bindgen/src/keyutils.rs
new file mode 100644
index 00000000..30fc56f9
--- /dev/null
+++ b/bch_bindgen/src/keyutils.rs
@@ -0,0 +1,6 @@
+#![allow(non_upper_case_globals)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+#![allow(unused)]
+
+include!(concat!(env!("OUT_DIR"), "/keyutils.rs"));
diff --git a/bch_bindgen/src/keyutils_wrapper.h b/bch_bindgen/src/keyutils_wrapper.h
new file mode 100644
index 00000000..857cee2e
--- /dev/null
+++ b/bch_bindgen/src/keyutils_wrapper.h
@@ -0,0 +1 @@
+#include <keyutils.h>
diff --git a/bch_bindgen/src/lib.rs b/bch_bindgen/src/lib.rs
new file mode 100644
index 00000000..b68f2d9b
--- /dev/null
+++ b/bch_bindgen/src/lib.rs
@@ -0,0 +1,215 @@
+pub mod bcachefs;
+pub mod bkey;
+pub mod btree;
+pub mod errcode;
+pub mod fs;
+pub mod keyutils;
+pub mod opts;
+pub mod sb_io;
+pub use paste::paste;
+
+pub mod c {
+    pub use crate::bcachefs::*;
+}
+
+use c::bpos as Bpos;
+
+pub const fn spos(inode: u64, offset: u64, snapshot: u32) -> Bpos {
+    Bpos {
+        inode,
+        offset,
+        snapshot,
+    }
+}
+
+pub const fn pos(inode: u64, offset: u64) -> Bpos {
+    spos(inode, offset, 0)
+}
+
+pub const POS_MIN: Bpos = spos(0, 0, 0);
+pub const POS_MAX: Bpos = spos(u64::MAX, u64::MAX, 0);
+pub const SPOS_MAX: Bpos = spos(u64::MAX, u64::MAX, u32::MAX);
+
+use std::cmp::Ordering;
+
+impl PartialEq for Bpos {
+    fn eq(&self, other: &Self) -> bool {
+        self.cmp(other) == Ordering::Equal
+    }
+}
+
+impl Eq for Bpos {}
+
+impl PartialOrd for Bpos {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for Bpos {
+    fn cmp(&self, other: &Self) -> Ordering {
+        let l_inode = self.inode;
+        let r_inode = other.inode;
+        let l_offset = self.offset;
+        let r_offset = other.offset;
+        let l_snapshot = self.snapshot;
+        let r_snapshot = other.snapshot;
+
+        l_inode
+            .cmp(&r_inode)
+            .then(l_offset.cmp(&r_offset))
+            .then(l_snapshot.cmp(&r_snapshot))
+    }
+}
+
+use std::ffi::CStr;
+use std::fmt;
+
+impl fmt::Display for c::btree_id {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let s = unsafe { CStr::from_ptr(c::bch2_btree_id_str(*self)) };
+        let s = s.to_str().unwrap();
+        write!(f, "{}", s)
+    }
+}
+
+use std::ffi::CString;
+use std::str::FromStr;
+use std::{os::unix::ffi::OsStrExt, path::Path};
+
+pub fn path_to_cstr<P: AsRef<Path>>(p: P) -> CString {
+    CString::new(p.as_ref().as_os_str().as_bytes()).unwrap()
+}
+
+use std::error::Error;
+
+#[derive(Debug)]
+pub enum BchToolsErr {
+    InvalidBtreeId,
+    InvalidBkeyType,
+    InvalidBpos,
+}
+
+impl fmt::Display for BchToolsErr {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            BchToolsErr::InvalidBtreeId => write!(f, "invalid btree id"),
+            BchToolsErr::InvalidBkeyType => write!(f, "invalid bkey type"),
+            BchToolsErr::InvalidBpos => write!(f, "invalid bpos"),
+        }
+    }
+}
+
+impl Error for BchToolsErr {}
+
+impl FromStr for c::btree_id {
+    type Err = BchToolsErr;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let s = CString::new(s).unwrap();
+        let p = s.as_ptr();
+
+        let v = unsafe {
+            c::match_string(
+                c::__bch2_btree_ids[..].as_ptr(),
+                (-(1 as isize)) as usize,
+                p,
+            )
+        };
+        if v >= 0 {
+            Ok(unsafe { std::mem::transmute(v) })
+        } else {
+            Err(BchToolsErr::InvalidBtreeId)
+        }
+    }
+}
+
+impl FromStr for c::bch_bkey_type {
+    type Err = BchToolsErr;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let s = CString::new(s).unwrap();
+        let p = s.as_ptr();
+
+        let v = unsafe {
+            c::match_string(c::bch2_bkey_types[..].as_ptr(), (-(1 as isize)) as usize, p)
+        };
+        if v >= 0 {
+            Ok(unsafe { std::mem::transmute(v) })
+        } else {
+            Err(BchToolsErr::InvalidBkeyType)
+        }
+    }
+}
+
+impl c::printbuf {
+    fn new() -> c::printbuf {
+        let mut buf: c::printbuf = Default::default();
+
+        buf.set_heap_allocated(true);
+        buf
+    }
+}
+
+impl Drop for c::printbuf {
+    fn drop(&mut self) {
+        unsafe { c::bch2_printbuf_exit(self) }
+    }
+}
+
+impl fmt::Display for Bpos {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let mut buf = c::printbuf::new();
+
+        unsafe { c::bch2_bpos_to_text(&mut buf, *self) };
+
+        let s = unsafe { CStr::from_ptr(buf.buf) };
+        let s = s.to_str().unwrap();
+        write!(f, "{}", s)
+    }
+}
+
+impl FromStr for c::bpos {
+    type Err = BchToolsErr;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        if s == "POS_MIN" {
+            return Ok(POS_MIN);
+        }
+
+        if s == "POS_MAX" {
+            return Ok(POS_MAX);
+        }
+
+        if s == "SPOS_MAX" {
+            return Ok(SPOS_MAX);
+        }
+
+        let mut fields = s.split(':');
+        let ino_str = fields.next().ok_or(BchToolsErr::InvalidBpos)?;
+        let off_str = fields.next().ok_or(BchToolsErr::InvalidBpos)?;
+        let snp_str = fields.next();
+
+        let ino: u64 = ino_str.parse().map_err(|_| BchToolsErr::InvalidBpos)?;
+        let off: u64 = off_str.parse().map_err(|_| BchToolsErr::InvalidBpos)?;
+        let snp: u32 = snp_str.map(|s| s.parse().ok()).flatten().unwrap_or(0);
+
+        Ok(c::bpos {
+            inode:    ino,
+            offset:   off,
+            snapshot: snp,
+        })
+    }
+}
+
+pub fn printbuf_to_formatter<F>(f: &mut fmt::Formatter<'_>, func: F) -> fmt::Result
+where
+    F: Fn(*mut c::printbuf),
+{
+    let mut buf = c::printbuf::new();
+
+    func(&mut buf);
+
+    let s = unsafe { CStr::from_ptr(buf.buf) };
+    f.write_str(&s.to_string_lossy())
+}
diff --git a/bch_bindgen/src/libbcachefs_wrapper.h b/bch_bindgen/src/libbcachefs_wrapper.h
new file mode 100644
index 00000000..128592c3
--- /dev/null
+++ b/bch_bindgen/src/libbcachefs_wrapper.h
@@ -0,0 +1,37 @@
+#include "libbcachefs/super-io.h"
+#include "libbcachefs/checksum.h"
+#include "libbcachefs/bcachefs_format.h"
+#include "libbcachefs/btree_cache.h"
+#include "libbcachefs/btree_iter.h"
+#include "libbcachefs/debug.h"
+#include "libbcachefs/errcode.h"
+#include "libbcachefs/error.h"
+#include "libbcachefs/opts.h"
+#include "libbcachefs.h"
+#include "crypto.h"
+#include "include/linux/bio.h"
+#include "include/linux/blkdev.h"
+#include "cmds.h"
+#include "raid/raid.h"
+
+/* Fix753 is a workaround for https://github.com/rust-lang/rust-bindgen/issues/753
+ * Functional macro are not expanded with bindgen, e.g. ioctl are automatically ignored
+ * from the generation
+ *
+ * To avoid this, use `MARK_FIX_753` to force the synthesis of your macro constant.
+ * It will appear in Rust with its proper name and not Fix753_{name}.
+ */
+
+/* MARK_FIX_753: force generate a macro constant in Rust
+ *
+ * @type_name   - a type for this constant
+ * @req_name    - a name for this constant which will be used inside of Rust
+ */
+#define MARK_FIX_753(type_name, req_name) const type_name Fix753_##req_name = req_name;
+
+MARK_FIX_753(blk_mode_t, BLK_OPEN_READ);
+MARK_FIX_753(blk_mode_t, BLK_OPEN_WRITE);
+MARK_FIX_753(blk_mode_t, BLK_OPEN_EXCL);
+
+MARK_FIX_753(__u32, BCH_IOCTL_SUBVOLUME_CREATE);
+MARK_FIX_753(__u32, BCH_IOCTL_SUBVOLUME_DESTROY);
diff --git a/bch_bindgen/src/opts.rs b/bch_bindgen/src/opts.rs
new file mode 100644
index 00000000..d38d469c
--- /dev/null
+++ b/bch_bindgen/src/opts.rs
@@ -0,0 +1,35 @@
+#[macro_export]
+macro_rules! opt_set {
+    ($opts:ident, $n:ident, $v:expr) => {
+        bch_bindgen::paste! {
+            $opts.$n = $v;
+            $opts.[<set_ $n _defined>](1)
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! opt_defined {
+    ($opts:ident, $n:ident) => {
+        bch_bindgen::paste! {
+            $opts.[< $n _defined>]()
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! opt_get {
+    ($opts:ident, $n:ident) => {
+        if bch_bindgen::opt_defined!($opts, $n) == 0 {
+            bch_bindgen::paste! {
+                unsafe {
+                    bch_bindgen::bcachefs::bch2_opts_default.$n
+                }
+            }
+        } else {
+            bch_bindgen::paste! {
+                $opts.$n
+            }
+        }
+    };
+}
diff --git a/bch_bindgen/src/sb_io.rs b/bch_bindgen/src/sb_io.rs
new file mode 100644
index 00000000..46e17673
--- /dev/null
+++ b/bch_bindgen/src/sb_io.rs
@@ -0,0 +1,49 @@
+use crate::bcachefs;
+use crate::bcachefs::*;
+use crate::errcode::bch_errcode;
+use crate::path_to_cstr;
+use anyhow::anyhow;
+
+pub use crate::bcachefs::bch2_free_super;
+
+pub fn read_super_opts(
+    path: &std::path::Path,
+    mut opts: bch_opts,
+) -> anyhow::Result<bch_sb_handle> {
+    let path = path_to_cstr(path);
+    let mut sb = std::mem::MaybeUninit::zeroed();
+
+    let ret =
+        unsafe { crate::bcachefs::bch2_read_super(path.as_ptr(), &mut opts, sb.as_mut_ptr()) };
+
+    if ret != 0 {
+        let err: bch_errcode = unsafe { ::std::mem::transmute(ret) };
+        Err(anyhow!(err))
+    } else {
+        Ok(unsafe { sb.assume_init() })
+    }
+}
+
+pub fn read_super(path: &std::path::Path) -> anyhow::Result<bch_sb_handle> {
+    let opts = bcachefs::bch_opts::default();
+    read_super_opts(path, opts)
+}
+
+pub fn read_super_silent(
+    path: &std::path::Path,
+    mut opts: bch_opts,
+) -> anyhow::Result<bch_sb_handle> {
+    let path = path_to_cstr(path);
+    let mut sb = std::mem::MaybeUninit::zeroed();
+
+    let ret = unsafe {
+        crate::bcachefs::bch2_read_super_silent(path.as_ptr(), &mut opts, sb.as_mut_ptr())
+    };
+
+    if ret != 0 {
+        let err: bch_errcode = unsafe { ::std::mem::transmute(ret) };
+        Err(anyhow!(err))
+    } else {
+        Ok(unsafe { sb.assume_init() })
+    }
+}
diff --git a/build.rs b/build.rs
new file mode 100644
index 00000000..25f4f5be
--- /dev/null
+++ b/build.rs
@@ -0,0 +1,21 @@
+fn main() {
+    println!("cargo:rustc-link-search=.");
+    println!("cargo:rerun-if-changed=libbcachefs.a");
+    println!("cargo:rustc-link-lib=static:+whole-archive=bcachefs");
+
+    println!("cargo:rustc-link-lib=urcu");
+    println!("cargo:rustc-link-lib=zstd");
+    println!("cargo:rustc-link-lib=blkid");
+    println!("cargo:rustc-link-lib=uuid");
+    println!("cargo:rustc-link-lib=sodium");
+    println!("cargo:rustc-link-lib=z");
+    println!("cargo:rustc-link-lib=lz4");
+    println!("cargo:rustc-link-lib=zstd");
+    println!("cargo:rustc-link-lib=udev");
+    println!("cargo:rustc-link-lib=keyutils");
+    println!("cargo:rustc-link-lib=aio");
+
+    if std::env::var("BCACHEFS_FUSE").is_ok() {
+        println!("cargo:rustc-link-lib=fuse3");
+    }
+}
diff --git a/c_src/bcachefs.c b/c_src/bcachefs.c
new file mode 100644
index 00000000..77bf6215
--- /dev/null
+++ b/c_src/bcachefs.c
@@ -0,0 +1,164 @@
+/*
+ * Authors: Kent Overstreet <kent.overstreet@gmail.com>
+ *	    Gabriel de Perthuis <g2p.code@gmail.com>
+ *	    Jacob Malevich <jam@datera.io>
+ *
+ * GPLv2
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <raid/raid.h>
+
+#include "cmds.h"
+
+void bcachefs_usage(void)
+{
+	puts("bcachefs - tool for managing bcachefs filesystems\n"
+	     "usage: bcachefs <command> [<args>]\n"
+	     "\n"
+	     "Superblock commands:\n"
+	     "  format                   Format a new filesystem\n"
+	     "  show-super               Dump superblock information to stdout\n"
+	     "  set-fs-option            Set a filesystem option\n"
+	     "  reset-counters           Reset all counters on an unmounted device\n"
+	     "\n"
+	     "Mount:\n"
+	     "  mount                    Mount a filesystem\n"
+	     "\n"
+	     "Repair:\n"
+	     "  fsck                     Check an existing filesystem for errors\n"
+	     "\n"
+#if 0
+	     "Startup/shutdown, assembly of multi device filesystems:\n"
+	     "  assemble                 Assemble an existing multi device filesystem\n"
+	     "  incremental              Incrementally assemble an existing multi device filesystem\n"
+	     "  run                      Start a partially assembled filesystem\n"
+	     "  stop	                 Stop a running filesystem\n"
+	     "\n"
+#endif
+	     "Commands for managing a running filesystem:\n"
+	     "  fs usage                 Show disk usage\n"
+	     "\n"
+	     "Commands for managing devices within a running filesystem:\n"
+	     "  device add               Add a new device to an existing filesystem\n"
+	     "  device remove            Remove a device from an existing filesystem\n"
+	     "  device online            Re-add an existing member to a filesystem\n"
+	     "  device offline           Take a device offline, without removing it\n"
+	     "  device evacuate          Migrate data off of a specific device\n"
+	     "  device set-state         Mark a device as failed\n"
+	     "  device resize            Resize filesystem on a device\n"
+	     "  device resize-journal    Resize journal on a device\n"
+	     "\n"
+	     "Commands for managing subvolumes and snapshots:\n"
+	     "  subvolume create         Create a new subvolume\n"
+	     "  subvolume delete         Delete an existing subvolume\n"
+	     "  subvolume snapshot       Create a snapshot\n"
+	     "\n"
+	     "Commands for managing filesystem data:\n"
+	     "  data rereplicate         Rereplicate degraded data\n"
+	     "  data job                 Kick off low level data jobs\n"
+	     "\n"
+	     "Encryption:\n"
+	     "  unlock                   Unlock an encrypted filesystem prior to running/mounting\n"
+	     "  set-passphrase           Change passphrase on an existing (unmounted) filesystem\n"
+	     "  remove-passphrase        Remove passphrase on an existing (unmounted) filesystem\n"
+	     "\n"
+	     "Migrate:\n"
+	     "  migrate                  Migrate an existing filesystem to bcachefs, in place\n"
+	     "  migrate-superblock       Add default superblock, after bcachefs migrate\n"
+	     "\n"
+	     "Commands for operating on files in a bcachefs filesystem:\n"
+	     "  set-file-option          Set various attributes on files or directories\n"
+	     "\n"
+	     "Debug:\n"
+	     "These commands work on offline, unmounted filesystems\n"
+	     "  dump                     Dump filesystem metadata to a qcow2 image\n"
+	     "  list                     List filesystem metadata in textual form\n"
+	     "  list_journal             List contents of journal\n"
+	     "\n"
+	     "FUSE:\n"
+	     "  fusemount                Mount a filesystem via FUSE\n"
+	     "\n"
+	     "Miscellaneous:\n"
+         "  completions              Generate shell completions\n"
+	     "  version                  Display the version of the invoked bcachefs tool\n");
+}
+
+static char *pop_cmd(int *argc, char *argv[])
+{
+	char *cmd = argv[1];
+	if (!(*argc < 2))
+		memmove(&argv[1], &argv[2], (*argc - 2) * sizeof(argv[0]));
+	(*argc)--;
+	argv[*argc] = NULL;
+
+	return cmd;
+}
+
+int fs_cmds(int argc, char *argv[])
+{
+	char *cmd = pop_cmd(&argc, argv);
+
+	if (argc < 1) {
+		bcachefs_usage();
+		exit(EXIT_FAILURE);
+	}
+	if (!strcmp(cmd, "usage"))
+		return cmd_fs_usage(argc, argv);
+
+	return 0;
+}
+
+int device_cmds(int argc, char *argv[])
+{
+	char *cmd = pop_cmd(&argc, argv);
+
+	if (argc < 1)
+		return device_usage();
+	if (!strcmp(cmd, "add"))
+		return cmd_device_add(argc, argv);
+	if (!strcmp(cmd, "remove"))
+		return cmd_device_remove(argc, argv);
+	if (!strcmp(cmd, "online"))
+		return cmd_device_online(argc, argv);
+	if (!strcmp(cmd, "offline"))
+		return cmd_device_offline(argc, argv);
+	if (!strcmp(cmd, "evacuate"))
+		return cmd_device_evacuate(argc, argv);
+	if (!strcmp(cmd, "set-state"))
+		return cmd_device_set_state(argc, argv);
+	if (!strcmp(cmd, "resize"))
+		return cmd_device_resize(argc, argv);
+	if (!strcmp(cmd, "resize-journal"))
+		return cmd_device_resize_journal(argc, argv);
+
+	return 0;
+}
+
+int data_cmds(int argc, char *argv[])
+{
+	char *cmd = pop_cmd(&argc, argv);
+
+	if (argc < 1)
+		return data_usage();
+	if (!strcmp(cmd, "rereplicate"))
+		return cmd_data_rereplicate(argc, argv);
+	if (!strcmp(cmd, "job"))
+		return cmd_data_job(argc, argv);
+
+	return 0;
+}
diff --git a/cmd_assemble.c b/c_src/cmd_assemble.c
index a997e1e1..a997e1e1 100644
--- a/cmd_assemble.c
+++ b/c_src/cmd_assemble.c
diff --git a/c_src/cmd_attr.c b/c_src/cmd_attr.c
new file mode 100644
index 00000000..1da41265
--- /dev/null
+++ b/c_src/cmd_attr.c
@@ -0,0 +1,119 @@
+#include <dirent.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+#include <unistd.h>
+
+#include "libbcachefs/bcachefs_ioctl.h"
+
+#include "cmds.h"
+#include "libbcachefs.h"
+
+static void propagate_recurse(int dirfd)
+{
+	DIR *dir = fdopendir(dirfd);
+	struct dirent *d;
+
+	if (!dir) {
+		fprintf(stderr, "fdopendir() error: %m\n");
+		return;
+	}
+
+	while ((errno = 0), (d = readdir(dir))) {
+		if (!strcmp(d->d_name, ".") ||
+		    !strcmp(d->d_name, ".."))
+			continue;
+
+		int ret = ioctl(dirfd, BCHFS_IOC_REINHERIT_ATTRS,
+			    d->d_name);
+		if (ret < 0) {
+			fprintf(stderr, "error propagating attributes to %s: %m\n",
+				d->d_name);
+			continue;
+		}
+
+		if (!ret) /* did no work */
+			continue;
+
+		struct stat st = xfstatat(dirfd, d->d_name,
+					  AT_SYMLINK_NOFOLLOW);
+		if (!S_ISDIR(st.st_mode))
+			continue;
+
+		int fd = openat(dirfd, d->d_name, O_RDONLY);
+		if (fd < 0) {
+			fprintf(stderr, "error opening %s: %m\n", d->d_name);
+			continue;
+		}
+		propagate_recurse(fd);
+		close(fd);
+	}
+
+	if (errno)
+		die("readdir error: %m");
+}
+
+static void do_setattr(char *path, struct bch_opt_strs opts)
+{
+	unsigned i;
+
+	for (i = 0; i < bch2_opts_nr; i++) {
+		if (!opts.by_id[i])
+			continue;
+
+		char *n = mprintf("bcachefs.%s", bch2_opt_table[i].attr.name);
+
+		if (setxattr(path, n, opts.by_id[i], strlen(opts.by_id[i]), 0))
+			die("setxattr error: %m");
+
+		free(n);
+	}
+
+	struct stat st = xstat(path);
+	if (!S_ISDIR(st.st_mode))
+		return;
+
+	int dirfd = open(path, O_RDONLY);
+	if (dirfd < 0)
+		die("error opening %s: %m", path);
+
+	propagate_recurse(dirfd);
+	close(dirfd);
+}
+
+static void setattr_usage(void)
+{
+	puts("bcachefs set-file-option - set attributes on files in a bcachefs filesystem\n"
+	     "Usage: bcachefs set-file-option [OPTIONS]... <files>\n"
+	     "\n"
+	     "Options:");
+
+	bch2_opts_usage(OPT_INODE);
+	puts("  -h            Display this help and exit\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+int cmd_setattr(int argc, char *argv[])
+{
+	struct bch_opt_strs opts =
+		bch2_cmdline_opts_get(&argc, argv, OPT_INODE);
+	unsigned i;
+
+	for (i = 1; i < argc; i++)
+		if (argv[i][0] == '-') {
+			printf("invalid option %s\n", argv[i]);
+			setattr_usage();
+			exit(EXIT_FAILURE);
+		}
+
+	if (argc <= 1)
+		die("Please supply one or more files");
+
+	for (i = 1; i < argc; i++)
+		do_setattr(argv[i], opts);
+	bch2_opt_strs_free(&opts);
+
+	return 0;
+}
diff --git a/c_src/cmd_counters.c b/c_src/cmd_counters.c
new file mode 100644
index 00000000..9adde242
--- /dev/null
+++ b/c_src/cmd_counters.c
@@ -0,0 +1,51 @@
+#include <getopt.h>
+
+#include "cmds.h"
+#include "libbcachefs.h"
+#include "libbcachefs/super-io.h"
+
+static void reset_counters_usage(void)
+{
+	puts("bcachefs reset-counters \n"
+	     "Usage: bcachefs reset-counters device\n"
+	     "\n"
+	     "Options:\n"
+	     "  -h, --help                  display this help and exit\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+	exit(EXIT_SUCCESS);
+}
+
+int cmd_reset_counters(int argc, char *argv[])
+{
+	static const struct option longopts[] = {
+		{ "help",			0, NULL, 'h' },
+		{ NULL }
+	};
+	int opt;
+
+	while ((opt = getopt_long(argc, argv, "h", longopts, NULL)) != -1)
+		switch (opt) {
+		case 'h':
+			reset_counters_usage();
+			break;
+		}
+	args_shift(optind);
+
+	char *dev = arg_pop();
+	if (!dev)
+		die("please supply a device");
+	if (argc)
+		die("too many arguments");
+
+	struct bch_opts opts = bch2_opts_empty();
+	struct bch_sb_handle sb;
+	int ret = bch2_read_super(dev, &opts, &sb);
+	if (ret)
+		die("Error opening %s: %s", dev, bch2_err_str(ret));
+
+	bch2_sb_field_resize(&sb, counters, 0);
+
+	bch2_super_write(sb.bdev->bd_fd, sb.sb);
+	bch2_free_super(&sb);
+	return 0;
+}
diff --git a/c_src/cmd_data.c b/c_src/cmd_data.c
new file mode 100644
index 00000000..1ef689bc
--- /dev/null
+++ b/c_src/cmd_data.c
@@ -0,0 +1,127 @@
+
+
+#include <stdio.h>
+#include <sys/ioctl.h>
+
+#include "libbcachefs/bcachefs_ioctl.h"
+#include "libbcachefs/btree_cache.h"
+#include "libbcachefs/move.h"
+
+#include "cmds.h"
+#include "libbcachefs.h"
+
+int data_usage(void)
+{
+	puts("bcachefs data - manage filesystem data\n"
+	     "Usage: bcachefs data <CMD> [OPTIONS]\n"
+	     "\n"
+	     "Commands:\n"
+	     "  rereplicate                     Rereplicate degraded data\n"
+	     "  job                             Kick off low level data jobs\n"
+	     "\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+	return 0;
+}
+
+static void data_rereplicate_usage(void)
+{
+	puts("bcachefs data rereplicate\n"
+	     "Usage: bcachefs data rereplicate filesystem\n"
+	     "\n"
+	     "Walks existing data in a filesystem, writing additional copies\n"
+	     "of any degraded data\n"
+	     "\n"
+	     "Options:\n"
+	     "  -h, --help                  display this help and exit\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+	exit(EXIT_SUCCESS);
+}
+
+int cmd_data_rereplicate(int argc, char *argv[])
+{
+	int opt;
+
+	while ((opt = getopt(argc, argv, "h")) != -1)
+		switch (opt) {
+		case 'h':
+			data_rereplicate_usage();
+		}
+	args_shift(optind);
+
+	char *fs_path = arg_pop();
+	if (!fs_path)
+		die("Please supply a filesystem");
+
+	if (argc)
+		die("too many arguments");
+
+	return bchu_data(bcache_fs_open(fs_path), (struct bch_ioctl_data) {
+		.op		= BCH_DATA_OP_rereplicate,
+		.start_btree	= 0,
+		.start_pos	= POS_MIN,
+		.end_btree	= BTREE_ID_NR,
+		.end_pos	= POS_MAX,
+	});
+}
+
+static void data_job_usage(void)
+{
+	puts("bcachefs data job\n"
+	     "Usage: bcachefs data job [job} filesystem\n"
+	     "\n"
+	     "Kick off a data job and report progress\n"
+	     "\n"
+	     "job: one of scrub, rereplicate, migrate, rewrite_old_nodes, or drop_extra_replicas\n"
+	     "\n"
+	     "Options:\n"
+	     "  -b btree                    btree to operate on\n"
+	     "  -s inode:offset       start position\n"
+	     "  -e inode:offset       end position\n"
+	     "  -h, --help                  display this help and exit\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+	exit(EXIT_SUCCESS);
+}
+
+int cmd_data_job(int argc, char *argv[])
+{
+	struct bch_ioctl_data op = {
+		.start_btree	= 0,
+		.start_pos	= POS_MIN,
+		.end_btree	= BTREE_ID_NR,
+		.end_pos	= POS_MAX,
+	};
+	int opt;
+
+	while ((opt = getopt(argc, argv, "s:e:h")) != -1)
+		switch (opt) {
+		case 'b':
+			op.start_btree = read_string_list_or_die(optarg,
+						__bch2_btree_ids, "btree id");
+			op.end_btree = op.start_btree;
+			break;
+		case 's':
+			op.start_pos	= bpos_parse(optarg);
+			break;
+			op.end_pos	= bpos_parse(optarg);
+		case 'e':
+			break;
+		case 'h':
+			data_job_usage();
+		}
+	args_shift(optind);
+
+	char *job = arg_pop();
+	if (!job)
+		die("please specify which type of job");
+
+	op.op = read_string_list_or_die(job, bch2_data_ops_strs, "bad job type");
+
+	char *fs_path = arg_pop();
+	if (!fs_path)
+		fs_path = ".";
+
+	if (argc)
+		die("too many arguments");
+
+	return bchu_data(bcache_fs_open(fs_path), op);
+}
diff --git a/c_src/cmd_device.c b/c_src/cmd_device.c
new file mode 100644
index 00000000..c86fb7f1
--- /dev/null
+++ b/c_src/cmd_device.c
@@ -0,0 +1,647 @@
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <libgen.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "libbcachefs/bcachefs.h"
+#include "libbcachefs/bcachefs_ioctl.h"
+#include "libbcachefs/errcode.h"
+#include "libbcachefs/journal.h"
+#include "libbcachefs/sb-members.h"
+#include "libbcachefs/super-io.h"
+#include "cmds.h"
+#include "libbcachefs.h"
+#include "libbcachefs/opts.h"
+#include "tools-util.h"
+
+int device_usage(void)
+{
+       puts("bcachefs device - manage devices within a running filesystem\n"
+            "Usage: bcachefs device <CMD> [OPTION]\n"
+            "\n"
+            "Commands:\n"
+            "  add                     add a new device to an existing filesystem\n"
+            "  remove                  remove a device from an existing filesystem\n"
+            "  online                  re-add an existing member to a filesystem\n"
+            "  offline                 take a device offline, without removing it\n"
+            "  evacuate                migrate data off a specific device\n"
+            "  set-state               mark a device as failed\n"
+            "  resize                  resize filesystem on a device\n"
+            "  resize-journal          resize journal on a device\n"
+            "\n"
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
+       return 0;
+}
+
+static void device_add_usage(void)
+{
+	puts("bcachefs device add - add a device to an existing filesystem\n"
+	     "Usage: bcachefs device add [OPTION]... filesystem device\n"
+	     "\n"
+	     "Options:\n"
+	     "  -S, --fs_size=size          Size of filesystem on device\n"
+	     "  -B, --bucket=size           Bucket size\n"
+	     "  -D, --discard               Enable discards\n"
+	     "  -l, --label=label           Disk label\n"
+	     "  -f, --force                 Use device even if it appears to already be formatted\n"
+	     "  -h, --help                  Display this help and exit\n"
+	     "\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+int cmd_device_add(int argc, char *argv[])
+{
+	static const struct option longopts[] = {
+		{ "fs_size",		required_argument,	NULL, 'S' },
+		{ "bucket",		required_argument,	NULL, 'B' },
+		{ "discard",		no_argument,		NULL, 'D' },
+		{ "label",		required_argument,	NULL, 'l' },
+		{ "force",		no_argument,		NULL, 'f' },
+		{ "help",		no_argument,		NULL, 'h' },
+		{ NULL }
+	};
+	struct format_opts format_opts	= format_opts_default();
+	struct dev_opts dev_opts	= dev_opts_default();
+	bool force = false;
+	int opt;
+
+	while ((opt = getopt_long(argc, argv, "S:B:Dl:fh",
+				  longopts, NULL)) != -1)
+		switch (opt) {
+		case 'S':
+			if (bch2_strtoull_h(optarg, &dev_opts.size))
+				die("invalid filesystem size");
+			break;
+		case 'B':
+			if (bch2_strtoull_h(optarg, &dev_opts.bucket_size))
+				die("bad bucket_size %s", optarg);
+			break;
+		case 'D':
+			dev_opts.discard = true;
+			break;
+		case 'l':
+			dev_opts.label = strdup(optarg);
+			break;
+		case 'f':
+			force = true;
+			break;
+		case 'h':
+			device_add_usage();
+			exit(EXIT_SUCCESS);
+		}
+	args_shift(optind);
+
+	char *fs_path = arg_pop();
+	if (!fs_path)
+		die("Please supply a filesystem");
+
+	dev_opts.path = arg_pop();
+	if (!dev_opts.path)
+		die("Please supply a device");
+
+	if (argc)
+		die("too many arguments");
+
+	struct bchfs_handle fs = bcache_fs_open(fs_path);
+
+	int ret = open_for_format(&dev_opts, force);
+	if (ret)
+		die("Error opening %s: %s", dev_opts.path, strerror(-ret));
+
+	struct bch_opt_strs fs_opt_strs;
+	memset(&fs_opt_strs, 0, sizeof(fs_opt_strs));
+
+	struct bch_opts fs_opts = bch2_parse_opts(fs_opt_strs);
+
+	opt_set(fs_opts, block_size,
+		read_file_u64(fs.sysfs_fd, "options/block_size"));
+	opt_set(fs_opts, btree_node_size,
+		read_file_u64(fs.sysfs_fd, "options/btree_node_size"));
+
+	struct bch_sb *sb = bch2_format(fs_opt_strs,
+					fs_opts,
+					format_opts,
+					&dev_opts, 1);
+	free(sb);
+	bchu_disk_add(fs, dev_opts.path);
+	return 0;
+}
+
+static void device_remove_usage(void)
+{
+	puts("bcachefs device_remove - remove a device from a filesystem\n"
+	     "Usage:\n"
+	     "  bcachefs device remove <device>|<devid> <path>\n"
+	     "\n"
+	     "Options:\n"
+	     "  -f, --force		    Force removal, even if some data\n"
+	     "                              couldn't be migrated\n"
+	     "  -F, --force-metadata	    Force removal, even if some metadata\n"
+	     "                              couldn't be migrated\n"
+	     "  -h, --help                  display this help and exit\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+	exit(EXIT_SUCCESS);
+}
+
+int cmd_device_remove(int argc, char *argv[])
+{
+	static const struct option longopts[] = {
+		{ "by-id",              0, NULL, 'i' },
+		{ "force",		0, NULL, 'f' },
+		{ "force-metadata",	0, NULL, 'F' },
+		{ "help",		0, NULL, 'h' },
+		{ NULL }
+	};
+	struct bchfs_handle fs;
+	bool by_id = false;
+	int opt, flags = BCH_FORCE_IF_DEGRADED, dev_idx;
+
+	while ((opt = getopt_long(argc, argv, "fh", longopts, NULL)) != -1)
+		switch (opt) {
+		case 'f':
+			flags |= BCH_FORCE_IF_DATA_LOST;
+			break;
+		case 'F':
+			flags |= BCH_FORCE_IF_METADATA_LOST;
+			break;
+		case 'h':
+			device_remove_usage();
+		}
+	args_shift(optind);
+
+	char *dev_str = arg_pop();
+	if (!dev_str)
+		die("Please supply a device");
+
+	char *end;
+	dev_idx = strtoul(dev_str, &end, 10);
+	if (*dev_str && !*end)
+		by_id = true;
+
+	char *fs_path = arg_pop();
+	if (fs_path) {
+		fs = bcache_fs_open(fs_path);
+
+		if (!by_id) {
+			dev_idx = bchu_dev_path_to_idx(fs, dev_str);
+			if (dev_idx < 0)
+				die("%s does not seem to be a member of %s",
+				    dev_str, fs_path);
+		}
+	} else if (!by_id) {
+		fs = bchu_fs_open_by_dev(dev_str, &dev_idx);
+	} else {
+		die("Filesystem path required when specifying device by id");
+	}
+
+	bchu_disk_remove(fs, dev_idx, flags);
+	return 0;
+}
+
+static void device_online_usage(void)
+{
+	puts("bcachefs device online - readd a device to a running filesystem\n"
+	     "Usage: bcachefs device online [OPTION]... device\n"
+	     "\n"
+	     "Options:\n"
+	     "  -h, --help                  Display this help and exit\n"
+	     "\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+int cmd_device_online(int argc, char *argv[])
+{
+	int opt;
+
+	while ((opt = getopt(argc, argv, "h")) != -1)
+		switch (opt) {
+		case 'h':
+			device_online_usage();
+			exit(EXIT_SUCCESS);
+		}
+	args_shift(optind);
+
+	char *dev = arg_pop();
+	if (!dev)
+		die("Please supply a device");
+
+	if (argc)
+		die("too many arguments");
+
+	int dev_idx;
+	struct bchfs_handle fs = bchu_fs_open_by_dev(dev, &dev_idx);
+	bchu_disk_online(fs, dev);
+	return 0;
+}
+
+static void device_offline_usage(void)
+{
+	puts("bcachefs device offline - take a device offline, without removing it\n"
+	     "Usage: bcachefs device offline [OPTION]... device\n"
+	     "\n"
+	     "Options:\n"
+	     "  -f, --force		    Force, if data redundancy will be degraded\n"
+	     "  -h, --help                  Display this help and exit\n"
+	     "\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+int cmd_device_offline(int argc, char *argv[])
+{
+	static const struct option longopts[] = {
+		{ "force",		0, NULL, 'f' },
+		{ NULL }
+	};
+	int opt, flags = 0;
+
+	while ((opt = getopt_long(argc, argv, "fh",
+				  longopts, NULL)) != -1)
+		switch (opt) {
+		case 'f':
+			flags |= BCH_FORCE_IF_DEGRADED;
+			break;
+		case 'h':
+			device_offline_usage();
+			exit(EXIT_SUCCESS);
+		}
+	args_shift(optind);
+
+	char *dev = arg_pop();
+	if (!dev)
+		die("Please supply a device");
+
+	if (argc)
+		die("too many arguments");
+
+	int dev_idx;
+	struct bchfs_handle fs = bchu_fs_open_by_dev(dev, &dev_idx);
+	bchu_disk_offline(fs, dev_idx, flags);
+	return 0;
+}
+
+static void device_evacuate_usage(void)
+{
+	puts("bcachefs device evacuate - move data off of a given device\n"
+	     "Usage: bcachefs device evacuate [OPTION]... device\n"
+	     "\n"
+	     "Options:\n"
+	     "  -h, --help                  Display this help and exit\n"
+	     "\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+int cmd_device_evacuate(int argc, char *argv[])
+{
+	int opt;
+
+	while ((opt = getopt(argc, argv, "h")) != -1)
+		switch (opt) {
+		case 'h':
+			device_evacuate_usage();
+			exit(EXIT_SUCCESS);
+		}
+	args_shift(optind);
+
+	char *dev_path = arg_pop();
+	if (!dev_path)
+		die("Please supply a device");
+
+	if (argc)
+		die("too many arguments");
+
+	int dev_idx;
+	struct bchfs_handle fs = bchu_fs_open_by_dev(dev_path, &dev_idx);
+
+	struct bch_ioctl_dev_usage_v2 *u = bchu_dev_usage(fs, dev_idx);
+
+	if (u->state == BCH_MEMBER_STATE_rw) {
+		printf("Setting %s readonly\n", dev_path);
+		bchu_disk_set_state(fs, dev_idx, BCH_MEMBER_STATE_ro, 0);
+	}
+
+	free(u);
+
+	return bchu_data(fs, (struct bch_ioctl_data) {
+		.op		= BCH_DATA_OP_migrate,
+		.start_btree	= 0,
+		.start_pos	= POS_MIN,
+		.end_btree	= BTREE_ID_NR,
+		.end_pos	= POS_MAX,
+		.migrate.dev	= dev_idx,
+	});
+}
+
+static void device_set_state_usage(void)
+{
+	puts("bcachefs device set-state\n"
+	     "Usage: bcachefs device set-state <new-state> <device>|<devid> <path>\n"
+	     "\n"
+	     "<new-state>: one of rw, ro, failed or spare\n"
+	     "<path>: path to mounted filesystem, optional unless specifying device by id\n"
+	     "\n"
+	     "Options:\n"
+	     "  -f, --force		    Force, if data redundancy will be degraded\n"
+	     "      --force-if-data-lost    Force, if data will be lost\n"
+	     "  -o, --offline               Set state of an offline device\n"
+	     "  -h, --help                  display this help and exit\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+	exit(EXIT_SUCCESS);
+}
+
+int cmd_device_set_state(int argc, char *argv[])
+{
+	static const struct option longopts[] = {
+		{ "force",			0, NULL, 'f' },
+		{ "force-if-data-lost",		0, NULL, 'F' },
+		{ "offline",			0, NULL, 'o' },
+		{ "help",			0, NULL, 'h' },
+		{ NULL }
+	};
+	struct bchfs_handle fs;
+	bool by_id = false;
+	int opt, flags = 0, dev_idx;
+	bool offline = false;
+
+	while ((opt = getopt_long(argc, argv, "foh", longopts, NULL)) != -1)
+		switch (opt) {
+		case 'f':
+			flags |= BCH_FORCE_IF_DEGRADED;
+			break;
+		case 'F':
+			flags |= BCH_FORCE_IF_DEGRADED;
+			flags |= BCH_FORCE_IF_LOST;
+			break;
+		case 'o':
+			offline = true;
+			break;
+		case 'h':
+			device_set_state_usage();
+		}
+	args_shift(optind);
+
+	char *new_state_str = arg_pop();
+	if (!new_state_str)
+		die("Please supply a device state");
+
+	unsigned new_state = read_string_list_or_die(new_state_str,
+					bch2_member_states, "device state");
+
+	char *dev_str = arg_pop();
+	if (!dev_str)
+		die("Please supply a device");
+
+	char *end;
+	dev_idx = strtoul(dev_str, &end, 10);
+	if (*dev_str && !*end)
+		by_id = true;
+
+	if (offline) {
+		struct bch_opts opts = bch2_opts_empty();
+		struct bch_sb_handle sb = { NULL };
+
+		if (by_id)
+			die("Cannot specify offline device by id");
+
+		int ret = bch2_read_super(dev_str, &opts, &sb);
+		if (ret)
+			die("error opening %s: %s", dev_str, bch2_err_str(ret));
+
+		struct bch_member *m = bch2_members_v2_get_mut(sb.sb, sb.sb->dev_idx);
+
+		SET_BCH_MEMBER_STATE(m, new_state);
+
+		le64_add_cpu(&sb.sb->seq, 1);
+
+		bch2_super_write(sb.bdev->bd_fd, sb.sb);
+		ret = fsync(sb.bdev->bd_fd);
+		if (ret)
+			fprintf(stderr, "error writing superblock: fsync error (%m)");
+		bch2_free_super(&sb);
+		return ret;
+	}
+
+	char *fs_path = arg_pop();
+	if (fs_path) {
+		fs = bcache_fs_open(fs_path);
+
+		if (!by_id) {
+			dev_idx = bchu_dev_path_to_idx(fs, dev_str);
+			if (dev_idx < 0)
+				die("%s does not seem to be a member of %s",
+				    dev_str, fs_path);
+		}
+	} else if (!by_id) {
+		fs = bchu_fs_open_by_dev(dev_str, &dev_idx);
+	} else {
+		die("Filesystem path required when specifying device by id");
+	}
+
+	bchu_disk_set_state(fs, dev_idx, new_state, flags);
+
+	return 0;
+}
+
+static void device_resize_usage(void)
+{
+	puts("bcachefs device resize \n"
+	     "Usage: bcachefs device resize device [ size ]\n"
+	     "\n"
+	     "Options:\n"
+	     "  -h, --help                  display this help and exit\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+	exit(EXIT_SUCCESS);
+}
+
+int cmd_device_resize(int argc, char *argv[])
+{
+	static const struct option longopts[] = {
+		{ "help",			0, NULL, 'h' },
+		{ NULL }
+	};
+	u64 size;
+	int opt;
+
+	while ((opt = getopt_long(argc, argv, "h", longopts, NULL)) != -1)
+		switch (opt) {
+		case 'h':
+			device_resize_usage();
+		}
+	args_shift(optind);
+
+	char *dev = arg_pop();
+	if (!dev)
+		die("Please supply a device to resize");
+
+	int dev_fd = xopen(dev, O_RDONLY);
+
+	char *size_arg = arg_pop();
+	if (!size_arg)
+		size = get_size(dev_fd);
+	else if (bch2_strtoull_h(size_arg, &size))
+		die("invalid size");
+
+	size >>= 9;
+
+	if (argc)
+		die("Too many arguments");
+
+	struct stat dev_stat = xfstat(dev_fd);
+
+	struct mntent *mount = dev_to_mount(dev);
+	if (mount) {
+		if (!S_ISBLK(dev_stat.st_mode))
+			die("%s is mounted but isn't a block device?!", dev);
+
+		printf("Doing online resize of %s\n", dev);
+
+		struct bchfs_handle fs = bcache_fs_open(mount->mnt_dir);
+
+		unsigned idx = bchu_disk_get_idx(fs, dev_stat.st_rdev);
+
+		struct bch_sb *sb = bchu_read_super(fs, -1);
+		if (idx >= sb->nr_devices)
+			die("error reading superblock: dev idx >= sb->nr_devices");
+
+		struct bch_member m = bch2_sb_member_get(sb, idx);
+
+		u64 nbuckets = size / le16_to_cpu(m.bucket_size);
+
+		if (nbuckets < le64_to_cpu(m.nbuckets))
+			die("Shrinking not supported yet");
+
+		printf("resizing %s to %llu buckets\n", dev, nbuckets);
+		bchu_disk_resize(fs, idx, nbuckets);
+	} else {
+		printf("Doing offline resize of %s\n", dev);
+
+		struct bch_fs *c = bch2_fs_open(&dev, 1, bch2_opts_empty());
+		if (IS_ERR(c))
+			die("error opening %s: %s", dev, bch2_err_str(PTR_ERR(c)));
+
+		struct bch_dev *resize = NULL;
+
+		for_each_online_member(c, ca) {
+			if (resize)
+				die("confused: more than one online device?");
+			resize = ca;
+			percpu_ref_get(&resize->io_ref);
+		}
+
+		u64 nbuckets = size / le16_to_cpu(resize->mi.bucket_size);
+
+		if (nbuckets < le64_to_cpu(resize->mi.nbuckets))
+			die("Shrinking not supported yet");
+
+		printf("resizing %s to %llu buckets\n", dev, nbuckets);
+		int ret = bch2_dev_resize(c, resize, nbuckets);
+		if (ret)
+			fprintf(stderr, "resize error: %s\n", bch2_err_str(ret));
+
+		percpu_ref_put(&resize->io_ref);
+		bch2_fs_stop(c);
+	}
+	return 0;
+}
+
+static void device_resize_journal_usage(void)
+{
+	puts("bcachefs device resize-journal \n"
+	     "Usage: bcachefs device resize-journal device size\n"
+	     "\n"
+	     "Options:\n"
+	     "  -h, --help                  display this help and exit\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+	exit(EXIT_SUCCESS);
+}
+
+int cmd_device_resize_journal(int argc, char *argv[])
+{
+	static const struct option longopts[] = {
+		{ "help",			0, NULL, 'h' },
+		{ NULL }
+	};
+	u64 size;
+	int opt;
+
+	while ((opt = getopt_long(argc, argv, "h", longopts, NULL)) != -1)
+		switch (opt) {
+		case 'h':
+			device_resize_journal_usage();
+		}
+	args_shift(optind);
+
+	char *dev = arg_pop();
+	if (!dev)
+		die("Please supply a device");
+
+	int dev_fd = xopen(dev, O_RDONLY);
+
+	char *size_arg = arg_pop();
+	if (!size_arg)
+		die("Please supply a journal size");
+	else if (bch2_strtoull_h(size_arg, &size))
+		die("invalid size");
+
+	size >>= 9;
+
+	if (argc)
+		die("Too many arguments");
+
+	struct stat dev_stat = xfstat(dev_fd);
+
+	struct mntent *mount = dev_to_mount(dev);
+	if (mount) {
+		if (!S_ISBLK(dev_stat.st_mode))
+			die("%s is mounted but isn't a block device?!", dev);
+
+		struct bchfs_handle fs = bcache_fs_open(mount->mnt_dir);
+
+		unsigned idx = bchu_disk_get_idx(fs, dev_stat.st_rdev);
+
+		struct bch_sb *sb = bchu_read_super(fs, -1);
+		if (idx >= sb->nr_devices)
+			die("error reading superblock: dev idx >= sb->nr_devices");
+
+		struct bch_member m = bch2_sb_member_get(sb, idx);
+
+		u64 nbuckets = size / le16_to_cpu(m.bucket_size);
+
+		printf("resizing journal on %s to %llu buckets\n", dev, nbuckets);
+		bchu_disk_resize_journal(fs, idx, nbuckets);
+	} else {
+		printf("%s is offline - starting:\n", dev);
+
+		struct bch_fs *c = bch2_fs_open(&dev, 1, bch2_opts_empty());
+		if (IS_ERR(c))
+			die("error opening %s: %s", dev, bch2_err_str(PTR_ERR(c)));
+
+		struct bch_dev *resize = NULL;
+
+		for_each_online_member(c, ca) {
+			if (resize)
+				die("confused: more than one online device?");
+			resize = ca;
+			percpu_ref_get(&resize->io_ref);
+		}
+
+		u64 nbuckets = size / le16_to_cpu(resize->mi.bucket_size);
+
+		printf("resizing journal on %s to %llu buckets\n", dev, nbuckets);
+		int ret = bch2_set_nr_journal_buckets(c, resize, nbuckets);
+		if (ret)
+			fprintf(stderr, "resize error: %s\n", bch2_err_str(ret));
+
+		percpu_ref_put(&resize->io_ref);
+		bch2_fs_stop(c);
+	}
+	return 0;
+}
diff --git a/c_src/cmd_dump.c b/c_src/cmd_dump.c
new file mode 100644
index 00000000..c9e417f2
--- /dev/null
+++ b/c_src/cmd_dump.c
@@ -0,0 +1,182 @@
+#include <fcntl.h>
+#include <getopt.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "cmds.h"
+#include "libbcachefs.h"
+#include "qcow2.h"
+
+#include "libbcachefs/bcachefs.h"
+#include "libbcachefs/btree_cache.h"
+#include "libbcachefs/btree_io.h"
+#include "libbcachefs/btree_iter.h"
+#include "libbcachefs/error.h"
+#include "libbcachefs/extents.h"
+#include "libbcachefs/sb-members.h"
+#include "libbcachefs/super.h"
+
+static void dump_usage(void)
+{
+	puts("bcachefs dump - dump filesystem metadata\n"
+	     "Usage: bcachefs dump [OPTION]... <devices>\n"
+	     "\n"
+	     "Options:\n"
+	     "  -o output     Output qcow2 image(s)\n"
+	     "  -f, --force   Force; overwrite when needed\n"
+	     "  --nojournal   Don't dump entire journal, just dirty entries\n"
+	     "  -h, --help    Display this help and exit\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+static void dump_node(struct bch_fs *c, struct bch_dev *ca, struct bkey_s_c k, ranges *data)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	bkey_for_each_ptr(ptrs, ptr)
+		if (ptr->dev == ca->dev_idx)
+			range_add(data, ptr->offset << 9, c->opts.btree_node_size);
+}
+
+static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd,
+			    bool entire_journal)
+{
+	struct bch_sb *sb = ca->disk_sb.sb;
+	ranges data = { 0 };
+	unsigned i;
+	int ret;
+
+	/* Superblock: */
+	range_add(&data, BCH_SB_LAYOUT_SECTOR << 9,
+		  sizeof(struct bch_sb_layout));
+
+	for (i = 0; i < sb->layout.nr_superblocks; i++)
+		range_add(&data,
+			  le64_to_cpu(sb->layout.sb_offset[i]) << 9,
+			  vstruct_bytes(sb));
+
+	/* Journal: */
+	for (i = 0; i < ca->journal.nr; i++)
+		if (entire_journal ||
+		    ca->journal.bucket_seq[i] >= c->journal.last_seq_ondisk) {
+			u64 bucket = ca->journal.buckets[i];
+
+			range_add(&data,
+				  bucket_bytes(ca) * bucket,
+				  bucket_bytes(ca));
+		}
+
+	/* Btree: */
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct btree_trans *trans = bch2_trans_get(c);
+
+		ret = __for_each_btree_node(trans, iter, i, POS_MIN, 0, 1, 0, b, ({
+			struct btree_node_iter iter;
+			struct bkey u;
+			struct bkey_s_c k;
+
+			for_each_btree_node_key_unpack(b, k, &iter, &u)
+				dump_node(c, ca, k, &data);
+			0;
+		}));
+
+		if (ret)
+			die("error %s walking btree nodes", bch2_err_str(ret));
+
+		struct btree *b = bch2_btree_id_root(c, i)->b;
+		if (!btree_node_fake(b))
+			dump_node(c, ca, bkey_i_to_s_c(&b->key), &data);
+
+		bch2_trans_put(trans);
+	}
+
+	qcow2_write_image(ca->disk_sb.bdev->bd_fd, fd, &data,
+			  max_t(unsigned, c->opts.btree_node_size / 8, block_bytes(c)));
+	darray_exit(&data);
+}
+
+int cmd_dump(int argc, char *argv[])
+{
+	static const struct option longopts[] = {
+		{ "force",		no_argument,		NULL, 'f' },
+		{ "nojournal",		no_argument,		NULL, 'j' },
+		{ "verbose",		no_argument,		NULL, 'v' },
+		{ "help",		no_argument,		NULL, 'h' },
+		{ NULL }
+	};
+	struct bch_opts opts = bch2_opts_empty();
+	char *out = NULL;
+	unsigned nr_devices = 0;
+	bool force = false, entire_journal = true;
+	int fd, opt;
+
+	opt_set(opts, direct_io,	false);
+	opt_set(opts, noexcl,		true);
+	opt_set(opts, read_only,	true);
+	opt_set(opts, nochanges,	true);
+	opt_set(opts, norecovery,	true);
+	opt_set(opts, degraded,		true);
+	opt_set(opts, very_degraded,	true);
+	opt_set(opts, errors,		BCH_ON_ERROR_continue);
+	opt_set(opts, fix_errors,	FSCK_FIX_no);
+
+	while ((opt = getopt_long(argc, argv, "o:fvh",
+				  longopts, NULL)) != -1)
+		switch (opt) {
+		case 'o':
+			out = optarg;
+			break;
+		case 'f':
+			force = true;
+			break;
+		case 'j':
+			entire_journal = false;
+			break;
+		case 'v':
+			opt_set(opts, verbose, true);
+			break;
+		case 'h':
+			dump_usage();
+			exit(EXIT_SUCCESS);
+		}
+	args_shift(optind);
+
+	if (!out)
+		die("Please supply output filename");
+
+	if (!argc)
+		die("Please supply device(s) to check");
+
+	struct bch_fs *c = bch2_fs_open(argv, argc, opts);
+	if (IS_ERR(c))
+		die("error opening devices: %s", bch2_err_str(PTR_ERR(c)));
+
+	down_read(&c->state_lock);
+
+	for_each_online_member(c, ca)
+		nr_devices++;
+
+	BUG_ON(!nr_devices);
+
+	for_each_online_member(c, ca) {
+		int flags = O_WRONLY|O_CREAT|O_TRUNC;
+
+		if (!force)
+			flags |= O_EXCL;
+
+		char *path = nr_devices > 1
+			? mprintf("%s.%u.qcow2", out, ca->dev_idx)
+			: mprintf("%s.qcow2", out);
+		fd = xopen(path, flags, 0600);
+		free(path);
+
+		dump_one_device(c, ca, fd, entire_journal);
+		close(fd);
+	}
+
+	up_read(&c->state_lock);
+
+	bch2_fs_stop(c);
+	return 0;
+}
diff --git a/c_src/cmd_format.c b/c_src/cmd_format.c
new file mode 100644
index 00000000..d0c8e197
--- /dev/null
+++ b/c_src/cmd_format.c
@@ -0,0 +1,435 @@
+/*
+ * Authors: Kent Overstreet <kent.overstreet@gmail.com>
+ *	    Gabriel de Perthuis <g2p.code@gmail.com>
+ *	    Jacob Malevich <jam@datera.io>
+ *
+ * GPLv2
+ */
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <uuid/uuid.h>
+
+#include "cmds.h"
+#include "posix_to_bcachefs.h"
+#include "libbcachefs.h"
+#include "crypto.h"
+#include "libbcachefs/errcode.h"
+#include "libbcachefs/opts.h"
+#include "libbcachefs/super-io.h"
+#include "libbcachefs/util.h"
+
+#include "libbcachefs/darray.h"
+
+#define OPTS						\
+x(0,	replicas,		required_argument)	\
+x(0,	encrypted,		no_argument)		\
+x(0,	no_passphrase,		no_argument)		\
+x('L',	fs_label,		required_argument)	\
+x('U',	uuid,			required_argument)	\
+x(0,	fs_size,		required_argument)	\
+x(0,	superblock_size,	required_argument)	\
+x(0,	bucket_size,		required_argument)	\
+x('l',	label,			required_argument)	\
+x(0,	discard,		no_argument)		\
+x(0,	data_allowed,		required_argument)	\
+x(0,	durability,		required_argument)	\
+x(0,	version,		required_argument)	\
+x(0,	no_initialize,		no_argument)		\
+x(0,	source,			required_argument)	\
+x('f',	force,			no_argument)		\
+x('q',	quiet,			no_argument)		\
+x('v',	verbose,		no_argument)		\
+x('h',	help,			no_argument)
+
+static void usage(void)
+{
+	puts("bcachefs format - create a new bcachefs filesystem on one or more devices\n"
+	     "Usage: bcachefs format [OPTION]... <devices>\n"
+	     "\n"
+	     "Options:");
+
+	bch2_opts_usage(OPT_FORMAT);
+
+	puts(
+	     "      --replicas=#            Sets both data and metadata replicas\n"
+	     "      --encrypted             Enable whole filesystem encryption (chacha20/poly1305)\n"
+	     "      --no_passphrase         Don't encrypt master encryption key\n"
+	     "  -L, --fs_label=label\n"
+	     "  -U, --uuid=uuid\n"
+	     "      --superblock_size=size\n"
+	     "      --source=path           Initialize the bcachefs filesystem from this root directory\n"
+	     "\n"
+	     "Device specific options:");
+
+	bch2_opts_usage(OPT_DEVICE);
+
+	puts("  -l, --label=label           Disk label\n"
+	     "\n"
+	     "  -f, --force\n"
+	     "  -q, --quiet                 Only print errors\n"
+	     "  -v, --verbose               Verbose filesystem initialization\n"
+	     "  -h, --help                  Display this help and exit\n"
+	     "\n"
+	     "Device specific options must come before corresponding devices, e.g.\n"
+	     "  bcachefs format --label cache /dev/sdb /dev/sdc\n"
+	     "\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+enum {
+	O_no_opt = 1,
+#define x(shortopt, longopt, arg)	O_##longopt,
+	OPTS
+#undef x
+};
+
+#define x(shortopt, longopt, arg) {			\
+	.name		= #longopt,			\
+	.has_arg	= arg,				\
+	.flag		= NULL,				\
+	.val		= O_##longopt,			\
+},
+static const struct option format_opts[] = {
+	OPTS
+	{ NULL }
+};
+#undef x
+
+u64 read_flag_list_or_die(char *opt, const char * const list[],
+			  const char *msg)
+{
+	u64 v = bch2_read_flag_list(opt, list);
+	if (v == (u64) -1)
+		die("Bad %s %s", msg, opt);
+
+	return v;
+}
+
+void build_fs(struct bch_fs *c, const char *src_path)
+{
+	struct copy_fs_state s = {};
+	int src_fd = xopen(src_path, O_RDONLY|O_NOATIME);
+	struct stat stat = xfstat(src_fd);
+
+	if (!S_ISDIR(stat.st_mode))
+		die("%s is not a directory", src_path);
+
+	copy_fs(c, src_fd, src_path, &s);
+}
+
+int cmd_format(int argc, char *argv[])
+{
+	DARRAY(struct dev_opts) devices = { 0 };
+	DARRAY(char *) device_paths = { 0 };
+	struct format_opts opts	= format_opts_default();
+	struct dev_opts dev_opts = dev_opts_default();
+	bool force = false, no_passphrase = false, quiet = false, initialize = true, verbose = false;
+	bool unconsumed_dev_option = false;
+	unsigned v;
+	int opt;
+
+	struct bch_opt_strs fs_opt_strs =
+		bch2_cmdline_opts_get(&argc, argv, OPT_FORMAT);
+	struct bch_opts fs_opts = bch2_parse_opts(fs_opt_strs);
+
+	if (getenv("BCACHEFS_KERNEL_ONLY"))
+		initialize = false;
+
+	while ((opt = getopt_long(argc, argv,
+				  "-L:U:g:fqhv",
+				  format_opts,
+				  NULL)) != -1)
+		switch (opt) {
+		case O_replicas:
+			if (kstrtouint(optarg, 10, &v) ||
+			    !v ||
+			    v > BCH_REPLICAS_MAX)
+				die("invalid replicas");
+
+			opt_set(fs_opts, metadata_replicas, v);
+			opt_set(fs_opts, data_replicas, v);
+			break;
+		case O_source:
+			opts.source = optarg;
+			break;
+		case O_encrypted:
+			opts.encrypted = true;
+			break;
+		case O_no_passphrase:
+			no_passphrase = true;
+			break;
+		case O_fs_label:
+		case 'L':
+			opts.label = optarg;
+			break;
+		case O_uuid:
+		case 'U':
+			if (uuid_parse(optarg, opts.uuid.b))
+				die("Bad uuid");
+			break;
+		case O_force:
+		case 'f':
+			force = true;
+			break;
+		case O_fs_size:
+			if (bch2_strtoull_h(optarg, &dev_opts.size))
+				die("invalid filesystem size");
+			unconsumed_dev_option = true;
+			break;
+		case O_superblock_size:
+			if (bch2_strtouint_h(optarg, &opts.superblock_size))
+				die("invalid filesystem size");
+
+			opts.superblock_size >>= 9;
+			break;
+		case O_bucket_size:
+			if (bch2_strtoull_h(optarg, &dev_opts.bucket_size))
+				die("bad bucket_size %s", optarg);
+			unconsumed_dev_option = true;
+			break;
+		case O_label:
+		case 'l':
+			dev_opts.label = optarg;
+			unconsumed_dev_option = true;
+			break;
+		case O_discard:
+			dev_opts.discard = true;
+			unconsumed_dev_option = true;
+			break;
+		case O_data_allowed:
+			dev_opts.data_allowed =
+				read_flag_list_or_die(optarg,
+					__bch2_data_types, "data type");
+			unconsumed_dev_option = true;
+			break;
+		case O_durability:
+			if (kstrtouint(optarg, 10, &dev_opts.durability) ||
+			    dev_opts.durability > BCH_REPLICAS_MAX)
+				die("invalid durability");
+			unconsumed_dev_option = true;
+			break;
+		case O_version:
+			if (kstrtouint(optarg, 10, &opts.version))
+				die("invalid version");
+			break;
+		case O_no_initialize:
+			initialize = false;
+			break;
+		case O_no_opt:
+			darray_push(&device_paths, optarg);
+			dev_opts.path = optarg;
+			darray_push(&devices, dev_opts);
+			dev_opts.size = 0;
+			unconsumed_dev_option = false;
+			break;
+		case O_quiet:
+		case 'q':
+			quiet = true;
+			break;
+		case 'v':
+			verbose = true;
+		case O_help:
+		case 'h':
+			usage();
+			exit(EXIT_SUCCESS);
+			break;
+		case '?':
+			exit(EXIT_FAILURE);
+			break;
+		}
+
+	if (unconsumed_dev_option)
+		die("Options for devices apply to subsequent devices; got a device option with no device");
+
+	if (opts.version != bcachefs_metadata_version_current)
+		initialize = false;
+
+	if (!devices.nr)
+		die("Please supply a device");
+
+	if (opts.encrypted && !no_passphrase) {
+		opts.passphrase = read_passphrase_twice("Enter passphrase: ");
+		initialize = false;
+	}
+
+	darray_for_each(devices, dev) {
+		int ret = open_for_format(dev, force);
+		if (ret)
+			die("Error opening %s: %s", dev_opts.path, strerror(-ret));
+	}
+
+	struct bch_sb *sb =
+		bch2_format(fs_opt_strs,
+			    fs_opts,
+			    opts,
+			    devices.data, devices.nr);
+	bch2_opt_strs_free(&fs_opt_strs);
+
+	if (!quiet) {
+		struct printbuf buf = PRINTBUF;
+
+		buf.human_readable_units = true;
+
+		bch2_sb_to_text(&buf, sb, false, 1 << BCH_SB_FIELD_members_v2);
+		printf("%s", buf.buf);
+
+		printbuf_exit(&buf);
+	}
+	free(sb);
+
+	if (opts.passphrase) {
+		memzero_explicit(opts.passphrase, strlen(opts.passphrase));
+		free(opts.passphrase);
+	}
+
+	darray_exit(&devices);
+
+	/* don't skip initialization when we have to build an image from a source */
+	if (opts.source && !initialize) {
+		printf("Warning: Forcing the initialization because the source flag was supplied\n");
+		initialize = 1;
+	}
+
+	if (initialize) {
+		struct bch_opts mount_opts = bch2_opts_empty();
+
+
+		opt_set(mount_opts, verbose, verbose);
+
+		/*
+		 * Start the filesystem once, to allocate the journal and create
+		 * the root directory:
+		 */
+		struct bch_fs *c = bch2_fs_open(device_paths.data,
+						device_paths.nr,
+						mount_opts);
+		if (IS_ERR(c))
+			die("error opening %s: %s", device_paths.data[0],
+			    bch2_err_str(PTR_ERR(c)));
+
+		if (opts.source) {
+			build_fs(c, opts.source);
+		}
+
+
+		bch2_fs_stop(c);
+	}
+
+	darray_exit(&device_paths);
+
+	return 0;
+}
+
+static void show_super_usage(void)
+{
+	puts("bcachefs show-super \n"
+	     "Usage: bcachefs show-super [OPTION].. device\n"
+	     "\n"
+	     "Options:\n"
+	     "  -f, --fields=(fields)       list of sections to print\n"
+	     "      --field-only=fiel)      print superblock section only, no header\n"
+	     "  -l, --layout                print superblock layout\n"
+	     "  -h, --help                  display this help and exit\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+	exit(EXIT_SUCCESS);
+}
+
+int cmd_show_super(int argc, char *argv[])
+{
+	static const struct option longopts[] = {
+		{ "fields",			1, NULL, 'f' },
+		{ "field-only",			1, NULL, 'F' },
+		{ "layout",			0, NULL, 'l' },
+		{ "help",			0, NULL, 'h' },
+		{ NULL }
+	};
+	unsigned fields = 0;
+	int field_only = -1;
+	bool print_layout = false;
+	bool print_default_fields = true;
+	int opt;
+
+	while ((opt = getopt_long(argc, argv, "f:lh", longopts, NULL)) != -1)
+		switch (opt) {
+		case 'f':
+			fields = !strcmp(optarg, "all")
+				? ~0
+				: read_flag_list_or_die(optarg,
+					bch2_sb_fields, "superblock field");
+			print_default_fields = false;
+			break;
+		case 'F':
+			field_only = read_string_list_or_die(optarg,
+					bch2_sb_fields, "superblock field");
+			print_default_fields = false;
+			break;
+		case 'l':
+			print_layout = true;
+			break;
+		case 'h':
+			show_super_usage();
+			break;
+		}
+	args_shift(optind);
+
+	char *dev = arg_pop();
+	if (!dev)
+		die("please supply a device");
+	if (argc)
+		die("too many arguments");
+
+	struct bch_opts opts = bch2_opts_empty();
+
+	opt_set(opts, noexcl,	true);
+	opt_set(opts, nochanges, true);
+
+	struct bch_sb_handle sb;
+	int ret = bch2_read_super(dev, &opts, &sb);
+	if (ret)
+		die("Error opening %s: %s", dev, bch2_err_str(ret));
+
+	if (print_default_fields) {
+		fields |= bch2_sb_field_get(sb.sb, members_v2)
+			? 1 << BCH_SB_FIELD_members_v2
+			: 1 << BCH_SB_FIELD_members_v1;
+		fields |= 1 << BCH_SB_FIELD_errors;
+	}
+
+	struct printbuf buf = PRINTBUF;
+
+	buf.human_readable_units = true;
+
+	if (field_only >= 0) {
+		struct bch_sb_field *f = bch2_sb_field_get_id(sb.sb, field_only);
+
+		if (f)
+			__bch2_sb_field_to_text(&buf, sb.sb, f);
+	} else {
+		printbuf_tabstop_push(&buf, 44);
+
+		char *model = fd_to_dev_model(sb.bdev->bd_fd);
+		prt_str(&buf, "Device:");
+		prt_tab(&buf);
+		prt_str(&buf, model);
+		prt_newline(&buf);
+		free(model);
+
+		bch2_sb_to_text(&buf, sb.sb, print_layout, fields);
+	}
+	printf("%s", buf.buf);
+
+	bch2_free_super(&sb);
+	printbuf_exit(&buf);
+	return 0;
+}
diff --git a/c_src/cmd_fs.c b/c_src/cmd_fs.c
new file mode 100644
index 00000000..82eeceff
--- /dev/null
+++ b/c_src/cmd_fs.c
@@ -0,0 +1,544 @@
+#include <getopt.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+
+#include <uuid/uuid.h>
+
+#include "linux/sort.h"
+#include "linux/rcupdate.h"
+
+#include "libbcachefs/bcachefs_ioctl.h"
+#include "libbcachefs/buckets.h"
+#include "libbcachefs/disk_accounting.h"
+#include "libbcachefs/opts.h"
+#include "libbcachefs/super-io.h"
+
+#include "cmds.h"
+#include "libbcachefs.h"
+
+#include "libbcachefs/darray.h"
+
+static void __dev_usage_type_to_text(struct printbuf *out,
+				     enum bch_data_type type,
+				     unsigned bucket_size,
+				     u64 buckets, u64 sectors, u64 frag)
+{
+	bch2_prt_data_type(out, type);
+	prt_char(out, ':');
+	prt_tab(out);
+
+	prt_units_u64(out, sectors << 9);
+	prt_tab_rjust(out);
+
+	prt_printf(out, "%llu", buckets);
+	prt_tab_rjust(out);
+
+	if (frag) {
+		prt_units_u64(out, frag << 9);
+		prt_tab_rjust(out);
+	}
+	prt_newline(out);
+}
+
+static void dev_usage_type_to_text(struct printbuf *out,
+				   struct bch_ioctl_dev_usage_v2 *u,
+				   enum bch_data_type type)
+{
+	u64 sectors = 0;
+	switch (type) {
+	case BCH_DATA_free:
+	case BCH_DATA_need_discard:
+	case BCH_DATA_need_gc_gens:
+		/* sectors are 0 for these types so calculate sectors for them */
+		sectors = u->d[type].buckets * u->bucket_size;
+		break;
+	default:
+		sectors = u->d[type].sectors;
+	}
+
+	__dev_usage_type_to_text(out, type,
+			u->bucket_size,
+			u->d[type].buckets,
+			sectors,
+			u->d[type].fragmented);
+}
+
+static void dev_usage_to_text(struct printbuf *out,
+			      struct bchfs_handle fs,
+			      struct dev_name *d)
+{
+	struct bch_ioctl_dev_usage_v2 *u = bchu_dev_usage(fs, d->idx);
+
+	prt_newline(out);
+	prt_printf(out, "%s (device %u):", d->label ?: "(no label)", d->idx);
+	prt_tab(out);
+	prt_str(out, d->dev ?: "(device not found)");
+	prt_tab_rjust(out);
+
+	prt_str(out, bch2_member_states[u->state]);
+	prt_tab_rjust(out);
+
+	prt_newline(out);
+
+	printbuf_indent_add(out, 2);
+	prt_tab(out);
+
+	prt_str(out, "data");
+	prt_tab_rjust(out);
+
+	prt_str(out, "buckets");
+	prt_tab_rjust(out);
+
+	prt_str(out, "fragmented");
+	prt_tab_rjust(out);
+
+	prt_newline(out);
+
+	for (unsigned i = 0; i < u->nr_data_types; i++)
+		dev_usage_type_to_text(out, u, i);
+
+	prt_str(out, "capacity:");
+	prt_tab(out);
+
+	prt_units_u64(out, (u->nr_buckets * u->bucket_size) << 9);
+	prt_tab_rjust(out);
+	prt_printf(out, "%llu", u->nr_buckets);
+	prt_tab_rjust(out);
+
+	printbuf_indent_sub(out, 2);
+
+	prt_newline(out);
+	free(u);
+}
+
+static int dev_by_label_cmp(const void *_l, const void *_r)
+{
+	const struct dev_name *l = _l, *r = _r;
+
+	return  (l->label && r->label
+		 ? strcmp(l->label, r->label) : 0) ?:
+		(l->dev && r->dev
+		 ? strcmp(l->dev, r->dev) : 0) ?:
+		cmp_int(l->idx, r->idx);
+}
+
+static struct dev_name *dev_idx_to_name(dev_names *dev_names, unsigned idx)
+{
+	darray_for_each(*dev_names, dev)
+		if (dev->idx == idx)
+			return dev;
+	return NULL;
+}
+
+static void devs_usage_to_text(struct printbuf *out,
+			       struct bchfs_handle fs,
+			       dev_names dev_names)
+{
+	sort(dev_names.data, dev_names.nr,
+	     sizeof(dev_names.data[0]), dev_by_label_cmp, NULL);
+
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, 16);
+	printbuf_tabstop_push(out, 20);
+	printbuf_tabstop_push(out, 16);
+	printbuf_tabstop_push(out, 14);
+
+	darray_for_each(dev_names, dev)
+		dev_usage_to_text(out, fs, dev);
+
+	darray_for_each(dev_names, dev) {
+		free(dev->dev);
+		free(dev->label);
+	}
+}
+
+static void persistent_reserved_to_text(struct printbuf *out,
+					unsigned nr_replicas, s64 sectors)
+{
+	if (!sectors)
+		return;
+
+	prt_str(out, "reserved:");
+	prt_tab(out);
+	prt_printf(out, "%u/%u ", 1, nr_replicas);
+	prt_tab(out);
+	prt_str(out, "[] ");
+	prt_units_u64(out, sectors << 9);
+	prt_tab_rjust(out);
+	prt_newline(out);
+}
+
+static void replicas_usage_to_text(struct printbuf *out,
+				   const struct bch_replicas_entry_v1 *r,
+				   s64 sectors,
+				   dev_names *dev_names)
+{
+	if (!sectors)
+		return;
+
+	char devs[4096], *d = devs;
+	*d++ = '[';
+
+	unsigned durability = 0;
+
+	for (unsigned i = 0; i < r->nr_devs; i++) {
+		unsigned dev_idx = r->devs[i];
+		struct dev_name *dev = dev_idx_to_name(dev_names, dev_idx);
+
+		durability += dev ? dev->durability : 0;
+
+		if (i)
+			*d++ = ' ';
+
+		d += dev && dev->dev
+			? sprintf(d, "%s", dev->dev)
+			: sprintf(d, "%u", dev_idx);
+	}
+	*d++ = ']';
+	*d++ = '\0';
+
+	bch2_prt_data_type(out, r->data_type);
+	prt_char(out, ':');
+	prt_tab(out);
+
+	prt_printf(out, "%u/%u ", r->nr_required, r->nr_devs);
+	prt_tab(out);
+
+	prt_printf(out, "%u ", durability);
+	prt_tab(out);
+
+	prt_printf(out, "%s ", devs);
+	prt_tab(out);
+
+	prt_units_u64(out, sectors << 9);
+	prt_tab_rjust(out);
+	prt_newline(out);
+}
+
+#define for_each_usage_replica(_u, _r)					\
+	for (_r = (_u)->replicas;					\
+	     _r != (void *) (_u)->replicas + (_u)->replica_entries_bytes;\
+	     _r = replicas_usage_next(_r),				\
+	     BUG_ON((void *) _r > (void *) (_u)->replicas + (_u)->replica_entries_bytes))
+
+typedef DARRAY(struct bkey_i_accounting *) darray_accounting_p;
+
+static int accounting_p_cmp(const void *_l, const void *_r)
+{
+	const struct bkey_i_accounting * const *l = _l;
+	const struct bkey_i_accounting * const *r = _r;
+
+	struct bpos lp = (*l)->k.p, rp = (*r)->k.p;
+
+	bch2_bpos_swab(&lp);
+	bch2_bpos_swab(&rp);
+	return bpos_cmp(lp, rp);
+}
+
+static void accounting_sort(darray_accounting_p *sorted,
+			    struct bch_ioctl_query_accounting *in)
+{
+	for (struct bkey_i_accounting *a = in->accounting;
+	     a < (struct bkey_i_accounting *) ((u64 *) in->accounting + in->accounting_u64s);
+	     a = bkey_i_to_accounting(bkey_next(&a->k_i)))
+		if (darray_push(sorted, a))
+			die("memory allocation failure");
+
+	sort(sorted->data, sorted->nr, sizeof(sorted->data[0]), accounting_p_cmp, NULL);
+}
+
+static int fs_usage_v1_to_text(struct printbuf *out,
+			       struct bchfs_handle fs,
+			       dev_names dev_names)
+{
+	struct bch_ioctl_query_accounting *a =
+		bchu_fs_accounting(fs,
+			BIT(BCH_DISK_ACCOUNTING_persistent_reserved)|
+			BIT(BCH_DISK_ACCOUNTING_replicas)|
+			BIT(BCH_DISK_ACCOUNTING_compression)|
+			BIT(BCH_DISK_ACCOUNTING_btree)|
+			BIT(BCH_DISK_ACCOUNTING_rebalance_work));
+	if (!a)
+		return -1;
+
+	darray_accounting_p a_sorted = {};
+
+	accounting_sort(&a_sorted, a);
+
+	prt_str(out, "Filesystem: ");
+	pr_uuid(out, fs.uuid.b);
+	prt_newline(out);
+
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, 20);
+	printbuf_tabstop_push(out, 16);
+
+	prt_str(out, "Size:");
+	prt_tab(out);
+	prt_units_u64(out, a->capacity << 9);
+	prt_tab_rjust(out);
+	prt_newline(out);
+
+	prt_str(out, "Used:");
+	prt_tab(out);
+	prt_units_u64(out, a->used << 9);
+	prt_tab_rjust(out);
+	prt_newline(out);
+
+	prt_str(out, "Online reserved:");
+	prt_tab(out);
+	prt_units_u64(out, a->online_reserved << 9);
+	prt_tab_rjust(out);
+	prt_newline(out);
+
+	prt_newline(out);
+
+	printbuf_tabstops_reset(out);
+
+	printbuf_tabstop_push(out, 16);
+	prt_str(out, "Data type");
+	prt_tab(out);
+
+	printbuf_tabstop_push(out, 16);
+	prt_str(out, "Required/total");
+	prt_tab(out);
+
+	printbuf_tabstop_push(out, 14);
+	prt_str(out, "Durability");
+	prt_tab(out);
+
+	printbuf_tabstop_push(out, 14);
+	prt_str(out, "Devices");
+	prt_newline(out);
+
+	printbuf_tabstop_push(out, 14);
+
+	unsigned prev_type = 0;
+
+	darray_for_each(a_sorted, i) {
+		struct bkey_i_accounting *a = *i;
+
+		struct disk_accounting_pos acc_k;
+		bpos_to_disk_accounting_pos(&acc_k, a->k.p);
+
+		bool new_type = acc_k.type != prev_type;
+		prev_type = acc_k.type;
+
+		switch (acc_k.type) {
+		case BCH_DISK_ACCOUNTING_persistent_reserved:
+			persistent_reserved_to_text(out,
+				acc_k.persistent_reserved.nr_replicas,
+				a->v.d[0]);
+			break;
+		case BCH_DISK_ACCOUNTING_replicas:
+			replicas_usage_to_text(out, &acc_k.replicas, a->v.d[0], &dev_names);
+			break;
+		case BCH_DISK_ACCOUNTING_compression:
+			if (new_type) {
+				prt_printf(out, "\nCompression:\n");
+				printbuf_tabstops_reset(out);
+				printbuf_tabstop_push(out, 12);
+				printbuf_tabstop_push(out, 16);
+				printbuf_tabstop_push(out, 16);
+				printbuf_tabstop_push(out, 24);
+				prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n");
+			}
+
+			u64 nr_extents			= a->v.d[0];
+			u64 sectors_uncompressed	= a->v.d[1];
+			u64 sectors_compressed		= a->v.d[2];
+
+			bch2_prt_compression_type(out, acc_k.compression.type);
+			prt_tab(out);
+
+			prt_human_readable_u64(out, sectors_compressed << 9);
+			prt_tab_rjust(out);
+
+			prt_human_readable_u64(out, sectors_uncompressed << 9);
+			prt_tab_rjust(out);
+
+			prt_human_readable_u64(out, nr_extents
+					       ? div_u64(sectors_uncompressed << 9, nr_extents)
+					       : 0);
+			prt_tab_rjust(out);
+			prt_newline(out);
+			break;
+		case BCH_DISK_ACCOUNTING_btree:
+			if (new_type) {
+				prt_printf(out, "\nBtree usage:\n");
+				printbuf_tabstops_reset(out);
+				printbuf_tabstop_push(out, 12);
+				printbuf_tabstop_push(out, 16);
+			}
+			prt_printf(out, "%s:\t", bch2_btree_id_str(acc_k.btree.id));
+			prt_units_u64(out, a->v.d[0] << 9);
+			prt_tab_rjust(out);
+			prt_newline(out);
+			break;
+		case BCH_DISK_ACCOUNTING_rebalance_work:
+			if (new_type)
+				prt_printf(out, "\nPending rebalance work:\n");
+			prt_units_u64(out, a->v.d[0] << 9);
+			prt_newline(out);
+			break;
+		}
+	}
+
+	darray_exit(&a_sorted);
+	free(a);
+	return 0;
+}
+
+static void fs_usage_v0_to_text(struct printbuf *out,
+				struct bchfs_handle fs,
+				dev_names dev_names)
+{
+	struct bch_ioctl_fs_usage *u = bchu_fs_usage(fs);
+
+	prt_str(out, "Filesystem: ");
+	pr_uuid(out, fs.uuid.b);
+	prt_newline(out);
+
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, 20);
+	printbuf_tabstop_push(out, 16);
+
+	prt_str(out, "Size:");
+	prt_tab(out);
+	prt_units_u64(out, u->capacity << 9);
+	prt_tab_rjust(out);
+	prt_newline(out);
+
+	prt_str(out, "Used:");
+	prt_tab(out);
+	prt_units_u64(out, u->used << 9);
+	prt_tab_rjust(out);
+	prt_newline(out);
+
+	prt_str(out, "Online reserved:");
+	prt_tab(out);
+	prt_units_u64(out, u->online_reserved << 9);
+	prt_tab_rjust(out);
+	prt_newline(out);
+
+	prt_newline(out);
+
+	printbuf_tabstops_reset(out);
+
+	printbuf_tabstop_push(out, 16);
+	prt_str(out, "Data type");
+	prt_tab(out);
+
+	printbuf_tabstop_push(out, 16);
+	prt_str(out, "Required/total");
+	prt_tab(out);
+
+	printbuf_tabstop_push(out, 14);
+	prt_str(out, "Durability");
+	prt_tab(out);
+
+	printbuf_tabstop_push(out, 14);
+	prt_str(out, "Devices");
+	prt_newline(out);
+
+	printbuf_tabstop_push(out, 14);
+
+	for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++)
+		persistent_reserved_to_text(out, i, u->persistent_reserved[i]);
+
+	struct bch_replicas_usage *r;
+
+	for_each_usage_replica(u, r)
+		if (r->r.data_type < BCH_DATA_user)
+			replicas_usage_to_text(out, &r->r, r->sectors, &dev_names);
+
+	for_each_usage_replica(u, r)
+		if (r->r.data_type == BCH_DATA_user &&
+		    r->r.nr_required <= 1)
+			replicas_usage_to_text(out, &r->r, r->sectors, &dev_names);
+
+	for_each_usage_replica(u, r)
+		if (r->r.data_type == BCH_DATA_user &&
+		    r->r.nr_required > 1)
+			replicas_usage_to_text(out, &r->r, r->sectors, &dev_names);
+
+	for_each_usage_replica(u, r)
+		if (r->r.data_type > BCH_DATA_user)
+			replicas_usage_to_text(out, &r->r, r->sectors, &dev_names);
+
+	free(u);
+}
+
+static void fs_usage_to_text(struct printbuf *out, const char *path)
+{
+	struct bchfs_handle fs = bcache_fs_open(path);
+
+	dev_names dev_names = bchu_fs_get_devices(fs);
+
+	if (!fs_usage_v1_to_text(out, fs, dev_names))
+		goto devs;
+
+	fs_usage_v0_to_text(out, fs, dev_names);
+devs:
+	devs_usage_to_text(out, fs, dev_names);
+
+	darray_exit(&dev_names);
+
+	bcache_fs_close(fs);
+}
+
+static void fs_usage_usage(void)
+{
+	puts("bcachefs fs usage - display detailed filesystem usage\n"
+	     "Usage: bcachefs fs usage [OPTION]... <mountpoint>\n"
+	     "\n"
+	     "Options:\n"
+	     "  -h, --human-readable              Human readable units\n"
+	     "  -H, --help                        Display this help and exit\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+int cmd_fs_usage(int argc, char *argv[])
+{
+	static const struct option longopts[] = {
+		{ "help",		no_argument,		NULL, 'H' },
+		{ "human-readable",     no_argument,            NULL, 'h' },
+		{ NULL }
+	};
+	bool human_readable = false;
+	struct printbuf buf = PRINTBUF;
+	char *fs;
+	int opt;
+
+	while ((opt = getopt_long(argc, argv, "h",
+				  longopts, NULL)) != -1)
+		switch (opt) {
+		case 'h':
+			human_readable = true;
+			break;
+		case 'H':
+			fs_usage_usage();
+			exit(EXIT_SUCCESS);
+		default:
+			fs_usage_usage();
+			exit(EXIT_FAILURE);
+		}
+	args_shift(optind);
+
+	if (!argc) {
+		printbuf_reset(&buf);
+		buf.human_readable_units = human_readable;
+		fs_usage_to_text(&buf, ".");
+		printf("%s", buf.buf);
+	} else {
+		while ((fs = arg_pop())) {
+			printbuf_reset(&buf);
+			buf.human_readable_units = human_readable;
+			fs_usage_to_text(&buf, fs);
+			printf("%s", buf.buf);
+		}
+	}
+
+	printbuf_exit(&buf);
+	return 0;
+}
diff --git a/c_src/cmd_fsck.c b/c_src/cmd_fsck.c
new file mode 100644
index 00000000..2ea51ff2
--- /dev/null
+++ b/c_src/cmd_fsck.c
@@ -0,0 +1,348 @@
+
+#include <errno.h>
+#include <getopt.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include "cmds.h"
+#include "libbcachefs/error.h"
+#include "libbcachefs.h"
+#include "libbcachefs/super.h"
+#include "libbcachefs/super-io.h"
+#include "tools-util.h"
+
+static void fsck_usage(void)
+{
+	puts("bcachefs fsck - filesystem check and repair\n"
+	     "Usage: bcachefs fsck [OPTION]... <devices>\n"
+	     "\n"
+	     "Options:\n"
+	     "  -p                      Automatic repair (no questions)\n"
+	     "  -n                      Don't repair, only check for errors\n"
+	     "  -y                      Assume \"yes\" to all questions\n"
+	     "  -f                      Force checking even if filesystem is marked clean\n"
+	     "  -r, --ratelimit_errors  Don't display more than 10 errors of a given type\n"
+	     "  -R, --reconstruct_alloc Reconstruct the alloc btree\n"
+	     "  -k, --kernel            Use the in-kernel fsck implementation\n"
+	     "  -v                      Be verbose\n"
+	     "  -h, --help              Display this help and exit\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+static void setnonblocking(int fd)
+{
+	int flags = fcntl(fd, F_GETFL);
+	if (fcntl(fd, F_SETFL, flags|O_NONBLOCK))
+		die("fcntl error: %m");
+}
+
+static int do_splice(int rfd, int wfd)
+{
+	char buf[4096], *b = buf;
+
+	int r = read(rfd, buf, sizeof(buf));
+	if (r < 0 && errno == EAGAIN)
+		return 0;
+	if (r < 0)
+		return r;
+	if (!r)
+		return 1;
+	do {
+		ssize_t w = write(wfd, b, r);
+		if (w < 0)
+			die("%s: write error: %m", __func__);
+		r -= w;
+		b += w;
+	} while (r);
+	return 0;
+}
+
+static int splice_fd_to_stdinout(int fd)
+{
+	setnonblocking(STDIN_FILENO);
+	setnonblocking(fd);
+
+	bool stdin_closed = false;
+
+	while (true) {
+		fd_set fds;
+
+		FD_ZERO(&fds);
+		FD_SET(fd, &fds);
+		if (!stdin_closed)
+			FD_SET(STDIN_FILENO, &fds);
+
+		if (select(fd + 1, &fds, NULL, NULL, NULL) < 0)
+			die("select error: %m");
+
+		int r = do_splice(fd, STDOUT_FILENO);
+		if (r < 0)
+			return r;
+		if (r)
+			break;
+
+		r = do_splice(STDIN_FILENO, fd);
+		if (r < 0)
+			return r;
+		if (r)
+			stdin_closed = true;
+	}
+
+	return close(fd);
+}
+
+static int fsck_online(const char *dev_path, const char *opt_str)
+{
+	int dev_idx;
+	struct bchfs_handle fs = bchu_fs_open_by_dev(dev_path, &dev_idx);
+
+	struct bch_ioctl_fsck_online fsck = {
+		.opts = (unsigned long) opt_str
+	};
+
+	int fsck_fd = ioctl(fs.ioctl_fd, BCH_IOCTL_FSCK_ONLINE, &fsck);
+	if (fsck_fd < 0)
+		die("BCH_IOCTL_FSCK_ONLINE error: %s", bch2_err_str(errno));
+
+	return splice_fd_to_stdinout(fsck_fd);
+}
+
+static void append_opt(struct printbuf *out, const char *opt)
+{
+	if (out->pos)
+		prt_char(out, ',');
+	prt_str(out, opt);
+}
+
+static bool should_use_kernel_fsck(darray_str devs)
+{
+	system("modprobe bcachefs");
+
+	unsigned kernel_version = !access("/sys/module/bcachefs/parameters/version", R_OK)
+	    ? read_file_u64(AT_FDCWD, "/sys/module/bcachefs/parameters/version")
+	    : 0;
+
+	if (!kernel_version)
+		return false;
+
+	if (kernel_version == bcachefs_metadata_version_current)
+		return false;
+
+	struct bch_opts opts = bch2_opts_empty();
+	opt_set(opts, nostart, true);
+	opt_set(opts, noexcl, true);
+	opt_set(opts, nochanges, true);
+	opt_set(opts, read_only, true);
+
+	struct bch_fs *c = bch2_fs_open(devs.data, devs.nr, opts);
+	if (IS_ERR(c))
+		return false;
+
+	bool ret = ((bcachefs_metadata_version_current < kernel_version &&
+		     kernel_version <= c->sb.version) ||
+		    (c->sb.version <= kernel_version &&
+		     kernel_version < bcachefs_metadata_version_current));
+
+	if (ret) {
+		struct printbuf buf = PRINTBUF;
+
+		prt_str(&buf, "fsck binary is version ");
+		bch2_version_to_text(&buf, bcachefs_metadata_version_current);
+		prt_str(&buf, " but filesystem is ");
+		bch2_version_to_text(&buf, c->sb.version);
+		prt_str(&buf, " and kernel is ");
+		bch2_version_to_text(&buf, kernel_version);
+		prt_str(&buf, ", using kernel fsck\n");
+
+		printf("%s", buf.buf);
+		printbuf_exit(&buf);
+	}
+
+	bch2_fs_stop(c);
+
+	return ret;
+}
+
+static bool is_blockdev(const char *path)
+{
+	struct stat s;
+	if (stat(path, &s))
+		return true;
+	return S_ISBLK(s.st_mode);
+}
+
+static void loopdev_free(const char *path)
+{
+	char *cmd = mprintf("losetup -d %s", path);
+	system(cmd);
+	free(cmd);
+}
+
+static char *loopdev_alloc(const char *path)
+{
+	char *cmd = mprintf("losetup --show -f %s", path);
+	FILE *f = popen(cmd, "r");
+	free(cmd);
+	if (!f) {
+		fprintf(stderr, "error executing losetup: %m\n");
+		return NULL;
+	}
+
+	char *line = NULL;
+	size_t n = 0;
+	getline(&line, &n, f);
+	int ret = pclose(f);
+	if (ret) {
+		fprintf(stderr, "error executing losetup: %i\n", ret);
+		free(line);
+		return NULL;
+	}
+
+	strim(line);
+	return line;
+}
+
+int cmd_fsck(int argc, char *argv[])
+{
+	static const struct option longopts[] = {
+		{ "ratelimit_errors",	no_argument,		NULL, 'r' },
+		{ "reconstruct_alloc",	no_argument,		NULL, 'R' },
+		{ "kernel",		no_argument,		NULL, 'k' },
+		{ "no-kernel",		no_argument,		NULL, 'K' },
+		{ "help",		no_argument,		NULL, 'h' },
+		{ NULL }
+	};
+	int kernel = -1; /* unset */
+	int opt, ret = 0;
+	struct printbuf opts_str = PRINTBUF;
+
+	if (getenv("BCACHEFS_KERNEL_ONLY"))
+		kernel = true;
+
+	append_opt(&opts_str, "degraded");
+	append_opt(&opts_str, "fsck");
+	append_opt(&opts_str, "fix_errors=ask");
+	append_opt(&opts_str, "read_only");
+
+	while ((opt = getopt_long(argc, argv,
+				  "apynfo:rRkKvh",
+				  longopts, NULL)) != -1)
+		switch (opt) {
+		case 'a': /* outdated alias for -p */
+		case 'p':
+		case 'y':
+			append_opt(&opts_str, "fix_errors=yes");
+			break;
+		case 'n':
+			append_opt(&opts_str, "nochanges");
+			append_opt(&opts_str, "fix_errors=no");
+			break;
+		case 'f':
+			/* force check, even if filesystem marked clean: */
+			break;
+		case 'o':
+			append_opt(&opts_str, optarg);
+			break;
+		case 'r':
+			append_opt(&opts_str, "ratelimit_errors");
+			break;
+		case 'R':
+			append_opt(&opts_str, "reconstruct_alloc");
+			break;
+		case 'k':
+			kernel = true;
+			break;
+		case 'K':
+			kernel = false;
+			break;
+		case 'v':
+			append_opt(&opts_str, "verbose");
+			break;
+		case 'h':
+			fsck_usage();
+			exit(16);
+		}
+	args_shift(optind);
+
+	if (!argc) {
+		fprintf(stderr, "Please supply device(s) to check\n");
+		exit(8);
+	}
+
+	darray_str devs = get_or_split_cmdline_devs(argc, argv);
+
+	darray_for_each(devs, i)
+		if (dev_mounted(*i)) {
+			printf("Running fsck online\n");
+			return fsck_online(*i, opts_str.buf);
+		}
+
+	int kernel_probed = kernel;
+	if (kernel_probed < 0)
+		kernel_probed = should_use_kernel_fsck(devs);
+
+	struct bch_opts opts = bch2_opts_empty();
+	struct printbuf parse_later = PRINTBUF;
+
+	if (kernel_probed) {
+		darray_str loopdevs = {};
+		int fsck_fd = -1;
+
+		printf("Running in-kernel offline fsck\n");
+		struct bch_ioctl_fsck_offline *fsck = calloc(sizeof(*fsck) + sizeof(u64) * devs.nr, 1);
+
+		fsck->opts = (unsigned long)opts_str.buf;
+		darray_for_each(devs, i) {
+			if (is_blockdev(*i)) {
+				fsck->devs[i - devs.data] = (unsigned long) *i;
+			} else {
+				char *l = loopdev_alloc(*i);
+				if (!l)
+					goto kernel_fsck_err;
+				darray_push(&loopdevs, l);
+				fsck->devs[i - devs.data] = (unsigned long) l;
+			}
+		}
+		fsck->nr_devs = devs.nr;
+
+		int ctl_fd = bcachectl_open();
+		fsck_fd = ioctl(ctl_fd, BCH_IOCTL_FSCK_OFFLINE, fsck);
+kernel_fsck_err:
+		free(fsck);
+
+		darray_for_each(loopdevs, i)
+			loopdev_free(*i);
+		darray_exit(&loopdevs);
+
+		if (fsck_fd < 0 && kernel < 0)
+			goto userland_fsck;
+
+		if (fsck_fd < 0)
+			die("BCH_IOCTL_FSCK_OFFLINE error: %s", bch2_err_str(errno));
+
+		ret = splice_fd_to_stdinout(fsck_fd);
+	} else {
+userland_fsck:
+		printf("Running userspace offline fsck\n");
+		ret = bch2_parse_mount_opts(NULL, &opts, &parse_later, opts_str.buf);
+		if (ret)
+			return ret;
+
+		struct bch_fs *c = bch2_fs_open(devs.data, devs.nr, opts);
+		if (IS_ERR(c))
+			exit(8);
+
+		if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
+			fprintf(stderr, "%s: errors fixed\n", c->name);
+			ret |= 1;
+		}
+		if (test_bit(BCH_FS_error, &c->flags)) {
+			fprintf(stderr, "%s: still has errors\n", c->name);
+			ret |= 4;
+		}
+
+		bch2_fs_stop(c);
+	}
+
+	printbuf_exit(&opts_str);
+	return ret;
+}
diff --git a/c_src/cmd_fusemount.c b/c_src/cmd_fusemount.c
new file mode 100644
index 00000000..e5674b42
--- /dev/null
+++ b/c_src/cmd_fusemount.c
@@ -0,0 +1,1314 @@
+#ifdef BCACHEFS_FUSE
+
+#include <errno.h>
+#include <float.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <sys/statvfs.h>
+
+#include <fuse_lowlevel.h>
+
+#include "cmds.h"
+#include "libbcachefs.h"
+#include "tools-util.h"
+
+#include "libbcachefs/bcachefs.h"
+#include "libbcachefs/alloc_foreground.h"
+#include "libbcachefs/btree_iter.h"
+#include "libbcachefs/buckets.h"
+#include "libbcachefs/dirent.h"
+#include "libbcachefs/errcode.h"
+#include "libbcachefs/error.h"
+#include "libbcachefs/fs-common.h"
+#include "libbcachefs/inode.h"
+#include "libbcachefs/io_read.h"
+#include "libbcachefs/io_write.h"
+#include "libbcachefs/opts.h"
+#include "libbcachefs/super.h"
+
+/* mode_to_type(): */
+#include "libbcachefs/fs.h"
+
+#include <linux/dcache.h>
+
+/* XXX cut and pasted from fsck.c */
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+/* used by write_aligned function for waiting on bch2_write closure */
+struct write_aligned_op_t {
+        struct closure cl;
+
+        /* must be last: */
+        struct bch_write_op             op;
+};
+
+
+static inline subvol_inum map_root_ino(u64 ino)
+{
+	return (subvol_inum) { 1, ino == 1 ? 4096 : ino };
+}
+
+static inline u64 unmap_root_ino(u64 ino)
+{
+	return ino == 4096 ? 1 : ino;
+}
+
+static struct stat inode_to_stat(struct bch_fs *c,
+				 struct bch_inode_unpacked *bi)
+{
+	return (struct stat) {
+		.st_ino		= unmap_root_ino(bi->bi_inum),
+		.st_size	= bi->bi_size,
+		.st_mode	= bi->bi_mode,
+		.st_uid		= bi->bi_uid,
+		.st_gid		= bi->bi_gid,
+		.st_nlink	= bch2_inode_nlink_get(bi),
+		.st_rdev	= bi->bi_dev,
+		.st_blksize	= block_bytes(c),
+		.st_blocks	= bi->bi_sectors,
+		.st_atim	= bch2_time_to_timespec(c, bi->bi_atime),
+		.st_mtim	= bch2_time_to_timespec(c, bi->bi_mtime),
+		.st_ctim	= bch2_time_to_timespec(c, bi->bi_ctime),
+	};
+}
+
+static struct fuse_entry_param inode_to_entry(struct bch_fs *c,
+					      struct bch_inode_unpacked *bi)
+{
+	return (struct fuse_entry_param) {
+		.ino		= unmap_root_ino(bi->bi_inum),
+		.generation	= bi->bi_generation,
+		.attr		= inode_to_stat(c, bi),
+		.attr_timeout	= DBL_MAX,
+		.entry_timeout	= DBL_MAX,
+	};
+}
+
+static void bcachefs_fuse_init(void *arg, struct fuse_conn_info *conn)
+{
+	if (conn->capable & FUSE_CAP_WRITEBACK_CACHE) {
+		fuse_log(FUSE_LOG_DEBUG, "fuse_init: activating writeback\n");
+		conn->want |= FUSE_CAP_WRITEBACK_CACHE;
+	} else
+		fuse_log(FUSE_LOG_DEBUG, "fuse_init: writeback not capable\n");
+
+	//conn->want |= FUSE_CAP_POSIX_ACL;
+}
+
+static void bcachefs_fuse_destroy(void *arg)
+{
+	struct bch_fs *c = arg;
+
+	bch2_fs_stop(c);
+}
+
+static void bcachefs_fuse_lookup(fuse_req_t req, fuse_ino_t dir_ino,
+				 const char *name)
+{
+	subvol_inum dir = map_root_ino(dir_ino);
+	struct bch_fs *c = fuse_req_userdata(req);
+	struct bch_inode_unpacked bi;
+	struct qstr qstr = QSTR(name);
+	subvol_inum inum;
+	int ret;
+
+	fuse_log(FUSE_LOG_DEBUG, "fuse_lookup(dir=%llu name=%s)\n",
+		 dir.inum, name);
+
+	ret = bch2_inode_find_by_inum(c, dir, &bi);
+	if (ret) {
+		fuse_reply_err(req, -ret);
+		return;
+	}
+
+	struct bch_hash_info hash_info = bch2_hash_info_init(c, &bi);
+
+	ret = bch2_dirent_lookup(c, dir, &hash_info, &qstr, &inum);
+	if (ret) {
+		struct fuse_entry_param e = {
+			.attr_timeout	= DBL_MAX,
+			.entry_timeout	= DBL_MAX,
+		};
+		fuse_reply_entry(req, &e);
+		return;
+	}
+
+	ret = bch2_inode_find_by_inum(c, inum, &bi);
+	if (ret)
+		goto err;
+
+	fuse_log(FUSE_LOG_DEBUG, "fuse_lookup ret(inum=%llu)\n",
+		 bi.bi_inum);
+
+	struct fuse_entry_param e = inode_to_entry(c, &bi);
+	fuse_reply_entry(req, &e);
+	return;
+err:
+	fuse_log(FUSE_LOG_DEBUG, "fuse_lookup error %i\n", ret);
+	fuse_reply_err(req, -ret);
+}
+
+static void bcachefs_fuse_getattr(fuse_req_t req, fuse_ino_t ino,
+				  struct fuse_file_info *fi)
+{
+	subvol_inum inum = map_root_ino(ino);
+	struct bch_fs *c = fuse_req_userdata(req);
+	struct bch_inode_unpacked bi;
+	struct stat attr;
+
+	fuse_log(FUSE_LOG_DEBUG, "fuse_getattr(inum=%llu)\n", inum.inum);
+
+	int ret = bch2_inode_find_by_inum(c, inum, &bi);
+	if (ret) {
+		fuse_log(FUSE_LOG_DEBUG, "fuse_getattr error %i\n", ret);
+		fuse_reply_err(req, -ret);
+		return;
+	}
+
+	fuse_log(FUSE_LOG_DEBUG, "fuse_getattr success\n");
+
+	attr = inode_to_stat(c, &bi);
+	fuse_reply_attr(req, &attr, DBL_MAX);
+}
+
+static void bcachefs_fuse_setattr(fuse_req_t req, fuse_ino_t ino,
+				  struct stat *attr, int to_set,
+				  struct fuse_file_info *fi)
+{
+	struct bch_fs *c = fuse_req_userdata(req);
+	struct bch_inode_unpacked inode_u;
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	u64 now;
+	int ret;
+
+	subvol_inum inum = map_root_ino(ino);
+
+	fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_setattr(%llu, %x)\n", inum.inum, to_set);
+
+	trans = bch2_trans_get(c);
+retry:
+	bch2_trans_begin(trans);
+	now = bch2_current_time(c);
+
+	ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent);
+	if (ret)
+		goto err;
+
+	if (to_set & FUSE_SET_ATTR_MODE)
+		inode_u.bi_mode	= attr->st_mode;
+	if (to_set & FUSE_SET_ATTR_UID)
+		inode_u.bi_uid	= attr->st_uid;
+	if (to_set & FUSE_SET_ATTR_GID)
+		inode_u.bi_gid	= attr->st_gid;
+	if (to_set & FUSE_SET_ATTR_SIZE)
+		inode_u.bi_size	= attr->st_size;
+	if (to_set & FUSE_SET_ATTR_ATIME)
+		inode_u.bi_atime = timespec_to_bch2_time(c, attr->st_atim);
+	if (to_set & FUSE_SET_ATTR_MTIME)
+		inode_u.bi_mtime = timespec_to_bch2_time(c, attr->st_mtim);
+	if (to_set & FUSE_SET_ATTR_ATIME_NOW)
+		inode_u.bi_atime = now;
+	if (to_set & FUSE_SET_ATTR_MTIME_NOW)
+		inode_u.bi_mtime = now;
+	/* TODO: CTIME? */
+
+	ret   = bch2_inode_write(trans, &iter, &inode_u) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BCH_TRANS_COMMIT_no_enospc);
+err:
+        bch2_trans_iter_exit(trans, &iter);
+	if (ret == -EINTR)
+		goto retry;
+
+	bch2_trans_put(trans);
+
+	if (!ret) {
+		*attr = inode_to_stat(c, &inode_u);
+		fuse_reply_attr(req, attr, DBL_MAX);
+	} else {
+		fuse_reply_err(req, -ret);
+	}
+}
+
+static int do_create(struct bch_fs *c, subvol_inum dir,
+		     const char *name, mode_t mode, dev_t rdev,
+		     struct bch_inode_unpacked *new_inode)
+{
+	struct qstr qstr = QSTR(name);
+	struct bch_inode_unpacked dir_u;
+	uid_t uid = 0;
+	gid_t gid = 0;
+
+	bch2_inode_init_early(c, new_inode);
+
+	return bch2_trans_commit_do(c, NULL, NULL, 0,
+			bch2_create_trans(trans,
+				dir, &dir_u,
+				new_inode, &qstr,
+				uid, gid, mode, rdev, NULL, NULL,
+				(subvol_inum) { 0 }, 0));
+}
+
+static void bcachefs_fuse_mknod(fuse_req_t req, fuse_ino_t dir_ino,
+				const char *name, mode_t mode,
+				dev_t rdev)
+{
+	subvol_inum dir = map_root_ino(dir_ino);
+	struct bch_fs *c = fuse_req_userdata(req);
+	struct bch_inode_unpacked new_inode;
+	int ret;
+
+	fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_mknod(%llu, %s, %x, %x)\n",
+		 dir.inum, name, mode, rdev);
+
+	ret = do_create(c, dir, name, mode, rdev, &new_inode);
+	if (ret)
+		goto err;
+
+	struct fuse_entry_param e = inode_to_entry(c, &new_inode);
+	fuse_reply_entry(req, &e);
+	return;
+err:
+	fuse_reply_err(req, -ret);
+}
+
+static void bcachefs_fuse_mkdir(fuse_req_t req, fuse_ino_t dir,
+				const char *name, mode_t mode)
+{
+	fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_mkdir(%llu, %s, %x)\n",
+		 dir, name, mode);
+
+	BUG_ON(mode & S_IFMT);
+
+	mode |= S_IFDIR;
+	bcachefs_fuse_mknod(req, dir, name, mode, 0);
+}
+
+static void bcachefs_fuse_unlink(fuse_req_t req, fuse_ino_t dir_ino,
+				 const char *name)
+{
+	struct bch_fs *c = fuse_req_userdata(req);
+	struct bch_inode_unpacked dir_u, inode_u;
+	struct qstr qstr = QSTR(name);
+	subvol_inum dir = map_root_ino(dir_ino);
+
+	fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_unlink(%llu, %s)\n", dir.inum, name);
+
+	int ret = bch2_trans_commit_do(c, NULL, NULL,
+				BCH_TRANS_COMMIT_no_enospc,
+			    bch2_unlink_trans(trans, dir, &dir_u,
+					      &inode_u, &qstr, false));
+
+	fuse_reply_err(req, -ret);
+}
+
+static void bcachefs_fuse_rmdir(fuse_req_t req, fuse_ino_t dir,
+				const char *name)
+{
+	fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_rmdir(%llu, %s)\n", dir, name);
+
+	bcachefs_fuse_unlink(req, dir, name);
+}
+
+static void bcachefs_fuse_rename(fuse_req_t req,
+				 fuse_ino_t src_dir_ino, const char *srcname,
+				 fuse_ino_t dst_dir_ino, const char *dstname,
+				 unsigned flags)
+{
+	struct bch_fs *c = fuse_req_userdata(req);
+	struct bch_inode_unpacked dst_dir_u, src_dir_u;
+	struct bch_inode_unpacked src_inode_u, dst_inode_u;
+	struct qstr dst_name = QSTR(srcname);
+	struct qstr src_name = QSTR(dstname);
+	subvol_inum src_dir = map_root_ino(src_dir_ino);
+	subvol_inum dst_dir = map_root_ino(dst_dir_ino);
+	int ret;
+
+	fuse_log(FUSE_LOG_DEBUG,
+		 "bcachefs_fuse_rename(%llu, %s, %llu, %s, %x)\n",
+		 src_dir.inum, srcname, dst_dir.inum, dstname, flags);
+
+	/* XXX handle overwrites */
+	ret = bch2_trans_commit_do(c, NULL, NULL, 0,
+		bch2_rename_trans(trans,
+				  src_dir, &src_dir_u,
+				  dst_dir, &dst_dir_u,
+				  &src_inode_u, &dst_inode_u,
+				  &src_name, &dst_name,
+				  BCH_RENAME));
+
+	fuse_reply_err(req, -ret);
+}
+
+static void bcachefs_fuse_link(fuse_req_t req, fuse_ino_t ino,
+			       fuse_ino_t newparent_ino, const char *newname)
+{
+	struct bch_fs *c = fuse_req_userdata(req);
+	struct bch_inode_unpacked dir_u, inode_u;
+	struct qstr qstr = QSTR(newname);
+	subvol_inum newparent	= map_root_ino(newparent_ino);
+	subvol_inum inum	= map_root_ino(ino);
+	int ret;
+
+	fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_link(%llu, %llu, %s)\n",
+		 inum.inum, newparent.inum, newname);
+
+	ret = bch2_trans_commit_do(c, NULL, NULL, 0,
+			    bch2_link_trans(trans, newparent, &dir_u,
+					    inum, &inode_u, &qstr));
+
+	if (!ret) {
+		struct fuse_entry_param e = inode_to_entry(c, &inode_u);
+		fuse_reply_entry(req, &e);
+	} else {
+		fuse_reply_err(req, -ret);
+	}
+}
+
+static void bcachefs_fuse_open(fuse_req_t req, fuse_ino_t inum,
+			       struct fuse_file_info *fi)
+{
+	fi->direct_io		= false;
+	fi->keep_cache		= true;
+	fi->cache_readdir	= true;
+
+	fuse_reply_open(req, fi);
+}
+
+static void userbio_init(struct bio *bio, struct bio_vec *bv,
+			 void *buf, size_t size)
+{
+	bio_init(bio, NULL, bv, 1, 0);
+	bio->bi_iter.bi_size	= size;
+	bv->bv_page		= buf;
+	bv->bv_len		= size;
+	bv->bv_offset		= 0;
+}
+
+static int get_inode_io_opts(struct bch_fs *c, subvol_inum inum, struct bch_io_opts *opts)
+{
+	struct bch_inode_unpacked inode;
+	if (bch2_inode_find_by_inum(c, inum, &inode))
+		return -EINVAL;
+
+	bch2_inode_opts_get(opts, c, &inode);
+	return 0;
+}
+
+static void bcachefs_fuse_read_endio(struct bio *bio)
+{
+	closure_put(bio->bi_private);
+}
+
+
+static void bcachefs_fuse_write_endio(struct bch_write_op *op)
+{
+       struct write_aligned_op_t *w = container_of(op,struct write_aligned_op_t,op);
+       closure_put(&w->cl);
+}
+
+
+struct fuse_align_io {
+	off_t		start;
+	size_t		pad_start;
+	off_t		end;
+	size_t		pad_end;
+	size_t		size;
+};
+
+/* Handle unaligned start and end */
+/* TODO: align to block_bytes, sector size, or page size? */
+static struct fuse_align_io align_io(const struct bch_fs *c, size_t size,
+				     off_t offset)
+{
+	struct fuse_align_io align;
+
+	BUG_ON(offset < 0);
+
+	align.start = round_down(offset, block_bytes(c));
+	align.pad_start = offset - align.start;
+
+	off_t end = offset + size;
+	align.end = round_up(end, block_bytes(c));
+	align.pad_end = align.end - end;
+
+	align.size = align.end - align.start;
+
+	return align;
+}
+
+/*
+ * Given an aligned number of bytes transferred, figure out how many unaligned
+ * bytes were transferred.
+ */
+static size_t align_fix_up_bytes(const struct fuse_align_io *align,
+				 size_t align_bytes)
+{
+	size_t bytes = 0;
+
+	if (align_bytes > align->pad_start) {
+		bytes = align_bytes - align->pad_start;
+		bytes = bytes > align->pad_end ? bytes - align->pad_end : 0;
+	}
+
+	return bytes;
+}
+
+/*
+ * Read aligned data.
+ */
+static int read_aligned(struct bch_fs *c, subvol_inum inum, size_t aligned_size,
+			off_t aligned_offset, void *buf)
+{
+	BUG_ON(aligned_size & (block_bytes(c) - 1));
+	BUG_ON(aligned_offset & (block_bytes(c) - 1));
+
+	struct bch_io_opts io_opts;
+	if (get_inode_io_opts(c, inum, &io_opts))
+		return -ENOENT;
+
+	struct bch_read_bio rbio;
+	struct bio_vec bv;
+	userbio_init(&rbio.bio, &bv, buf, aligned_size);
+	bio_set_op_attrs(&rbio.bio, REQ_OP_READ, REQ_SYNC);
+	rbio.bio.bi_iter.bi_sector	= aligned_offset >> 9;
+
+	struct closure cl;
+	closure_init_stack(&cl);
+
+	closure_get(&cl);
+	rbio.bio.bi_end_io		= bcachefs_fuse_read_endio;
+	rbio.bio.bi_private		= &cl;
+
+	bch2_read(c, rbio_init(&rbio.bio, io_opts), inum);
+
+	closure_sync(&cl);
+
+	return -blk_status_to_errno(rbio.bio.bi_status);
+}
+
+static void bcachefs_fuse_read(fuse_req_t req, fuse_ino_t ino,
+			       size_t size, off_t offset,
+			       struct fuse_file_info *fi)
+{
+	subvol_inum inum = map_root_ino(ino);
+	struct bch_fs *c = fuse_req_userdata(req);
+
+	fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_read(%llu, %zd, %lld)\n",
+		 inum, size, offset);
+
+	/* Check inode size. */
+	struct bch_inode_unpacked bi;
+	int ret = bch2_inode_find_by_inum(c, inum, &bi);
+	if (ret) {
+		fuse_reply_err(req, -ret);
+		return;
+	}
+
+	off_t end = min_t(u64, bi.bi_size, offset + size);
+	if (end <= offset) {
+		fuse_reply_buf(req, NULL, 0);
+		return;
+	}
+	size = end - offset;
+
+	struct fuse_align_io align = align_io(c, size, offset);
+
+	void *buf = aligned_alloc(PAGE_SIZE, align.size);
+	if (!buf) {
+		fuse_reply_err(req, ENOMEM);
+		return;
+	}
+
+	ret = read_aligned(c, inum, align.size, align.start, buf);
+
+	if (likely(!ret))
+		fuse_reply_buf(req, buf + align.pad_start, size);
+	else
+		fuse_reply_err(req, -ret);
+
+	free(buf);
+}
+
+static int inode_update_times(struct bch_fs *c, subvol_inum inum)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bch_inode_unpacked inode_u;
+	int ret = 0;
+	u64 now;
+
+	trans = bch2_trans_get(c);
+retry:
+	bch2_trans_begin(trans);
+	now = bch2_current_time(c);
+
+	ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent);
+	if (ret)
+		goto err;
+
+	inode_u.bi_mtime = now;
+	inode_u.bi_ctime = now;
+
+	ret = bch2_inode_write(trans, &iter, &inode_u);
+	if (ret)
+		goto err;
+
+	ret = bch2_trans_commit(trans, NULL, NULL,
+				BCH_TRANS_COMMIT_no_enospc);
+err:
+        bch2_trans_iter_exit(trans, &iter);
+	if (ret == -EINTR)
+		goto retry;
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int write_aligned(struct bch_fs *c, subvol_inum inum,
+			 struct bch_io_opts io_opts, void *buf,
+			 size_t aligned_size, off_t aligned_offset,
+			 off_t new_i_size, size_t *written_out)
+{
+
+	struct write_aligned_op_t w = { 0 }
+;
+	struct bch_write_op	*op = &w.op;
+	struct bio_vec		bv;
+
+	BUG_ON(aligned_size & (block_bytes(c) - 1));
+	BUG_ON(aligned_offset & (block_bytes(c) - 1));
+
+	*written_out = 0;
+
+	closure_init_stack(&w.cl);
+
+	bch2_write_op_init(op, c, io_opts); /* XXX reads from op?! */
+	op->write_point	= writepoint_hashed(0);
+	op->nr_replicas	= io_opts.data_replicas;
+	op->target	= io_opts.foreground_target;
+	op->subvol	= inum.subvol;
+	op->pos		= POS(inum.inum, aligned_offset >> 9);
+	op->new_i_size	= new_i_size;
+	op->end_io = bcachefs_fuse_write_endio;
+
+	userbio_init(&op->wbio.bio, &bv, buf, aligned_size);
+	bio_set_op_attrs(&op->wbio.bio, REQ_OP_WRITE, REQ_SYNC);
+
+	if (bch2_disk_reservation_get(c, &op->res, aligned_size >> 9,
+				      op->nr_replicas, 0)) {
+		/* XXX: use check_range_allocated like dio write path */
+		return -ENOSPC;
+	}
+
+	closure_get(&w.cl);
+
+	closure_call(&op->cl, bch2_write, NULL, NULL);
+
+	closure_sync(&w.cl);
+
+	if (!op->error)
+		*written_out = op->written << 9;
+
+	return op->error;
+}
+
+static void bcachefs_fuse_write(fuse_req_t req, fuse_ino_t ino,
+				const char *buf, size_t size,
+				off_t offset,
+				struct fuse_file_info *fi)
+{
+	subvol_inum inum = map_root_ino(ino);
+	struct bch_fs *c	= fuse_req_userdata(req);
+	struct bch_io_opts	io_opts;
+	size_t			aligned_written;
+	int			ret = 0;
+
+	fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_write(%llu, %zd, %lld)\n",
+		 inum, size, offset);
+
+	struct fuse_align_io align = align_io(c, size, offset);
+	void *aligned_buf = aligned_alloc(PAGE_SIZE, align.size);
+	BUG_ON(!aligned_buf);
+
+	if (get_inode_io_opts(c, inum, &io_opts)) {
+		ret = -ENOENT;
+		goto err;
+	}
+
+	/* Realign the data and read in start and end, if needed */
+
+	/* Read partial start data. */
+	if (align.pad_start) {
+		memset(aligned_buf, 0, block_bytes(c));
+
+		ret = read_aligned(c, inum, block_bytes(c), align.start,
+				   aligned_buf);
+		if (ret)
+			goto err;
+	}
+
+	/*
+	 * Read partial end data. If the whole write fits in one block, the
+	 * start data and the end data are the same so this isn't needed.
+	 */
+	if (align.pad_end &&
+	    !(align.pad_start && align.size == block_bytes(c))) {
+		off_t partial_end_start = align.end - block_bytes(c);
+		size_t buf_offset = align.size - block_bytes(c);
+
+		memset(aligned_buf + buf_offset, 0, block_bytes(c));
+
+		ret = read_aligned(c, inum, block_bytes(c), partial_end_start,
+				   aligned_buf + buf_offset);
+		if (ret)
+			goto err;
+	}
+
+	/* Overlay what we want to write. */
+	memcpy(aligned_buf + align.pad_start, buf, size);
+
+	/* Actually write. */
+	ret = write_aligned(c, inum, io_opts, aligned_buf,
+			    align.size, align.start,
+			    offset + size, &aligned_written);
+
+	/* Figure out how many unaligned bytes were written. */
+	size_t written = align_fix_up_bytes(&align, aligned_written);
+	BUG_ON(written > size);
+
+	fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_write: wrote %zd bytes\n",
+		 written);
+
+	if (written > 0)
+		ret = 0;
+
+	/*
+	 * Update inode times.
+	 * TODO: Integrate with bch2_extent_update()
+	 */
+	if (!ret)
+		ret = inode_update_times(c, inum);
+
+	if (!ret) {
+		BUG_ON(written == 0);
+		fuse_reply_write(req, written);
+		free(aligned_buf);
+		return;
+	}
+
+err:
+	fuse_reply_err(req, -ret);
+	free(aligned_buf);
+}
+
+static void bcachefs_fuse_symlink(fuse_req_t req, const char *link,
+				  fuse_ino_t dir_ino, const char *name)
+{
+	subvol_inum dir = map_root_ino(dir_ino);
+	struct bch_fs *c = fuse_req_userdata(req);
+	struct bch_inode_unpacked new_inode;
+	size_t link_len = strlen(link);
+	int ret;
+
+	fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_symlink(%s, %llu, %s)\n",
+		 link, dir.inum, name);
+
+	ret = do_create(c, dir, name, S_IFLNK|S_IRWXUGO, 0, &new_inode);
+	if (ret)
+		goto err;
+
+	struct bch_io_opts io_opts;
+	ret = get_inode_io_opts(c, dir, &io_opts);
+	if (ret)
+		goto err;
+
+	struct fuse_align_io align = align_io(c, link_len + 1, 0);
+
+	void *aligned_buf = aligned_alloc(PAGE_SIZE, align.size);
+	BUG_ON(!aligned_buf);
+
+	memset(aligned_buf, 0, align.size);
+	memcpy(aligned_buf, link, link_len); /* already terminated */
+
+	subvol_inum inum = (subvol_inum) { dir.subvol, new_inode.bi_inum };
+
+	size_t aligned_written;
+	ret = write_aligned(c, inum, io_opts, aligned_buf,
+			    align.size, align.start, link_len + 1,
+			    &aligned_written);
+	free(aligned_buf);
+
+	if (ret)
+		goto err;
+
+	size_t written = align_fix_up_bytes(&align, aligned_written);
+	BUG_ON(written != link_len + 1); // TODO: handle short
+
+	ret = inode_update_times(c, inum);
+	if (ret)
+		goto err;
+
+	new_inode.bi_size = written;
+
+	struct fuse_entry_param e = inode_to_entry(c, &new_inode);
+	fuse_reply_entry(req, &e);
+	return;
+
+err:
+	fuse_reply_err(req, -ret);
+}
+
+static void bcachefs_fuse_readlink(fuse_req_t req, fuse_ino_t ino)
+{
+	subvol_inum inum = map_root_ino(ino);
+	struct bch_fs *c = fuse_req_userdata(req);
+	char *buf = NULL;
+
+	fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_readlink(%llu)\n", inum.inum);
+
+	struct bch_inode_unpacked bi;
+	int ret = bch2_inode_find_by_inum(c, inum, &bi);
+	if (ret)
+		goto err;
+
+	struct fuse_align_io align = align_io(c, bi.bi_size, 0);
+
+	ret = -ENOMEM;
+	buf = aligned_alloc(PAGE_SIZE, align.size);
+	if (!buf)
+		goto err;
+
+	ret = read_aligned(c, inum, align.size, align.start, buf);
+	if (ret)
+		goto err;
+
+	BUG_ON(buf[align.size - 1] != 0);
+
+	fuse_reply_readlink(req, buf);
+
+err:
+	if (ret)
+		fuse_reply_err(req, -ret);
+
+	free(buf);
+}
+
+#if 0
+/*
+ * FUSE flush is essentially the close() call, however it is not guaranteed
+ * that one flush happens per open/create.
+ *
+ * It doesn't have to do anything, and is mostly relevant for NFS-style
+ * filesystems where close has some relationship to caching.
+ */
+static void bcachefs_fuse_flush(fuse_req_t req, fuse_ino_t inum,
+				struct fuse_file_info *fi)
+{
+	struct bch_fs *c = fuse_req_userdata(req);
+}
+
+static void bcachefs_fuse_release(fuse_req_t req, fuse_ino_t inum,
+				  struct fuse_file_info *fi)
+{
+	struct bch_fs *c = fuse_req_userdata(req);
+}
+
+static void bcachefs_fuse_fsync(fuse_req_t req, fuse_ino_t inum, int datasync,
+				struct fuse_file_info *fi)
+{
+	struct bch_fs *c = fuse_req_userdata(req);
+}
+
+static void bcachefs_fuse_opendir(fuse_req_t req, fuse_ino_t inum,
+				  struct fuse_file_info *fi)
+{
+	struct bch_fs *c = fuse_req_userdata(req);
+}
+#endif
+
+struct fuse_dir_context {
+	struct dir_context	ctx;
+	fuse_req_t		req;
+	char			*buf;
+	size_t			bufsize;
+};
+
+struct fuse_dirent {
+	uint64_t	ino;
+	uint64_t	off;
+	uint32_t	namelen;
+	uint32_t	type;
+	char name[];
+};
+
+#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name)
+#define FUSE_DIRENT_ALIGN(x) \
+	(((x) + sizeof(uint64_t) - 1) & ~(sizeof(uint64_t) - 1))
+
+static size_t fuse_add_direntry2(char *buf, size_t bufsize,
+				 const char *name, int namelen,
+				 const struct stat *stbuf, off_t off)
+{
+	size_t entlen		= FUSE_NAME_OFFSET + namelen;
+	size_t entlen_padded	= FUSE_DIRENT_ALIGN(entlen);
+	struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
+
+	if ((buf == NULL) || (entlen_padded > bufsize))
+		return entlen_padded;
+
+	dirent->ino = stbuf->st_ino;
+	dirent->off = off;
+	dirent->namelen = namelen;
+	dirent->type = (stbuf->st_mode & S_IFMT) >> 12;
+	memcpy(dirent->name, name, namelen);
+	memset(dirent->name + namelen, 0, entlen_padded - entlen);
+
+	return entlen_padded;
+}
+
+static int fuse_filldir(struct dir_context *_ctx,
+			const char *name, int namelen,
+			loff_t pos, u64 ino, unsigned type)
+{
+	struct fuse_dir_context *ctx =
+		container_of(_ctx, struct fuse_dir_context, ctx);
+
+	struct stat statbuf = {
+		.st_ino		= unmap_root_ino(ino),
+		.st_mode	= type << 12,
+	};
+
+	fuse_log(FUSE_LOG_DEBUG, "fuse_filldir(name=%s inum=%llu pos=%llu)\n",
+		 name, statbuf.st_ino, pos);
+
+	size_t len = fuse_add_direntry2(ctx->buf,
+					ctx->bufsize,
+					name,
+					namelen,
+					&statbuf,
+					pos + 1);
+
+	if (len > ctx->bufsize)
+		return -1;
+
+	ctx->buf	+= len;
+	ctx->bufsize	-= len;
+
+	return 0;
+}
+
+static bool handle_dots(struct fuse_dir_context *ctx, fuse_ino_t dir)
+{
+	if (ctx->ctx.pos == 0) {
+		if (fuse_filldir(&ctx->ctx, ".", 1, ctx->ctx.pos,
+				 dir, DT_DIR) < 0)
+			return false;
+		ctx->ctx.pos = 1;
+	}
+
+	if (ctx->ctx.pos == 1) {
+		if (fuse_filldir(&ctx->ctx, "..", 2, ctx->ctx.pos,
+				 /*TODO: parent*/ 1, DT_DIR) < 0)
+			return false;
+		ctx->ctx.pos = 2;
+	}
+
+	return true;
+}
+
+static void bcachefs_fuse_readdir(fuse_req_t req, fuse_ino_t dir_ino,
+				  size_t size, off_t off,
+				  struct fuse_file_info *fi)
+{
+	subvol_inum dir = map_root_ino(dir_ino);
+	struct bch_fs *c = fuse_req_userdata(req);
+	struct bch_inode_unpacked bi;
+	char *buf = calloc(size, 1);
+	struct fuse_dir_context ctx = {
+		.ctx.actor	= fuse_filldir,
+		.ctx.pos	= off,
+		.req		= req,
+		.buf		= buf,
+		.bufsize	= size,
+	};
+	int ret = 0;
+
+	fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_readdir(dir=%llu, size=%zu, "
+		 "off=%lld)\n", dir.inum, size, off);
+
+	ret = bch2_inode_find_by_inum(c, dir, &bi);
+	if (ret)
+		goto reply;
+
+	if (!S_ISDIR(bi.bi_mode)) {
+		ret = -ENOTDIR;
+		goto reply;
+	}
+
+	if (!handle_dots(&ctx, dir.inum))
+		goto reply;
+
+	ret = bch2_readdir(c, dir, &ctx.ctx);
+reply:
+	if (!ret) {
+		fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_readdir reply %zd\n",
+					ctx.buf - buf);
+		fuse_reply_buf(req, buf, ctx.buf - buf);
+	} else {
+		fuse_reply_err(req, -ret);
+	}
+
+	free(buf);
+}
+
+#if 0
+static void bcachefs_fuse_readdirplus(fuse_req_t req, fuse_ino_t dir,
+				      size_t size, off_t off,
+				      struct fuse_file_info *fi)
+{
+
+}
+
+static void bcachefs_fuse_releasedir(fuse_req_t req, fuse_ino_t inum,
+				     struct fuse_file_info *fi)
+{
+	struct bch_fs *c = fuse_req_userdata(req);
+}
+
+static void bcachefs_fuse_fsyncdir(fuse_req_t req, fuse_ino_t inum, int datasync,
+				   struct fuse_file_info *fi)
+{
+	struct bch_fs *c = fuse_req_userdata(req);
+}
+#endif
+
+static void bcachefs_fuse_statfs(fuse_req_t req, fuse_ino_t inum)
+{
+	struct bch_fs *c = fuse_req_userdata(req);
+	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
+	unsigned shift = c->block_bits;
+	struct statvfs statbuf = {
+		.f_bsize	= block_bytes(c),
+		.f_frsize	= block_bytes(c),
+		.f_blocks	= usage.capacity >> shift,
+		.f_bfree	= (usage.capacity - usage.used) >> shift,
+		//.f_bavail	= statbuf.f_bfree,
+		.f_files	= usage.nr_inodes,
+		.f_ffree	= U64_MAX,
+		.f_namemax	= BCH_NAME_MAX,
+	};
+
+	fuse_reply_statfs(req, &statbuf);
+}
+
+#if 0
+static void bcachefs_fuse_setxattr(fuse_req_t req, fuse_ino_t inum,
+				   const char *name, const char *value,
+				   size_t size, int flags)
+{
+	struct bch_fs *c = fuse_req_userdata(req);
+}
+
+static void bcachefs_fuse_getxattr(fuse_req_t req, fuse_ino_t inum,
+				   const char *name, size_t size)
+{
+	struct bch_fs *c = fuse_req_userdata(req);
+
+	fuse_reply_xattr(req, );
+}
+
+static void bcachefs_fuse_listxattr(fuse_req_t req, fuse_ino_t inum, size_t size)
+{
+	struct bch_fs *c = fuse_req_userdata(req);
+}
+
+static void bcachefs_fuse_removexattr(fuse_req_t req, fuse_ino_t inum,
+				      const char *name)
+{
+	struct bch_fs *c = fuse_req_userdata(req);
+}
+#endif
+
+static void bcachefs_fuse_create(fuse_req_t req, fuse_ino_t dir_ino,
+				 const char *name, mode_t mode,
+				 struct fuse_file_info *fi)
+{
+	subvol_inum dir = map_root_ino(dir_ino);
+	struct bch_fs *c = fuse_req_userdata(req);
+	struct bch_inode_unpacked new_inode;
+	int ret;
+
+	fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_create(%llu, %s, %x)\n",
+		 dir.inum, name, mode);
+
+	ret = do_create(c, dir, name, mode, 0, &new_inode);
+	if (ret)
+		goto err;
+
+	struct fuse_entry_param e = inode_to_entry(c, &new_inode);
+	fuse_reply_create(req, &e, fi);
+	return;
+err:
+	fuse_reply_err(req, -ret);
+}
+
+#if 0
+static void bcachefs_fuse_write_buf(fuse_req_t req, fuse_ino_t inum,
+				    struct fuse_bufvec *bufv, off_t off,
+				    struct fuse_file_info *fi)
+{
+	struct bch_fs *c = fuse_req_userdata(req);
+}
+
+static void bcachefs_fuse_fallocate(fuse_req_t req, fuse_ino_t inum, int mode,
+				    off_t offset, off_t length,
+				    struct fuse_file_info *fi)
+{
+	struct bch_fs *c = fuse_req_userdata(req);
+}
+#endif
+
+static const struct fuse_lowlevel_ops bcachefs_fuse_ops = {
+	.init		= bcachefs_fuse_init,
+	.destroy	= bcachefs_fuse_destroy,
+	.lookup		= bcachefs_fuse_lookup,
+	.getattr	= bcachefs_fuse_getattr,
+	.setattr	= bcachefs_fuse_setattr,
+	.readlink	= bcachefs_fuse_readlink,
+	.mknod		= bcachefs_fuse_mknod,
+	.mkdir		= bcachefs_fuse_mkdir,
+	.unlink		= bcachefs_fuse_unlink,
+	.rmdir		= bcachefs_fuse_rmdir,
+	.symlink	= bcachefs_fuse_symlink,
+	.rename		= bcachefs_fuse_rename,
+	.link		= bcachefs_fuse_link,
+	.open		= bcachefs_fuse_open,
+	.read		= bcachefs_fuse_read,
+	.write		= bcachefs_fuse_write,
+	//.flush	= bcachefs_fuse_flush,
+	//.release	= bcachefs_fuse_release,
+	//.fsync	= bcachefs_fuse_fsync,
+	//.opendir	= bcachefs_fuse_opendir,
+	.readdir	= bcachefs_fuse_readdir,
+	//.readdirplus	= bcachefs_fuse_readdirplus,
+	//.releasedir	= bcachefs_fuse_releasedir,
+	//.fsyncdir	= bcachefs_fuse_fsyncdir,
+	.statfs		= bcachefs_fuse_statfs,
+	//.setxattr	= bcachefs_fuse_setxattr,
+	//.getxattr	= bcachefs_fuse_getxattr,
+	//.listxattr	= bcachefs_fuse_listxattr,
+	//.removexattr	= bcachefs_fuse_removexattr,
+	.create		= bcachefs_fuse_create,
+
+	/* posix locks: */
+#if 0
+	.getlk		= bcachefs_fuse_getlk,
+	.setlk		= bcachefs_fuse_setlk,
+#endif
+	//.write_buf	= bcachefs_fuse_write_buf,
+	//.fallocate	= bcachefs_fuse_fallocate,
+
+};
+
+/*
+ * Setup and command parsing.
+ */
+
+struct bf_context {
+	char            *devices_str;
+	char            **devices;
+	int             nr_devices;
+};
+
+static void bf_context_free(struct bf_context *ctx)
+{
+	int i;
+
+	free(ctx->devices_str);
+	for (i = 0; i < ctx->nr_devices; ++i)
+		free(ctx->devices[i]);
+	free(ctx->devices);
+}
+
+static struct fuse_opt bf_opts[] = {
+	FUSE_OPT_END
+};
+
+/*
+ * Fuse option parsing helper -- returning 0 means we consumed the argument, 1
+ * means we did not.
+ */
+static int bf_opt_proc(void *data, const char *arg, int key,
+    struct fuse_args *outargs)
+{
+	struct bf_context *ctx = data;
+
+	switch (key) {
+	case FUSE_OPT_KEY_NONOPT:
+		/* Just extract the first non-option string. */
+		if (!ctx->devices_str) {
+			ctx->devices_str = strdup(arg);
+			return 0;
+		}
+		return 1;
+	}
+
+	return 1;
+}
+
+/*
+ * dev1:dev2 -> [ dev1, dev2 ]
+ * dev	     -> [ dev ]
+ */
+static void tokenize_devices(struct bf_context *ctx)
+{
+	char *devices_str = strdup(ctx->devices_str);
+	char *devices_tmp = devices_str;
+	char **devices = NULL;
+	int nr = 0;
+	char *dev = NULL;
+
+	while ((dev = strsep(&devices_tmp, ":"))) {
+		if (strlen(dev) > 0) {
+			devices = realloc(devices, (nr + 1) * sizeof *devices);
+			devices[nr] = strdup(dev);
+			nr++;
+		}
+	}
+
+	if (!devices) {
+		devices = malloc(sizeof *devices);
+		devices[0] = strdup(ctx->devices_str);
+		nr = 1;
+	}
+
+	ctx->devices = devices;
+	ctx->nr_devices = nr;
+
+	free(devices_str);
+}
+
+static void usage(char *argv[])
+{
+	printf("Usage: %s fusemount [options] <dev>[:dev2:...] <mountpoint>\n",
+	       argv[0]);
+	printf("\n");
+}
+
+int cmd_fusemount(int argc, char *argv[])
+{
+	struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
+	struct bch_opts bch_opts = bch2_opts_empty();
+	struct bf_context ctx = { 0 };
+	struct bch_fs *c = NULL;
+	struct fuse_session *se = NULL;
+	int ret = 0, i;
+
+	/* Parse arguments. */
+	if (fuse_opt_parse(&args, &ctx, bf_opts, bf_opt_proc) < 0)
+		die("fuse_opt_parse err: %m");
+
+	struct fuse_cmdline_opts fuse_opts;
+	if (fuse_parse_cmdline(&args, &fuse_opts) < 0)
+		die("fuse_parse_cmdline err: %m");
+
+	if (fuse_opts.show_help) {
+		usage(argv);
+		fuse_cmdline_help();
+		fuse_lowlevel_help();
+		ret = 0;
+		goto out;
+	}
+	if (fuse_opts.show_version) {
+		printf("FUSE library version %s\n", fuse_pkgversion());
+		fuse_lowlevel_version();
+		printf("bcachefs version: %s\n", VERSION_STRING);
+		ret = 0;
+		goto out;
+	}
+	if (!fuse_opts.mountpoint) {
+		usage(argv);
+		printf("Please supply a mountpoint.\n");
+		ret = 1;
+		goto out;
+	}
+	if (!ctx.devices_str) {
+		usage(argv);
+		printf("Please specify a device or device1:device2:...\n");
+		ret = 1;
+		goto out;
+	}
+	tokenize_devices(&ctx);
+
+	struct printbuf fsname = PRINTBUF;
+	prt_printf(&fsname, "fsname=");
+	for (i = 0; i < ctx.nr_devices; ++i) {
+		if (i)
+			prt_str(&fsname, ":");
+		prt_str(&fsname, ctx.devices[i]);
+	}
+
+	fuse_opt_add_arg(&args, "-o");
+	fuse_opt_add_arg(&args, fsname.buf);
+
+	/* Open bch */
+	printf("Opening bcachefs filesystem on:\n");
+	for (i = 0; i < ctx.nr_devices; ++i)
+                printf("\t%s\n", ctx.devices[i]);
+
+	c = bch2_fs_open(ctx.devices, ctx.nr_devices, bch_opts);
+	if (IS_ERR(c))
+		die("error opening %s: %s", ctx.devices_str,
+		    bch2_err_str(PTR_ERR(c)));
+
+	/* Fuse */
+	se = fuse_session_new(&args, &bcachefs_fuse_ops,
+				sizeof(bcachefs_fuse_ops), c);
+	if (!se) {
+		fprintf(stderr, "fuse_lowlevel_new err: %m\n");
+		goto err;
+	}
+
+	if (fuse_set_signal_handlers(se) < 0) {
+		fprintf(stderr, "fuse_set_signal_handlers err: %m\n");
+		goto err;
+	}
+
+	if (fuse_session_mount(se, fuse_opts.mountpoint)) {
+		fprintf(stderr, "fuse_mount err: %m\n");
+		goto err;
+	}
+
+	/* This print statement is a trigger for tests. */
+	printf("Fuse mount initialized.\n");
+
+	if (fuse_opts.foreground == 0){
+		printf("Fuse forcing to foreground mode, due gcc constructors usage.\n");
+		fuse_opts.foreground = 1;
+	}
+
+	fuse_daemonize(fuse_opts.foreground);
+
+	ret = fuse_session_loop(se);
+
+out:
+	if (se) {
+		fuse_session_unmount(se);
+		fuse_remove_signal_handlers(se);
+		fuse_session_destroy(se);
+	}
+
+	free(fuse_opts.mountpoint);
+	fuse_opt_free_args(&args);
+	bf_context_free(&ctx);
+
+	return ret ? 1 : 0;
+
+err:
+	bch2_fs_stop(c);
+	goto out;
+}
+
+#endif /* BCACHEFS_FUSE */
diff --git a/cmd_key.c b/c_src/cmd_key.c
index 6052cb00..adb0ac8d 100644
--- a/cmd_key.c
+++ b/c_src/cmd_key.c
@@ -1,4 +1,5 @@
 #include <errno.h>
+#include <fcntl.h>
 #include <unistd.h>
 #include <uuid/uuid.h>
 
@@ -6,6 +7,7 @@
 #include "libbcachefs/checksum.h"
 #include "crypto.h"
 #include "libbcachefs.h"
+#include "tools-util.h"
 
 static void unlock_usage(void)
 {
@@ -14,20 +16,33 @@ static void unlock_usage(void)
 	     "\n"
 	     "Options:\n"
 	     "  -c                     Check if a device is encrypted\n"
+	     "  -k (session|user|user_session)\n"
+	     "                         Keyring to add to (default: user)\n"
+	     "  -f                     Passphrase file to read from (disables passphrase prompt)\n"
 	     "  -h                     Display this help and exit\n"
-	     "Report bugs to <linux-bcache@vger.kernel.org>");
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
 }
 
 int cmd_unlock(int argc, char *argv[])
 {
+	const char *keyring = "user";
 	bool check = false;
+	const char *passphrase_file_path = NULL;
+	char *passphrase = NULL;
+
 	int opt;
 
-	while ((opt = getopt(argc, argv, "ch")) != -1)
+	while ((opt = getopt(argc, argv, "cf:k:h")) != -1)
 		switch (opt) {
 		case 'c':
 			check = true;
 			break;
+		case 'k':
+			keyring = strdup(optarg);
+			break;
+		case 'f':
+			passphrase_file_path = strdup(optarg);
+			break;
 		case 'h':
 			unlock_usage();
 			exit(EXIT_SUCCESS);
@@ -49,17 +64,20 @@ int cmd_unlock(int argc, char *argv[])
 	struct bch_sb_handle sb;
 	int ret = bch2_read_super(dev, &opts, &sb);
 	if (ret)
-		die("Error opening %s: %s", dev, strerror(-ret));
+		die("Error opening %s: %s", dev, bch2_err_str(ret));
 
 	if (!bch2_sb_is_encrypted(sb.sb))
 		die("%s is not encrypted", dev);
 
 	if (check)
 		exit(EXIT_SUCCESS);
+	if (passphrase_file_path){
+		passphrase = read_file_str(AT_FDCWD, passphrase_file_path);
+	} else {
+		passphrase = read_passphrase("Enter passphrase: ");
+	}
 
-	char *passphrase = read_passphrase("Enter passphrase: ");
-
-	bch2_add_key(sb.sb, passphrase);
+	bch2_add_key(sb.sb, "user", keyring, passphrase);
 
 	bch2_free_super(&sb);
 	memzero_explicit(passphrase, strlen(passphrase));
@@ -84,9 +102,9 @@ int cmd_set_passphrase(int argc, char *argv[])
 
 	c = bch2_fs_open(argv + 1, argc - 1, opts);
 	if (IS_ERR(c))
-		die("Error opening %s: %s", argv[1], strerror(-PTR_ERR(c)));
+		die("Error opening %s: %s", argv[1], bch2_err_str(PTR_ERR(c)));
 
-	struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+	struct bch_sb_field_crypt *crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
 	if (!crypt)
 		die("Filesystem does not have encryption enabled");
 
@@ -105,6 +123,7 @@ int cmd_set_passphrase(int argc, char *argv[])
 		die("error encrypting key");
 	crypt->key = new_key;
 
+	bch2_revoke_key(c->disk_sb.sb);
 	bch2_write_super(c);
 	bch2_fs_stop(c);
 	return 0;
@@ -121,9 +140,9 @@ int cmd_remove_passphrase(int argc, char *argv[])
 	opt_set(opts, nostart, true);
 	c = bch2_fs_open(argv + 1, argc - 1, opts);
 	if (IS_ERR(c))
-		die("Error opening %s: %s", argv[1], strerror(-PTR_ERR(c)));
+		die("Error opening %s: %s", argv[1], bch2_err_str(PTR_ERR(c)));
 
-	struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+	struct bch_sb_field_crypt *crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
 	if (!crypt)
 		die("Filesystem does not have encryption enabled");
 
diff --git a/c_src/cmd_kill_btree_node.c b/c_src/cmd_kill_btree_node.c
new file mode 100644
index 00000000..c8f43150
--- /dev/null
+++ b/c_src/cmd_kill_btree_node.c
@@ -0,0 +1,140 @@
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "cmds.h"
+#include "libbcachefs.h"
+#include "tools-util.h"
+
+#include "libbcachefs/bcachefs.h"
+#include "libbcachefs/btree_iter.h"
+#include "libbcachefs/errcode.h"
+#include "libbcachefs/error.h"
+#include "libbcachefs/sb-members.h"
+#include "libbcachefs/super.h"
+
+static void kill_btree_node_usage(void)
+{
+	puts("bcachefs kill_btree_node - make btree nodes unreadable\n"
+	     "Usage: bcachefs kill_btree_node [OPTION]... <devices>\n"
+	     "\n"
+	     "Options:\n"
+	     "  -b (extents|inodes|dirents|xattrs)    Btree to delete from\n"
+	     "  -l level                              Levle to delete from (0 == leaves)\n"
+	     "  -i index                              Index of btree node to kill\n"
+	     "  -h                                    Display this help and exit\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+struct kill_node {
+	unsigned	btree;
+	unsigned	level;
+	u64		idx;
+};
+
+int cmd_kill_btree_node(int argc, char *argv[])
+{
+	struct bch_opts opts = bch2_opts_empty();
+	DARRAY(struct kill_node) kill_nodes = {};
+	int opt;
+
+	opt_set(opts, read_only,	true);
+
+	while ((opt = getopt(argc, argv, "n:h")) != -1)
+		switch (opt) {
+		case 'n': {
+			char *p = optarg;
+			const char *str_btree	= strsep(&p, ":");
+			const char *str_level	= strsep(&p, ":");
+			const char *str_idx	= strsep(&p, ":");
+
+			struct kill_node n = {
+				.btree = read_string_list_or_die(str_btree,
+						__bch2_btree_ids, "btree id"),
+			};
+
+			if (str_level &&
+			    (kstrtouint(str_level, 10, &n.level) || n.level >= BTREE_MAX_DEPTH))
+				die("invalid level");
+
+			if (str_idx &&
+			    kstrtoull(str_idx, 10, &n.idx))
+				die("invalid index %s", str_idx);
+
+			darray_push(&kill_nodes, n);
+			break;
+		}
+		case 'h':
+			kill_btree_node_usage();
+			exit(EXIT_SUCCESS);
+		}
+	args_shift(optind);
+
+	if (!argc)
+		die("Please supply device(s)");
+
+	struct bch_fs *c = bch2_fs_open(argv, argc, opts);
+	if (IS_ERR(c))
+		die("error opening %s: %s", argv[0], bch2_err_str(PTR_ERR(c)));
+
+	int ret;
+	void *zeroes;
+
+	ret = posix_memalign(&zeroes, c->opts.block_size, c->opts.block_size);
+	if (ret)
+		die("error %s from posix_memalign", bch2_err_str(ret));
+
+	struct btree_trans *trans = bch2_trans_get(c);
+
+	darray_for_each(kill_nodes, i) {
+		ret = __for_each_btree_node(trans, iter, i->btree, POS_MIN, 0, i->level, 0, b, ({
+			if (b->c.level != i->level)
+				continue;
+
+			int ret2 = 0;
+			if (!i->idx) {
+				struct printbuf buf = PRINTBUF;
+				bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+				bch_info(c, "killing btree node %s l=%u %s",
+					 bch2_btree_id_str(i->btree), i->level, buf.buf);
+				printbuf_exit(&buf);
+
+				ret2 = 1;
+
+				struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
+				bkey_for_each_ptr(ptrs, ptr) {
+					struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
+					if (!ca)
+						continue;
+
+					int ret3 = pwrite(ca->disk_sb.bdev->bd_fd, zeroes,
+						     c->opts.block_size, ptr->offset << 9);
+					bch2_dev_put(ca);
+					if (ret3 != c->opts.block_size) {
+						bch_err(c, "pwrite error: expected %u got %i %s",
+							c->opts.block_size, ret, strerror(errno));
+						ret2 = EXIT_FAILURE;
+					}
+				}
+			}
+
+			i->idx--;
+			ret2;
+		}));
+
+		if (ret < 0) {
+			bch_err(c, "error %i walking btree nodes", ret);
+			break;
+		} else if (!ret) {
+			bch_err(c, "node at specified index not found");
+			ret = EXIT_FAILURE;
+			break;
+		}
+	}
+
+	bch2_trans_put(trans);
+	bch2_fs_stop(c);
+	darray_exit(&kill_nodes);
+	return ret < 0 ? ret : 0;
+}
diff --git a/c_src/cmd_list_journal.c b/c_src/cmd_list_journal.c
new file mode 100644
index 00000000..fe7f9b05
--- /dev/null
+++ b/c_src/cmd_list_journal.c
@@ -0,0 +1,306 @@
+#include <fcntl.h>
+#include <getopt.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "cmds.h"
+#include "libbcachefs.h"
+#include "tools-util.h"
+
+#include "libbcachefs/bcachefs.h"
+#include "libbcachefs/btree_iter.h"
+#include "libbcachefs/errcode.h"
+#include "libbcachefs/error.h"
+#include "libbcachefs/journal_io.h"
+#include "libbcachefs/journal_seq_blacklist.h"
+#include "libbcachefs/super.h"
+
+static const char *NORMAL	= "\x1B[0m";
+static const char *RED		= "\x1B[31m";
+
+static void list_journal_usage(void)
+{
+	puts("bcachefs list_journal - print contents of journal\n"
+	     "Usage: bcachefs list_journal [OPTION]... <devices>\n"
+	     "\n"
+	     "Options:\n"
+	     "  -a                                Read entire journal, not just dirty entries\n"
+	     "  -n, --nr-entries=nr               Number of journal entries to print, starting from the most recent\n"
+	     "  -t, --transaction-filter=bbpos    Filter transactions not updating <bbpos>\n"
+	     "                                    Or entries not matching the range <bbpos-bbpos>\n"
+	     "  -k, --key-filter=btree            Filter keys not updating btree\n"
+	     "  -v, --verbose                     Verbose mode\n"
+	     "  -h, --help                        Display this help and exit\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+static void star_start_of_lines(char *buf)
+{
+	char *p = buf;
+
+	if (*p == ' ')
+		*p = '*';
+
+	while ((p = strstr(p, "\n ")))
+		p[1] = '*';
+}
+
+static inline bool entry_is_transaction_start(struct jset_entry *entry)
+{
+	return entry->type == BCH_JSET_ENTRY_log && !entry->level;
+}
+
+typedef DARRAY(struct bbpos_range) d_bbpos_range;
+typedef DARRAY(enum btree_id) d_btree_id;
+
+static bool bkey_matches_filter(d_bbpos_range filter, struct jset_entry *entry, struct bkey_i *k)
+{
+	darray_for_each(filter, i) {
+		struct bbpos k_start	= BBPOS(entry->btree_id, bkey_start_pos(&k->k));
+		struct bbpos k_end	= BBPOS(entry->btree_id, k->k.p);
+
+		if (bbpos_cmp(k_start, i->end) < 0 &&
+		    bbpos_cmp(k_end, i->start) > 0)
+			return true;
+	}
+	return false;
+}
+
+static bool entry_matches_transaction_filter(struct jset_entry *entry,
+					     d_bbpos_range filter)
+{
+	if (entry->type == BCH_JSET_ENTRY_btree_root ||
+	    entry->type == BCH_JSET_ENTRY_btree_keys ||
+	    entry->type == BCH_JSET_ENTRY_overwrite)
+		jset_entry_for_each_key(entry, k)
+			if (bkey_matches_filter(filter, entry, k))
+				return true;
+	return false;
+}
+
+static bool should_print_transaction(struct jset_entry *entry, struct jset_entry *end,
+				     darray_str msg_filter,
+				     d_bbpos_range key_filter)
+{
+	struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
+	unsigned b = jset_entry_log_msg_bytes(l);
+
+	darray_for_each(msg_filter, i)
+		if (!strncmp(*i, l->d, b))
+			return false;
+
+	if (!key_filter.nr)
+		return true;
+
+	for (entry = vstruct_next(entry);
+	     entry != end && !entry_is_transaction_start(entry);
+	     entry = vstruct_next(entry))
+		if (entry_matches_transaction_filter(entry, key_filter))
+			return true;
+
+	return false;
+}
+
+static bool should_print_entry(struct jset_entry *entry, d_btree_id filter)
+{
+	if (!filter.nr)
+		return true;
+
+	if (entry->type != BCH_JSET_ENTRY_btree_root &&
+	    entry->type != BCH_JSET_ENTRY_btree_keys &&
+	    entry->type != BCH_JSET_ENTRY_overwrite)
+		return true;
+
+	jset_entry_for_each_key(entry, k)
+		darray_for_each(filter, id)
+			if (entry->btree_id == *id)
+				return true;
+
+	return false;
+}
+
+static void journal_entry_header_to_text(struct printbuf *out,
+					 struct bch_fs *c,
+					 struct journal_replay *p, bool blacklisted)
+{
+	if (blacklisted)
+		prt_str(out, "blacklisted ");
+
+	prt_printf(out,
+		   "journal entry     %llu\n"
+		   "  version         %u\n"
+		   "  last seq        %llu\n"
+		   "  flush           %u\n"
+		   "  written at      ",
+		   le64_to_cpu(p->j.seq),
+		   le32_to_cpu(p->j.version),
+		   le64_to_cpu(p->j.last_seq),
+		   !JSET_NO_FLUSH(&p->j));
+	bch2_journal_ptrs_to_text(out, c, p);
+
+	if (blacklisted)
+		star_start_of_lines(out->buf);
+}
+
+static void journal_entry_header_print(struct bch_fs *c, struct journal_replay *p, bool blacklisted)
+{
+	struct printbuf buf = PRINTBUF;
+	journal_entry_header_to_text(&buf, c, p, blacklisted);
+	printf("%s\n", buf.buf);
+	printbuf_exit(&buf);
+}
+
+static void journal_entries_print(struct bch_fs *c, unsigned nr_entries,
+				  darray_str transaction_msg_filter,
+				  d_bbpos_range transaction_key_filter,
+				  d_btree_id key_filter)
+{
+	struct journal_replay *p, **_p;
+	struct genradix_iter iter;
+	struct printbuf buf = PRINTBUF;
+
+	genradix_for_each(&c->journal_entries, iter, _p) {
+		bool printed_header = false;
+
+		p = *_p;
+		if (!p)
+			continue;
+
+		if (le64_to_cpu(p->j.seq) + nr_entries < atomic64_read(&c->journal.seq))
+			continue;
+
+		bool blacklisted = p->ignore_blacklisted ||
+			bch2_journal_seq_is_blacklisted(c,
+					le64_to_cpu(p->j.seq), false);
+
+		if (!transaction_msg_filter.nr &&
+		    !transaction_key_filter.nr) {
+			journal_entry_header_print(c, p, blacklisted);
+			printed_header = true;
+		}
+
+		struct jset_entry *entry = p->j.start;
+		struct jset_entry *end = vstruct_last(&p->j);
+		while (entry != end) {
+
+			/*
+			 * log entries denote the start of a new transaction
+			 * commit:
+			 */
+			if (entry_is_transaction_start(entry)) {
+				if (!should_print_transaction(entry, end,
+							      transaction_msg_filter,
+							      transaction_key_filter)) {
+					do {
+						entry = vstruct_next(entry);
+					} while (entry != end && !entry_is_transaction_start(entry));
+
+					continue;
+				}
+
+				prt_newline(&buf);
+			}
+
+			if (!should_print_entry(entry, key_filter))
+				goto next;
+
+			if (!printed_header)
+				journal_entry_header_print(c, p, blacklisted);
+			printed_header = true;
+
+			bool highlight = entry_matches_transaction_filter(entry, transaction_key_filter);
+			if (highlight)
+				fputs(RED, stdout);
+
+			printbuf_indent_add(&buf, 4);
+			bch2_journal_entry_to_text(&buf, c, entry);
+
+			if (blacklisted)
+				star_start_of_lines(buf.buf);
+			printf("%s\n", buf.buf);
+			printbuf_reset(&buf);
+
+			if (highlight)
+				fputs(NORMAL, stdout);
+next:
+			entry = vstruct_next(entry);
+		}
+	}
+
+	printbuf_exit(&buf);
+}
+
+int cmd_list_journal(int argc, char *argv[])
+{
+	static const struct option longopts[] = {
+		{ "nr-entries",		required_argument,	NULL, 'n' },
+		{ "transaction-filter",	required_argument,	NULL, 't' },
+		{ "key-filter",		required_argument,	NULL, 'k' },
+		{ "verbose",		no_argument,		NULL, 'v' },
+		{ "help",		no_argument,		NULL, 'h' },
+		{ NULL }
+	};
+	struct bch_opts opts = bch2_opts_empty();
+	u32 nr_entries = U32_MAX;
+	darray_str	transaction_msg_filter = {};
+	d_bbpos_range	transaction_key_filter = {};
+	d_btree_id	key_filter = {};
+	int opt;
+
+	opt_set(opts, noexcl,		true);
+	opt_set(opts, nochanges,	true);
+	opt_set(opts, norecovery,	true);
+	opt_set(opts, read_only,	true);
+	opt_set(opts, degraded,		true);
+	opt_set(opts, very_degraded,		true);
+	opt_set(opts, errors,		BCH_ON_ERROR_continue);
+	opt_set(opts, fix_errors,	FSCK_FIX_yes);
+	opt_set(opts, retain_recovery_info ,true);
+	opt_set(opts, read_journal_only,true);
+
+	while ((opt = getopt_long(argc, argv, "an:m:t:k:vh",
+				  longopts, NULL)) != -1)
+		switch (opt) {
+		case 'a':
+			opt_set(opts, read_entire_journal, true);
+			break;
+		case 'n':
+			if (kstrtouint(optarg, 10, &nr_entries))
+				die("error parsing nr_entries");
+			opt_set(opts, read_entire_journal, true);
+			break;
+		case 'm':
+			darray_push(&transaction_msg_filter, strdup(optarg));
+			break;
+		case 't':
+			darray_push(&transaction_key_filter, bbpos_range_parse(optarg));
+			break;
+		case 'k':
+			darray_push(&key_filter, read_string_list_or_die(optarg, __bch2_btree_ids, "btree id"));
+			break;
+		case 'v':
+			opt_set(opts, verbose, true);
+			break;
+		case 'h':
+			list_journal_usage();
+			exit(EXIT_SUCCESS);
+		}
+	args_shift(optind);
+
+	if (!argc)
+		die("Please supply device(s) to open");
+
+	darray_str devs = get_or_split_cmdline_devs(argc, argv);
+
+	struct bch_fs *c = bch2_fs_open(devs.data, devs.nr, opts);
+	if (IS_ERR(c))
+		die("error opening %s: %s", argv[0], bch2_err_str(PTR_ERR(c)));
+
+	journal_entries_print(c, nr_entries,
+			      transaction_msg_filter,
+			      transaction_key_filter,
+			      key_filter);
+	bch2_fs_stop(c);
+	return 0;
+}
diff --git a/c_src/cmd_migrate.c b/c_src/cmd_migrate.c
new file mode 100644
index 00000000..a5b7786d
--- /dev/null
+++ b/c_src/cmd_migrate.c
@@ -0,0 +1,426 @@
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/vfs.h>
+#include <unistd.h>
+
+#include <linux/fiemap.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+
+#include <uuid/uuid.h>
+
+#include "cmds.h"
+#include "crypto.h"
+#include "libbcachefs.h"
+#include "posix_to_bcachefs.h"
+
+#include <linux/dcache.h>
+#include <linux/generic-radix-tree.h>
+#include "libbcachefs/bcachefs.h"
+#include "libbcachefs/btree_update.h"
+#include "libbcachefs/buckets.h"
+#include "libbcachefs/dirent.h"
+#include "libbcachefs/errcode.h"
+#include "libbcachefs/inode.h"
+#include "libbcachefs/replicas.h"
+#include "libbcachefs/super.h"
+
+/* XXX cut and pasted from fsck.c */
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+static char *dev_t_to_path(dev_t dev)
+{
+	char link[PATH_MAX], *p;
+	int ret;
+
+	char *sysfs_dev = mprintf("/sys/dev/block/%u:%u",
+				  major(dev), minor(dev));
+	ret = readlink(sysfs_dev, link, sizeof(link));
+	free(sysfs_dev);
+
+	if (ret < 0 || ret >= sizeof(link))
+		die("readlink error while looking up block device: %m");
+
+	link[ret] = '\0';
+
+	p = strrchr(link, '/');
+	if (!p)
+		die("error looking up device name");
+	p++;
+
+	return mprintf("/dev/%s", p);
+}
+
+static bool path_is_fs_root(const char *path)
+{
+	char *line = NULL, *p, *mount;
+	size_t n = 0;
+	FILE *f;
+	bool ret = true;
+
+	f = fopen("/proc/self/mountinfo", "r");
+	if (!f)
+		die("Error getting mount information");
+
+	while (getline(&line, &n, f) != -1) {
+		p = line;
+
+		strsep(&p, " "); /* mount id */
+		strsep(&p, " "); /* parent id */
+		strsep(&p, " "); /* dev */
+		strsep(&p, " "); /* root */
+		mount = strsep(&p, " ");
+		strsep(&p, " ");
+
+		if (mount && !strcmp(path, mount))
+			goto found;
+	}
+
+	ret = false;
+found:
+	fclose(f);
+	free(line);
+	return ret;
+}
+
+static void mark_unreserved_space(struct bch_fs *c, ranges extents)
+{
+	struct bch_dev *ca = c->devs[0];
+	struct hole_iter iter;
+	struct range i;
+
+	for_each_hole(iter, extents, bucket_to_sector(ca, ca->mi.nbuckets) << 9, i) {
+		u64 b;
+
+		if (i.start == i.end)
+			return;
+
+		b = sector_to_bucket(ca, i.start >> 9);
+		do {
+			set_bit(b, ca->buckets_nouse);
+			b++;
+		} while (bucket_to_sector(ca, b) << 9 < i.end);
+	}
+}
+
+static ranges reserve_new_fs_space(const char *file_path, unsigned block_size,
+				   u64 size, u64 *bcachefs_inum, dev_t dev,
+				   bool force)
+{
+	int fd = force
+		? open(file_path, O_RDWR|O_CREAT, 0600)
+		: open(file_path, O_RDWR|O_CREAT|O_EXCL, 0600);
+	if (fd < 0)
+		die("Error creating %s for bcachefs metadata: %m",
+		    file_path);
+
+	struct stat statbuf = xfstat(fd);
+
+	if (statbuf.st_dev != dev)
+		die("bcachefs file has incorrect device");
+
+	*bcachefs_inum = statbuf.st_ino;
+
+	if (fallocate(fd, 0, 0, size))
+		die("Error reserving space for bcachefs metadata: %m");
+
+	fsync(fd);
+
+	struct fiemap_iter iter;
+	struct fiemap_extent e;
+	ranges extents = { 0 };
+
+	fiemap_for_each(fd, iter, e) {
+		if (e.fe_flags & (FIEMAP_EXTENT_UNKNOWN|
+				  FIEMAP_EXTENT_ENCODED|
+				  FIEMAP_EXTENT_NOT_ALIGNED|
+				  FIEMAP_EXTENT_DATA_INLINE))
+			die("Unable to continue: metadata file not fully mapped");
+
+		if ((e.fe_physical	& (block_size - 1)) ||
+		    (e.fe_length	& (block_size - 1)))
+			die("Unable to continue: unaligned extents in metadata file");
+
+		range_add(&extents, e.fe_physical, e.fe_length);
+	}
+	fiemap_iter_exit(&iter);
+	close(fd);
+
+	ranges_sort_merge(&extents);
+	return extents;
+}
+
+static void find_superblock_space(ranges extents,
+				  struct format_opts opts,
+				  struct dev_opts *dev)
+{
+	darray_for_each(extents, i) {
+		u64 start = round_up(max(256ULL << 10, i->start),
+				     dev->bucket_size << 9);
+		u64 end = round_down(i->end,
+				     dev->bucket_size << 9);
+
+		/* Need space for two superblocks: */
+		if (start + (opts.superblock_size << 9) * 2 <= end) {
+			dev->sb_offset	= start >> 9;
+			dev->sb_end	= dev->sb_offset + opts.superblock_size * 2;
+			return;
+		}
+	}
+
+	die("Couldn't find a valid location for superblock");
+}
+
+static void migrate_usage(void)
+{
+	puts("bcachefs migrate - migrate an existing filesystem to bcachefs\n"
+	     "Usage: bcachefs migrate [OPTION]...\n"
+	     "\n"
+	     "Options:\n"
+	     "  -f fs                  Root of filesystem to migrate(s)\n"
+	     "      --encrypted        Enable whole filesystem encryption (chacha20/poly1305)\n"
+	     "      --no_passphrase    Don't encrypt master encryption key\n"
+	     "  -F                     Force, even if metadata file already exists\n"
+	     "  -h                     Display this help and exit\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+static const struct option migrate_opts[] = {
+	{ "encrypted",		no_argument, NULL, 'e' },
+	{ "no_passphrase",	no_argument, NULL, 'p' },
+	{ NULL }
+};
+
+static int migrate_fs(const char		*fs_path,
+		      struct bch_opt_strs	fs_opt_strs,
+		      struct bch_opts		fs_opts,
+		      struct format_opts	format_opts,
+		      bool force)
+{
+	if (!path_is_fs_root(fs_path))
+		die("%s is not a filesystem root", fs_path);
+
+	int fs_fd = xopen(fs_path, O_RDONLY|O_NOATIME);
+	struct stat stat = xfstat(fs_fd);
+
+	if (!S_ISDIR(stat.st_mode))
+		die("%s is not a directory", fs_path);
+
+	struct dev_opts dev = dev_opts_default();
+
+	dev.path = dev_t_to_path(stat.st_dev);
+	dev.file = bdev_file_open_by_path(dev.path, BLK_OPEN_READ|BLK_OPEN_WRITE, &dev, NULL);
+
+	int ret = PTR_ERR_OR_ZERO(dev.file);
+	if (ret < 0)
+		die("Error opening device to format %s: %s", dev.path, strerror(-ret));
+	dev.bdev = file_bdev(dev.file);
+
+	opt_set(fs_opts, block_size, get_blocksize(dev.bdev->bd_fd));
+
+	char *file_path = mprintf("%s/bcachefs", fs_path);
+	printf("Creating new filesystem on %s in space reserved at %s\n",
+	       dev.path, file_path);
+
+	dev.size	= get_size(dev.bdev->bd_fd);
+	dev.bucket_size = bch2_pick_bucket_size(fs_opts, &dev);
+	dev.nbuckets	= dev.size / dev.bucket_size;
+
+	bch2_check_bucket_size(fs_opts, &dev);
+
+	u64 bcachefs_inum;
+	ranges extents = reserve_new_fs_space(file_path,
+				fs_opts.block_size >> 9,
+				get_size(dev.bdev->bd_fd) / 5,
+				&bcachefs_inum, stat.st_dev, force);
+
+	find_superblock_space(extents, format_opts, &dev);
+
+	struct bch_sb *sb = bch2_format(fs_opt_strs,
+					fs_opts, format_opts, &dev, 1);
+	u64 sb_offset = le64_to_cpu(sb->layout.sb_offset[0]);
+
+	if (format_opts.passphrase)
+		bch2_add_key(sb, "user", "user", format_opts.passphrase);
+
+	free(sb);
+
+	struct bch_opts opts = bch2_opts_empty();
+	struct bch_fs *c = NULL;
+	char *path[1] = { dev.path };
+
+	opt_set(opts, sb,	sb_offset);
+	opt_set(opts, nostart,	true);
+	opt_set(opts, noexcl,	true);
+	opt_set(opts, nostart, true);
+
+	c = bch2_fs_open(path, 1, opts);
+	if (IS_ERR(c))
+		die("Error opening new filesystem: %s", bch2_err_str(PTR_ERR(c)));
+
+	ret = bch2_buckets_nouse_alloc(c);
+	if (ret)
+		die("Error allocating buckets_nouse: %s", bch2_err_str(ret));
+
+	ret = bch2_fs_start(c);
+	if (IS_ERR(c))
+		die("Error starting new filesystem: %s", bch2_err_str(ret));
+
+	mark_unreserved_space(c, extents);
+
+	ret = bch2_fs_start(c);
+	if (ret)
+		die("Error starting new filesystem: %s", bch2_err_str(ret));
+
+	struct copy_fs_state s = {
+		.bcachefs_inum	= bcachefs_inum,
+		.dev		= stat.st_dev,
+		.extents	= extents,
+		.type		= BCH_MIGRATE_migrate,
+	};
+
+	copy_fs(c, fs_fd, fs_path, &s);
+
+	bch2_fs_stop(c);
+
+	printf("Migrate complete, running fsck:\n");
+	opt_set(opts, nostart,	false);
+	opt_set(opts, nochanges, true);
+	opt_set(opts, read_only, true);
+
+	c = bch2_fs_open(path, 1, opts);
+	if (IS_ERR(c))
+		die("Error opening new filesystem: %s", bch2_err_str(PTR_ERR(c)));
+
+	bch2_fs_stop(c);
+	printf("fsck complete\n");
+
+	printf("To mount the new filesystem, run\n"
+	       "  mount -t bcachefs -o sb=%llu %s dir\n"
+	       "\n"
+	       "After verifying that the new filesystem is correct, to create a\n"
+	       "superblock at the default offset and finish the migration run\n"
+	       "  bcachefs migrate-superblock -d %s -o %llu\n"
+	       "\n"
+	       "The new filesystem will have a file at /old_migrated_filesystem\n"
+	       "referencing all disk space that might be used by the existing\n"
+	       "filesystem. That file can be deleted once the old filesystem is\n"
+	       "no longer needed (and should be deleted prior to running\n"
+	       "bcachefs migrate-superblock)\n",
+	       sb_offset, dev.path, dev.path, sb_offset);
+	return 0;
+}
+
+int cmd_migrate(int argc, char *argv[])
+{
+	struct format_opts format_opts = format_opts_default();
+	char *fs_path = NULL;
+	bool no_passphrase = false, force = false;
+	int opt;
+
+	struct bch_opt_strs fs_opt_strs =
+		bch2_cmdline_opts_get(&argc, argv, OPT_FORMAT);
+	struct bch_opts fs_opts = bch2_parse_opts(fs_opt_strs);
+
+	while ((opt = getopt_long(argc, argv, "f:Fh",
+				  migrate_opts, NULL)) != -1)
+		switch (opt) {
+		case 'f':
+			fs_path = optarg;
+			break;
+		case 'e':
+			format_opts.encrypted = true;
+			break;
+		case 'p':
+			no_passphrase = true;
+			break;
+		case 'F':
+			force = true;
+			break;
+		case 'h':
+			migrate_usage();
+			exit(EXIT_SUCCESS);
+		}
+
+	if (!fs_path)
+		die("Please specify a filesystem to migrate");
+
+	if (format_opts.encrypted && !no_passphrase)
+		format_opts.passphrase = read_passphrase_twice("Enter passphrase: ");
+
+	int ret = migrate_fs(fs_path,
+			     fs_opt_strs,
+			     fs_opts,
+			     format_opts, force);
+	bch2_opt_strs_free(&fs_opt_strs);
+	return ret;
+}
+
+static void migrate_superblock_usage(void)
+{
+	puts("bcachefs migrate-superblock - create default superblock after migrating\n"
+	     "Usage: bcachefs migrate-superblock [OPTION]...\n"
+	     "\n"
+	     "Options:\n"
+	     "  -d device     Device to create superblock for\n"
+	     "  -o offset     Offset of existing superblock\n"
+	     "  -h            Display this help and exit\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+int cmd_migrate_superblock(int argc, char *argv[])
+{
+	char *dev = NULL;
+	u64 offset = 0;
+	int opt, ret;
+
+	while ((opt = getopt(argc, argv, "d:o:h")) != -1)
+		switch (opt) {
+			case 'd':
+				dev = optarg;
+				break;
+			case 'o':
+				ret = kstrtou64(optarg, 10, &offset);
+				if (ret)
+					die("Invalid offset");
+				break;
+			case 'h':
+				migrate_superblock_usage();
+				exit(EXIT_SUCCESS);
+		}
+
+	if (!dev)
+		die("Please specify a device");
+
+	if (!offset)
+		die("Please specify offset of existing superblock");
+
+	int fd = xopen(dev, O_RDWR);
+	struct bch_sb *sb = __bch2_super_read(fd, offset);
+
+	if (sb->layout.nr_superblocks >= ARRAY_SIZE(sb->layout.sb_offset))
+		die("Can't add superblock: no space left in superblock layout");
+
+	unsigned i;
+	for (i = 0; i < sb->layout.nr_superblocks; i++)
+		if (le64_to_cpu(sb->layout.sb_offset[i]) == BCH_SB_SECTOR)
+			die("Superblock layout already has default superblock");
+
+	memmove(&sb->layout.sb_offset[1],
+		&sb->layout.sb_offset[0],
+		sb->layout.nr_superblocks * sizeof(u64));
+	sb->layout.nr_superblocks++;
+
+	sb->layout.sb_offset[0] = cpu_to_le64(BCH_SB_SECTOR);
+
+	bch2_super_write(fd, sb);
+	close(fd);
+
+	return 0;
+}
diff --git a/c_src/cmd_option.c b/c_src/cmd_option.c
new file mode 100644
index 00000000..21048d7d
--- /dev/null
+++ b/c_src/cmd_option.c
@@ -0,0 +1,168 @@
+/*
+ * Authors: Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * GPLv2
+ */
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <uuid/uuid.h>
+
+#include "cmds.h"
+#include "libbcachefs.h"
+#include "libbcachefs/errcode.h"
+#include "libbcachefs/opts.h"
+#include "libbcachefs/super-io.h"
+
+static void set_option_usage(void)
+{
+	puts("bcachefs set-fs-option \n"
+	     "Usage: bcachefs set-fs-option [OPTION].. device\n"
+	     "\n"
+	     "Options:\n");
+	bch2_opts_usage(OPT_MOUNT);
+	puts("  -h, --help                  display this help and exit\n"
+	     "Report bugs to <linux-bcachefs@vger.kernel.org>");
+	exit(EXIT_SUCCESS);
+}
+
+static int name_to_dev_idx(struct bch_fs *c, const char *dev)
+{
+	int ret = -1;
+
+	rcu_read_lock();
+	for_each_member_device_rcu(c, ca, NULL)
+		if (!strcmp(ca->name, dev)) {
+			ret = ca->dev_idx;
+			break;
+		}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+int cmd_set_option(int argc, char *argv[])
+{
+	struct bch_opt_strs new_opt_strs = bch2_cmdline_opts_get(&argc, argv, OPT_MOUNT|OPT_DEVICE);
+	struct bch_opts new_opts = bch2_parse_opts(new_opt_strs);
+	unsigned i;
+	int opt, ret = 0;
+
+	while ((opt = getopt(argc, argv, "h")) != -1)
+		switch (opt) {
+		case 'h':
+			set_option_usage();
+			break;
+		}
+	args_shift(optind);
+
+	if (!argc) {
+		fprintf(stderr, "Please supply device(s)\n");
+		exit(EXIT_FAILURE);
+	}
+
+	bool online = false;
+	for (i = 0; i < argc; i++)
+		if (dev_mounted(argv[i])) {
+			online = true;
+			break;
+		}
+
+	if (!online) {
+		struct bch_opts open_opts = bch2_opts_empty();
+		opt_set(open_opts, nostart, true);
+
+		struct bch_fs *c = bch2_fs_open(argv, argc, open_opts);
+		if (IS_ERR(c)) {
+			fprintf(stderr, "error opening %s: %s\n", argv[0], bch2_err_str(PTR_ERR(c)));
+			exit(EXIT_FAILURE);
+		}
+
+		for (i = 0; i < bch2_opts_nr; i++) {
+			const struct bch_option *opt = bch2_opt_table + i;
+
+			u64 v = bch2_opt_get_by_id(&new_opts, i);
+
+			if (!bch2_opt_defined_by_id(&new_opts, i))
+				continue;
+
+			ret = bch2_opt_check_may_set(c, i, v);
+			if (ret < 0) {
+				fprintf(stderr, "error setting %s: %i\n", opt->attr.name, ret);
+				continue;
+			}
+
+			if (!(opt->flags & (OPT_FS|OPT_DEVICE)))
+				fprintf(stderr, "Can't set option %s\n", opt->attr.name);
+
+			if (opt->flags & OPT_FS) {
+				bch2_opt_set_sb(c, NULL, opt, v);
+			}
+
+			if (opt->flags & OPT_DEVICE) {
+				for (unsigned dev = 0; dev < argc; dev++) {
+					int dev_idx = name_to_dev_idx(c, argv[dev]);
+					if (dev_idx < 0) {
+						fprintf(stderr, "Couldn't look up device %s\n", argv[i]);
+						continue;
+					}
+
+					bch2_opt_set_sb(c, c->devs[dev_idx], opt, v);
+				}
+			}
+		}
+
+		bch2_fs_stop(c);
+		return ret;
+	} else {
+		unsigned dev_idx;
+		struct bchfs_handle fs = bchu_fs_open_by_dev(argv[i], &dev_idx);
+
+		for (i = 0; i < argc; i++) {
+			struct bchfs_handle fs2 = bchu_fs_open_by_dev(argv[i], &dev_idx);
+			if (memcmp(&fs.uuid, &fs2.uuid, sizeof(fs2.uuid)))
+				die("Filesystem mounted, but not all devices are members");
+			bcache_fs_close(fs2);
+		}
+
+		for (i = 0; i < bch2_opts_nr; i++) {
+			if (!new_opt_strs.by_id[i])
+				continue;
+
+			const struct bch_option *opt = bch2_opt_table + i;
+
+			if (!(opt->flags & (OPT_FS|OPT_DEVICE)))
+				fprintf(stderr, "Can't set option %s\n", opt->attr.name);
+
+			if (opt->flags & OPT_FS) {
+				char *path = mprintf("options/%s", opt->attr.name);
+
+				write_file_str(fs.sysfs_fd, path, new_opt_strs.by_id[i]);
+				free(path);
+			}
+
+			if (opt->flags & OPT_DEVICE) {
+				for (unsigned dev = 0; dev < argc; dev++) {
+					struct bchfs_handle fs2 = bchu_fs_open_by_dev(argv[i], &dev_idx);
+					bcache_fs_close(fs2);
+
+
+					char *path = mprintf("dev-%u/%s", dev_idx, opt->attr.name);
+					write_file_str(fs.sysfs_fd, path, new_opt_strs.by_id[i]);
+					free(path);
+				}
+			}
+		}
+	}
+	return 0;
+}
diff --git a/cmd_run.c b/c_src/cmd_run.c
index 1bf84e5c..1bf84e5c 100644
--- a/cmd_run.c
+++ b/c_src/cmd_run.c
diff --git a/cmd_version.c b/c_src/cmd_version.c
index 3fb4b6e2..5fe30e5e 100644
--- a/cmd_version.c
+++ b/c_src/cmd_version.c
@@ -4,6 +4,6 @@
 
 int cmd_version(int argc, char *argv[])
 {
-	printf("bcachefs tool version %s\n", VERSION_STRING);
+	printf("%s\n", VERSION_STRING);
 	return 0;
 }
diff --git a/cmds.h b/c_src/cmds.h
index 3ebd12f8..64267dc4 100644
--- a/cmds.h
+++ b/c_src/cmds.h
@@ -11,16 +11,12 @@
 
 int cmd_format(int argc, char *argv[]);
 int cmd_show_super(int argc, char *argv[]);
-
-#if 0
-int cmd_assemble(int argc, char *argv[]);
-int cmd_incremental(int argc, char *argv[]);
-int cmd_run(int argc, char *argv[]);
-int cmd_stop(int argc, char *argv[]);
-#endif
+int cmd_reset_counters(int argc, char *argv[]);
+int cmd_set_option(int argc, char *argv[]);
 
 int cmd_fs_usage(int argc, char *argv[]);
 
+int device_usage(void);
 int cmd_device_add(int argc, char *argv[]);
 int cmd_device_remove(int argc, char *argv[]);
 int cmd_device_online(int argc, char *argv[]);
@@ -28,8 +24,11 @@ int cmd_device_offline(int argc, char *argv[]);
 int cmd_device_evacuate(int argc, char *argv[]);
 int cmd_device_set_state(int argc, char *argv[]);
 int cmd_device_resize(int argc, char *argv[]);
+int cmd_device_resize_journal(int argc, char *argv[]);
 
+int data_usage(void);
 int cmd_data_rereplicate(int argc, char *argv[]);
+int cmd_data_job(int argc, char *argv[]);
 
 int cmd_unlock(int argc, char *argv[]);
 int cmd_set_passphrase(int argc, char *argv[]);
@@ -38,11 +37,27 @@ int cmd_remove_passphrase(int argc, char *argv[]);
 int cmd_fsck(int argc, char *argv[]);
 
 int cmd_dump(int argc, char *argv[]);
-int cmd_list(int argc, char *argv[]);
+int cmd_list_journal(int argc, char *argv[]);
+int cmd_kill_btree_node(int argc, char *argv[]);
 
 int cmd_migrate(int argc, char *argv[]);
 int cmd_migrate_superblock(int argc, char *argv[]);
 
 int cmd_version(int argc, char *argv[]);
 
+int cmd_setattr(int argc, char *argv[]);
+
+int subvolume_usage(void);
+int cmd_subvolume_create(int argc, char *argv[]);
+int cmd_subvolume_delete(int argc, char *argv[]);
+int cmd_subvolume_snapshot(int argc, char *argv[]);
+
+int cmd_fusemount(int argc, char *argv[]);
+
+void bcachefs_usage(void);
+int device_cmds(int argc, char *argv[]);
+int fs_cmds(int argc, char *argv[]);
+int data_cmds(int argc, char *argv[]);
+int subvolume_cmds(int argc, char *argv[]);
+
 #endif /* _CMDS_H */
diff --git a/config.h b/c_src/config.h
index e69de29b..e69de29b 100644
--- a/config.h
+++ b/c_src/config.h
diff --git a/crypto.c b/c_src/crypto.c
index 7f7fbd5a..32671bd8 100644
--- a/crypto.c
+++ b/c_src/crypto.c
@@ -12,7 +12,7 @@
 
 #include <keyutils.h>
 #include <linux/random.h>
-#include <libscrypt.h>
+#include <sodium/crypto_pwhash_scryptsalsa208sha256.h>
 #include <uuid/uuid.h>
 
 #include "libbcachefs/checksum.h"
@@ -84,12 +84,13 @@ struct bch_key derive_passphrase(struct bch_sb_field_crypt *crypt,
 
 	switch (BCH_CRYPT_KDF_TYPE(crypt)) {
 	case BCH_KDF_SCRYPT:
-		ret = libscrypt_scrypt((void *) passphrase, strlen(passphrase),
-				       salt, sizeof(salt),
-				       1ULL << BCH_KDF_SCRYPT_N(crypt),
-				       1ULL << BCH_KDF_SCRYPT_R(crypt),
-				       1ULL << BCH_KDF_SCRYPT_P(crypt),
-				       (void *) &key, sizeof(key));
+		ret = crypto_pwhash_scryptsalsa208sha256_ll(
+			(void *) passphrase, strlen(passphrase),
+			salt, sizeof(salt),
+			1ULL << BCH_KDF_SCRYPT_N(crypt),
+			1ULL << BCH_KDF_SCRYPT_R(crypt),
+			1ULL << BCH_KDF_SCRYPT_P(crypt),
+			(void *) &key, sizeof(key));
 		if (ret)
 			die("scrypt error: %i", ret);
 		break;
@@ -104,7 +105,7 @@ bool bch2_sb_is_encrypted(struct bch_sb *sb)
 {
 	struct bch_sb_field_crypt *crypt;
 
-	return (crypt = bch2_sb_get_crypt(sb)) &&
+	return (crypt = bch2_sb_field_get(sb, crypt)) &&
 		bch2_key_is_encrypted(&crypt->key);
 }
 
@@ -112,7 +113,7 @@ void bch2_passphrase_check(struct bch_sb *sb, const char *passphrase,
 			   struct bch_key *passphrase_key,
 			   struct bch_encrypted_key *sb_key)
 {
-	struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(sb);
+	struct bch_sb_field_crypt *crypt = bch2_sb_field_get(sb, crypt);
 	if (!crypt)
 		die("filesystem is not encrypted");
 
@@ -132,10 +133,23 @@ void bch2_passphrase_check(struct bch_sb *sb, const char *passphrase,
 		die("incorrect passphrase");
 }
 
-void bch2_add_key(struct bch_sb *sb, const char *passphrase)
+void bch2_add_key(struct bch_sb *sb,
+		  const char *type,
+		  const char *keyring_str,
+		  const char *passphrase)
 {
 	struct bch_key passphrase_key;
 	struct bch_encrypted_key sb_key;
+	int keyring;
+
+	if (!strcmp(keyring_str, "session"))
+		keyring = KEY_SPEC_SESSION_KEYRING;
+	else if (!strcmp(keyring_str, "user"))
+		keyring = KEY_SPEC_USER_KEYRING;
+	else if (!strcmp(keyring_str, "user_session"))
+		keyring = KEY_SPEC_USER_SESSION_KEYRING;
+	else
+		die("unknown keyring %s", keyring_str);
 
 	bch2_passphrase_check(sb, passphrase,
 			      &passphrase_key,
@@ -146,12 +160,10 @@ void bch2_add_key(struct bch_sb *sb, const char *passphrase)
 
 	char *description = mprintf("bcachefs:%s", uuid);
 
-	if (add_key("logon", description,
-		    &passphrase_key, sizeof(passphrase_key),
-		    KEY_SPEC_USER_KEYRING) < 0 ||
-	    add_key("user", description,
+	if (add_key(type,
+		    description,
 		    &passphrase_key, sizeof(passphrase_key),
-		    KEY_SPEC_USER_KEYRING) < 0)
+		    keyring) < 0)
 		die("add_key error: %m");
 
 	memzero_explicit(description, strlen(description));
@@ -170,9 +182,9 @@ void bch_sb_crypt_init(struct bch_sb *sb,
 	if (passphrase) {
 
 		SET_BCH_CRYPT_KDF_TYPE(crypt, BCH_KDF_SCRYPT);
-		SET_BCH_KDF_SCRYPT_N(crypt, ilog2(SCRYPT_N));
-		SET_BCH_KDF_SCRYPT_R(crypt, ilog2(SCRYPT_r));
-		SET_BCH_KDF_SCRYPT_P(crypt, ilog2(SCRYPT_p));
+		SET_BCH_KDF_SCRYPT_N(crypt, ilog2(16384));
+		SET_BCH_KDF_SCRYPT_R(crypt, ilog2(8));
+		SET_BCH_KDF_SCRYPT_P(crypt, ilog2(16));
 
 		struct bch_key passphrase_key = derive_passphrase(crypt, passphrase);
 
diff --git a/crypto.h b/c_src/crypto.h
index 7f523c05..baea6d86 100644
--- a/crypto.h
+++ b/c_src/crypto.h
@@ -15,7 +15,7 @@ struct bch_key derive_passphrase(struct bch_sb_field_crypt *, const char *);
 bool bch2_sb_is_encrypted(struct bch_sb *);
 void bch2_passphrase_check(struct bch_sb *, const char *,
 			   struct bch_key *, struct bch_encrypted_key *);
-void bch2_add_key(struct bch_sb *, const char *);
+void bch2_add_key(struct bch_sb *, const char *, const char *, const char *);
 void bch_sb_crypt_init(struct bch_sb *sb, struct bch_sb_field_crypt *,
 		       const char *);
 
diff --git a/c_src/libbcachefs.c b/c_src/libbcachefs.c
new file mode 100644
index 00000000..75cab72c
--- /dev/null
+++ b/c_src/libbcachefs.c
@@ -0,0 +1,754 @@
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <uuid/uuid.h>
+
+#include "libbcachefs.h"
+#include "crypto.h"
+#include "libbcachefs/bcachefs_format.h"
+#include "libbcachefs/btree_cache.h"
+#include "libbcachefs/buckets.h"
+#include "libbcachefs/checksum.h"
+#include "libbcachefs/disk_groups.h"
+#include "libbcachefs/journal_seq_blacklist.h"
+#include "libbcachefs/opts.h"
+#include "libbcachefs/replicas.h"
+#include "libbcachefs/super-io.h"
+#include "tools-util.h"
+
+#define NSEC_PER_SEC	1000000000L
+
+static void init_layout(struct bch_sb_layout *l,
+			unsigned block_size,
+			unsigned sb_size,
+			u64 sb_start, u64 sb_end)
+{
+	u64 sb_pos = sb_start;
+	unsigned i;
+
+	memset(l, 0, sizeof(*l));
+
+	l->magic		= BCHFS_MAGIC;
+	l->layout_type		= 0;
+	l->nr_superblocks	= 2;
+	l->sb_max_size_bits	= ilog2(sb_size);
+
+	/* Create two superblocks in the allowed range: */
+	for (i = 0; i < l->nr_superblocks; i++) {
+		if (sb_pos != BCH_SB_SECTOR)
+			sb_pos = round_up(sb_pos, block_size >> 9);
+
+		l->sb_offset[i] = cpu_to_le64(sb_pos);
+		sb_pos += sb_size;
+	}
+
+	if (sb_pos > sb_end)
+		die("insufficient space for superblocks: start %llu end %llu > %llu size %u",
+		    sb_start, sb_pos, sb_end, sb_size);
+}
+
+/* minimum size filesystem we can create, given a bucket size: */
+static u64 min_size(unsigned bucket_size)
+{
+	return BCH_MIN_NR_NBUCKETS * bucket_size;
+}
+
+u64 bch2_pick_bucket_size(struct bch_opts opts, struct dev_opts *dev)
+{
+	u64 bucket_size;
+
+	if (dev->size < min_size(opts.block_size))
+		die("cannot format %s, too small (%llu bytes, min %llu)",
+		    dev->path, dev->size, min_size(opts.block_size));
+
+	/* Bucket size must be >= block size: */
+	bucket_size = opts.block_size;
+
+	/* Bucket size must be >= btree node size: */
+	if (opt_defined(opts, btree_node_size))
+		bucket_size = max_t(unsigned, bucket_size,
+					 opts.btree_node_size);
+
+	/* Want a bucket size of at least 128k, if possible: */
+	bucket_size = max(bucket_size, 128ULL << 10);
+
+	if (dev->size >= min_size(bucket_size)) {
+		unsigned scale = max(1,
+			ilog2(dev->size / min_size(bucket_size)) / 4);
+
+		scale = rounddown_pow_of_two(scale);
+
+		/* max bucket size 1 mb */
+		bucket_size = min(bucket_size * scale, 1ULL << 20);
+	} else {
+		do {
+			bucket_size /= 2;
+		} while (dev->size < min_size(bucket_size));
+	}
+
+	return bucket_size;
+}
+
+void bch2_check_bucket_size(struct bch_opts opts, struct dev_opts *dev)
+{
+	if (dev->bucket_size < opts.block_size)
+		die("Bucket size (%llu) cannot be smaller than block size (%u)",
+		    dev->bucket_size, opts.block_size);
+
+	if (opt_defined(opts, btree_node_size) &&
+	    dev->bucket_size < opts.btree_node_size)
+		die("Bucket size (%llu) cannot be smaller than btree node size (%u)",
+		    dev->bucket_size, opts.btree_node_size);
+
+	if (dev->nbuckets < BCH_MIN_NR_NBUCKETS)
+		die("Not enough buckets: %llu, need %u (bucket size %llu)",
+		    dev->nbuckets, BCH_MIN_NR_NBUCKETS, dev->bucket_size);
+
+	if (dev->bucket_size > (u32) U16_MAX << 9)
+		die("Bucket size (%llu) too big (max %u)",
+		    dev->bucket_size, (u32) U16_MAX << 9);
+}
+
+static unsigned parse_target(struct bch_sb_handle *sb,
+			     struct dev_opts *devs, size_t nr_devs,
+			     const char *s)
+{
+	struct dev_opts *i;
+	int idx;
+
+	if (!s)
+		return 0;
+
+	for (i = devs; i < devs + nr_devs; i++)
+		if (!strcmp(s, i->path))
+			return dev_to_target(i - devs);
+
+	idx = bch2_disk_path_find(sb, s);
+	if (idx >= 0)
+		return group_to_target(idx);
+
+	die("Invalid target %s", s);
+	return 0;
+}
+
+struct bch_sb *bch2_format(struct bch_opt_strs	fs_opt_strs,
+			   struct bch_opts	fs_opts,
+			   struct format_opts	opts,
+			   struct dev_opts	*devs,
+			   size_t		nr_devs)
+{
+	struct bch_sb_handle sb = { NULL };
+	struct dev_opts *i;
+	unsigned max_dev_block_size = 0;
+	unsigned opt_id;
+	u64 min_bucket_size = U64_MAX;
+
+	for (i = devs; i < devs + nr_devs; i++)
+		max_dev_block_size = max(max_dev_block_size, get_blocksize(i->bdev->bd_fd));
+
+	/* calculate block size: */
+	if (!opt_defined(fs_opts, block_size)) {
+		opt_set(fs_opts, block_size, max_dev_block_size);
+	} else if (fs_opts.block_size < max_dev_block_size)
+		die("blocksize too small: %u, must be greater than device blocksize %u",
+		    fs_opts.block_size, max_dev_block_size);
+
+	/* get device size, if it wasn't specified: */
+	for (i = devs; i < devs + nr_devs; i++)
+		if (!i->size)
+			i->size = get_size(i->bdev->bd_fd);
+
+	/* calculate bucket sizes: */
+	for (i = devs; i < devs + nr_devs; i++)
+		min_bucket_size = min(min_bucket_size,
+			i->bucket_size ?: bch2_pick_bucket_size(fs_opts, i));
+
+	for (i = devs; i < devs + nr_devs; i++)
+		if (!i->bucket_size)
+			i->bucket_size = min_bucket_size;
+
+	for (i = devs; i < devs + nr_devs; i++) {
+		i->nbuckets = i->size / i->bucket_size;
+		bch2_check_bucket_size(fs_opts, i);
+	}
+
+	/* calculate btree node size: */
+	if (!opt_defined(fs_opts, btree_node_size)) {
+		/* 256k default btree node size */
+		opt_set(fs_opts, btree_node_size, 256 << 10);
+
+		for (i = devs; i < devs + nr_devs; i++)
+			fs_opts.btree_node_size =
+				min_t(unsigned, fs_opts.btree_node_size,
+				      i->bucket_size);
+	}
+
+	if (uuid_is_null(opts.uuid.b))
+		uuid_generate(opts.uuid.b);
+
+	if (bch2_sb_realloc(&sb, 0))
+		die("insufficient memory");
+
+	sb.sb->version		= le16_to_cpu(opts.version);
+	sb.sb->version_min	= le16_to_cpu(opts.version);
+	sb.sb->magic		= BCHFS_MAGIC;
+	sb.sb->user_uuid	= opts.uuid;
+	sb.sb->nr_devices	= nr_devs;
+
+	if (opts.version == bcachefs_metadata_version_current)
+		sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
+
+	uuid_generate(sb.sb->uuid.b);
+
+	if (opts.label)
+		memcpy(sb.sb->label,
+		       opts.label,
+		       min(strlen(opts.label), sizeof(sb.sb->label)));
+
+	for (opt_id = 0;
+	     opt_id < bch2_opts_nr;
+	     opt_id++) {
+		u64 v;
+
+		v = bch2_opt_defined_by_id(&fs_opts, opt_id)
+			? bch2_opt_get_by_id(&fs_opts, opt_id)
+			: bch2_opt_get_by_id(&bch2_opts_default, opt_id);
+
+		__bch2_opt_set_sb(sb.sb, -1, &bch2_opt_table[opt_id], v);
+	}
+
+	struct timespec now;
+	if (clock_gettime(CLOCK_REALTIME, &now))
+		die("error getting current time: %m");
+
+	sb.sb->time_base_lo	= cpu_to_le64(now.tv_sec * NSEC_PER_SEC + now.tv_nsec);
+	sb.sb->time_precision	= cpu_to_le32(1);
+
+	/* Member info: */
+	struct bch_sb_field_members_v2 *mi =
+		bch2_sb_field_resize(&sb, members_v2,
+			(sizeof(*mi) + sizeof(struct bch_member) *
+			nr_devs) / sizeof(u64));
+	mi->member_bytes = cpu_to_le16(sizeof(struct bch_member));
+	for (i = devs; i < devs + nr_devs; i++) {
+		struct bch_member *m = bch2_members_v2_get_mut(sb.sb, (i - devs));
+
+		uuid_generate(m->uuid.b);
+		m->nbuckets	= cpu_to_le64(i->nbuckets);
+		m->first_bucket	= 0;
+		m->bucket_size	= cpu_to_le16(i->bucket_size >> 9);
+
+		SET_BCH_MEMBER_DISCARD(m,	i->discard);
+		SET_BCH_MEMBER_DATA_ALLOWED(m,	i->data_allowed);
+		SET_BCH_MEMBER_DURABILITY(m,	i->durability + 1);
+	}
+
+	/* Disk labels*/
+	for (i = devs; i < devs + nr_devs; i++) {
+		struct bch_member *m;
+		int idx;
+
+		if (!i->label)
+			continue;
+
+		idx = bch2_disk_path_find_or_create(&sb, i->label);
+		if (idx < 0)
+			die("error creating disk path: %s", strerror(-idx));
+
+		/*
+		 * Recompute mi and m after each sb modification: its location
+		 * in memory may have changed due to reallocation.
+		 */
+		m = bch2_members_v2_get_mut(sb.sb, (i - devs));
+		SET_BCH_MEMBER_GROUP(m,	idx + 1);
+	}
+
+	SET_BCH_SB_FOREGROUND_TARGET(sb.sb,
+		parse_target(&sb, devs, nr_devs, fs_opt_strs.foreground_target));
+	SET_BCH_SB_BACKGROUND_TARGET(sb.sb,
+		parse_target(&sb, devs, nr_devs, fs_opt_strs.background_target));
+	SET_BCH_SB_PROMOTE_TARGET(sb.sb,
+		parse_target(&sb, devs, nr_devs, fs_opt_strs.promote_target));
+	SET_BCH_SB_METADATA_TARGET(sb.sb,
+		parse_target(&sb, devs, nr_devs, fs_opt_strs.metadata_target));
+
+	/* Crypt: */
+	if (opts.encrypted) {
+		struct bch_sb_field_crypt *crypt =
+			bch2_sb_field_resize(&sb, crypt, sizeof(*crypt) / sizeof(u64));
+
+		bch_sb_crypt_init(sb.sb, crypt, opts.passphrase);
+		SET_BCH_SB_ENCRYPTION_TYPE(sb.sb, 1);
+	}
+
+	bch2_sb_members_cpy_v2_v1(&sb);
+
+	for (i = devs; i < devs + nr_devs; i++) {
+		u64 size_sectors = i->size >> 9;
+
+		sb.sb->dev_idx = i - devs;
+
+		if (!i->sb_offset) {
+			i->sb_offset	= BCH_SB_SECTOR;
+			i->sb_end	= size_sectors;
+		}
+
+		init_layout(&sb.sb->layout, fs_opts.block_size,
+			    opts.superblock_size,
+			    i->sb_offset, i->sb_end);
+
+		/*
+		 * Also create a backup superblock at the end of the disk:
+		 *
+		 * If we're not creating a superblock at the default offset, it
+		 * means we're being run from the migrate tool and we could be
+		 * overwriting existing data if we write to the end of the disk:
+		 */
+		if (i->sb_offset == BCH_SB_SECTOR) {
+			struct bch_sb_layout *l = &sb.sb->layout;
+			u64 backup_sb = size_sectors - (1 << l->sb_max_size_bits);
+
+			backup_sb = rounddown(backup_sb, i->bucket_size >> 9);
+			l->sb_offset[l->nr_superblocks++] = cpu_to_le64(backup_sb);
+		}
+
+		if (i->sb_offset == BCH_SB_SECTOR) {
+			/* Zero start of disk */
+			static const char zeroes[BCH_SB_SECTOR << 9];
+
+			xpwrite(i->bdev->bd_fd, zeroes, BCH_SB_SECTOR << 9, 0,
+				"zeroing start of disk");
+		}
+
+		bch2_super_write(i->bdev->bd_fd, sb.sb);
+		close(i->bdev->bd_fd);
+	}
+
+	return sb.sb;
+}
+
+void bch2_super_write(int fd, struct bch_sb *sb)
+{
+	struct nonce nonce = { 0 };
+	unsigned bs = get_blocksize(fd);
+
+	unsigned i;
+	for (i = 0; i < sb->layout.nr_superblocks; i++) {
+		sb->offset = sb->layout.sb_offset[i];
+
+		if (sb->offset == BCH_SB_SECTOR) {
+			/* Write backup layout */
+
+			BUG_ON(bs > 4096);
+
+			char *buf = aligned_alloc(bs, bs);
+			xpread(fd, buf, bs, 4096 - bs);
+			memcpy(buf + bs - sizeof(sb->layout),
+			       &sb->layout,
+			       sizeof(sb->layout));
+			xpwrite(fd, buf, bs, 4096 - bs,
+				"backup layout");
+			free(buf);
+
+		}
+
+		sb->csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb), nonce, sb);
+		xpwrite(fd, sb, round_up(vstruct_bytes(sb), bs),
+			le64_to_cpu(sb->offset) << 9,
+			"superblock");
+	}
+
+	fsync(fd);
+}
+
+struct bch_sb *__bch2_super_read(int fd, u64 sector)
+{
+	struct bch_sb sb, *ret;
+
+	xpread(fd, &sb, sizeof(sb), sector << 9);
+
+	if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic)) &&
+	    memcmp(&sb.magic, &BCHFS_MAGIC, sizeof(sb.magic)))
+		die("not a bcachefs superblock");
+
+	size_t bytes = vstruct_bytes(&sb);
+
+	ret = malloc(bytes);
+
+	xpread(fd, ret, bytes, sector << 9);
+
+	return ret;
+}
+
+/* ioctl interface: */
+
+/* Global control device: */
+int bcachectl_open(void)
+{
+	return xopen("/dev/bcachefs-ctl", O_RDWR);
+}
+
+/* Filesystem handles (ioctl, sysfs dir): */
+
+#define SYSFS_BASE "/sys/fs/bcachefs/"
+
+void bcache_fs_close(struct bchfs_handle fs)
+{
+	close(fs.ioctl_fd);
+	close(fs.sysfs_fd);
+}
+
+struct bchfs_handle bcache_fs_open(const char *path)
+{
+	struct bchfs_handle ret;
+
+	if (!uuid_parse(path, ret.uuid.b)) {
+		/* It's a UUID, look it up in sysfs: */
+		char *sysfs = mprintf(SYSFS_BASE "%s", path);
+		ret.sysfs_fd = xopen(sysfs, O_RDONLY);
+
+		char *minor = read_file_str(ret.sysfs_fd, "minor");
+		char *ctl = mprintf("/dev/bcachefs%s-ctl", minor);
+		ret.ioctl_fd = xopen(ctl, O_RDWR);
+
+		free(sysfs);
+		free(minor);
+		free(ctl);
+	} else {
+		/* It's a path: */
+		ret.ioctl_fd = open(path, O_RDONLY);
+		if (ret.ioctl_fd < 0)
+			die("Error opening filesystem at %s: %m", path);
+
+		struct bch_ioctl_query_uuid uuid;
+		if (ioctl(ret.ioctl_fd, BCH_IOCTL_QUERY_UUID, &uuid) < 0)
+			die("error opening %s: not a bcachefs filesystem", path);
+
+		ret.uuid = uuid.uuid;
+
+		char uuid_str[40];
+		uuid_unparse(uuid.uuid.b, uuid_str);
+
+		char *sysfs = mprintf(SYSFS_BASE "%s", uuid_str);
+		ret.sysfs_fd = xopen(sysfs, O_RDONLY);
+		free(sysfs);
+	}
+
+	return ret;
+}
+
+/*
+ * Given a path to a block device, open the filesystem it belongs to; also
+ * return the device's idx:
+ */
+struct bchfs_handle bchu_fs_open_by_dev(const char *path, int *idx)
+{
+	struct bch_opts opts = bch2_opts_empty();
+	char buf[1024], *uuid_str;
+
+	struct stat stat = xstat(path);
+
+	if (S_ISBLK(stat.st_mode)) {
+		char *sysfs = mprintf("/sys/dev/block/%u:%u/bcachefs",
+				      major(stat.st_dev),
+				      minor(stat.st_dev));
+
+		ssize_t len = readlink(sysfs, buf, sizeof(buf));
+		free(sysfs);
+
+		if (len <= 0)
+			goto read_super;
+
+		char *p = strrchr(buf, '/');
+		if (!p || sscanf(p + 1, "dev-%u", idx) != 1)
+			die("error parsing sysfs");
+
+		*p = '\0';
+		p = strrchr(buf, '/');
+		uuid_str = p + 1;
+	} else {
+read_super:
+		opt_set(opts, noexcl,	true);
+		opt_set(opts, nochanges, true);
+
+		struct bch_sb_handle sb;
+		int ret = bch2_read_super(path, &opts, &sb);
+		if (ret)
+			die("Error opening %s: %s", path, strerror(-ret));
+
+		*idx = sb.sb->dev_idx;
+		uuid_str = buf;
+		uuid_unparse(sb.sb->user_uuid.b, uuid_str);
+
+		bch2_free_super(&sb);
+	}
+
+	return bcache_fs_open(uuid_str);
+}
+
+int bchu_dev_path_to_idx(struct bchfs_handle fs, const char *dev_path)
+{
+	int idx;
+	struct bchfs_handle fs2 = bchu_fs_open_by_dev(dev_path, &idx);
+
+	if (memcmp(&fs.uuid, &fs2.uuid, sizeof(fs.uuid)))
+		idx = -1;
+	bcache_fs_close(fs2);
+	return idx;
+}
+
+int bchu_data(struct bchfs_handle fs, struct bch_ioctl_data cmd)
+{
+	int progress_fd = xioctl(fs.ioctl_fd, BCH_IOCTL_DATA, &cmd);
+
+	while (1) {
+		struct bch_ioctl_data_event e;
+
+		if (read(progress_fd, &e, sizeof(e)) != sizeof(e))
+			die("error reading from progress fd %m");
+
+		if (e.type)
+			continue;
+
+		if (e.p.data_type == U8_MAX)
+			break;
+
+		printf("\33[2K\r");
+
+		printf("%llu%% complete: current position %s",
+		       e.p.sectors_total
+		       ? e.p.sectors_done * 100 / e.p.sectors_total
+		       : 0,
+		       bch2_data_type_str(e.p.data_type));
+
+		switch (e.p.data_type) {
+		case BCH_DATA_btree:
+		case BCH_DATA_user:
+			printf(" %s:%llu:%llu",
+			       bch2_btree_id_str(e.p.btree_id),
+			       e.p.pos.inode,
+			       e.p.pos.offset);
+		}
+
+		fflush(stdout);
+		sleep(1);
+	}
+	printf("\nDone\n");
+
+	close(progress_fd);
+	return 0;
+}
+
+/* option parsing */
+
+void bch2_opt_strs_free(struct bch_opt_strs *opts)
+{
+	unsigned i;
+
+	for (i = 0; i < bch2_opts_nr; i++) {
+		free(opts->by_id[i]);
+		opts->by_id[i] = NULL;
+	}
+}
+
+struct bch_opt_strs bch2_cmdline_opts_get(int *argc, char *argv[],
+					  unsigned opt_types)
+{
+	struct bch_opt_strs opts;
+	unsigned i = 1;
+
+	memset(&opts, 0, sizeof(opts));
+
+	while (i < *argc) {
+		char *optstr = strcmp_prefix(argv[i], "--");
+		char *valstr = NULL, *p;
+		int optid, nr_args = 1;
+
+		if (!optstr) {
+			i++;
+			continue;
+		}
+
+		optstr = strdup(optstr);
+
+		p = optstr;
+		while (isalpha(*p) || *p == '_')
+			p++;
+
+		if (*p == '=') {
+			*p = '\0';
+			valstr = p + 1;
+		}
+
+		optid = bch2_opt_lookup(optstr);
+		if (optid < 0 ||
+		    !(bch2_opt_table[optid].flags & opt_types)) {
+			i++;
+			goto next;
+		}
+
+		if (!valstr &&
+		    bch2_opt_table[optid].type != BCH_OPT_BOOL) {
+			nr_args = 2;
+			valstr = argv[i + 1];
+		}
+
+		if (!valstr)
+			valstr = "1";
+
+		opts.by_id[optid] = strdup(valstr);
+
+		*argc -= nr_args;
+		memmove(&argv[i],
+			&argv[i + nr_args],
+			sizeof(char *) * (*argc - i));
+		argv[*argc] = NULL;
+next:
+		free(optstr);
+	}
+
+	return opts;
+}
+
+struct bch_opts bch2_parse_opts(struct bch_opt_strs strs)
+{
+	struct bch_opts opts = bch2_opts_empty();
+	struct printbuf err = PRINTBUF;
+	unsigned i;
+	int ret;
+	u64 v;
+
+	for (i = 0; i < bch2_opts_nr; i++) {
+		if (!strs.by_id[i])
+			continue;
+
+		ret = bch2_opt_parse(NULL,
+				     &bch2_opt_table[i],
+				     strs.by_id[i], &v, &err);
+		if (ret < 0 && ret != -BCH_ERR_option_needs_open_fs)
+			die("Invalid option %s", err.buf);
+
+		bch2_opt_set_by_id(&opts, i, v);
+	}
+
+	printbuf_exit(&err);
+	return opts;
+}
+
+#define newline(c)		\
+	do {			\
+		printf("\n");	\
+		c = 0;	 	\
+	} while(0)
+void bch2_opts_usage(unsigned opt_types)
+{
+	const struct bch_option *opt;
+	unsigned i, c = 0, helpcol = 30;
+
+
+
+	for (opt = bch2_opt_table;
+	     opt < bch2_opt_table + bch2_opts_nr;
+	     opt++) {
+		if (!(opt->flags & opt_types))
+			continue;
+
+		c += printf("      --%s", opt->attr.name);
+
+		switch (opt->type) {
+		case BCH_OPT_BOOL:
+			break;
+		case BCH_OPT_STR:
+			c += printf("=(");
+			for (i = 0; opt->choices[i]; i++) {
+				if (i)
+					c += printf("|");
+				c += printf("%s", opt->choices[i]);
+			}
+			c += printf(")");
+			break;
+		default:
+			c += printf("=%s", opt->hint);
+			break;
+		}
+
+		if (opt->help) {
+			const char *l = opt->help;
+
+			if (c >= helpcol)
+				newline(c);
+
+			while (1) {
+				const char *n = strchrnul(l, '\n');
+
+				while (c < helpcol) {
+					putchar(' ');
+					c++;
+				}
+				printf("%.*s", (int) (n - l), l);
+				newline(c);
+
+				if (!*n)
+					break;
+				l = n + 1;
+			}
+		} else {
+			newline(c);
+		}
+	}
+}
+
+dev_names bchu_fs_get_devices(struct bchfs_handle fs)
+{
+	DIR *dir = fdopendir(fs.sysfs_fd);
+	struct dirent *d;
+	dev_names devs;
+
+	darray_init(&devs);
+
+	while ((errno = 0), (d = readdir(dir))) {
+		struct dev_name n = { 0, NULL, NULL };
+
+		if (sscanf(d->d_name, "dev-%u", &n.idx) != 1)
+			continue;
+
+		char *block_attr = mprintf("dev-%u/block", n.idx);
+
+		char sysfs_block_buf[4096];
+		ssize_t r = readlinkat(fs.sysfs_fd, block_attr,
+				       sysfs_block_buf, sizeof(sysfs_block_buf));
+		if (r > 0) {
+			sysfs_block_buf[r] = '\0';
+			n.dev = strdup(basename(sysfs_block_buf));
+		}
+
+		free(block_attr);
+
+		char *label_attr = mprintf("dev-%u/label", n.idx);
+		n.label = read_file_str(fs.sysfs_fd, label_attr);
+		free(label_attr);
+
+		char *durability_attr = mprintf("dev-%u/durability", n.idx);
+		n.durability = read_file_u64(fs.sysfs_fd, durability_attr);
+		free(durability_attr);
+
+		darray_push(&devs, n);
+	}
+
+	closedir(dir);
+
+	return devs;
+}
diff --git a/c_src/libbcachefs.h b/c_src/libbcachefs.h
new file mode 100644
index 00000000..fc6eb8bf
--- /dev/null
+++ b/c_src/libbcachefs.h
@@ -0,0 +1,300 @@
+#ifndef _LIBBCACHE_H
+#define _LIBBCACHE_H
+
+#include <linux/uuid.h>
+#include <stdbool.h>
+
+#include "libbcachefs/bcachefs.h"
+#include "libbcachefs/bcachefs_format.h"
+#include "libbcachefs/bcachefs_ioctl.h"
+#include "libbcachefs/inode.h"
+#include "libbcachefs/opts.h"
+#include "libbcachefs/vstructs.h"
+#include "tools-util.h"
+
+/* option parsing */
+
+#define SUPERBLOCK_SIZE_DEFAULT		2048	/* 1 MB */
+
+struct bch_opt_strs {
+union {
+	char			*by_id[bch2_opts_nr];
+struct {
+#define x(_name, ...)	char	*_name;
+	BCH_OPTS()
+#undef x
+};
+};
+};
+
+void bch2_opt_strs_free(struct bch_opt_strs *);
+struct bch_opt_strs bch2_cmdline_opts_get(int *, char *[], unsigned);
+struct bch_opts bch2_parse_opts(struct bch_opt_strs);
+void bch2_opts_usage(unsigned);
+
+struct format_opts {
+	char		*label;
+	__uuid_t	uuid;
+	unsigned	version;
+	unsigned	superblock_size;
+	bool		encrypted;
+	char		*passphrase;
+	char		*source;
+};
+
+static inline struct format_opts format_opts_default()
+{
+	unsigned version = !access(   "/sys/module/bcachefs/parameters/version", R_OK)
+	    ? read_file_u64(AT_FDCWD, "/sys/module/bcachefs/parameters/version")
+	    : bcachefs_metadata_version_current;
+
+	return (struct format_opts) {
+		.version		= version,
+		.superblock_size	= SUPERBLOCK_SIZE_DEFAULT,
+	};
+}
+
+struct dev_opts {
+	struct file	*file;
+	struct block_device *bdev;
+	char		*path;
+	u64		size;		/* bytes*/
+	u64		bucket_size;	/* bytes */
+	const char	*label;
+	unsigned	data_allowed;
+	unsigned	durability;
+	bool		discard;
+
+	u64		nbuckets;
+
+	u64		sb_offset;
+	u64		sb_end;
+};
+
+static inline struct dev_opts dev_opts_default()
+{
+	return (struct dev_opts) {
+		.data_allowed		= ~0U << 2,
+		.durability		= 1,
+	};
+}
+
+u64 bch2_pick_bucket_size(struct bch_opts, struct dev_opts *);
+void bch2_check_bucket_size(struct bch_opts, struct dev_opts *);
+
+struct bch_sb *bch2_format(struct bch_opt_strs,
+			   struct bch_opts,
+			   struct format_opts, struct dev_opts *, size_t);
+
+void bch2_super_write(int, struct bch_sb *);
+struct bch_sb *__bch2_super_read(int, u64);
+
+/* ioctl interface: */
+
+int bcachectl_open(void);
+
+struct bchfs_handle {
+	__uuid_t	uuid;
+	int		ioctl_fd;
+	int		sysfs_fd;
+};
+
+void bcache_fs_close(struct bchfs_handle);
+struct bchfs_handle bcache_fs_open(const char *);
+struct bchfs_handle bchu_fs_open_by_dev(const char *, int *);
+int bchu_dev_path_to_idx(struct bchfs_handle, const char *);
+
+static inline void bchu_disk_add(struct bchfs_handle fs, char *dev)
+{
+	struct bch_ioctl_disk i = { .dev = (unsigned long) dev, };
+
+	xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_ADD, &i);
+}
+
+static inline void bchu_disk_remove(struct bchfs_handle fs, unsigned dev_idx,
+				    unsigned flags)
+{
+	struct bch_ioctl_disk i = {
+		.flags	= flags|BCH_BY_INDEX,
+		.dev	= dev_idx,
+	};
+
+	xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_REMOVE, &i);
+}
+
+static inline void bchu_disk_online(struct bchfs_handle fs, char *dev)
+{
+	struct bch_ioctl_disk i = { .dev = (unsigned long) dev, };
+
+	xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_ONLINE, &i);
+}
+
+static inline void bchu_disk_offline(struct bchfs_handle fs, unsigned dev_idx,
+				     unsigned flags)
+{
+	struct bch_ioctl_disk i = {
+		.flags	= flags|BCH_BY_INDEX,
+		.dev	= dev_idx,
+	};
+
+	xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_OFFLINE, &i);
+}
+
+static inline void bchu_disk_set_state(struct bchfs_handle fs, unsigned dev,
+				       unsigned new_state, unsigned flags)
+{
+	struct bch_ioctl_disk_set_state i = {
+		.flags		= flags|BCH_BY_INDEX,
+		.new_state	= new_state,
+		.dev		= dev,
+	};
+
+	xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_SET_STATE, &i);
+}
+
+static inline struct bch_ioctl_fs_usage *bchu_fs_usage(struct bchfs_handle fs)
+{
+	struct bch_ioctl_fs_usage *u = NULL;
+	size_t replica_entries_bytes = 4096;
+
+	while (1) {
+		u = xrealloc(u, sizeof(*u) + replica_entries_bytes);
+		u->replica_entries_bytes = replica_entries_bytes;
+
+		if (!ioctl(fs.ioctl_fd, BCH_IOCTL_FS_USAGE, u))
+			return u;
+
+		if (errno != ERANGE)
+			die("BCH_IOCTL_USAGE error: %m");
+
+		replica_entries_bytes *= 2;
+	}
+}
+
+static inline struct bch_ioctl_query_accounting *bchu_fs_accounting(struct bchfs_handle fs,
+								    unsigned typemask)
+{
+	unsigned accounting_u64s = 128;
+	struct bch_ioctl_query_accounting *ret = NULL;
+
+	while (1) {
+		ret = xrealloc(ret, sizeof(*ret) + accounting_u64s * sizeof(u64));
+
+		memset(ret, 0, sizeof(*ret));
+
+		ret->accounting_u64s = accounting_u64s;
+		ret->accounting_types_mask = typemask;
+
+		if (!ioctl(fs.ioctl_fd, BCH_IOCTL_QUERY_ACCOUNTING, ret))
+			return ret;
+
+		if (errno == ENOTTY)
+			return NULL;
+
+		if (errno == ERANGE) {
+			accounting_u64s *= 2;
+			continue;
+		}
+
+		die("BCH_IOCTL_USAGE error: %m");
+	}
+}
+
+static inline struct bch_ioctl_dev_usage_v2 *bchu_dev_usage(struct bchfs_handle fs,
+							    unsigned idx)
+{
+	struct bch_ioctl_dev_usage_v2 *u = xcalloc(sizeof(*u) + sizeof(u->d[0]) * BCH_DATA_NR, 1);
+
+	u->dev			= idx;
+	u->flags		= BCH_BY_INDEX;
+	u->nr_data_types	= BCH_DATA_NR;
+
+	if (!ioctl(fs.ioctl_fd, BCH_IOCTL_DEV_USAGE_V2, u))
+		return u;
+
+	struct bch_ioctl_dev_usage u_v1 = { .dev = idx, .flags = BCH_BY_INDEX};
+	xioctl(fs.ioctl_fd, BCH_IOCTL_DEV_USAGE, &u_v1);
+
+	u->state	= u_v1.state;
+	u->nr_data_types = ARRAY_SIZE(u_v1.d);
+	u->bucket_size	= u_v1.bucket_size;
+	u->nr_buckets	= u_v1.nr_buckets;
+
+	for (unsigned i = 0; i < ARRAY_SIZE(u_v1.d); i++)
+		u->d[i] = u_v1.d[i];
+
+	return u;
+}
+
+static inline struct bch_sb *bchu_read_super(struct bchfs_handle fs, unsigned idx)
+{
+	size_t size = 4096;
+	struct bch_sb *sb = NULL;
+
+	while (1) {
+		sb = xrealloc(sb, size);
+		struct bch_ioctl_read_super i = {
+			.size	= size,
+			.sb	= (unsigned long) sb,
+		};
+
+		if (idx != -1) {
+			i.flags |= BCH_READ_DEV|BCH_BY_INDEX;
+			i.dev = idx;
+		}
+
+		if (!ioctl(fs.ioctl_fd, BCH_IOCTL_READ_SUPER, &i))
+			return sb;
+		if (errno != ERANGE)
+			die("BCH_IOCTL_READ_SUPER error: %m");
+		size *= 2;
+	}
+}
+
+static inline unsigned bchu_disk_get_idx(struct bchfs_handle fs, dev_t dev)
+{
+	struct bch_ioctl_disk_get_idx i = { .dev = dev };
+
+	return xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_GET_IDX, &i);
+}
+
+static inline void bchu_disk_resize(struct bchfs_handle fs,
+				    unsigned idx,
+				    u64 nbuckets)
+{
+	struct bch_ioctl_disk_resize i = {
+		.flags	= BCH_BY_INDEX,
+		.dev	= idx,
+		.nbuckets = nbuckets,
+	};
+
+	xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_RESIZE, &i);
+}
+
+static inline void bchu_disk_resize_journal(struct bchfs_handle fs,
+					    unsigned idx,
+					    u64 nbuckets)
+{
+	struct bch_ioctl_disk_resize i = {
+		.flags	= BCH_BY_INDEX,
+		.dev	= idx,
+		.nbuckets = nbuckets,
+	};
+
+	xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_RESIZE_JOURNAL, &i);
+}
+
+int bchu_data(struct bchfs_handle, struct bch_ioctl_data);
+
+struct dev_name {
+	unsigned	idx;
+	char		*dev;
+	char		*label;
+	uuid_t		uuid;
+	unsigned	durability;
+};
+typedef DARRAY(struct dev_name) dev_names;
+
+dev_names bchu_fs_get_devices(struct bchfs_handle);
+
+#endif /* _LIBBCACHE_H */
diff --git a/c_src/posix_to_bcachefs.c b/c_src/posix_to_bcachefs.c
new file mode 100644
index 00000000..d4701263
--- /dev/null
+++ b/c_src/posix_to_bcachefs.c
@@ -0,0 +1,461 @@
+#include <dirent.h>
+#include <sys/xattr.h>
+#include <linux/xattr.h>
+
+#include "posix_to_bcachefs.h"
+#include "libbcachefs/alloc_foreground.h"
+#include "libbcachefs/buckets.h"
+#include "libbcachefs/fs-common.h"
+#include "libbcachefs/io_write.h"
+#include "libbcachefs/str_hash.h"
+#include "libbcachefs/xattr.h"
+
+void update_inode(struct bch_fs *c,
+			 struct bch_inode_unpacked *inode)
+{
+	struct bkey_inode_buf packed;
+	int ret;
+
+	bch2_inode_pack(&packed, inode);
+	packed.inode.k.p.snapshot = U32_MAX;
+	ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
+				NULL, 0, BTREE_ITER_cached);
+	if (ret)
+		die("error updating inode: %s", bch2_err_str(ret));
+}
+
+void create_link(struct bch_fs *c,
+			struct bch_inode_unpacked *parent,
+			const char *name, u64 inum, mode_t mode)
+{
+	struct qstr qstr = QSTR(name);
+	struct bch_inode_unpacked parent_u;
+	struct bch_inode_unpacked inode;
+
+	int ret = bch2_trans_commit_do(c, NULL, NULL, 0,
+		bch2_link_trans(trans,
+				(subvol_inum) { 1, parent->bi_inum }, &parent_u,
+				(subvol_inum) { 1, inum }, &inode, &qstr));
+	if (ret)
+		die("error creating hardlink: %s", bch2_err_str(ret));
+}
+
+struct bch_inode_unpacked create_file(struct bch_fs *c,
+					     struct bch_inode_unpacked *parent,
+					     const char *name,
+					     uid_t uid, gid_t gid,
+					     mode_t mode, dev_t rdev)
+{
+	struct qstr qstr = QSTR(name);
+	struct bch_inode_unpacked new_inode;
+
+	bch2_inode_init_early(c, &new_inode);
+
+	int ret = bch2_trans_commit_do(c, NULL, NULL, 0,
+		bch2_create_trans(trans,
+				  (subvol_inum) { 1, parent->bi_inum }, parent,
+				  &new_inode, &qstr,
+				  uid, gid, mode, rdev, NULL, NULL,
+				  (subvol_inum) {}, 0));
+	if (ret)
+		die("error creating %s: %s", name, bch2_err_str(ret));
+
+	return new_inode;
+}
+
+#define for_each_xattr_handler(handlers, handler)		\
+	if (handlers)						\
+		for ((handler) = *(handlers)++;			\
+			(handler) != NULL;			\
+			(handler) = *(handlers)++)
+
+static const struct xattr_handler *xattr_resolve_name(char **name)
+{
+	const struct xattr_handler * const *handlers = bch2_xattr_handlers;
+	const struct xattr_handler *handler;
+
+	for_each_xattr_handler(handlers, handler) {
+		char *n;
+
+		n = strcmp_prefix(*name, xattr_prefix(handler));
+		if (n) {
+			if (!handler->prefix ^ !*n) {
+				if (*n)
+					continue;
+				return ERR_PTR(-EINVAL);
+			}
+			*name = n;
+			return handler;
+		}
+	}
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+void copy_times(struct bch_fs *c, struct bch_inode_unpacked *dst,
+		       struct stat *src)
+{
+	dst->bi_atime = timespec_to_bch2_time(c, src->st_atim);
+	dst->bi_mtime = timespec_to_bch2_time(c, src->st_mtim);
+	dst->bi_ctime = timespec_to_bch2_time(c, src->st_ctim);
+}
+
+void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst,
+			char *src)
+{
+	struct bch_hash_info hash_info = bch2_hash_info_init(c, dst);
+
+	char attrs[XATTR_LIST_MAX];
+	ssize_t attrs_size = llistxattr(src, attrs, sizeof(attrs));
+	if (attrs_size < 0)
+		die("listxattr error: %m");
+
+	char *next, *attr;
+	for (attr = attrs;
+	     attr < attrs + attrs_size;
+	     attr = next) {
+		next = attr + strlen(attr) + 1;
+
+		char val[XATTR_SIZE_MAX];
+		ssize_t val_size = lgetxattr(src, attr, val, sizeof(val));
+
+		if (val_size < 0)
+			die("error getting xattr val: %m");
+
+		const struct xattr_handler *h = xattr_resolve_name(&attr);
+		if (IS_ERR(h))
+			continue;
+
+		int ret = bch2_trans_commit_do(c, NULL, NULL, 0,
+				bch2_xattr_set(trans,
+					       (subvol_inum) { 1, dst->bi_inum },
+					       dst, &hash_info, attr,
+					       val, val_size, h->flags, 0));
+		if (ret < 0)
+			die("error creating xattr: %s", bch2_err_str(ret));
+	}
+}
+
+#define WRITE_DATA_BUF	(1 << 20)
+
+static char buf[WRITE_DATA_BUF] __aligned(PAGE_SIZE);
+
+static void write_data(struct bch_fs *c,
+		       struct bch_inode_unpacked *dst_inode,
+		       u64 dst_offset, void *buf, size_t len)
+{
+	struct bch_write_op op;
+	struct bio_vec bv[WRITE_DATA_BUF / PAGE_SIZE];
+
+	BUG_ON(dst_offset	& (block_bytes(c) - 1));
+	BUG_ON(len		& (block_bytes(c) - 1));
+	BUG_ON(len > WRITE_DATA_BUF);
+
+	bio_init(&op.wbio.bio, NULL, bv, ARRAY_SIZE(bv), 0);
+	bch2_bio_map(&op.wbio.bio, buf, len);
+
+	bch2_write_op_init(&op, c, bch2_opts_to_inode_opts(c->opts));
+	op.write_point	= writepoint_hashed(0);
+	op.nr_replicas	= 1;
+	op.subvol	= 1;
+	op.pos		= SPOS(dst_inode->bi_inum, dst_offset >> 9, U32_MAX);
+	op.flags |= BCH_WRITE_SYNC;
+
+	int ret = bch2_disk_reservation_get(c, &op.res, len >> 9,
+					    c->opts.data_replicas, 0);
+	if (ret)
+		die("error reserving space in new filesystem: %s", bch2_err_str(ret));
+
+	closure_call(&op.cl, bch2_write, NULL, NULL);
+
+	BUG_ON(!(op.flags & BCH_WRITE_SUBMITTED));
+	dst_inode->bi_sectors += len >> 9;
+
+	if (op.error)
+		die("write error: %s", bch2_err_str(op.error));
+}
+
+void copy_data(struct bch_fs *c,
+		      struct bch_inode_unpacked *dst_inode,
+		      int src_fd, u64 start, u64 end)
+{
+	while (start < end) {
+		unsigned len = min_t(u64, end - start, sizeof(buf));
+		unsigned pad = round_up(len, block_bytes(c)) - len;
+
+		xpread(src_fd, buf, len, start);
+		memset(buf + len, 0, pad);
+
+		write_data(c, dst_inode, start, buf, len + pad);
+		start += len;
+	}
+}
+
+static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
+		      u64 logical, u64 physical, u64 length)
+{
+	struct bch_dev *ca = c->devs[0];
+
+	BUG_ON(logical	& (block_bytes(c) - 1));
+	BUG_ON(physical & (block_bytes(c) - 1));
+	BUG_ON(length	& (block_bytes(c) - 1));
+
+	logical		>>= 9;
+	physical	>>= 9;
+	length		>>= 9;
+
+	BUG_ON(physical + length > bucket_to_sector(ca, ca->mi.nbuckets));
+
+	while (length) {
+		struct bkey_i_extent *e;
+		BKEY_PADDED_ONSTACK(k, BKEY_EXTENT_VAL_U64s_MAX) k;
+		u64 b = sector_to_bucket(ca, physical);
+		struct disk_reservation res;
+		unsigned sectors;
+		int ret;
+
+		sectors = min(ca->mi.bucket_size -
+			      (physical & (ca->mi.bucket_size - 1)),
+			      length);
+
+		e = bkey_extent_init(&k.k);
+		e->k.p.inode	= dst->bi_inum;
+		e->k.p.offset	= logical + sectors;
+		e->k.p.snapshot	= U32_MAX;
+		e->k.size	= sectors;
+		bch2_bkey_append_ptr(&e->k_i, (struct bch_extent_ptr) {
+					.offset = physical,
+					.dev = 0,
+					.gen = *bucket_gen(ca, b),
+				  });
+
+		ret = bch2_disk_reservation_get(c, &res, sectors, 1,
+						BCH_DISK_RESERVATION_NOFAIL);
+		if (ret)
+			die("error reserving space in new filesystem: %s",
+			    bch2_err_str(ret));
+
+		ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i, &res, 0, 0);
+		if (ret)
+			die("btree insert error %s", bch2_err_str(ret));
+
+		bch2_disk_reservation_put(c, &res);
+
+		dst->bi_sectors	+= sectors;
+		logical		+= sectors;
+		physical	+= sectors;
+		length		-= sectors;
+	}
+}
+
+void copy_link(struct bch_fs *c, struct bch_inode_unpacked *dst,
+		      char *src)
+{
+	ssize_t i;
+	ssize_t ret = readlink(src, buf, sizeof(buf));
+	if (ret < 0)
+		die("readlink error: %m");
+
+	for (i = ret; i < round_up(ret, block_bytes(c)); i++)
+		buf[i] = 0;
+
+	write_data(c, dst, 0, buf, round_up(ret, block_bytes(c)));
+}
+
+static void copy_file(struct bch_fs *c, struct bch_inode_unpacked *dst,
+		      int src_fd, u64 src_size,
+		      char *src_path, struct copy_fs_state *s)
+{
+	struct fiemap_iter iter;
+	struct fiemap_extent e;
+
+	fiemap_for_each(src_fd, iter, e)
+		if (e.fe_flags & FIEMAP_EXTENT_UNKNOWN) {
+			fsync(src_fd);
+			break;
+		}
+	fiemap_iter_exit(&iter);
+
+	fiemap_for_each(src_fd, iter, e) {
+		u64 src_max = roundup(src_size, block_bytes(c));
+
+		e.fe_length = min(e.fe_length, src_max - e.fe_logical);
+
+		if ((e.fe_logical	& (block_bytes(c) - 1)) ||
+		    (e.fe_length	& (block_bytes(c) - 1)))
+			die("Unaligned extent in %s - can't handle", src_path);
+
+		if (BCH_MIGRATE_copy == s->type || (e.fe_flags & (FIEMAP_EXTENT_UNKNOWN|
+				  FIEMAP_EXTENT_ENCODED|
+				  FIEMAP_EXTENT_NOT_ALIGNED|
+				  FIEMAP_EXTENT_DATA_INLINE))) {
+			copy_data(c, dst, src_fd, e.fe_logical,
+				  e.fe_logical + min(src_size - e.fe_logical,
+				      e.fe_length));
+			continue;
+		}
+
+		/*
+		 * if the data is below 1 MB, copy it so it doesn't conflict
+		 * with bcachefs's potentially larger superblock:
+		 */
+		if (e.fe_physical < 1 << 20) {
+			copy_data(c, dst, src_fd, e.fe_logical,
+				  e.fe_logical + min(src_size - e.fe_logical,
+				      e.fe_length));
+			continue;
+		}
+
+		if ((e.fe_physical	& (block_bytes(c) - 1)))
+			die("Unaligned extent in %s - can't handle", src_path);
+
+		range_add(&s->extents, e.fe_physical, e.fe_length);
+		link_data(c, dst, e.fe_logical, e.fe_physical, e.fe_length);
+	}
+	fiemap_iter_exit(&iter);
+}
+
+static void copy_dir(struct copy_fs_state *s,
+		     struct bch_fs *c,
+		     struct bch_inode_unpacked *dst,
+		     int src_fd, const char *src_path)
+{
+	DIR *dir = fdopendir(src_fd);
+	struct dirent *d;
+
+	while ((errno = 0), (d = readdir(dir))) {
+		struct bch_inode_unpacked inode;
+		int fd;
+
+		if (fchdir(src_fd))
+			die("chdir error: %m");
+
+		struct stat stat =
+			xfstatat(src_fd, d->d_name, AT_SYMLINK_NOFOLLOW);
+
+		if (!strcmp(d->d_name, ".") ||
+		    !strcmp(d->d_name, "..") ||
+		    !strcmp(d->d_name, "lost+found"))
+			continue;
+
+		if (BCH_MIGRATE_migrate == s->type && stat.st_ino == s->bcachefs_inum)
+			continue;
+
+		char *child_path = mprintf("%s/%s", src_path, d->d_name);
+
+		if (s->type == BCH_MIGRATE_migrate && stat.st_dev != s->dev)
+			die("%s does not have correct st_dev!", child_path);
+
+		u64 *dst_inum = S_ISREG(stat.st_mode)
+			? genradix_ptr_alloc(&s->hardlinks, stat.st_ino, GFP_KERNEL)
+			: NULL;
+
+		if (dst_inum && *dst_inum) {
+			create_link(c, dst, d->d_name, *dst_inum, S_IFREG);
+			goto next;
+		}
+
+		inode = create_file(c, dst, d->d_name,
+				    stat.st_uid, stat.st_gid,
+				    stat.st_mode, stat.st_rdev);
+
+		if (dst_inum)
+			*dst_inum = inode.bi_inum;
+
+		copy_xattrs(c, &inode, d->d_name);
+
+		/* copy xattrs */
+
+		switch (mode_to_type(stat.st_mode)) {
+		case DT_DIR:
+			fd = xopen(d->d_name, O_RDONLY|O_NOATIME);
+			copy_dir(s, c, &inode, fd, child_path);
+			close(fd);
+			break;
+		case DT_REG:
+			inode.bi_size = stat.st_size;
+
+			fd = xopen(d->d_name, O_RDONLY|O_NOATIME);
+			copy_file(c, &inode, fd, stat.st_size,
+				  child_path, s);
+			close(fd);
+			break;
+		case DT_LNK:
+			inode.bi_size = stat.st_size;
+
+			copy_link(c, &inode, d->d_name);
+			break;
+		case DT_FIFO:
+		case DT_CHR:
+		case DT_BLK:
+		case DT_SOCK:
+		case DT_WHT:
+			/* nothing else to copy for these: */
+			break;
+		default:
+			BUG();
+		}
+
+		copy_times(c, &inode, &stat);
+		update_inode(c, &inode);
+next:
+		free(child_path);
+	}
+
+	if (errno)
+		die("readdir error: %m");
+	closedir(dir);
+}
+
+static void reserve_old_fs_space(struct bch_fs *c,
+				 struct bch_inode_unpacked *root_inode,
+				 ranges *extents)
+{
+	struct bch_dev *ca = c->devs[0];
+	struct bch_inode_unpacked dst;
+	struct hole_iter iter;
+	struct range i;
+
+	dst = create_file(c, root_inode, "old_migrated_filesystem",
+			  0, 0, S_IFREG|0400, 0);
+	dst.bi_size = bucket_to_sector(ca, ca->mi.nbuckets) << 9;
+
+	ranges_sort_merge(extents);
+
+	for_each_hole(iter, *extents, bucket_to_sector(ca, ca->mi.nbuckets) << 9, i)
+		link_data(c, &dst, i.start, i.start, i.end - i.start);
+
+	update_inode(c, &dst);
+}
+
+void copy_fs(struct bch_fs *c, int src_fd, const char *src_path,
+		    struct copy_fs_state *s)
+{
+	syncfs(src_fd);
+
+	struct bch_inode_unpacked root_inode;
+	int ret = bch2_inode_find_by_inum(c, (subvol_inum) { 1, BCACHEFS_ROOT_INO },
+					  &root_inode);
+	if (ret)
+		die("error looking up root directory: %s", bch2_err_str(ret));
+
+	if (fchdir(src_fd))
+		die("chdir error: %m");
+
+	struct stat stat = xfstat(src_fd);
+	copy_times(c, &root_inode, &stat);
+	copy_xattrs(c, &root_inode, ".");
+
+
+	/* now, copy: */
+	copy_dir(s, c, &root_inode, src_fd, src_path);
+
+	if (BCH_MIGRATE_migrate == s->type)
+		reserve_old_fs_space(c, &root_inode, &s->extents);
+
+	update_inode(c, &root_inode);
+
+	if (BCH_MIGRATE_migrate == s->type)
+		darray_exit(&s->extents);
+
+	genradix_free(&s->hardlinks);
+}
diff --git a/c_src/posix_to_bcachefs.h b/c_src/posix_to_bcachefs.h
new file mode 100644
index 00000000..facb75ed
--- /dev/null
+++ b/c_src/posix_to_bcachefs.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _POSIX_TO_BCACHEFS_H
+#define _POSIX_TO_BCACHEFS_H
+
+/*
+ * This header exports the functionality needed for copying data from existing
+ * posix compliant filesystems to bcachefs. There are two use cases:
+ * 1. Creating a new bcachefs filesystem using `bcachefs format`, we can
+ *    specify a source directory tree which will be copied over the new
+ *    bcachefs filesytem.
+ * 2. Migrating an existing filesystem in place, with `bcachefs migrate`.
+ *    This will allocate space for the bcachefs metadata, but the actual data
+ *    represented by the extents will not be duplicated. The bcachefs metadata
+ *    will simply point to the existing extents.
+ *
+ * To avoid code duplication, `copy_fs` deals with both cases. See the function
+ * documentation for more details.
+ */
+
+#include "libbcachefs.h"
+
+enum bch_migrate_type {
+	BCH_MIGRATE_copy,
+	BCH_MIGRATE_migrate
+};
+
+/*
+ * The migrate action uses all the fields in this struct.
+ * The copy action only uses the `hardlinks` field. Since `hardlinks` is
+ * initialized with zeroes, an empty `copy_fs_state` struct can be passed.
+ */
+struct copy_fs_state {
+	u64			bcachefs_inum;
+	dev_t			dev;
+
+	GENRADIX(u64)		hardlinks;
+	ranges			extents;
+	enum bch_migrate_type	type;
+};
+
+/*
+ * The `copy_fs` function is used for both copying a directory tree to a new
+ * bcachefs filesystem and migrating an existing one, depending on the value
+ * from the `type` field in `copy_fs_state` struct.
+ *
+ * In case of copy, an empty `copy_fs_state` structure is passed to `copy_fs`
+ * (only the `hardlinks` field is used, and that is initialized with zeroes).
+ *
+ * In the migrate case, all the fields from `copy_fs_state` need to be
+ * initialized (`hardlinks` is initialized with zeroes).
+ */
+void copy_fs(struct bch_fs *c, int src_fd, const char *src_path,
+		    struct copy_fs_state *s);
+#endif /* _LIBBCACHE_H */
diff --git a/qcow2.c b/c_src/qcow2.c
index b7aa8c26..30a6e056 100644
--- a/qcow2.c
+++ b/c_src/qcow2.c
@@ -46,7 +46,8 @@ static void flush_l2(struct qcow2_image *img)
 	if (img->l1_index != -1) {
 		img->l1_table[img->l1_index] =
 			cpu_to_be64(img->offset|QCOW_OFLAG_COPIED);
-		xpwrite(img->fd, img->l2_table, img->block_size, img->offset);
+		xpwrite(img->fd, img->l2_table, img->block_size, img->offset,
+			"qcow2 l2 table");
 		img->offset += img->block_size;
 
 		memset(img->l2_table, 0, img->block_size);
@@ -71,7 +72,7 @@ static void add_l2(struct qcow2_image *img, u64 src_blk, u64 dst_offset)
 void qcow2_write_image(int infd, int outfd, ranges *data,
 		       unsigned block_size)
 {
-	u64 image_size = get_size(NULL, infd);
+	u64 image_size = get_size(infd);
 	unsigned l2_size = block_size / sizeof(u64);
 	unsigned l1_size = DIV_ROUND_UP(image_size, (u64) block_size * l2_size);
 	struct qcow2_hdr hdr = { 0 };
@@ -83,7 +84,6 @@ void qcow2_write_image(int infd, int outfd, ranges *data,
 		.l1_index	= -1,
 		.offset		= round_up(sizeof(hdr), block_size),
 	};
-	struct range *r;
 	char *buf = xmalloc(block_size);
 	u64 src_offset, dst_offset;
 
@@ -93,7 +93,7 @@ void qcow2_write_image(int infd, int outfd, ranges *data,
 	ranges_sort_merge(data);
 
 	/* Write data: */
-	darray_foreach(r, *data)
+	darray_for_each(*data, r)
 		for (src_offset = r->start;
 		     src_offset < r->end;
 		     src_offset += block_size) {
@@ -101,7 +101,8 @@ void qcow2_write_image(int infd, int outfd, ranges *data,
 			img.offset += img.block_size;
 
 			xpread(infd, buf, block_size, src_offset);
-			xpwrite(outfd, buf, block_size, dst_offset);
+			xpwrite(outfd, buf, block_size, dst_offset,
+				"qcow2 data");
 
 			add_l2(&img, src_offset / block_size, dst_offset);
 		}
@@ -111,7 +112,8 @@ void qcow2_write_image(int infd, int outfd, ranges *data,
 	/* Write L1 table: */
 	dst_offset		= img.offset;
 	img.offset		+= round_up(l1_size * sizeof(u64), block_size);
-	xpwrite(img.fd, img.l1_table, l1_size * sizeof(u64), dst_offset);
+	xpwrite(img.fd, img.l1_table, l1_size * sizeof(u64), dst_offset,
+		"qcow2 l1 table");
 
 	/* Write header: */
 	hdr.magic		= cpu_to_be32(QCOW_MAGIC);
@@ -123,7 +125,8 @@ void qcow2_write_image(int infd, int outfd, ranges *data,
 
 	memset(buf, 0, block_size);
 	memcpy(buf, &hdr, sizeof(hdr));
-	xpwrite(img.fd, buf, block_size, 0);
+	xpwrite(img.fd, buf, block_size, 0,
+		"qcow2 header");
 
 	free(img.l2_table);
 	free(img.l1_table);
diff --git a/qcow2.h b/c_src/qcow2.h
index 0943d55c..0943d55c 100644
--- a/qcow2.h
+++ b/c_src/qcow2.h
diff --git a/tools-util.c b/c_src/tools-util.c
index ca6d89a5..3a76a02e 100644
--- a/tools-util.c
+++ b/c_src/tools-util.c
@@ -17,6 +17,7 @@
 #include <blkid.h>
 #include <uuid/uuid.h>
 
+#include "libbcachefs.h"
 #include "libbcachefs/bcachefs_ioctl.h"
 #include "linux/sort.h"
 #include "tools-util.h"
@@ -31,7 +32,7 @@ void die(const char *fmt, ...)
 	va_end(args);
 	fputc('\n', stderr);
 
-	exit(EXIT_FAILURE);
+	_exit(EXIT_FAILURE);
 }
 
 char *mprintf(const char *fmt, ...)
@@ -50,36 +51,6 @@ char *mprintf(const char *fmt, ...)
 	return str;
 }
 
-void *xcalloc(size_t count, size_t size)
-{
-	void *p = calloc(count, size);
-
-	if (!p)
-		die("insufficient memory");
-
-	return p;
-}
-
-void *xmalloc(size_t size)
-{
-	void *p = malloc(size);
-
-	if (!p)
-		die("insufficient memory");
-
-	memset(p, 0, size);
-	return p;
-}
-
-void *xrealloc(void *p, size_t size)
-{
-	p = realloc(p, size);
-	if (!p)
-		die("insufficient memory");
-
-	return p;
-}
-
 void xpread(int fd, void *buf, size_t count, off_t offset)
 {
 	while (count) {
@@ -94,12 +65,12 @@ void xpread(int fd, void *buf, size_t count, off_t offset)
 	}
 }
 
-void xpwrite(int fd, const void *buf, size_t count, off_t offset)
+void xpwrite(int fd, const void *buf, size_t count, off_t offset, const char *msg)
 {
 	ssize_t r = pwrite(fd, buf, count, offset);
 
 	if (r != count)
-		die("write error (ret %zi err %m)", r);
+		die("error writing %s (ret %zi err %m)", msg, r);
 }
 
 struct stat xfstatat(int dirfd, const char *path, int flags)
@@ -122,67 +93,23 @@ struct stat xstat(const char *path)
 {
 	struct stat statbuf;
 	if (stat(path, &statbuf))
-		die("stat error: %m");
+		die("stat error statting %s: %m", path);
 	return statbuf;
 }
 
-/* Formatting: */
-
-int printf_pad(unsigned pad, const char * fmt, ...)
-{
-       va_list args;
-       int ret;
-
-       va_start(args, fmt);
-       ret = vprintf(fmt, args);
-       va_end(args);
-
-       while (ret++ < pad)
-	       putchar(' ');
-
-       return ret;
-}
+/* File parsing (i.e. sysfs) */
 
-struct units_buf __pr_units(s64 _v, enum units units)
+void write_file_str(int dirfd, const char *path, const char *str)
 {
-	struct units_buf ret;
-	char *out = ret.b, *end = out + sizeof(ret.b);
-	u64 v = _v;
+	int fd = xopenat(dirfd, path, O_WRONLY);
+	ssize_t wrote, len = strlen(str);
 
-	if (_v < 0) {
-		out += scnprintf(out, end - out, "-");
-		v = -_v;
-	}
-
-	switch (units) {
-	case BYTES:
-		snprintf(out, end - out, "%llu", v << 9);
-		break;
-	case SECTORS:
-		snprintf(out, end - out, "%llu", v);
-		break;
-	case HUMAN_READABLE:
-		v <<= 9;
-
-		if (v >= 1024) {
-			int exp = log(v) / log(1024);
-			snprintf(out, end - out, "%.1f%c",
-				 v / pow(1024, exp),
-				 "KMGTPE"[exp-1]);
-		} else {
-			snprintf(out, end - out, "%llu", v);
-		}
-
-		break;
-	}
-
-	return ret;
+	wrote = write(fd, str, len);
+	if (wrote != len)
+		die("read error: %m");
+	close(fd);
 }
 
-/* Argument parsing stuff: */
-
-/* File parsing (i.e. sysfs) */
-
 char *read_file_str(int dirfd, const char *path)
 {
 	int fd = xopenat(dirfd, path, O_RDONLY);
@@ -197,6 +124,10 @@ char *read_file_str(int dirfd, const char *path)
 	buf[len] = '\0';
 	if (len && buf[len - 1] == '\n')
 		buf[len - 1] = '\0';
+	if (!strlen(buf)) {
+		free(buf);
+		buf = NULL;
+	}
 
 	close(fd);
 
@@ -207,7 +138,7 @@ u64 read_file_u64(int dirfd, const char *path)
 {
 	char *buf = read_file_str(dirfd, path);
 	u64 v;
-	if (kstrtou64(buf, 10, &v))
+	if (bch2_strtou64_h(buf, &v))
 		die("read_file_u64: error parsing %s (got %s)", path, buf);
 	free(buf);
 	return v;
@@ -226,7 +157,7 @@ ssize_t read_string_list_or_die(const char *opt, const char * const list[],
 }
 
 /* Returns size of file or block device: */
-u64 get_size(const char *path, int fd)
+u64 get_size(int fd)
 {
 	struct stat statbuf = xfstat(fd);
 
@@ -238,36 +169,60 @@ u64 get_size(const char *path, int fd)
 	return ret;
 }
 
-/* Returns blocksize in units of 512 byte sectors: */
-unsigned get_blocksize(const char *path, int fd)
+/* Returns blocksize, in bytes: */
+unsigned get_blocksize(int fd)
 {
 	struct stat statbuf = xfstat(fd);
 
 	if (!S_ISBLK(statbuf.st_mode))
-		return statbuf.st_blksize >> 9;
+		return statbuf.st_blksize;
 
 	unsigned ret;
 	xioctl(fd, BLKPBSZGET, &ret);
-	return ret >> 9;
+	return ret;
 }
 
 /* Open a block device, do magic blkid stuff to probe for existing filesystems: */
-int open_for_format(const char *dev, bool force)
-{
+int open_for_format(struct dev_opts *dev, bool force)
+{
+	int blkid_version_code = blkid_get_library_version(NULL, NULL);
+	if (blkid_version_code < 2401) {
+		if (force) {
+			fprintf(
+				stderr,
+				"Continuing with out of date libblkid %s because --force was passed.\n",
+				BLKID_VERSION);
+		} else {
+			// Reference for picking 2.40.1:
+			// https://mirrors.edge.kernel.org/pub/linux/utils/util-linux/v2.40/v2.40.1-ReleaseNotes
+			// https://github.com/util-linux/util-linux/issues/3103
+			die(
+				"Refusing to format when using libblkid %s\n"
+				"libblkid >= 2.40.1 is required to check for existing filesystems\n"
+				"Earlier versions may not recognize some bcachefs filesystems.\n", BLKID_VERSION);
+		}
+	}
+
 	blkid_probe pr;
 	const char *fs_type = NULL, *fs_label = NULL;
 	size_t fs_type_len, fs_label_len;
 
-	int fd = xopen(dev, O_RDWR|O_EXCL);
-
-	if (force)
-		return fd;
+	dev->file = bdev_file_open_by_path(dev->path,
+				BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL|BLK_OPEN_BUFFERED,
+				dev, NULL);
+	int ret = PTR_ERR_OR_ZERO(dev->file);
+	if (ret < 0)
+		die("Error opening device to format %s: %s", dev->path, strerror(-ret));
+	dev->bdev = file_bdev(dev->file);
 
 	if (!(pr = blkid_new_probe()))
 		die("blkid error 1");
-	if (blkid_probe_set_device(pr, fd, 0, 0))
+	if (blkid_probe_set_device(pr, dev->bdev->bd_fd, 0, 0))
 		die("blkid error 2");
-	if (blkid_probe_enable_partitions(pr, true))
+	if (blkid_probe_enable_partitions(pr, true) ||
+	    blkid_probe_enable_superblocks(pr, true) ||
+	    blkid_probe_set_superblocks_flags(pr,
+			BLKID_SUBLKS_LABEL|BLKID_SUBLKS_TYPE|BLKID_SUBLKS_MAGIC))
 		die("blkid error 3");
 	if (blkid_do_fullprobe(pr) < 0)
 		die("blkid error 4");
@@ -278,17 +233,23 @@ int open_for_format(const char *dev, bool force)
 	if (fs_type) {
 		if (fs_label)
 			printf("%s contains a %s filesystem labelled '%s'\n",
-			       dev, fs_type, fs_label);
+			       dev->path, fs_type, fs_label);
 		else
 			printf("%s contains a %s filesystem\n",
-			       dev, fs_type);
-		fputs("Proceed anyway?", stdout);
-		if (!ask_yn())
-			exit(EXIT_FAILURE);
+			       dev->path, fs_type);
+		if (!force) {
+			fputs("Proceed anyway?", stdout);
+			if (!ask_yn())
+				exit(EXIT_FAILURE);
+		}
+		while (blkid_do_probe(pr) == 0) {
+			if (blkid_do_wipe(pr, 0))
+				die("Failed to wipe preexisting metadata.");
+		}
 	}
 
 	blkid_free_probe(pr);
-	return fd;
+	return ret;
 }
 
 bool ask_yn(void)
@@ -322,31 +283,27 @@ static int range_cmp(const void *_l, const void *_r)
 
 void ranges_sort_merge(ranges *r)
 {
-	struct range *t, *i;
-	ranges tmp = { NULL };
+	ranges tmp = { 0 };
 
-	sort(&darray_item(*r, 0), darray_size(*r),
-	     sizeof(darray_item(*r, 0)), range_cmp, NULL);
+	sort(r->data, r->nr, sizeof(r->data[0]), range_cmp, NULL);
 
 	/* Merge contiguous ranges: */
-	darray_foreach(i, *r) {
-		t = tmp.size ?  &tmp.item[tmp.size - 1] : NULL;
+	darray_for_each(*r, i) {
+		struct range *t = tmp.nr ? &tmp.data[tmp.nr - 1] : NULL;
 
 		if (t && t->end >= i->start)
 			t->end = max(t->end, i->end);
 		else
-			darray_append(tmp, *i);
+			darray_push(&tmp, *i);
 	}
 
-	darray_free(*r);
+	darray_exit(r);
 	*r = tmp;
 }
 
 void ranges_roundup(ranges *r, unsigned block_size)
 {
-	struct range *i;
-
-	darray_foreach(i, *r) {
+	darray_for_each(*r, i) {
 		i->start = round_down(i->start, block_size);
 		i->end	= round_up(i->end, block_size);
 	}
@@ -354,9 +311,7 @@ void ranges_roundup(ranges *r, unsigned block_size)
 
 void ranges_rounddown(ranges *r, unsigned block_size)
 {
-	struct range *i;
-
-	darray_foreach(i, *r) {
+	darray_for_each(*r, i) {
 		i->start = round_up(i->start, block_size);
 		i->end	= round_down(i->end, block_size);
 		i->end	= max(i->end, i->start);
@@ -367,26 +322,26 @@ struct fiemap_extent fiemap_iter_next(struct fiemap_iter *iter)
 {
 	struct fiemap_extent e;
 
-	BUG_ON(iter->idx > iter->f.fm_mapped_extents);
+	BUG_ON(iter->idx > iter->f->fm_mapped_extents);
 
-	if (iter->idx == iter->f.fm_mapped_extents) {
-		xioctl(iter->fd, FS_IOC_FIEMAP, &iter->f);
+	if (iter->idx == iter->f->fm_mapped_extents) {
+		xioctl(iter->fd, FS_IOC_FIEMAP, iter->f);
 
-		if (!iter->f.fm_mapped_extents)
+		if (!iter->f->fm_mapped_extents)
 			return (struct fiemap_extent) { .fe_length = 0 };
 
 		iter->idx = 0;
 	}
 
-	e = iter->f.fm_extents[iter->idx++];
+	e = iter->f->fm_extents[iter->idx++];
 	BUG_ON(!e.fe_length);
 
-	iter->f.fm_start = e.fe_logical + e.fe_length;
+	iter->f->fm_start = e.fe_logical + e.fe_length;
 
 	return e;
 }
 
-const char *strcmp_prefix(const char *a, const char *a_prefix)
+char *strcmp_prefix(char *a, const char *a_prefix)
 {
 	while (*a_prefix && *a == *a_prefix) {
 		a++;
@@ -395,24 +350,6 @@ const char *strcmp_prefix(const char *a, const char *a_prefix)
 	return *a_prefix ? NULL : a;
 }
 
-unsigned hatoi_validate(const char *s, const char *msg)
-{
-	u64 v;
-
-	if (bch2_strtoull_h(s, &v))
-		die("bad %s %s", msg, s);
-
-	v /= 512;
-
-	if (v > USHRT_MAX)
-		die("%s too large\n", msg);
-
-	if (!v)
-		die("%s too small\n", msg);
-
-	return v;
-}
-
 /* crc32c */
 
 static u32 crc32c_default(u32 crc, const void *buf, size_t size)
@@ -610,32 +547,195 @@ char *dev_to_path(dev_t dev)
 	return path;
 }
 
-char *dev_to_mount(char *dev)
+struct mntent *dev_to_mount(char *dev)
 {
-	char *line = NULL, *ret = NULL;
-	size_t n = 0;
-
-	FILE *f = fopen("/proc/mounts", "r");
+	struct mntent *mnt, *ret = NULL;
+	FILE *f = setmntent("/proc/mounts", "r");
 	if (!f)
 		die("error opening /proc/mounts: %m");
 
-	while (getline(&line, &n, f) != -1) {
-		char *d, *p = line;
-		char *devs = strsep(&p, " ");
-		char *mount = strsep(&p, " ");
-
-		if (!devs || !mount)
-			continue;
-
-		p = devs;
-		while ((d = strsep(&p, ":")))
-			if (!strcmp(d, dev)) {
-				ret = strdup(mount);
-				goto found;
+	struct stat d1 = xstat(dev);
+
+	while ((mnt = getmntent(f))) {
+		char *d, *p = mnt->mnt_fsname;
+
+		while ((d = strsep(&p, ":"))) {
+			struct stat d2;
+
+			if (stat(d, &d2))
+				continue;
+
+			if (S_ISBLK(d1.st_mode) != S_ISBLK(d2.st_mode))
+				continue;
+
+			if (S_ISBLK(d1.st_mode)) {
+				if (d1.st_rdev != d2.st_rdev)
+					continue;
+			} else {
+				if (d1.st_dev != d2.st_dev ||
+				    d1.st_ino != d2.st_ino)
+					continue;
 			}
+
+			ret = mnt;
+			goto found;
+		}
 	}
 found:
 	fclose(f);
-	free(line);
+	return ret;
+}
+
+int dev_mounted(char *dev)
+{
+	struct mntent *mnt = dev_to_mount(dev);
+
+	if (!mnt)
+		return 0;
+	if (hasmntopt(mnt, "ro"))
+		return 1;
+	return 2;
+}
+
+static char *dev_to_sysfs_path(dev_t dev)
+{
+	return mprintf("/sys/dev/block/%u:%u", major(dev), minor(dev));
+}
+
+char *fd_to_dev_model(int fd)
+{
+	struct stat stat = xfstat(fd);
+
+	if (S_ISBLK(stat.st_mode)) {
+		char *sysfs_path = dev_to_sysfs_path(stat.st_rdev);
+
+		char *model_path = mprintf("%s/device/model", sysfs_path);
+		if (!access(model_path, R_OK))
+			goto got_model;
+		free(model_path);
+
+		/* partition? try parent */
+
+		char buf[1024];
+		if (readlink(sysfs_path, buf, sizeof(buf)) < 0)
+			die("readlink error on %s: %m", sysfs_path);
+
+		free(sysfs_path);
+		sysfs_path = strdup(buf);
+
+		*strrchr(sysfs_path, '/') = 0;
+		model_path = mprintf("%s/device/model", sysfs_path);
+		if (!access(model_path, R_OK))
+			goto got_model;
+
+		return strdup("(unknown device)");
+		char *model;
+got_model:
+		model = read_file_str(AT_FDCWD, model_path);
+		free(model_path);
+		free(sysfs_path);
+		return model;
+	} else {
+		return strdup("(reg file)");
+	}
+}
+
+static int kstrtoull_symbolic(const char *s, unsigned int base, unsigned long long *res)
+{
+	if (!strcmp(s, "U64_MAX")) {
+		*res = U64_MAX;
+		return 0;
+	}
+
+	if (!strcmp(s, "U32_MAX")) {
+		*res = U32_MAX;
+		return 0;
+	}
+
+	return kstrtoull(s, base, res);
+}
+
+static int kstrtouint_symbolic(const char *s, unsigned int base, unsigned *res)
+{
+	unsigned long long tmp;
+	int rv;
+
+	rv = kstrtoull_symbolic(s, base, &tmp);
+	if (rv < 0)
+		return rv;
+	if (tmp != (unsigned long long)(unsigned int)tmp)
+		return -ERANGE;
+	*res = tmp;
+	return 0;
+}
+
+struct bpos bpos_parse(char *buf)
+{
+	char *orig = strdup(buf);
+	char *s = buf;
+
+	char *inode_s	= strsep(&s, ":");
+	char *offset_s	= strsep(&s, ":");
+	char *snapshot_s = strsep(&s, ":");
+
+	if (!inode_s || !offset_s || s)
+		die("invalid bpos %s", orig);
+	free(orig);
+
+	u64 inode_v = 0, offset_v = 0;
+	u32 snapshot_v = 0;
+	if (kstrtoull_symbolic(inode_s, 10, &inode_v))
+		die("invalid bpos.inode %s", inode_s);
+
+	if (kstrtoull_symbolic(offset_s, 10, &offset_v))
+		die("invalid bpos.offset %s", offset_s);
+
+	if (snapshot_s &&
+	    kstrtouint_symbolic(snapshot_s, 10, &snapshot_v))
+		die("invalid bpos.snapshot %s", snapshot_s);
+
+	return (struct bpos) { .inode = inode_v, .offset = offset_v, .snapshot = snapshot_v };
+}
+
+struct bbpos bbpos_parse(char *buf)
+{
+	char *s = buf, *field;
+	struct bbpos ret;
+
+	if (!(field = strsep(&s, ":")))
+		die("invalid bbpos %s", buf);
+
+	ret.btree = read_string_list_or_die(field, __bch2_btree_ids, "btree id");
+
+	if (!s)
+		die("invalid bbpos %s", buf);
+
+	ret.pos = bpos_parse(s);
+	return ret;
+}
+
+struct bbpos_range bbpos_range_parse(char *buf)
+{
+	char *s = buf;
+	char *start_str = strsep(&s, "-");
+	char *end_str	= strsep(&s, "-");
+
+	struct bbpos start = bbpos_parse(start_str);
+	struct bbpos end = end_str ? bbpos_parse(end_str) : start;
+
+	return (struct bbpos_range) { .start = start, .end = end };
+}
+
+darray_str get_or_split_cmdline_devs(int argc, char *argv[])
+{
+	darray_str ret = {};
+
+	if (argc == 1) {
+		bch2_split_devs(argv[0], &ret);
+	} else {
+		for (unsigned i = 0; i < argc; i++)
+			darray_push(&ret, strdup(argv[i]));
+	}
+
 	return ret;
 }
diff --git a/tools-util.h b/c_src/tools-util.h
index 57f61e50..572aca05 100644
--- a/tools-util.h
+++ b/c_src/tools-util.h
@@ -2,6 +2,7 @@
 #define _TOOLS_UTIL_H
 
 #include <errno.h>
+#include <mntent.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -17,20 +18,52 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/uuid.h>
-#include "ccan/darray/darray.h"
+#include "libbcachefs/bcachefs.h"
+#include "libbcachefs/bbpos.h"
+#include "libbcachefs/darray.h"
 
-void die(const char *, ...);
+#define noreturn __attribute__((noreturn))
+
+void die(const char *, ...)
+	__attribute__ ((format (printf, 1, 2))) noreturn;
 char *mprintf(const char *, ...)
 	__attribute__ ((format (printf, 1, 2)));
-void *xcalloc(size_t, size_t);
-void *xmalloc(size_t);
-void *xrealloc(void *, size_t);
 void xpread(int, void *, size_t, off_t);
-void xpwrite(int, const void *, size_t, off_t);
+void xpwrite(int, const void *, size_t, off_t, const char *);
 struct stat xfstatat(int, const char *, int);
 struct stat xfstat(int);
 struct stat xstat(const char *);
 
+static inline void *xmalloc(size_t size)
+{
+	void *p = malloc(size);
+
+	if (!p)
+		die("insufficient memory");
+
+	memset(p, 0, size);
+	return p;
+}
+
+static inline void *xcalloc(size_t count, size_t size)
+{
+	void *p = calloc(count, size);
+
+	if (!p)
+		die("insufficient memory");
+
+	return p;
+}
+
+static inline void *xrealloc(void *p, size_t size)
+{
+	p = realloc(p, size);
+	if (!p)
+		die("insufficient memory");
+
+	return p;
+}
+
 #define xopenat(_dirfd, _path, ...)					\
 ({									\
 	int _fd = openat((_dirfd), (_path), __VA_ARGS__);		\
@@ -49,31 +82,17 @@ struct stat xstat(const char *);
 	_ret;								\
 })
 
-int printf_pad(unsigned pad, const char * fmt, ...);
-
-enum units {
-	BYTES,
-	SECTORS,
-	HUMAN_READABLE,
-};
-
-struct units_buf __pr_units(s64, enum units);
-
-struct units_buf {
-	char	b[20];
-};
-
-#define pr_units(_v, _u)	&(__pr_units(_v, _u).b[0])
-
+void write_file_str(int, const char *, const char *);
 char *read_file_str(int, const char *);
 u64 read_file_u64(int, const char *);
 
 ssize_t read_string_list_or_die(const char *, const char * const[],
 				const char *);
 
-u64 get_size(const char *, int);
-unsigned get_blocksize(const char *, int);
-int open_for_format(const char *, bool);
+u64 get_size(int);
+unsigned get_blocksize(int);
+struct dev_opts;
+int open_for_format(struct dev_opts *, bool);
 
 bool ask_yn(void);
 
@@ -82,14 +101,14 @@ struct range {
 	u64		end;
 };
 
-typedef darray(struct range) ranges;
+typedef DARRAY(struct range) ranges;
 
 static inline void range_add(ranges *data, u64 offset, u64 size)
 {
-	darray_append(*data, (struct range) {
+	darray_push(data, ((struct range) {
 		.start = offset,
 		.end = offset + size
-	});
+	}));
 }
 
 void ranges_sort_merge(ranges *);
@@ -105,9 +124,9 @@ struct hole_iter {
 static inline struct range hole_iter_next(struct hole_iter *iter)
 {
 	struct range r = {
-		.start	= iter->idx ? iter->r.item[iter->idx - 1].end : 0,
-		.end	= iter->idx < iter->r.size
-			? iter->r.item[iter->idx].start : iter->end,
+		.start	= iter->idx ? iter->r.data[iter->idx - 1].end : 0,
+		.end	= iter->idx < iter->r.nr
+			? iter->r.data[iter->idx].start : iter->end,
 	};
 
 	BUG_ON(r.start > r.end);
@@ -118,14 +137,13 @@ static inline struct range hole_iter_next(struct hole_iter *iter)
 
 #define for_each_hole(_iter, _ranges, _end, _i)				\
 	for (_iter = (struct hole_iter) { .r = _ranges, .end = _end };	\
-	     (_iter.idx <= _iter.r.size &&				\
+	     (_iter.idx <= _iter.r.nr &&				\
 	      (_i = hole_iter_next(&_iter), true));)
 
 #include <linux/fiemap.h>
 
 struct fiemap_iter {
-	struct fiemap		f;
-	struct fiemap_extent	fe[1024];
+	struct fiemap		*f;
 	unsigned		idx;
 	int			fd;
 };
@@ -134,26 +152,37 @@ static inline void fiemap_iter_init(struct fiemap_iter *iter, int fd)
 {
 	memset(iter, 0, sizeof(*iter));
 
-	iter->f.fm_extent_count	= ARRAY_SIZE(iter->fe);
-	iter->f.fm_length	= FIEMAP_MAX_OFFSET;
+	iter->f = xmalloc(sizeof(struct fiemap) +
+			  sizeof(struct fiemap_extent) * 1024);
+
+	iter->f->fm_extent_count	= 1024;
+	iter->f->fm_length	= FIEMAP_MAX_OFFSET;
 	iter->fd		= fd;
 }
 
+static inline void fiemap_iter_exit(struct fiemap_iter *iter)
+{
+	free(iter->f);
+	memset(iter, 0, sizeof(*iter));
+}
+
 struct fiemap_extent fiemap_iter_next(struct fiemap_iter *);
 
 #define fiemap_for_each(fd, iter, extent)				\
 	for (fiemap_iter_init(&iter, fd);				\
 	     (extent = fiemap_iter_next(&iter)).fe_length;)
 
-const char *strcmp_prefix(const char *, const char *);
-
-unsigned hatoi_validate(const char *, const char *);
+char *strcmp_prefix(char *, const char *);
 
+/* Avoid conflicts with libblkid's crc32 function in static builds */
+#define crc32c bch_crc32c
 u32 crc32c(u32, const void *, size_t);
 
 char *dev_to_name(dev_t);
 char *dev_to_path(dev_t);
-char *dev_to_mount(char *);
+struct mntent *dev_to_mount(char *);
+int dev_mounted(char *);
+char *fd_to_dev_model(int);
 
 #define args_shift(_nr)							\
 do {									\
@@ -170,4 +199,16 @@ do {									\
 	_ret;								\
 })
 
+struct bpos bpos_parse(char *);
+struct bbpos bbpos_parse(char *);
+
+struct bbpos_range {
+	struct bbpos	start;
+	struct bbpos	end;
+};
+
+struct bbpos_range bbpos_range_parse(char *);
+
+darray_str get_or_split_cmdline_devs(int argc, char *argv[]);
+
 #endif /* _TOOLS_UTIL_H */
diff --git a/ccan/compiler/compiler.h b/ccan/compiler/compiler.h
index bce4f25a..bcfe32ee 100644
--- a/ccan/compiler/compiler.h
+++ b/ccan/compiler/compiler.h
@@ -97,7 +97,7 @@
  * UNNEEDED - a variable/function may not be needed
  *
  * This suppresses warnings about unused variables or functions, but tells
- * the compiler that if it is unused it need not emit it into the source code.
+ * the compiler that if it is unused it needs not emit it into the source code.
  *
  * Example:
  * // With some preprocessor options, this is unnecessary.
diff --git a/ccan/darray/LICENSE b/ccan/darray/LICENSE
deleted file mode 100644
index 89de3547..00000000
--- a/ccan/darray/LICENSE
+++ /dev/null
@@ -1,17 +0,0 @@
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
diff --git a/ccan/darray/_info b/ccan/darray/_info
deleted file mode 100644
index b6d5e4ba..00000000
--- a/ccan/darray/_info
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "config.h"
-#include <stdio.h>
-#include <string.h>
-
-#include "ccan/darray/darray.h"
-
-/**
- * darray - Generic resizable arrays
- *
- * darray is a set of macros for managing dynamically-allocated arrays.
- * It removes the tedium of managing realloc'd arrays with pointer, size, and
- * allocated size.
- *
- * Example:
- * #include <ccan/darray/darray.h>
- * #include <stdio.h>
- * 
- * int main(void) {
- * 	darray(int) numbers = darray_new();
- * 	char buffer[32];
- * 	
- * 	for (;;) {
- * 		int *i;
- * 		darray_foreach(i, numbers)
- * 			printf("%d ", *i);
- * 		if (darray_size(numbers) > 0)
- * 			puts("");
- * 		
- * 		printf("darray> ");
- * 		fgets(buffer, sizeof(buffer), stdin);
- * 		if (*buffer == '\0' || *buffer == '\n')
- * 			break;
- * 		
- * 		darray_append(numbers, atoi(buffer));
- * 	}
- * 	
- * 	darray_free(numbers);
- * 	
- * 	return 0;
- * }
- *
- * Author: Joey Adams <joeyadams3.14159@gmail.com>
- * License: MIT
- * Version: 0.2
- */
-int main(int argc, char *argv[])
-{
-	if (argc != 2)
-		return 1;
-
-	if (strcmp(argv[1], "depends") == 0) {
-		/* Nothing. */
-		return 0;
-	}
-
-	return 1;
-}
diff --git a/ccan/darray/darray.h b/ccan/darray/darray.h
deleted file mode 100644
index 75112419..00000000
--- a/ccan/darray/darray.h
+++ /dev/null
@@ -1,355 +0,0 @@
-/*
- * Copyright (C) 2011 Joseph Adams <joeyadams3.14159@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#ifndef CCAN_DARRAY_H
-#define CCAN_DARRAY_H
-
-#include <stdlib.h>
-#include <string.h>
-#include "config.h"
-
-/*
- * SYNOPSIS
- *
- * Life cycle of a darray (dynamically-allocated array):
- *
- *     darray(int) a = darray_new();
- *     darray_free(a);
- *
- *     struct {darray(int) a;} foo;
- *     darray_init(foo.a);
- *     darray_free(foo.a);
- *
- * Typedefs for darrays of common types:
- *
- *     darray_char, darray_schar, darray_uchar
- *     darray_short, darray_int, darray_long
- *     darray_ushort, darray_uint, darray_ulong
- *
- * Access:
- *
- *     T      darray_item(darray(T) arr, size_t index);
- *     size_t darray_size(darray(T) arr);
- *     size_t darray_alloc(darray(T) arr);
- *     bool   darray_empty(darray(T) arr);
- *
- * Insertion (single item):
- *
- *     void   darray_append(darray(T) arr, T item);
- *     void   darray_prepend(darray(T) arr, T item);
- *     void   darray_push(darray(T) arr, T item); // same as darray_append
- *
- * Insertion (multiple items):
- *
- *     void   darray_append_items(darray(T) arr, T *items, size_t count);
- *     void   darray_prepend_items(darray(T) arr, T *items, size_t count);
- *
- *     void   darray_appends(darray(T) arr, [T item, [...]]);
- *     void   darray_prepends(darray(T) arr, [T item, [...]]);
- *
- *     // Same functionality as above, but does not require typeof.
- *     void   darray_appends_t(darray(T) arr, #T, [T item, [...]]);
- *     void   darray_prepends_t(darray(T) arr, #T, [T item, [...]]);
- *
- * Removal:
- *
- *     T      darray_pop(darray(T) arr | darray_size(arr) != 0);
- *     T*     darray_pop_check(darray(T*) arr);
- *     void   darray_remove(darray(T) arr, size_t index);
- *
- * Replacement:
- *
- *     void   darray_from_items(darray(T) arr, T *items, size_t count);
- *     void   darray_from_c(darray(T) arr, T c_array[N]);
- *
- * String buffer:
- *
- *     void   darray_append_string(darray(char) arr, const char *str);
- *     void   darray_append_lit(darray(char) arr, char stringLiteral[N+1]);
- *
- *     void   darray_prepend_string(darray(char) arr, const char *str);
- *     void   darray_prepend_lit(darray(char) arr, char stringLiteral[N+1]);
- *
- *     void   darray_from_string(darray(T) arr, const char *str);
- *     void   darray_from_lit(darray(char) arr, char stringLiteral[N+1]);
- *
- * Size management:
- *
- *     void   darray_resize(darray(T) arr, size_t newSize);
- *     void   darray_resize0(darray(T) arr, size_t newSize);
- *
- *     void   darray_realloc(darray(T) arr, size_t newAlloc);
- *     void   darray_growalloc(darray(T) arr, size_t newAlloc);
- *
- *     void   darray_make_room(darray(T) arr, size_t room);
- *
- * Traversal:
- *
- *     darray_foreach(T *&i, darray(T) arr) {...}
- *     darray_foreach_reverse(T *&i, darray(T) arr) {...}
- *
- * Except for darray_foreach, darray_foreach_reverse, and darray_remove,
- * all macros evaluate their non-darray arguments only once.
- */
-
-/*** Life cycle ***/
-
-#define darray(type) struct {type *item; size_t size; size_t alloc;}
-
-#define darray_new() {0,0,0}
-#define darray_init(arr) do {(arr).item=0; (arr).size=0; (arr).alloc=0;} while(0)
-#define darray_free(arr) do {free((arr).item);} while(0)
-
-
-/*
- * Typedefs for darrays of common types.  These are useful
- * when you want to pass a pointer to an darray(T) around.
- *
- * The following will produce an incompatible pointer warning:
- *
- *     void foo(darray(int) *arr);
- *     darray(int) arr = darray_new();
- *     foo(&arr);
- *
- * The workaround:
- *
- *     void foo(darray_int *arr);
- *     darray_int arr = darray_new();
- *     foo(&arr);
- */
-
-typedef darray(char)           darray_char;
-typedef darray(signed char)    darray_schar;
-typedef darray(unsigned char)  darray_uchar;
-
-typedef darray(short)          darray_short;
-typedef darray(int)            darray_int;
-typedef darray(long)           darray_long;
-
-typedef darray(unsigned short) darray_ushort;
-typedef darray(unsigned int)   darray_uint;
-typedef darray(unsigned long)  darray_ulong;
-
-
-/*** Access ***/
-
-#define darray_item(arr, i) ((arr).item[i])
-#define darray_size(arr)    ((arr).size)
-#define darray_alloc(arr)   ((arr).alloc)
-#define darray_empty(arr)   ((arr).size == 0)
-
-
-/*** Insertion (single item) ***/
-
-#define darray_append(arr, ...) do { \
-		darray_resize(arr, (arr).size+1); \
-		(arr).item[(arr).size-1] = (__VA_ARGS__); \
-	} while(0)
-#define darray_prepend(arr, ...) do { \
-		darray_resize(arr, (arr).size+1); \
-		memmove((arr).item+1, (arr).item, ((arr).size-1)*sizeof(*(arr).item)); \
-		(arr).item[0] = (__VA_ARGS__); \
-	} while(0)
-#define darray_push(arr, ...) darray_append(arr, __VA_ARGS__)
-
-
-/*** Insertion (multiple items) ***/
-
-#define darray_append_items(arr, items, count) do { \
-		size_t __count = (count), __oldSize = (arr).size; \
-		darray_resize(arr, __oldSize + __count); \
-		memcpy((arr).item + __oldSize, items, __count * sizeof(*(arr).item)); \
-	} while(0)
-
-#define darray_prepend_items(arr, items, count) do { \
-		size_t __count = (count), __oldSize = (arr).size; \
-		darray_resize(arr, __count + __oldSize); \
-		memmove((arr).item + __count, (arr).item, __oldSize * sizeof(*(arr).item)); \
-		memcpy((arr).item, items, __count * sizeof(*(arr).item)); \
-	} while(0)
-
-#define darray_append_items_nullterminate(arr, items, count) do { \
-		size_t __count = (count), __oldSize = (arr).size; \
-		darray_resize(arr, __oldSize + __count + 1); \
-		memcpy((arr).item + __oldSize, items, __count * sizeof(*(arr).item)); \
-		(arr).item[--(arr).size] = 0; \
-	} while(0)
-
-#define darray_prepend_items_nullterminate(arr, items, count) do { \
-		size_t __count = (count), __oldSize = (arr).size; \
-		darray_resize(arr, __count + __oldSize + 1); \
-		memmove((arr).item + __count, (arr).item, __oldSize * sizeof(*(arr).item)); \
-		memcpy((arr).item, items, __count * sizeof(*(arr).item)); \
-		(arr).item[--(arr).size] = 0; \
-	} while(0)
-
-#if HAVE_TYPEOF
-#define darray_appends(arr, ...) darray_appends_t(arr, typeof((*(arr).item)), __VA_ARGS__)
-#define darray_prepends(arr, ...) darray_prepends_t(arr, typeof((*(arr).item)), __VA_ARGS__)
-#endif
-
-#define darray_appends_t(arr, type, ...) do { \
-		type __src[] = {__VA_ARGS__}; \
-		darray_append_items(arr, __src, sizeof(__src)/sizeof(*__src)); \
-	} while(0)
-#define darray_prepends_t(arr, type, ...) do { \
-		type __src[] = {__VA_ARGS__}; \
-		darray_prepend_items(arr, __src, sizeof(__src)/sizeof(*__src)); \
-	} while(0)
-
-
-/*** Removal ***/
-
-/* Warning: Do not call darray_pop on an empty darray. */
-#define darray_pop(arr) ((arr).item[--(arr).size])
-#define darray_pop_check(arr) ((arr).size ? darray_pop(arr) : NULL)
-/* Warning, slow: Requires copying all elements after removed item. */
-#define darray_remove(arr, index) do { \
-	if (index < arr.size-1)    \
-		memmove(&(arr).item[index], &(arr).item[index+1], ((arr).size-1-i)*sizeof(*(arr).item)); \
-	(arr).size--;  \
-	} while(0)
-
-
-/*** Replacement ***/
-
-#define darray_from_items(arr, items, count) do {size_t __count = (count); darray_resize(arr, __count); memcpy((arr).item, items, __count*sizeof(*(arr).item));} while(0)
-#define darray_from_c(arr, c_array) darray_from_items(arr, c_array, sizeof(c_array)/sizeof(*(c_array)))
-
-
-/*** String buffer ***/
-
-#define darray_append_string(arr, str) do {const char *__str = (str); darray_append_items(arr, __str, strlen(__str)+1); (arr).size--;} while(0)
-#define darray_append_lit(arr, stringLiteral) do {darray_append_items(arr, stringLiteral, sizeof(stringLiteral)); (arr).size--;} while(0)
-
-#define darray_prepend_string(arr, str) do { \
-		const char *__str = (str); \
-		darray_prepend_items_nullterminate(arr, __str, strlen(__str)); \
-	} while(0)
-#define darray_prepend_lit(arr, stringLiteral) \
-	darray_prepend_items_nullterminate(arr, stringLiteral, sizeof(stringLiteral) - 1)
-
-#define darray_from_string(arr, str) do {const char *__str = (str); darray_from_items(arr, __str, strlen(__str)+1); (arr).size--;} while(0)
-#define darray_from_lit(arr, stringLiteral) do {darray_from_items(arr, stringLiteral, sizeof(stringLiteral)); (arr).size--;} while(0)
-
-
-/*** Size management ***/
-
-#define darray_resize(arr, newSize) darray_growalloc(arr, (arr).size = (newSize))
-#define darray_resize0(arr, newSize) do { \
-		size_t __oldSize = (arr).size, __newSize = (newSize); \
-		(arr).size = __newSize; \
-		if (__newSize > __oldSize) { \
-			darray_growalloc(arr, __newSize); \
-			memset(&(arr).item[__oldSize], 0, (__newSize - __oldSize) * sizeof(*(arr).item)); \
-		} \
-	} while(0)
-
-#define darray_realloc(arr, newAlloc) do { \
-		(arr).item = realloc((arr).item, ((arr).alloc = (newAlloc)) * sizeof(*(arr).item)); \
-	} while(0)
-#define darray_growalloc(arr, need) do { \
-		size_t __need = (need); \
-		if (__need > (arr).alloc) \
-			darray_realloc(arr, darray_next_alloc((arr).alloc, __need)); \
-	} while(0)
-
-#if HAVE_STATEMENT_EXPR==1
-#define darray_make_room(arr, room) ({size_t newAlloc = (arr).size+(room); if ((arr).alloc<newAlloc) darray_realloc(arr, newAlloc); (arr).item+(arr).size; })
-#endif
-
-static inline size_t darray_next_alloc(size_t alloc, size_t need)
-{
-	if (alloc == 0)
-		alloc = 1;
-	while (alloc < need)
-		alloc *= 2;
-	return alloc;
-}
-
-
-/*** Traversal ***/
-
-/*
- * darray_foreach(T *&i, darray(T) arr) {...}
- *
- * Traverse a darray.  `i` must be declared in advance as a pointer to an item.
- */
-#define darray_foreach(i, arr) \
-	for ((i) = &(arr).item[0]; (i) < &(arr).item[(arr).size]; (i)++)
-
-/*
- * darray_foreach_reverse(T *&i, darray(T) arr) {...}
- *
- * Like darray_foreach, but traverse in reverse order.
- */
-#define darray_foreach_reverse(i, arr) \
-	for ((i) = &(arr).item[(arr).size]; (i)-- > &(arr).item[0]; )
-
-
-#endif /* CCAN_DARRAY_H */
-
-/*
-
-darray_growalloc(arr, newAlloc) sees if the darray can currently hold newAlloc items;
-	if not, it increases the alloc to satisfy this requirement, allocating slack
-	space to avoid having to reallocate for every size increment.
-
-darray_from_string(arr, str) copies a string to an darray_char.
-
-darray_push(arr, item) pushes an item to the end of the darray.
-darray_pop(arr) pops it back out.  Be sure there is at least one item in the darray before calling.
-darray_pop_check(arr) does the same as darray_pop, but returns NULL if there are no more items left in the darray.
-
-darray_make_room(arr, room) ensures there's 'room' elements of space after the end of the darray, and it returns a pointer to this space.
-Currently requires HAVE_STATEMENT_EXPR, but I plan to remove this dependency by creating an inline function.
-
-The following require HAVE_TYPEOF==1 :
-
-darray_appends(arr, item0, item1...) appends a collection of comma-delimited items to the darray.
-darray_prepends(arr, item0, item1...) prepends a collection of comma-delimited items to the darray.\
-
-
-Examples:
-
-	darray(int)  arr;
-	int        *i;
-	
-	darray_appends(arr, 0,1,2,3,4);
-	darray_appends(arr, -5,-4,-3,-2,-1);
-	darray_foreach(i, arr)
-		printf("%d ", *i);
-	printf("\n");
-	
-	darray_free(arr);
-	
-
-	typedef struct {int n,d;} Fraction;
-	darray(Fraction) fractions;
-	Fraction        *i;
-	
-	darray_appends(fractions, {3,4}, {3,5}, {2,1});
-	darray_foreach(i, fractions)
-		printf("%d/%d\n", i->n, i->d);
-	
-	darray_free(fractions);
-*/
diff --git a/cmd_data.c b/cmd_data.c
deleted file mode 100644
index f495b6c0..00000000
--- a/cmd_data.c
+++ /dev/null
@@ -1,48 +0,0 @@
-
-
-#include <stdio.h>
-#include <sys/ioctl.h>
-
-#include "libbcachefs/bcachefs_ioctl.h"
-
-#include "cmds.h"
-#include "libbcachefs.h"
-
-static void data_rereplicate_usage(void)
-{
-	puts("bcachefs data rereplicate\n"
-	     "Usage: bcachefs data rereplicate filesystem\n"
-	     "\n"
-	     "Walks existing data in a filesystem, writing additional copies\n"
-	     "of any degraded data\n"
-	     "\n"
-	     "Options:\n"
-	     "  -h, --help                  display this help and exit\n"
-	     "Report bugs to <linux-bcache@vger.kernel.org>");
-	exit(EXIT_SUCCESS);
-}
-
-int cmd_data_rereplicate(int argc, char *argv[])
-{
-	int opt;
-
-	while ((opt = getopt(argc, argv, "h")) != -1)
-		switch (opt) {
-		case 'h':
-			data_rereplicate_usage();
-		}
-	args_shift(optind);
-
-	char *fs_path = arg_pop();
-	if (!fs_path)
-		die("Please supply a filesystem");
-
-	if (argc)
-		die("too many arguments");
-
-	return bchu_data(bcache_fs_open(fs_path), (struct bch_ioctl_data) {
-		.op	= BCH_DATA_OP_REREPLICATE,
-		.start	= POS_MIN,
-		.end	= POS_MAX,
-	});
-}
diff --git a/cmd_debug.c b/cmd_debug.c
deleted file mode 100644
index 11d73b35..00000000
--- a/cmd_debug.c
+++ /dev/null
@@ -1,332 +0,0 @@
-#include <fcntl.h>
-#include <string.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-
-#include "cmds.h"
-#include "libbcachefs.h"
-#include "qcow2.h"
-#include "tools-util.h"
-
-#include "libbcachefs/bcachefs.h"
-#include "libbcachefs/alloc.h"
-#include "libbcachefs/bset.h"
-#include "libbcachefs/btree_cache.h"
-#include "libbcachefs/btree_iter.h"
-#include "libbcachefs/buckets.h"
-#include "libbcachefs/error.h"
-#include "libbcachefs/journal.h"
-#include "libbcachefs/super.h"
-
-static void dump_usage(void)
-{
-	puts("bcachefs dump - dump filesystem metadata\n"
-	     "Usage: bcachefs dump [OPTION]... <devices>\n"
-	     "\n"
-	     "Options:\n"
-	     "  -o output     Output qcow2 image(s)\n"
-	     "  -f            Force; overwrite when needed\n"
-	     "  -h            Display this help and exit\n"
-	     "Report bugs to <linux-bcache@vger.kernel.org>");
-}
-
-static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd)
-{
-	struct bch_sb *sb = ca->disk_sb.sb;
-	ranges data;
-	unsigned i;
-
-	darray_init(data);
-
-	/* Superblock: */
-	range_add(&data, BCH_SB_LAYOUT_SECTOR << 9,
-		  sizeof(struct bch_sb_layout));
-
-	for (i = 0; i < sb->layout.nr_superblocks; i++)
-		range_add(&data,
-			  le64_to_cpu(sb->layout.sb_offset[i]) << 9,
-			  vstruct_bytes(sb));
-
-	/* Journal: */
-	for (i = 0; i < ca->journal.nr; i++)
-		if (ca->journal.bucket_seq[i] >= c->journal.last_seq_ondisk) {
-			u64 bucket = ca->journal.buckets[i];
-
-			range_add(&data,
-				  bucket_bytes(ca) * bucket,
-				  bucket_bytes(ca));
-		}
-
-	/* Btree: */
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		const struct bch_extent_ptr *ptr;
-		struct btree_iter iter;
-		struct btree *b;
-
-		for_each_btree_node(&iter, c, i, POS_MIN, 0, b) {
-			struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
-
-			extent_for_each_ptr(e, ptr)
-				if (ptr->dev == ca->dev_idx)
-					range_add(&data,
-						  ptr->offset << 9,
-						  b->written << 9);
-		}
-		bch2_btree_iter_unlock(&iter);
-	}
-
-	qcow2_write_image(ca->disk_sb.bdev->bd_fd, fd, &data,
-			  max_t(unsigned, btree_bytes(c) / 8, block_bytes(c)));
-}
-
-int cmd_dump(int argc, char *argv[])
-{
-	struct bch_opts opts = bch2_opts_empty();
-	struct bch_dev *ca;
-	char *out = NULL;
-	unsigned i, nr_devices = 0;
-	bool force = false;
-	int fd, opt;
-
-	opt_set(opts, nochanges,	true);
-	opt_set(opts, noreplay,		true);
-	opt_set(opts, degraded,		true);
-	opt_set(opts, errors,		BCH_ON_ERROR_CONTINUE);
-
-	while ((opt = getopt(argc, argv, "o:fh")) != -1)
-		switch (opt) {
-		case 'o':
-			out = optarg;
-			break;
-		case 'f':
-			force = true;
-			break;
-		case 'h':
-			dump_usage();
-			exit(EXIT_SUCCESS);
-		}
-	args_shift(optind);
-
-	if (!out)
-		die("Please supply output filename");
-
-	if (!argc)
-		die("Please supply device(s) to check");
-
-	struct bch_fs *c = bch2_fs_open(argv, argc, opts);
-	if (IS_ERR(c))
-		die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c)));
-
-	down_read(&c->gc_lock);
-
-	for_each_online_member(ca, c, i)
-		nr_devices++;
-
-	BUG_ON(!nr_devices);
-
-	for_each_online_member(ca, c, i) {
-		int flags = O_WRONLY|O_CREAT|O_TRUNC;
-
-		if (!force)
-			flags |= O_EXCL;
-
-		if (!c->devs[i])
-			continue;
-
-		char *path = nr_devices > 1
-			? mprintf("%s.%u", out, i)
-			: strdup(out);
-		fd = xopen(path, flags, 0600);
-		free(path);
-
-		dump_one_device(c, ca, fd);
-		close(fd);
-	}
-
-	up_read(&c->gc_lock);
-
-	bch2_fs_stop(c);
-	return 0;
-}
-
-static void list_keys(struct bch_fs *c, enum btree_id btree_id,
-		      struct bpos start, struct bpos end)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	char buf[512];
-
-	for_each_btree_key(&iter, c, btree_id, start,
-			   BTREE_ITER_PREFETCH, k) {
-		if (bkey_cmp(k.k->p, end) > 0)
-			break;
-
-		bch2_bkey_val_to_text(c, bkey_type(0, btree_id),
-				      buf, sizeof(buf), k);
-		puts(buf);
-	}
-	bch2_btree_iter_unlock(&iter);
-}
-
-static void list_btree_formats(struct bch_fs *c, enum btree_id btree_id,
-			       struct bpos start, struct bpos end)
-{
-	struct btree_iter iter;
-	struct btree *b;
-	char buf[4096];
-
-	for_each_btree_node(&iter, c, btree_id, start, 0, b) {
-		if (bkey_cmp(b->key.k.p, end) > 0)
-			break;
-
-		bch2_print_btree_node(c, b, buf, sizeof(buf));
-		puts(buf);
-	}
-	bch2_btree_iter_unlock(&iter);
-}
-
-static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id,
-			    struct bpos start, struct bpos end)
-{
-	struct btree_iter iter;
-	struct btree_node_iter node_iter;
-	struct bkey unpacked;
-	struct bkey_s_c k;
-	struct btree *b;
-	char buf[4096];
-
-	for_each_btree_node(&iter, c, btree_id, start, 0, b) {
-		if (bkey_cmp(b->key.k.p, end) > 0)
-			break;
-
-		bch2_print_btree_node(c, b, buf, sizeof(buf));
-		fputs(buf, stdout);
-
-		buf[0] = '\t';
-
-		for_each_btree_node_key_unpack(b, k, &node_iter,
-					       btree_node_is_extents(b),
-					       &unpacked) {
-			bch2_bkey_val_to_text(c, bkey_type(0, btree_id),
-					      buf + 1, sizeof(buf) - 1, k);
-			puts(buf);
-		}
-	}
-	bch2_btree_iter_unlock(&iter);
-}
-
-static struct bpos parse_pos(char *buf)
-{
-	char *s = buf, *field;
-	u64 inode_v = 0, offset_v = 0;
-
-	if (!(field = strsep(&s, ":")) ||
-	    kstrtoull(field, 10, &inode_v))
-		die("invalid bpos %s", buf);
-
-	if ((field = strsep(&s, ":")) &&
-	    kstrtoull(field, 10, &offset_v))
-		die("invalid bpos %s", buf);
-
-	if (s)
-		die("invalid bpos %s", buf);
-
-	return (struct bpos) { .inode = inode_v, .offset = offset_v };
-}
-
-static void list_keys_usage(void)
-{
-	puts("bcachefs list - list filesystem metadata to stdout\n"
-	     "Usage: bcachefs list [OPTION]... <devices>\n"
-	     "\n"
-	     "Options:\n"
-	     "  -b (extents|inodes|dirents|xattrs)    Btree to list from\n"
-	     "  -s inode:offset                       Start position to list from\n"
-	     "  -e inode:offset                       End position\n"
-	     "  -i inode                              List keys for a given inode number\n"
-	     "  -m (keys|formats)                     List mode\n"
-	     "  -f                                    Check (fsck) the filesystem first\n"
-	     "  -v                                    Verbose mode\n"
-	     "  -h                                    Display this help and exit\n"
-	     "Report bugs to <linux-bcache@vger.kernel.org>");
-}
-
-static const char * const list_modes[] = {
-	"keys",
-	"formats",
-	"nodes",
-	NULL
-};
-
-int cmd_list(int argc, char *argv[])
-{
-	struct bch_opts opts = bch2_opts_empty();
-	enum btree_id btree_id = BTREE_ID_EXTENTS;
-	struct bpos start = POS_MIN, end = POS_MAX;
-	u64 inum;
-	int mode = 0, opt;
-
-	opt_set(opts, nochanges,	true);
-	opt_set(opts, norecovery,	true);
-	opt_set(opts, degraded,		true);
-	opt_set(opts, errors,		BCH_ON_ERROR_CONTINUE);
-
-	while ((opt = getopt(argc, argv, "b:s:e:i:m:fvh")) != -1)
-		switch (opt) {
-		case 'b':
-			btree_id = read_string_list_or_die(optarg,
-						bch2_btree_ids, "btree id");
-			break;
-		case 's':
-			start	= parse_pos(optarg);
-			break;
-		case 'e':
-			end	= parse_pos(optarg);
-			break;
-		case 'i':
-			if (kstrtoull(optarg, 10, &inum))
-				die("invalid inode %s", optarg);
-			start	= POS(inum, 0);
-			end	= POS(inum + 1, 0);
-			break;
-		case 'm':
-			mode = read_string_list_or_die(optarg,
-						list_modes, "list mode");
-			break;
-		case 'f':
-			opt_set(opts, fix_errors, FSCK_OPT_YES);
-			opt_set(opts, norecovery, false);
-			break;
-		case 'v':
-			opt_set(opts, verbose_recovery, true);
-			break;
-		case 'h':
-			list_keys_usage();
-			exit(EXIT_SUCCESS);
-		}
-	args_shift(optind);
-
-	if (!argc)
-		die("Please supply device(s)");
-
-	struct bch_fs *c = bch2_fs_open(argv, argc, opts);
-	if (IS_ERR(c))
-		die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c)));
-
-	switch (mode) {
-	case 0:
-		list_keys(c, btree_id, start, end);
-		break;
-	case 1:
-		list_btree_formats(c, btree_id, start, end);
-		break;
-	case 2:
-		list_nodes_keys(c, btree_id, start, end);
-		break;
-	default:
-		die("Invalid mode");
-	}
-
-	bch2_fs_stop(c);
-	return 0;
-}
diff --git a/cmd_device.c b/cmd_device.c
deleted file mode 100644
index 428d3047..00000000
--- a/cmd_device.c
+++ /dev/null
@@ -1,434 +0,0 @@
-#include <errno.h>
-#include <fcntl.h>
-#include <getopt.h>
-#include <libgen.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include "libbcachefs/bcachefs_ioctl.h"
-#include "libbcachefs/super-io.h"
-#include "cmds.h"
-#include "libbcachefs.h"
-#include "libbcachefs/opts.h"
-#include "tools-util.h"
-
-static void device_add_usage(void)
-{
-	puts("bcachefs device add - add a device to an existing filesystem\n"
-	     "Usage: bcachefs device add [OPTION]... filesystem device\n"
-	     "\n"
-	     "Options:\n"
-	     "      --fs_size=size          Size of filesystem on device\n"
-	     "      --bucket=size           Bucket size\n"
-	     "      --discard               Enable discards\n"
-	     "  -t, --tier=#                Higher tier (e.g. 1) indicates slower devices\n"
-	     "  -f, --force                 Use device even if it appears to already be formatted\n"
-	     "  -h, --help                  Display this help and exit\n"
-	     "\n"
-	     "Report bugs to <linux-bcache@vger.kernel.org>");
-}
-
-int cmd_device_add(int argc, char *argv[])
-{
-	static const struct option longopts[] = {
-		{ "fs_size",		required_argument,	NULL, 'S' },
-		{ "bucket",		required_argument,	NULL, 'B' },
-		{ "discard",		no_argument,		NULL, 'D' },
-		{ "group",		required_argument,	NULL, 'g' },
-		{ "force",		no_argument,		NULL, 'f' },
-		{ "help",		no_argument,		NULL, 'h' },
-		{ NULL }
-	};
-	struct format_opts format_opts	= format_opts_default();
-	struct dev_opts dev_opts	= dev_opts_default();
-	bool force = false;
-	int opt;
-
-	while ((opt = getopt_long(argc, argv, "t:fh",
-				  longopts, NULL)) != -1)
-		switch (opt) {
-		case 'S':
-			if (bch2_strtoull_h(optarg, &dev_opts.size))
-				die("invalid filesystem size");
-
-			dev_opts.size >>= 9;
-			break;
-		case 'B':
-			dev_opts.bucket_size =
-				hatoi_validate(optarg, "bucket size");
-			break;
-		case 'D':
-			dev_opts.discard = true;
-			break;
-		case 'g':
-			dev_opts.group = strdup(optarg);
-			break;
-		case 'f':
-			force = true;
-			break;
-		case 'h':
-			device_add_usage();
-			exit(EXIT_SUCCESS);
-		}
-	args_shift(optind);
-
-	char *fs_path = arg_pop();
-	if (!fs_path)
-		die("Please supply a filesystem");
-
-	char *dev_path = arg_pop();
-	if (!dev_path)
-		die("Please supply a device");
-
-	if (argc)
-		die("too many arguments");
-
-	struct bchfs_handle fs = bcache_fs_open(fs_path);
-
-	dev_opts.path = dev_path;
-	dev_opts.fd = open_for_format(dev_opts.path, force);
-
-	format_opts.block_size	=
-		read_file_u64(fs.sysfs_fd, "block_size") >> 9;
-	format_opts.btree_node_size =
-		read_file_u64(fs.sysfs_fd, "btree_node_size") >> 9;
-
-	struct bch_sb *sb = bch2_format(format_opts, &dev_opts, 1);
-	free(sb);
-	fsync(dev_opts.fd);
-	close(dev_opts.fd);
-
-	bchu_disk_add(fs, dev_opts.path);
-	return 0;
-}
-
-static void device_remove_usage(void)
-{
-	puts("bcachefs device_remove - remove a device from a filesystem\n"
-	     "Usage: bcachefs device remove device\n"
-	     "\n"
-	     "Options:\n"
-	     "  -f, --force		    Force removal, even if some data\n"
-	     "                              couldn't be migrated\n"
-	     "      --force-metadata	    Force removal, even if some metadata\n"
-	     "                              couldn't be migrated\n"
-	     "  -h, --help                  display this help and exit\n"
-	     "Report bugs to <linux-bcache@vger.kernel.org>");
-	exit(EXIT_SUCCESS);
-}
-
-int cmd_device_remove(int argc, char *argv[])
-{
-	static const struct option longopts[] = {
-		{ "force",		0, NULL, 'f' },
-		{ "force-metadata",	0, NULL, 'F' },
-		{ "help",		0, NULL, 'h' },
-		{ NULL }
-	};
-	int opt, flags = BCH_FORCE_IF_DEGRADED;
-
-	while ((opt = getopt_long(argc, argv, "fh", longopts, NULL)) != -1)
-		switch (opt) {
-		case 'f':
-			flags |= BCH_FORCE_IF_DATA_LOST;
-			break;
-		case 'F':
-			flags |= BCH_FORCE_IF_METADATA_LOST;
-			break;
-		case 'h':
-			device_remove_usage();
-		}
-	args_shift(optind);
-
-	char *dev = arg_pop();
-	if (!dev)
-		die("Please supply a device to remove");
-
-	if (argc)
-		die("too many arguments");
-
-	unsigned dev_idx;
-	struct bchfs_handle fs = bchu_fs_open_by_dev(dev, &dev_idx);
-	bchu_disk_remove(fs, dev_idx, flags);
-	return 0;
-}
-
-static void device_online_usage(void)
-{
-	puts("bcachefs device online - readd a device to a running filesystem\n"
-	     "Usage: bcachefs device online [OPTION]... device\n"
-	     "\n"
-	     "Options:\n"
-	     "  -h, --help                  Display this help and exit\n"
-	     "\n"
-	     "Report bugs to <linux-bcache@vger.kernel.org>");
-}
-
-int cmd_device_online(int argc, char *argv[])
-{
-	int opt;
-
-	while ((opt = getopt(argc, argv, "h")) != -1)
-		switch (opt) {
-		case 'h':
-			device_online_usage();
-			exit(EXIT_SUCCESS);
-		}
-	args_shift(optind);
-
-	char *dev = arg_pop();
-	if (!dev)
-		die("Please supply a device");
-
-	if (argc)
-		die("too many arguments");
-
-	unsigned dev_idx;
-	struct bchfs_handle fs = bchu_fs_open_by_dev(dev, &dev_idx);
-	bchu_disk_online(fs, dev);
-	return 0;
-}
-
-static void device_offline_usage(void)
-{
-	puts("bcachefs device offline - take a device offline, without removing it\n"
-	     "Usage: bcachefs device offline [OPTION]... device\n"
-	     "\n"
-	     "Options:\n"
-	     "  -f, --force		    Force, if data redundancy will be degraded\n"
-	     "  -h, --help                  Display this help and exit\n"
-	     "\n"
-	     "Report bugs to <linux-bcache@vger.kernel.org>");
-}
-
-int cmd_device_offline(int argc, char *argv[])
-{
-	static const struct option longopts[] = {
-		{ "force",		0, NULL, 'f' },
-		{ NULL }
-	};
-	int opt, flags = 0;
-
-	while ((opt = getopt_long(argc, argv, "fh",
-				  longopts, NULL)) != -1)
-		switch (opt) {
-		case 'f':
-			flags |= BCH_FORCE_IF_DEGRADED;
-			break;
-		case 'h':
-			device_offline_usage();
-			exit(EXIT_SUCCESS);
-		}
-	args_shift(optind);
-
-	char *dev = arg_pop();
-	if (!dev)
-		die("Please supply a device");
-
-	if (argc)
-		die("too many arguments");
-
-	unsigned dev_idx;
-	struct bchfs_handle fs = bchu_fs_open_by_dev(dev, &dev_idx);
-	bchu_disk_offline(fs, dev_idx, flags);
-	return 0;
-}
-
-static void device_evacuate_usage(void)
-{
-	puts("bcachefs device evacuate - move data off of a given device\n"
-	     "Usage: bcachefs device evacuate [OPTION]... device\n"
-	     "\n"
-	     "Options:\n"
-	     "  -h, --help                  Display this help and exit\n"
-	     "\n"
-	     "Report bugs to <linux-bcache@vger.kernel.org>");
-}
-
-int cmd_device_evacuate(int argc, char *argv[])
-{
-	int opt;
-
-	while ((opt = getopt(argc, argv, "h")) != -1)
-		switch (opt) {
-		case 'h':
-			device_evacuate_usage();
-			exit(EXIT_SUCCESS);
-		}
-	args_shift(optind);
-
-	char *dev_path = arg_pop();
-	if (!dev_path)
-		die("Please supply a device");
-
-	if (argc)
-		die("too many arguments");
-
-	unsigned dev_idx;
-	struct bchfs_handle fs = bchu_fs_open_by_dev(dev_path, &dev_idx);
-
-	return bchu_data(fs, (struct bch_ioctl_data) {
-		.op		= BCH_DATA_OP_MIGRATE,
-		.start		= POS_MIN,
-		.end		= POS_MAX,
-		.migrate.dev	= dev_idx,
-	});
-}
-
-static void device_set_state_usage(void)
-{
-	puts("bcachefs device set-state\n"
-	     "Usage: bcachefs device set-state device new-state\n"
-	     "\n"
-	     "Options:\n"
-	     "  -f, --force		    Force, if data redundancy will be degraded\n"
-	     "  -h, --help                  display this help and exit\n"
-	     "Report bugs to <linux-bcache@vger.kernel.org>");
-	exit(EXIT_SUCCESS);
-}
-
-int cmd_device_set_state(int argc, char *argv[])
-{
-	static const struct option longopts[] = {
-		{ "force",			0, NULL, 'f' },
-		{ "help",			0, NULL, 'h' },
-		{ NULL }
-	};
-	int opt, flags = 0;
-
-	while ((opt = getopt_long(argc, argv, "fh", longopts, NULL)) != -1)
-		switch (opt) {
-		case 'f':
-			flags |= BCH_FORCE_IF_DEGRADED;
-			break;
-		case 'h':
-			device_set_state_usage();
-		}
-	args_shift(optind);
-
-	char *dev_path = arg_pop();
-	if (!dev_path)
-		die("Please supply a device");
-
-	char *new_state_str = arg_pop();
-	if (!new_state_str)
-		die("Please supply a device state");
-
-	unsigned new_state = read_string_list_or_die(new_state_str,
-					bch2_dev_state, "device state");
-
-	unsigned dev_idx;
-	struct bchfs_handle fs = bchu_fs_open_by_dev(dev_path, &dev_idx);
-
-	bchu_disk_set_state(fs, dev_idx, new_state, flags);
-	return 0;
-}
-
-static void device_resize_usage(void)
-{
-	puts("bcachefs device resize \n"
-	     "Usage: bcachefs device resize device [ size ]\n"
-	     "\n"
-	     "Options:\n"
-	     "  -h, --help                  display this help and exit\n"
-	     "Report bugs to <linux-bcache@vger.kernel.org>");
-	exit(EXIT_SUCCESS);
-}
-
-int cmd_device_resize(int argc, char *argv[])
-{
-	static const struct option longopts[] = {
-		{ "help",			0, NULL, 'h' },
-		{ NULL }
-	};
-	u64 size;
-	int opt;
-
-	while ((opt = getopt_long(argc, argv, "h", longopts, NULL)) != -1)
-		switch (opt) {
-		case 'h':
-			device_resize_usage();
-		}
-	args_shift(optind);
-
-	char *dev = arg_pop();
-	if (!dev)
-		die("Please supply a device to resize");
-
-	int dev_fd = xopen(dev, O_RDONLY);
-
-	char *size_arg = arg_pop();
-	if (!size_arg)
-		size = get_size(dev, dev_fd);
-	else if (bch2_strtoull_h(size_arg, &size))
-		die("invalid size");
-
-	size >>= 9;
-
-	if (argc)
-		die("Too many arguments");
-
-	struct stat dev_stat = xfstat(dev_fd);
-
-	char *mount = dev_to_mount(dev);
-	if (mount) {
-		if (!S_ISBLK(dev_stat.st_mode))
-			die("%s is mounted but isn't a block device?!", dev);
-
-		printf("Doing online resize of %s\n", dev);
-
-		struct bchfs_handle fs = bcache_fs_open(mount);
-
-		unsigned idx = bchu_disk_get_idx(fs, dev_stat.st_rdev);
-
-		struct bch_sb *sb = bchu_read_super(fs, -1);
-		if (idx >= sb->nr_devices)
-			die("error reading superblock: dev idx >= sb->nr_devices");
-
-		struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
-		if (!mi)
-			die("error reading superblock: no member info");
-
-		/* could also just read this out of sysfs... meh */
-		struct bch_member *m = mi->members + idx;
-
-		u64 nbuckets = size / le16_to_cpu(m->bucket_size);
-
-		printf("resizing %s to %llu buckets\n", dev, nbuckets);
-		bchu_disk_resize(fs, idx, nbuckets);
-	} else {
-		printf("Doing offline resize of %s\n", dev);
-
-		struct bch_fs *c = bch2_fs_open(&dev, 1, bch2_opts_empty());
-		if (IS_ERR(c))
-			die("error opening %s: %s", dev, strerror(-PTR_ERR(c)));
-
-		struct bch_dev *ca, *resize = NULL;
-		unsigned i;
-
-		for_each_online_member(ca, c, i) {
-			if (resize)
-				die("confused: more than one online device?");
-			resize = ca;
-			percpu_ref_get(&resize->io_ref);
-		}
-
-		u64 nbuckets = size / le16_to_cpu(resize->mi.bucket_size);
-
-		printf("resizing %s to %llu buckets\n", dev, nbuckets);
-		int ret = bch2_dev_resize(c, resize, nbuckets);
-		if (ret)
-			fprintf(stderr, "resize error: %s\n", strerror(-ret));
-
-		percpu_ref_put(&resize->io_ref);
-		bch2_fs_stop(c);
-	}
-	return 0;
-}
diff --git a/cmd_format.c b/cmd_format.c
deleted file mode 100644
index 75efd521..00000000
--- a/cmd_format.c
+++ /dev/null
@@ -1,372 +0,0 @@
-/*
- * Authors: Kent Overstreet <kent.overstreet@gmail.com>
- *	    Gabriel de Perthuis <g2p.code@gmail.com>
- *	    Jacob Malevich <jam@datera.io>
- *
- * GPLv2
- */
-#include <errno.h>
-#include <fcntl.h>
-#include <getopt.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include <uuid/uuid.h>
-
-#include "ccan/darray/darray.h"
-
-#include "cmds.h"
-#include "libbcachefs.h"
-#include "crypto.h"
-#include "libbcachefs/opts.h"
-#include "libbcachefs/super-io.h"
-#include "libbcachefs/util.h"
-
-#define OPTS									\
-t("bcachefs format - create a new bcachefs filesystem on one or more devices")	\
-t("Usage: bcachefs format [OPTION]... <devices>")				\
-t("")										\
-x('b',	block_size,		"size",			NULL)			\
-x(0,	btree_node_size,	"size",			"Default 256k")		\
-x(0,	metadata_checksum_type,	"(none|crc32c|crc64)",	NULL)			\
-x(0,	data_checksum_type,	"(none|crc32c|crc64)",	NULL)			\
-x(0,	compression_type,	"(none|lz4|gzip)",	NULL)			\
-x(0,	background_compression_type,	"(none|lz4|gzip)",	NULL)		\
-x(0,	replicas,		"#",			NULL)			\
-x(0,	data_replicas,		"#",			NULL)			\
-x(0,	metadata_replicas,	"#",			NULL)			\
-x(0,	foreground_target,	"target",		NULL)			\
-x(0,	background_target,	"target",		NULL)			\
-x(0,	promote_target,		"target",		NULL)			\
-x(0,	encrypted,		NULL,			"Enable whole filesystem encryption (chacha20/poly1305)")\
-x(0,	no_passphrase,		NULL,			"Don't encrypt master encryption key")\
-x('e',	error_action,		"(continue|remount-ro|panic)", NULL)		\
-x('L',	label,			"label",		NULL)			\
-x('U',	uuid,			"uuid",			NULL)			\
-x('f',	force,			NULL,			NULL)			\
-t("")										\
-t("Device specific options:")							\
-x(0,	fs_size,		"size",			"Size of filesystem on device")\
-x(0,	bucket_size,		"size",			"Bucket size")		\
-x('g',	group,			"label",		"Disk group")\
-x(0,	discard,		NULL,			NULL)			\
-x(0,	data_allowed,		"journal,btree,data",	"Allowed types of data on this device")\
-x(0,	durability,		"#",			"Number of times data written to this device will have been considered replicated")\
-t("Device specific options must come before corresponding devices, e.g.")	\
-t("  bcachefs format --group cache /dev/sdb --tier 1 /dev/sdc")			\
-t("")										\
-x('q',	quiet,			NULL,			"Only print errors")	\
-x('h',	help,			NULL,			"Display this help and exit")
-
-static void usage(void)
-{
-#define t(text)				puts(text "\n")
-#define x(shortopt, longopt, arg, help) do {				\
-	OPTS
-#undef x
-#undef t
-
-	puts("bcachefs format - create a new bcachefs filesystem on one or more devices\n"
-	     "Usage: bcachefs format [OPTION]... <devices>\n"
-	     "\n"
-	     "Options:\n"
-	     "  -b, --block=size\n"
-	     "      --btree_node=size       Btree node size, default 256k\n"
-	     "      --metadata_checksum_type=(none|crc32c|crc64)\n"
-	     "      --data_checksum_type=(none|crc32c|crc64)\n"
-	     "      --compression_type=(none|lz4|gzip|zstd)\n"
-	     "      --background_compression_type=(none|lz4|gzip|zstd)\n"
-	     "      --data_replicas=#       Number of data replicas\n"
-	     "      --metadata_replicas=#   Number of metadata replicas\n"
-	     "      --replicas=#            Sets both data and metadata replicas\n"
-	     "      --encrypted             Enable whole filesystem encryption (chacha20/poly1305)\n"
-	     "      --no_passphrase         Don't encrypt master encryption key\n"
-	     "      --error_action=(continue|remount-ro|panic)\n"
-	     "                              Action to take on filesystem error\n"
-	     "  -L, --label=label\n"
-	     "  -U, --uuid=uuid\n"
-	     "  -f, --force\n"
-	     "\n"
-	     "Device specific options:\n"
-	     "      --fs_size=size          Size of filesystem on device\n"
-	     "      --bucket=size           Bucket size\n"
-	     "      --discard               Enable discards\n"
-	     "      --durability=#          Device durability (0-4)\n"
-	     "  -g, --group=label           Disk group\n"
-	     "\n"
-	     "  -q, --quiet                 Only print errors\n"
-	     "  -h, --help                  Display this help and exit\n"
-	     "\n"
-	     "Device specific options must come before corresponding devices, e.g.\n"
-	     "  bcachefs format --group cache /dev/sdb --tier 1 /dev/sdc\n"
-	     "\n"
-	     "Report bugs to <linux-bcache@vger.kernel.org>");
-}
-
-enum {
-	O_no_opt = 1,
-#define t(text)
-#define x(shortopt, longopt, arg, help)	O_##longopt,
-	OPTS
-#undef x
-#undef t
-};
-
-static const struct option format_opts[] = {
-#define t(text)
-#define x(shortopt, longopt, arg, help)	{				\
-	.name		= #longopt,					\
-	.has_arg	= arg ? required_argument : no_argument,	\
-	.flag		= NULL,						\
-	.val		= O_##longopt,					\
-},
-	OPTS
-#undef x
-#undef t
-	{ NULL }
-};
-
-u64 read_flag_list_or_die(char *opt, const char * const list[],
-			  const char *msg)
-{
-	u64 v = bch2_read_flag_list(opt, list);
-	if (v == (u64) -1)
-		die("Bad %s %s", msg, opt);
-
-	return v;
-}
-
-int cmd_format(int argc, char *argv[])
-{
-	darray(struct dev_opts) devices;
-	struct format_opts opts	= format_opts_default();
-	struct dev_opts dev_opts = dev_opts_default(), *dev;
-	bool force = false, no_passphrase = false, quiet = false;
-	int opt;
-
-	darray_init(devices);
-
-	while ((opt = getopt_long(argc, argv,
-				  "-b:e:L:U:ft:qh",
-				  format_opts,
-				  NULL)) != -1)
-		switch (opt) {
-		case O_block_size:
-		case 'b':
-			opts.block_size =
-				hatoi_validate(optarg, "block size");
-			break;
-		case O_btree_node_size:
-			opts.btree_node_size =
-				hatoi_validate(optarg, "btree node size");
-			break;
-		case O_metadata_checksum_type:
-			opts.meta_csum_type =
-				read_string_list_or_die(optarg,
-						bch2_csum_types, "checksum type");
-			break;
-		case O_data_checksum_type:
-			opts.data_csum_type =
-				read_string_list_or_die(optarg,
-						bch2_csum_types, "checksum type");
-			break;
-		case O_compression_type:
-			opts.compression_type =
-				read_string_list_or_die(optarg,
-						bch2_compression_types,
-						"compression type");
-			break;
-		case O_background_compression_type:
-			opts.background_compression_type =
-				read_string_list_or_die(optarg,
-						bch2_compression_types,
-						"compression type");
-			break;
-		case O_data_replicas:
-			if (kstrtouint(optarg, 10, &opts.data_replicas) ||
-			    !opts.data_replicas ||
-			    opts.data_replicas > BCH_REPLICAS_MAX)
-				die("invalid replicas");
-			break;
-		case O_metadata_replicas:
-			if (kstrtouint(optarg, 10, &opts.meta_replicas) ||
-			    !opts.meta_replicas ||
-			    opts.meta_replicas > BCH_REPLICAS_MAX)
-				die("invalid replicas");
-			break;
-		case O_replicas:
-			if (kstrtouint(optarg, 10, &opts.data_replicas) ||
-			    !opts.data_replicas ||
-			    opts.data_replicas > BCH_REPLICAS_MAX)
-				die("invalid replicas");
-			opts.meta_replicas = opts.data_replicas;
-			break;
-		case O_foreground_target:
-			opts.foreground_target = strdup(optarg);
-			break;
-		case O_background_target:
-			opts.background_target = strdup(optarg);
-			break;
-		case O_promote_target:
-			opts.promote_target = strdup(optarg);
-			break;
-		case O_encrypted:
-			opts.encrypted = true;
-			break;
-		case O_no_passphrase:
-			no_passphrase = true;
-			break;
-		case O_error_action:
-		case 'e':
-			opts.on_error_action =
-				read_string_list_or_die(optarg,
-						bch2_error_actions, "error action");
-			break;
-		case O_label:
-		case 'L':
-			opts.label = strdup(optarg);
-			break;
-		case O_uuid:
-		case 'U':
-			if (uuid_parse(optarg, opts.uuid.b))
-				die("Bad uuid");
-			break;
-		case O_force:
-		case 'f':
-			force = true;
-			break;
-		case O_fs_size:
-			if (bch2_strtoull_h(optarg, &dev_opts.size))
-				die("invalid filesystem size");
-
-			dev_opts.size >>= 9;
-			break;
-		case O_bucket_size:
-			dev_opts.bucket_size =
-				hatoi_validate(optarg, "bucket size");
-			break;
-		case O_group:
-		case 'g':
-			dev_opts.group = strdup(optarg);
-			break;
-		case O_discard:
-			dev_opts.discard = true;
-			break;
-		case O_data_allowed:
-			dev_opts.data_allowed =
-				read_flag_list_or_die(optarg,
-					bch2_data_types, "data type");
-			break;
-		case O_durability:
-			if (kstrtouint(optarg, 10, &dev_opts.durability) ||
-			    dev_opts.durability > BCH_REPLICAS_MAX)
-				die("invalid durability");
-			break;
-		case O_no_opt:
-			dev_opts.path = strdup(optarg);
-			darray_append(devices, dev_opts);
-			dev_opts.size = 0;
-			break;
-		case O_quiet:
-		case 'q':
-			quiet = true;
-			break;
-		case O_help:
-		case 'h':
-			usage();
-			exit(EXIT_SUCCESS);
-			break;
-		}
-
-	if (!darray_size(devices))
-		die("Please supply a device");
-
-	if (opts.encrypted && !no_passphrase)
-		opts.passphrase = read_passphrase_twice("Enter passphrase: ");
-
-	darray_foreach(dev, devices)
-		dev->fd = open_for_format(dev->path, force);
-
-	struct bch_sb *sb =
-		bch2_format(opts, devices.item, darray_size(devices));
-
-	if (!quiet)
-		bch2_sb_print(sb, false, 1 << BCH_SB_FIELD_members, HUMAN_READABLE);
-	free(sb);
-
-	if (opts.passphrase) {
-		memzero_explicit(opts.passphrase, strlen(opts.passphrase));
-		free(opts.passphrase);
-	}
-
-	return 0;
-}
-
-static void show_super_usage(void)
-{
-	puts("bcachefs show-super \n"
-	     "Usage: bcachefs show-super [OPTION].. device\n"
-	     "\n"
-	     "Options:\n"
-	     "  -f, --fields=(fields)       list of sections to print\n"
-	     "  -l, --layout                print superblock layout\n"
-	     "  -h, --help                  display this help and exit\n"
-	     "Report bugs to <linux-bcache@vger.kernel.org>");
-	exit(EXIT_SUCCESS);
-}
-
-int cmd_show_super(int argc, char *argv[])
-{
-	static const struct option longopts[] = {
-		{ "fields",			1, NULL, 'f' },
-		{ "layout",			0, NULL, 'l' },
-		{ "help",			0, NULL, 'h' },
-		{ NULL }
-	};
-	unsigned fields = 1 << BCH_SB_FIELD_members;
-	bool print_layout = false;
-	int opt;
-
-	while ((opt = getopt_long(argc, argv, "f:lh", longopts, NULL)) != -1)
-		switch (opt) {
-		case 'f':
-			fields = !strcmp(optarg, "all")
-				? ~0
-				: read_flag_list_or_die(optarg,
-					bch2_sb_fields, "superblock field");
-			break;
-		case 'l':
-			print_layout = true;
-			break;
-		case 'h':
-			show_super_usage();
-			break;
-		}
-	args_shift(optind);
-
-	char *dev = arg_pop();
-	if (!dev)
-		die("please supply a device");
-	if (argc)
-		die("too many arguments");
-
-	struct bch_opts opts = bch2_opts_empty();
-
-	opt_set(opts, noexcl,	true);
-	opt_set(opts, nochanges, true);
-
-	struct bch_sb_handle sb;
-	int ret = bch2_read_super(dev, &opts, &sb);
-	if (ret)
-		die("Error opening %s: %s", dev, strerror(-ret));
-
-	bch2_sb_print(sb.sb, print_layout, fields, HUMAN_READABLE);
-	bch2_free_super(&sb);
-	return 0;
-}
diff --git a/cmd_fs.c b/cmd_fs.c
deleted file mode 100644
index c5aeb7b7..00000000
--- a/cmd_fs.c
+++ /dev/null
@@ -1,159 +0,0 @@
-
-#include <stdio.h>
-#include <sys/ioctl.h>
-
-#include <uuid/uuid.h>
-
-#include "ccan/darray/darray.h"
-
-#include "linux/sort.h"
-
-#include "libbcachefs/bcachefs_ioctl.h"
-#include "libbcachefs/opts.h"
-
-#include "cmds.h"
-#include "libbcachefs.h"
-
-static void print_dev_usage(struct bch_ioctl_dev_usage *d, unsigned idx,
-			    const char *label, enum units units)
-{
-	char *name = NULL;
-	u64 available = d->nr_buckets;
-	unsigned i;
-
-	printf("\n");
-	printf_pad(20, "%s (device %u):", label, idx);
-
-	name = !d->dev ? strdup("(offline)")
-		: dev_to_path(d->dev)
-		?: strdup("(device not found)");
-	printf("%24s%12s\n", name, bch2_dev_state[d->state]);
-	free(name);
-
-	printf("%-20s%12s%12s%12s\n",
-	       "", "data", "buckets", "fragmented");
-
-	for (i = BCH_DATA_SB; i < BCH_DATA_NR; i++) {
-		u64 frag = max((s64) d->buckets[i] * d->bucket_size -
-			       (s64) d->sectors[i], 0LL);
-
-		printf_pad(20, "  %s:", bch2_data_types[i]);
-		printf("%12s%12llu%12s\n",
-		       pr_units(d->sectors[i], units),
-		       d->buckets[i],
-		       pr_units(frag, units));
-
-		if (i != BCH_DATA_CACHED)
-			available -= d->buckets[i];
-	}
-
-	printf_pad(20, "  available:");
-	printf("%12s%12llu\n",
-	       pr_units(available * d->bucket_size, units),
-	       available);
-
-	printf_pad(20, "  capacity:");
-	printf("%12s%12llu\n",
-	       pr_units(d->nr_buckets * d->bucket_size, units),
-	       d->nr_buckets);
-}
-
-struct dev_by_label {
-	unsigned	idx;
-	char		*label;
-};
-
-static int dev_by_label_cmp(const void *_l, const void *_r)
-{
-	const struct dev_by_label *l = _l, *r = _r;
-
-	return strcmp(l->label, r->label);
-}
-
-static void print_fs_usage(const char *path, enum units units)
-{
-	unsigned i, j;
-	char uuid[40];
-
-	struct bchfs_handle fs = bcache_fs_open(path);
-	struct bch_ioctl_usage *u = bchu_usage(fs);
-
-	uuid_unparse(fs.uuid.b, uuid);
-	printf("Filesystem %s:\n", uuid);
-
-	printf("%-20s%12s\n", "Size:", pr_units(u->fs.capacity, units));
-	printf("%-20s%12s\n", "Used:", pr_units(u->fs.used, units));
-
-	printf("%-20s%12s%12s%12s%12s\n",
-	       "By replicas:", "1x", "2x", "3x", "4x");
-
-	for (j = BCH_DATA_BTREE; j < BCH_DATA_NR; j++) {
-		printf_pad(20, "  %s:", bch2_data_types[j]);
-
-		for (i = 0; i < BCH_REPLICAS_MAX; i++)
-			printf("%12s", pr_units(u->fs.sectors[j][i], units));
-		printf("\n");
-	}
-
-	printf_pad(20, "  %s:", "reserved");
-	for (i = 0; i < BCH_REPLICAS_MAX; i++)
-		printf("%12s", pr_units(u->fs.persistent_reserved[i], units));
-	printf("\n");
-
-	printf("%-20s%12s\n", "  online reserved:", pr_units(u->fs.online_reserved, units));
-
-	darray(struct dev_by_label) devs_by_label;
-	darray_init(devs_by_label);
-
-	for (i = 0; i < u->nr_devices; i++) {
-		struct bch_ioctl_dev_usage *d = u->devs + i;
-
-		if (!d->alive)
-			continue;
-
-		char *label_attr = mprintf("dev-%u/label", i);
-		char *label = read_file_str(fs.sysfs_fd, label_attr);
-		free(label_attr);
-
-		darray_append(devs_by_label,
-			(struct dev_by_label) { i, label });
-	}
-
-	sort(&darray_item(devs_by_label, 0), darray_size(devs_by_label),
-	     sizeof(darray_item(devs_by_label, 0)), dev_by_label_cmp, NULL);
-
-	struct dev_by_label *d;
-	darray_foreach(d, devs_by_label)
-		print_dev_usage(u->devs + d->idx, d->idx, d->label, units);
-
-	darray_foreach(d, devs_by_label)
-		free(d->label);
-	darray_free(devs_by_label);
-
-	free(u);
-	bcache_fs_close(fs);
-}
-
-int cmd_fs_usage(int argc, char *argv[])
-{
-	enum units units = BYTES;
-	char *fs;
-	int opt;
-
-	while ((opt = getopt(argc, argv, "h")) != -1)
-		switch (opt) {
-		case 'h':
-			units = HUMAN_READABLE;
-			break;
-		}
-	args_shift(optind);
-
-	if (!argc) {
-		print_fs_usage(".", units);
-	} else {
-		while ((fs = arg_pop()))
-			print_fs_usage(fs, units);
-	}
-
-	return 0;
-}
diff --git a/cmd_fsck.c b/cmd_fsck.c
deleted file mode 100644
index d16760ac..00000000
--- a/cmd_fsck.c
+++ /dev/null
@@ -1,70 +0,0 @@
-
-#include "cmds.h"
-#include "libbcachefs/error.h"
-#include "libbcachefs.h"
-#include "libbcachefs/super.h"
-#include "tools-util.h"
-
-static void usage(void)
-{
-	puts("bcachefs fsck - filesystem check and repair\n"
-	     "Usage: bcachefs fsck [OPTION]... <devices>\n"
-	     "\n"
-	     "Options:\n"
-	     "  -p     Automatic repair (no questions)\n"
-	     "  -n     Don't repair, only check for errors\n"
-	     "  -y     Assume \"yes\" to all questions\n"
-	     "  -f     Force checking even if filesystem is marked clean\n"
-	     "  -v     Be verbose\n"
-	     " --h     Display this help and exit\n"
-	     "Report bugs to <linux-bcache@vger.kernel.org>");
-}
-
-int cmd_fsck(int argc, char *argv[])
-{
-	struct bch_opts opts = bch2_opts_empty();
-	int opt, ret = 0;
-
-	opt_set(opts, degraded, true);
-	opt_set(opts, fix_errors, FSCK_OPT_ASK);
-
-	while ((opt = getopt(argc, argv, "apynfvh")) != -1)
-		switch (opt) {
-		case 'a': /* outdated alias for -p */
-		case 'p':
-			opt_set(opts, fix_errors, FSCK_OPT_YES);
-			break;
-		case 'y':
-			opt_set(opts, fix_errors, FSCK_OPT_YES);
-			break;
-		case 'n':
-			opt_set(opts, nochanges, true);
-			opt_set(opts, fix_errors, FSCK_OPT_NO);
-			break;
-		case 'f':
-			/* force check, even if filesystem marked clean: */
-			break;
-		case 'v':
-			opt_set(opts, verbose_recovery, true);
-			break;
-		case 'h':
-			usage();
-			exit(EXIT_SUCCESS);
-		}
-	args_shift(optind);
-
-	if (!argc)
-		die("Please supply device(s) to check");
-
-	struct bch_fs *c = bch2_fs_open(argv, argc, opts);
-	if (IS_ERR(c))
-		die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c)));
-
-	if (test_bit(BCH_FS_FSCK_FIXED_ERRORS, &c->flags))
-		ret = 2;
-	if (test_bit(BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags))
-		ret = 4;
-
-	bch2_fs_stop(c);
-	return ret;
-}
diff --git a/cmd_migrate.c b/cmd_migrate.c
deleted file mode 100644
index 44283c3c..00000000
--- a/cmd_migrate.c
+++ /dev/null
@@ -1,852 +0,0 @@
-#include <dirent.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <getopt.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <sys/stat.h>
-#include <sys/sysmacros.h>
-#include <sys/types.h>
-#include <sys/vfs.h>
-#include <unistd.h>
-#include <attr/xattr.h>
-
-#include <linux/fiemap.h>
-#include <linux/fs.h>
-#include <linux/stat.h>
-
-#include <uuid/uuid.h>
-
-#include "cmds.h"
-#include "crypto.h"
-#include "libbcachefs.h"
-
-#include <linux/dcache.h>
-#include <linux/generic-radix-tree.h>
-#include <linux/xattr.h>
-#include "libbcachefs/bcachefs.h"
-#include "libbcachefs/btree_update.h"
-#include "libbcachefs/buckets.h"
-#include "libbcachefs/dirent.h"
-#include "libbcachefs/fs.h"
-#include "libbcachefs/inode.h"
-#include "libbcachefs/io.h"
-#include "libbcachefs/replicas.h"
-#include "libbcachefs/str_hash.h"
-#include "libbcachefs/super.h"
-#include "libbcachefs/xattr.h"
-
-static char *dev_t_to_path(dev_t dev)
-{
-	char link[PATH_MAX], *p;
-	int ret;
-
-	char *sysfs_dev = mprintf("/sys/dev/block/%u:%u",
-				  major(dev), minor(dev));
-	ret = readlink(sysfs_dev, link, sizeof(link));
-	free(sysfs_dev);
-
-	if (ret < 0 || ret >= sizeof(link))
-		die("readlink error while looking up block device: %m");
-
-	link[ret] = '\0';
-
-	p = strrchr(link, '/');
-	if (!p)
-		die("error looking up device name");
-	p++;
-
-	return mprintf("/dev/%s", p);
-}
-
-static bool path_is_fs_root(const char *path)
-{
-	char *line = NULL, *p, *mount;
-	size_t n = 0;
-	FILE *f;
-	bool ret = true;
-
-	f = fopen("/proc/self/mountinfo", "r");
-	if (!f)
-		die("Error getting mount information");
-
-	while (getline(&line, &n, f) != -1) {
-		p = line;
-
-		strsep(&p, " "); /* mount id */
-		strsep(&p, " "); /* parent id */
-		strsep(&p, " "); /* dev */
-		strsep(&p, " "); /* root */
-		mount = strsep(&p, " ");
-		strsep(&p, " ");
-
-		if (mount && !strcmp(path, mount))
-			goto found;
-	}
-
-	ret = false;
-found:
-	fclose(f);
-	free(line);
-	return ret;
-}
-
-static void mark_unreserved_space(struct bch_fs *c, ranges extents)
-{
-	struct bch_dev *ca = c->devs[0];
-	struct hole_iter iter;
-	struct range i;
-
-	for_each_hole(iter, extents, bucket_to_sector(ca, ca->mi.nbuckets) << 9, i) {
-		struct bucket_mark new;
-		u64 b;
-
-		if (i.start == i.end)
-			return;
-
-		b = sector_to_bucket(ca, i.start >> 9);
-		do {
-			struct bucket *g = bucket(ca, b);
-			bucket_cmpxchg(g, new, new.nouse = 1);
-			b++;
-		} while (bucket_to_sector(ca, b) << 9 < i.end);
-	}
-}
-
-static void update_inode(struct bch_fs *c,
-			 struct bch_inode_unpacked *inode)
-{
-	struct bkey_inode_buf packed;
-	int ret;
-
-	bch2_inode_pack(&packed, inode);
-	ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
-				NULL, NULL, NULL, 0);
-	if (ret)
-		die("error creating file: %s", strerror(-ret));
-}
-
-static void create_dirent(struct bch_fs *c,
-			  struct bch_inode_unpacked *parent,
-			  const char *name, u64 inum, mode_t mode)
-{
-	struct bch_hash_info parent_hash_info = bch2_hash_info_init(c, parent);
-	struct qstr qname = { { { .len = strlen(name), } }, .name = name };
-
-	int ret = bch2_dirent_create(c, parent->bi_inum, &parent_hash_info,
-				     mode_to_type(mode), &qname,
-				     inum, NULL, BCH_HASH_SET_MUST_CREATE);
-	if (ret)
-		die("error creating file: %s", strerror(-ret));
-
-	if (S_ISDIR(mode))
-		parent->bi_nlink++;
-}
-
-static void create_link(struct bch_fs *c,
-			struct bch_inode_unpacked *parent,
-			const char *name, u64 inum, mode_t mode)
-{
-	struct bch_inode_unpacked inode;
-	int ret = bch2_inode_find_by_inum(c, inum, &inode);
-	if (ret)
-		die("error looking up hardlink: %s", strerror(-ret));
-
-	inode.bi_nlink++;
-	update_inode(c, &inode);
-
-	create_dirent(c, parent, name, inum, mode);
-}
-
-static struct bch_inode_unpacked create_file(struct bch_fs *c,
-					     struct bch_inode_unpacked *parent,
-					     const char *name,
-					     uid_t uid, gid_t gid,
-					     mode_t mode, dev_t rdev)
-{
-	struct bch_inode_unpacked new_inode;
-	int ret;
-
-	bch2_inode_init(c, &new_inode, uid, gid, mode, rdev, parent);
-
-	ret = bch2_inode_create(c, &new_inode, BLOCKDEV_INODE_MAX, 0,
-				&c->unused_inode_hint);
-	if (ret)
-		die("error creating file: %s", strerror(-ret));
-
-	create_dirent(c, parent, name, new_inode.bi_inum, mode);
-
-	return new_inode;
-}
-
-#define for_each_xattr_handler(handlers, handler)		\
-	if (handlers)						\
-		for ((handler) = *(handlers)++;			\
-			(handler) != NULL;			\
-			(handler) = *(handlers)++)
-
-static const struct xattr_handler *xattr_resolve_name(const char **name)
-{
-	const struct xattr_handler **handlers = bch2_xattr_handlers;
-	const struct xattr_handler *handler;
-
-	for_each_xattr_handler(handlers, handler) {
-		const char *n;
-
-		n = strcmp_prefix(*name, xattr_prefix(handler));
-		if (n) {
-			if (!handler->prefix ^ !*n) {
-				if (*n)
-					continue;
-				return ERR_PTR(-EINVAL);
-			}
-			*name = n;
-			return handler;
-		}
-	}
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static void copy_times(struct bch_fs *c, struct bch_inode_unpacked *dst,
-		       struct stat *src)
-{
-	dst->bi_atime = timespec_to_bch2_time(c, src->st_atim);
-	dst->bi_mtime = timespec_to_bch2_time(c, src->st_mtim);
-	dst->bi_ctime = timespec_to_bch2_time(c, src->st_ctim);
-}
-
-static void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst,
-			char *src)
-{
-	struct bch_hash_info hash_info = bch2_hash_info_init(c, dst);
-
-	char attrs[XATTR_LIST_MAX];
-	ssize_t attrs_size = llistxattr(src, attrs, sizeof(attrs));
-	if (attrs_size < 0)
-		die("listxattr error: %m");
-
-	const char *next, *attr;
-	for (attr = attrs;
-	     attr < attrs + attrs_size;
-	     attr = next) {
-		next = attr + strlen(attr) + 1;
-
-		char val[XATTR_SIZE_MAX];
-		ssize_t val_size = lgetxattr(src, attr, val, sizeof(val));
-
-		if (val_size < 0)
-			die("error getting xattr val: %m");
-
-		const struct xattr_handler *h = xattr_resolve_name(&attr);
-
-		int ret = bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC,
-				bch2_xattr_set(&trans, dst->bi_inum, &hash_info, attr,
-					       val, val_size, h->flags, 0));
-		if (ret < 0)
-			die("error creating xattr: %s", strerror(-ret));
-	}
-}
-
-static char buf[1 << 20] __aligned(PAGE_SIZE);
-static const size_t buf_pages = sizeof(buf) / PAGE_SIZE;
-
-static void write_data(struct bch_fs *c,
-		       struct bch_inode_unpacked *dst_inode,
-		       u64 dst_offset, void *buf, size_t len)
-{
-	struct {
-		struct bch_write_op op;
-		struct bio_vec bv[buf_pages];
-	} o;
-	struct closure cl;
-
-	BUG_ON(dst_offset	& (block_bytes(c) - 1));
-	BUG_ON(len		& (block_bytes(c) - 1));
-
-	closure_init_stack(&cl);
-
-	bio_init(&o.op.wbio.bio, o.bv, buf_pages);
-	o.op.wbio.bio.bi_iter.bi_size = len;
-	bch2_bio_map(&o.op.wbio.bio, buf);
-
-	bch2_write_op_init(&o.op, c, bch2_opts_to_inode_opts(c->opts));
-	o.op.write_point	= writepoint_hashed(0);
-	o.op.nr_replicas	= 1;
-	o.op.pos		= POS(dst_inode->bi_inum, dst_offset >> 9);
-
-	int ret = bch2_disk_reservation_get(c, &o.op.res, len >> 9,
-					    c->opts.data_replicas, 0);
-	if (ret)
-		die("error reserving space in new filesystem: %s", strerror(-ret));
-
-	closure_call(&o.op.cl, bch2_write, NULL, &cl);
-	closure_sync(&cl);
-
-	dst_inode->bi_sectors += len >> 9;
-}
-
-static void copy_data(struct bch_fs *c,
-		      struct bch_inode_unpacked *dst_inode,
-		      int src_fd, u64 start, u64 end)
-{
-	while (start < end) {
-		unsigned len = min_t(u64, end - start, sizeof(buf));
-		unsigned pad = round_up(len, block_bytes(c)) - len;
-
-		xpread(src_fd, buf, len, start);
-		memset(buf + len, 0, pad);
-
-		write_data(c, dst_inode, start, buf, len + pad);
-		start += len;
-	}
-}
-
-static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
-		      u64 logical, u64 physical, u64 length)
-{
-	struct bch_dev *ca = c->devs[0];
-
-	BUG_ON(logical	& (block_bytes(c) - 1));
-	BUG_ON(physical & (block_bytes(c) - 1));
-	BUG_ON(length	& (block_bytes(c) - 1));
-
-	logical		>>= 9;
-	physical	>>= 9;
-	length		>>= 9;
-
-	BUG_ON(physical + length > bucket_to_sector(ca, ca->mi.nbuckets));
-
-	while (length) {
-		struct bkey_i_extent *e;
-		BKEY_PADDED(k) k;
-		u64 b = sector_to_bucket(ca, physical);
-		struct disk_reservation res;
-		unsigned sectors;
-		int ret;
-
-		sectors = min(ca->mi.bucket_size -
-			      (physical & (ca->mi.bucket_size - 1)),
-			      length);
-
-		e = bkey_extent_init(&k.k);
-		e->k.p.inode	= dst->bi_inum;
-		e->k.p.offset	= logical + sectors;
-		e->k.size	= sectors;
-		extent_ptr_append(e, (struct bch_extent_ptr) {
-					.offset = physical,
-					.dev = 0,
-					.gen = bucket(ca, b)->mark.gen,
-				  });
-
-		set_bit(b, ca->buckets_dirty);
-
-		ret = bch2_disk_reservation_get(c, &res, sectors, 1,
-						BCH_DISK_RESERVATION_NOFAIL);
-		if (ret)
-			die("error reserving space in new filesystem: %s",
-			    strerror(-ret));
-
-		bch2_mark_bkey_replicas(c, BCH_DATA_USER,
-					extent_i_to_s_c(e).s_c);
-
-		ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i,
-					&res, NULL, NULL, 0);
-		if (ret)
-			die("btree insert error %s", strerror(-ret));
-
-		bch2_disk_reservation_put(c, &res);
-
-		dst->bi_sectors	+= sectors;
-		logical		+= sectors;
-		physical	+= sectors;
-		length		-= sectors;
-	}
-}
-
-static void copy_link(struct bch_fs *c, struct bch_inode_unpacked *dst,
-		      char *src)
-{
-	ssize_t ret = readlink(src, buf, sizeof(buf));
-	if (ret < 0)
-		die("readlink error: %m");
-
-	write_data(c, dst, 0, buf, round_up(ret, block_bytes(c)));
-}
-
-static void copy_file(struct bch_fs *c, struct bch_inode_unpacked *dst,
-		      int src_fd, u64 src_size,
-		      char *src_path, ranges *extents)
-{
-	struct fiemap_iter iter;
-	struct fiemap_extent e;
-
-	fiemap_for_each(src_fd, iter, e)
-		if (e.fe_flags & FIEMAP_EXTENT_UNKNOWN) {
-			fsync(src_fd);
-			break;
-		}
-
-	fiemap_for_each(src_fd, iter, e) {
-		if ((e.fe_logical	& (block_bytes(c) - 1)) ||
-		    (e.fe_length	& (block_bytes(c) - 1)))
-			die("Unaligned extent in %s - can't handle", src_path);
-
-		if (e.fe_flags & (FIEMAP_EXTENT_UNKNOWN|
-				  FIEMAP_EXTENT_ENCODED|
-				  FIEMAP_EXTENT_NOT_ALIGNED|
-				  FIEMAP_EXTENT_DATA_INLINE)) {
-			copy_data(c, dst, src_fd, e.fe_logical,
-				  min(src_size - e.fe_logical,
-				      e.fe_length));
-			continue;
-		}
-
-		/*
-		 * if the data is below 1 MB, copy it so it doesn't conflict
-		 * with bcachefs's potentially larger superblock:
-		 */
-		if (e.fe_physical < 1 << 20) {
-			copy_data(c, dst, src_fd, e.fe_logical,
-				  min(src_size - e.fe_logical,
-				      e.fe_length));
-			continue;
-		}
-
-		if ((e.fe_physical	& (block_bytes(c) - 1)))
-			die("Unaligned extent in %s - can't handle", src_path);
-
-		range_add(extents, e.fe_physical, e.fe_length);
-		link_data(c, dst, e.fe_logical, e.fe_physical, e.fe_length);
-	}
-}
-
-struct copy_fs_state {
-	u64			bcachefs_inum;
-	dev_t			dev;
-
-	GENRADIX(u64)		hardlinks;
-	ranges			extents;
-};
-
-static void copy_dir(struct copy_fs_state *s,
-		     struct bch_fs *c,
-		     struct bch_inode_unpacked *dst,
-		     int src_fd, const char *src_path)
-{
-	DIR *dir = fdopendir(src_fd);
-	struct dirent *d;
-
-	while ((errno = 0), (d = readdir(dir))) {
-		struct bch_inode_unpacked inode;
-		int fd;
-
-		if (fchdir(src_fd))
-			die("chdir error: %m");
-
-		struct stat stat =
-			xfstatat(src_fd, d->d_name, AT_SYMLINK_NOFOLLOW);
-
-		if (!strcmp(d->d_name, ".") ||
-		    !strcmp(d->d_name, "..") ||
-		    stat.st_ino == s->bcachefs_inum)
-			continue;
-
-		char *child_path = mprintf("%s/%s", src_path, d->d_name);
-
-		if (stat.st_dev != s->dev)
-			die("%s does not have correct st_dev!", child_path);
-
-		u64 *dst_inum = S_ISREG(stat.st_mode)
-			? genradix_ptr_alloc(&s->hardlinks, stat.st_ino, GFP_KERNEL)
-			: NULL;
-
-		if (dst_inum && *dst_inum) {
-			create_link(c, dst, d->d_name, *dst_inum, S_IFREG);
-			goto next;
-		}
-
-		inode = create_file(c, dst, d->d_name,
-				    stat.st_uid, stat.st_gid,
-				    stat.st_mode, stat.st_rdev);
-
-		if (dst_inum)
-			*dst_inum = inode.bi_inum;
-
-		copy_times(c, &inode, &stat);
-		copy_xattrs(c, &inode, d->d_name);
-
-		/* copy xattrs */
-
-		switch (mode_to_type(stat.st_mode)) {
-		case DT_DIR:
-			fd = xopen(d->d_name, O_RDONLY|O_NOATIME);
-			copy_dir(s, c, &inode, fd, child_path);
-			close(fd);
-			break;
-		case DT_REG:
-			inode.bi_size = stat.st_size;
-
-			fd = xopen(d->d_name, O_RDONLY|O_NOATIME);
-			copy_file(c, &inode, fd, stat.st_size,
-				  child_path, &s->extents);
-			close(fd);
-			break;
-		case DT_LNK:
-			inode.bi_size = stat.st_size;
-
-			copy_link(c, &inode, d->d_name);
-			break;
-		case DT_FIFO:
-		case DT_CHR:
-		case DT_BLK:
-		case DT_SOCK:
-		case DT_WHT:
-			/* nothing else to copy for these: */
-			break;
-		default:
-			BUG();
-		}
-
-		update_inode(c, &inode);
-next:
-		free(child_path);
-	}
-
-	if (errno)
-		die("readdir error: %m");
-}
-
-static ranges reserve_new_fs_space(const char *file_path, unsigned block_size,
-				   u64 size, u64 *bcachefs_inum, dev_t dev,
-				   bool force)
-{
-	int fd = force
-		? open(file_path, O_RDWR|O_CREAT, 0600)
-		: open(file_path, O_RDWR|O_CREAT|O_EXCL, 0600);
-	if (fd < 0)
-		die("Error creating %s for bcachefs metadata: %m",
-		    file_path);
-
-	struct stat statbuf = xfstat(fd);
-
-	if (statbuf.st_dev != dev)
-		die("bcachefs file has incorrect device");
-
-	*bcachefs_inum = statbuf.st_ino;
-
-	if (fallocate(fd, 0, 0, size))
-		die("Error reserving space for bcachefs metadata: %m");
-
-	fsync(fd);
-
-	struct fiemap_iter iter;
-	struct fiemap_extent e;
-	ranges extents = { NULL };
-
-	fiemap_for_each(fd, iter, e) {
-		if (e.fe_flags & (FIEMAP_EXTENT_UNKNOWN|
-				  FIEMAP_EXTENT_ENCODED|
-				  FIEMAP_EXTENT_NOT_ALIGNED|
-				  FIEMAP_EXTENT_DATA_INLINE))
-			die("Unable to continue: metadata file not fully mapped");
-
-		if ((e.fe_physical	& (block_size - 1)) ||
-		    (e.fe_length	& (block_size - 1)))
-			die("Unable to continue: unaligned extents in metadata file");
-
-		range_add(&extents, e.fe_physical, e.fe_length);
-	}
-	close(fd);
-
-	ranges_sort_merge(&extents);
-	return extents;
-}
-
-static void reserve_old_fs_space(struct bch_fs *c,
-				 struct bch_inode_unpacked *root_inode,
-				 ranges *extents)
-{
-	struct bch_dev *ca = c->devs[0];
-	struct bch_inode_unpacked dst;
-	struct hole_iter iter;
-	struct range i;
-
-	dst = create_file(c, root_inode, "old_migrated_filesystem",
-			  0, 0, S_IFREG|0400, 0);
-	dst.bi_size = bucket_to_sector(ca, ca->mi.nbuckets) << 9;
-
-	ranges_sort_merge(extents);
-
-	for_each_hole(iter, *extents, bucket_to_sector(ca, ca->mi.nbuckets) << 9, i)
-		link_data(c, &dst, i.start, i.start, i.end - i.start);
-
-	update_inode(c, &dst);
-}
-
-static void copy_fs(struct bch_fs *c, int src_fd, const char *src_path,
-		    u64 bcachefs_inum, ranges *extents)
-{
-	syncfs(src_fd);
-
-	struct bch_inode_unpacked root_inode;
-	int ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, &root_inode);
-	if (ret)
-		die("error looking up root directory: %s", strerror(-ret));
-
-	if (fchdir(src_fd))
-		die("chdir error: %m");
-
-	struct stat stat = xfstat(src_fd);
-	copy_times(c, &root_inode, &stat);
-	copy_xattrs(c, &root_inode, ".");
-
-	struct copy_fs_state s = {
-		.bcachefs_inum	= bcachefs_inum,
-		.dev		= stat.st_dev,
-		.extents	= *extents,
-	};
-
-	/* now, copy: */
-	copy_dir(&s, c, &root_inode, src_fd, src_path);
-
-	reserve_old_fs_space(c, &root_inode, &s.extents);
-
-	update_inode(c, &root_inode);
-
-	darray_free(s.extents);
-	genradix_free(&s.hardlinks);
-
-	bch2_alloc_write(c);
-}
-
-static void find_superblock_space(ranges extents, struct dev_opts *dev)
-{
-	struct range *i;
-
-	darray_foreach(i, extents) {
-		u64 start = round_up(max(256ULL << 10, i->start),
-				     dev->bucket_size << 9);
-		u64 end = round_down(i->end,
-				     dev->bucket_size << 9);
-
-		if (start + (128 << 10) <= end) {
-			dev->sb_offset	= start >> 9;
-			dev->sb_end	= dev->sb_offset + 256;
-			return;
-		}
-	}
-
-	die("Couldn't find a valid location for superblock");
-}
-
-static void migrate_usage(void)
-{
-	puts("bcachefs migrate - migrate an existing filesystem to bcachefs\n"
-	     "Usage: bcachefs migrate [OPTION]...\n"
-	     "\n"
-	     "Options:\n"
-	     "  -f fs                  Root of filesystem to migrate(s)\n"
-	     "      --encrypted        Enable whole filesystem encryption (chacha20/poly1305)\n"
-	     "      --no_passphrase    Don't encrypt master encryption key\n"
-	     "  -F                     Force, even if metadata file already exists\n"
-	     "  -h                     Display this help and exit\n"
-	     "Report bugs to <linux-bcache@vger.kernel.org>");
-}
-
-static const struct option migrate_opts[] = {
-	{ "encrypted",		no_argument, NULL, 'e' },
-	{ "no_passphrase",	no_argument, NULL, 'p' },
-	{ NULL }
-};
-
-static int migrate_fs(const char *fs_path,
-		      struct format_opts format_opts,
-		      bool force)
-{
-	if (!path_is_fs_root(fs_path))
-		die("%s is not a filysestem root", fs_path);
-
-	int fs_fd = xopen(fs_path, O_RDONLY|O_NOATIME);
-	struct stat stat = xfstat(fs_fd);
-
-	if (!S_ISDIR(stat.st_mode))
-		die("%s is not a directory", fs_path);
-
-	struct dev_opts dev = dev_opts_default();
-
-	dev.path = dev_t_to_path(stat.st_dev);
-	dev.fd = xopen(dev.path, O_RDWR);
-
-	unsigned block_size = get_blocksize(dev.path, dev.fd) << 9;
-	BUG_ON(!is_power_of_2(block_size) || block_size < 512);
-	format_opts.block_size = block_size >> 9;
-
-	char *file_path = mprintf("%s/bcachefs", fs_path);
-	printf("Creating new filesystem on %s in space reserved at %s\n",
-	       dev.path, file_path);
-
-	bch2_pick_bucket_size(format_opts, &dev);
-
-	u64 bcachefs_inum;
-	ranges extents = reserve_new_fs_space(file_path,
-				format_opts.block_size << 9,
-				get_size(dev.path, dev.fd) / 5,
-				&bcachefs_inum, stat.st_dev, force);
-
-	find_superblock_space(extents, &dev);
-
-	struct bch_sb *sb = bch2_format(format_opts, &dev, 1);
-	u64 sb_offset = le64_to_cpu(sb->layout.sb_offset[0]);
-
-	if (format_opts.passphrase)
-		bch2_add_key(sb, format_opts.passphrase);
-
-	free(sb);
-
-	struct bch_opts opts = bch2_opts_empty();
-	struct bch_fs *c = NULL;
-	char *path[1] = { dev.path };
-
-	opt_set(opts, sb,	sb_offset);
-	opt_set(opts, nostart,	true);
-	opt_set(opts, noexcl,	true);
-
-	c = bch2_fs_open(path, 1, opts);
-	if (IS_ERR(c))
-		die("Error opening new filesystem: %s", strerror(-PTR_ERR(c)));
-
-	mark_unreserved_space(c, extents);
-
-	const char *err = bch2_fs_start(c);
-	if (err)
-		die("Error starting new filesystem: %s", err);
-
-	copy_fs(c, fs_fd, fs_path, bcachefs_inum, &extents);
-
-	bch2_fs_stop(c);
-
-	printf("Migrate complete, running fsck:\n");
-	opt_set(opts, nostart,	false);
-	opt_set(opts, nochanges, true);
-
-	c = bch2_fs_open(path, 1, opts);
-	if (IS_ERR(c))
-		die("Error opening new filesystem: %s", strerror(-PTR_ERR(c)));
-
-	bch2_fs_stop(c);
-	printf("fsck complete\n");
-
-	printf("To mount the new filesystem, run\n"
-	       "  mount -t bcachefs -o sb=%llu %s dir\n"
-	       "\n"
-	       "After verifying that the new filesystem is correct, to create a\n"
-	       "superblock at the default offset and finish the migration run\n"
-	       "  bcachefs migrate-superblock -d %s -o %llu\n"
-	       "\n"
-	       "The new filesystem will have a file at /old_migrated_filestem\n"
-	       "referencing all disk space that might be used by the existing\n"
-	       "filesystem. That file can be deleted once the old filesystem is\n"
-	       "no longer needed (and should be deleted prior to running\n"
-	       "bcachefs migrate-superblock)\n",
-	       sb_offset, dev.path, dev.path, sb_offset);
-	return 0;
-}
-
-int cmd_migrate(int argc, char *argv[])
-{
-	struct format_opts format_opts = format_opts_default();
-	char *fs_path = NULL;
-	bool no_passphrase = false, force = false;
-	int opt;
-
-	while ((opt = getopt_long(argc, argv, "f:Fh",
-				  migrate_opts, NULL)) != -1)
-		switch (opt) {
-		case 'f':
-			fs_path = optarg;
-			break;
-		case 'e':
-			format_opts.encrypted = true;
-			break;
-		case 'p':
-			no_passphrase = true;
-			break;
-		case 'F':
-			force = true;
-			break;
-		case 'h':
-			migrate_usage();
-			exit(EXIT_SUCCESS);
-		}
-
-	if (!fs_path)
-		die("Please specify a filesytem to migrate");
-
-	if (format_opts.encrypted && !no_passphrase)
-		format_opts.passphrase = read_passphrase_twice("Enter passphrase: ");
-
-	return migrate_fs(fs_path, format_opts, force);
-}
-
-static void migrate_superblock_usage(void)
-{
-	puts("bcachefs migrate-superblock - create default superblock after migrating\n"
-	     "Usage: bcachefs migrate-superblock [OPTION]...\n"
-	     "\n"
-	     "Options:\n"
-	     "  -d device     Device to create superblock for\n"
-	     "  -o offset     Offset of existing superblock\n"
-	     "  -h            Display this help and exit\n"
-	     "Report bugs to <linux-bcache@vger.kernel.org>");
-}
-
-int cmd_migrate_superblock(int argc, char *argv[])
-{
-	char *dev = NULL;
-	u64 offset = 0;
-	int opt, ret;
-
-	while ((opt = getopt(argc, argv, "d:o:h")) != -1)
-		switch (opt) {
-			case 'd':
-				dev = optarg;
-				break;
-			case 'o':
-				ret = kstrtou64(optarg, 10, &offset);
-				if (ret)
-					die("Invalid offset");
-				break;
-			case 'h':
-				migrate_superblock_usage();
-				exit(EXIT_SUCCESS);
-		}
-
-	if (!dev)
-		die("Please specify a device");
-
-	if (!offset)
-		die("Please specify offset of existing superblock");
-
-	int fd = xopen(dev, O_RDWR);
-	struct bch_sb *sb = __bch2_super_read(fd, offset);
-
-	if (sb->layout.nr_superblocks >= ARRAY_SIZE(sb->layout.sb_offset))
-		die("Can't add superblock: no space left in superblock layout");
-
-	unsigned i;
-	for (i = 0; i < sb->layout.nr_superblocks; i++)
-		if (le64_to_cpu(sb->layout.sb_offset[i]) == BCH_SB_SECTOR)
-			die("Superblock layout already has default superblock");
-
-	memmove(&sb->layout.sb_offset[1],
-		&sb->layout.sb_offset[0],
-		sb->layout.nr_superblocks * sizeof(u64));
-	sb->layout.nr_superblocks++;
-
-	sb->layout.sb_offset[0] = cpu_to_le64(BCH_SB_SECTOR);
-
-	bch2_super_write(fd, sb);
-	close(fd);
-
-	return 0;
-}
diff --git a/debian/bcachefs-tools.dirs b/debian/bcachefs-tools.dirs
deleted file mode 100644
index ea98e984..00000000
--- a/debian/bcachefs-tools.dirs
+++ /dev/null
@@ -1,2 +0,0 @@
-sbin/
-usr/share/man/man8/
diff --git a/debian/bcachefs-tools.postinst b/debian/bcachefs-tools.postinst
new file mode 100644
index 00000000..56dd8905
--- /dev/null
+++ b/debian/bcachefs-tools.postinst
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+set -e
+
+#DEBHELPER#
+
+case "$1" in
+    configure)
+	if which update-initramfs >/dev/null; then
+	    update-initramfs -u
+	fi
+    ;;
+esac
+
diff --git a/debian/bcachefs-tools.postrm b/debian/bcachefs-tools.postrm
new file mode 100644
index 00000000..2d913367
--- /dev/null
+++ b/debian/bcachefs-tools.postrm
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+set -e
+
+#DEBHELPER#
+
+case "$1" in
+    remove)
+	if which update-initramfs >/dev/null; then
+	    update-initramfs -u
+	fi
+    ;;
+esac
+
diff --git a/debian/bcachefs.triggers b/debian/bcachefs.triggers
new file mode 100644
index 00000000..6c9f4543
--- /dev/null
+++ b/debian/bcachefs.triggers
@@ -0,0 +1 @@
+activate update-initramfs
diff --git a/debian/changelog b/debian/changelog
index 26d64694..b2eaedce 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,44 +1,158 @@
-bcachefs-tools (1.0.8-2~bpo8+1) jessie-backports; urgency=medium
+bcachefs-tools (1:1.9.1-1) unstable; urgency=medium
 
-  * Rebuild for jessie-backports.
+  * New upstream release
+  * New build-depency: librust-strum-dev
+  * Update copyright file
 
- -- Mathieu Parent <sathieu@debian.org>  Mon, 21 Sep 2015 21:18:39 +0200
+ -- Jonathan Carter <jcc@debian.org>  Thu, 20 Jun 2024 17:28:12 +0200
 
-bcachefs-tools (1.0.8-2) unstable; urgency=medium
+bcachefs-tools (1:1.7.0-1) unstable; urgency=medium
 
-  * Only run update-initramfs if installed. Fix dracut. (Closes: #788442)
+  * Upload to unstable (Closes: #1066929)
+  * Bump epoch
+    - Ack from pochu:
+      https://lists.debian.org/msgid-search/1c5f86c9-1525-4d44-996f-3d4eed1e64d6@debian.org
 
- -- David Mohr <david@mcbf.net>  Thu, 11 Jun 2015 10:23:48 -0600
+ -- Jonathan Carter <jcc@debian.org>  Wed, 15 May 2024 12:04:38 +0200
 
-bcachefs-tools (1.0.8-1) unstable; urgency=medium
+bcachefs-tools (24+really1.7.0-1~exp1) experimental; urgency=medium
 
-  [ James Page ]
-  * d/control: Add Vcs fields.
+  [ Steinar H. Gunderson ]
+  * New upstream release.
+    * Remove the do-not-install-mount-symlink patch.
+    * Add epoch to deal with new upstream versioning scheme,
+      and disable pristine-tar option in gbp.conf, since there is no
+      pristine-tar branch. (Closes: #1054620)
+    * revert-bindgen-changes.patch: New patch, reinstates upstream
+      hack that makes bcachefs-tools work with bindgen older than 0.69.4
+      (which isn't in Debian yet).
+  * Build-depend on systemd-dev, since udev.pc has moved there.
+  * Enable Rust parts, including mount.bcachefs. (Closes: #1060256)
+    * Include the right .mk flags to get DEB_HOST_RUST_TYPE etc., and
+      export that (needed for Debian's cargo wrapper).
+    * Override CARGO, and set CARGO_HOME during installation.
+    * Run prepare-debian in dh_auto_configure, and clean up the vendor dir
+      in dh_auto_clean.
+    * Remove Cargo.lock before the build, so that we can use Debian's versions
+      of all Rust crates. Also remove it in dh_auto_clean, so that it does not
+      take a diff.
+    * revert-bindgen-changes.patch: New patch, loosen required versions
+      of Rust crates
+      - errno (0.2 -> >= 0.2, < 1; Debian has 0.4)
+      - udev (0.7 -> >= 0.7, < 1; Debian has 0.8)
+      - memoffset (0.8.0 -> 0.6; Debian has 0.6.5, so a downgrade)
+      - paste (1.0.11 -> 1.0; Debian has 1.0.8, so a downgrade)
+      - bindgen (0.69.4 -> 0.66; see revert-bindgen-changes.patch above)
+    * Add build-dependency on all relevant Rust crates, and on python3:native
+      due to use of the Cargo wrapper.
+  * Build-Depend on pkgconf instead of pkg-config (Lintian warning).
+  * Add debhelper tokens to postinst and postrm scripts (Lintian warning).
+  * Depend on python3:any due to bcachefsck_all being a Python script
+    (Lintian warning).
 
-  [ David Mohr ]
-  * Don't depend on initramfs-tools, instead recommend it (Closes: #775674)
-  * New upstream release 1.0.8
-  * Update changelog
-  * Add patch to clean bcache-register
-  * Update changelog
-  * Adding dep3 headers to the 0001 patch
-  * Update watch to use http://evilpiepirate.org/git/bcache-tools.git
-  * Add patch for gcc-5 compatability.
-    Thanks to James Cowgill (Closes: #777798)
+  [ Jonathan Carter ]
+  * Upload to experimental
+  * Add librust-chrono-dev, librust-getset-dev and
+    librust-gag-dev to build-depends
+  * Recreate dependencies patch to match new upstream source
+  * Update standards version to 4.7.0
+  * Update above mentioned revert-bindgen-changes.patch
 
- -- David Mohr <david@mcbf.net>  Tue, 26 May 2015 20:57:58 -0600
+ -- Jonathan Carter <jcc@debian.org>  Thu, 25 Apr 2024 13:53:55 +0200
 
-bcachefs-tools (1.0.7-1) unstable; urgency=medium
+bcachefs-tools (24+really1.3.4-2) unstable; urgency=medium
 
-  [ David Mohr ]
-  * Based on work by Gabriel de Perthuis <g2p.code+debian@gmail.com>
-  * Initial release. (Closes: #708132)
+  * Upload to unstable
 
-  [ Robie Basak ]
-  * Remove unnecessary file bcache-tools.postrm.
-  * debian/copyright fixes.
-  * Add shebang to bcache-tools.preinst.
-  * Drop Vcs-* for now.
-  * Add self to Uploaders.
+ -- Jonathan Carter <jcc@debian.org>  Wed, 03 Jan 2024 17:06:16 +0200
 
- -- Robie Basak <robie@justgohome.co.uk>  Mon, 27 Oct 2014 13:32:08 +0000
+bcachefs-tools (24+really1.3.4-2~exp1) experimental; urgency=medium
+
+  [ Chris Hofstaedtler ]
+  * Non-maintainer upload.
+  * Install files into /usr instead of /. (Closes: #1059373)
+
+  [ Jonathan Carter ]
+  * Do not install bcachefs.mount symlink (Closes: #1057295)
+
+ -- Jonathan Carter <jcc@debian.org>  Wed, 27 Dec 2023 19:22:06 +0200
+
+bcachefs-tools (24+really1.3.4-1) unstable; urgency=medium
+
+  * New upstream release
+
+ -- Jonathan Carter <jcc@debian.org>  Tue, 21 Nov 2023 17:26:13 +0200
+
+bcachefs-tools (24+really1.2-1) unstable; urgency=medium
+
+  * New upstream release (Closes: #1054613)
+  * Disable new rust build
+    (dependencies currently unavailable in Debian, see debian/README.todo)
+  * Remove unneeded override_dh_auto_clean from debian/rules
+    (Closes: #1043654)
+
+ -- Jonathan Carter <jcc@debian.org>  Thu, 28 Sep 2023 19:54:47 +0200
+
+bcachefs-tools (24-1) unstable; urgency=medium
+
+  * New upstream release
+
+ -- Jonathan Carter <jcc@debian.org>  Tue, 29 Nov 2022 09:40:27 +0200
+
+bcachefs-tools (23-1) unstable; urgency=medium
+
+  * New upstream release
+  * Update standards version to 4.6.1
+
+ -- Jonathan Carter <jcc@debian.org>  Mon, 31 Oct 2022 11:45:25 +0200
+
+bcachefs-tools (0.1+git20220216.a1e928a-1) unstable; urgency=medium
+
+  * New upstream snapshot
+  * Grab patch from Ubuntu to reduce memory on amd64 builders
+    (http://launchpadlibrarian.net/580140160/bcachefs-tools_0.1+git20210805.6c42566-2_0.1+git20210805.6c42566-2ubuntu1.diff.gz)
+  * Update copyright years
+
+ -- Jonathan Carter <jcc@debian.org>  Wed, 16 Feb 2022 14:42:20 +0200
+
+bcachefs-tools (0.1+git20210805.6c42566-2) unstable; urgency=medium
+
+  * Remove valgrind as build-dependency, seems unneeded unless
+    doing debug work and is not available on all architectures.
+
+ -- Jonathan Carter <jcc@debian.org>  Fri, 03 Sep 2021 16:07:11 +0200
+
+bcachefs-tools (0.1+git20210805.6c42566-1) unstable; urgency=medium
+
+  * New upstream snapshot
+  * Update standards version to 4.6.0
+  * Add python3-docutils as dependency
+  * Do not run tests at package build time
+
+ -- Jonathan Carter <jcc@debian.org>  Tue, 06 Apr 2021 15:11:27 +0200
+
+bcachefs-tools (0.1+git20201025.742dbbdb-1) unstable; urgency=medium
+
+  * New upstream snapshot
+
+ -- Jonathan Carter <jcc@debian.org>  Mon, 26 Oct 2020 08:45:37 +0200
+
+bcachefs-tools (0.1+git20201017.8a4408-1~exp1) unstable; urgency=medium
+
+  * New upstream release
+  * Bump debhelper-compat to 13
+  * Update standards version to 4.5.0
+  * Declare Rules-Requires-Root: no
+  * debian/copyright:
+    - Update copyright years
+    - Update copyright owners
+    - Add intel and ccan copyright
+  * Add build-dep on libudev-dev, python3-pytest  and valgrind
+
+ -- Jonathan Carter <jcc@debian.org>  Sun, 18 Oct 2020 17:29:27 +0200
+
+bcachefs-tools (0.1+git20190829.aa2a42b-1~exp1) experimental; urgency=medium
+
+  * Initial Release (Closes: #935178)
+
+ -- Jonathan Carter <jcc@debian.org>  Mon, 16 Sep 2019 10:36:04 +0000
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index f599e28b..00000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-10
diff --git a/debian/control b/debian/control
index 08673f4b..9e5ff022 100644
--- a/debian/control
+++ b/debian/control
@@ -1,17 +1,39 @@
 Source: bcachefs-tools
-Maintainer: Kent Overstreet <kent.overstreet@gmail.com>
+Maintainer: Jonathan Carter <jcc@debian.org>
 Section: utils
 Priority: optional
-Standards-Version: 3.9.5
-Build-Depends: debhelper (>= 9), pkg-config, libblkid-dev, uuid-dev,
-	libscrypt-dev, libsodium-dev, libkeyutils-dev, liburcu-dev, zlib1g-dev,
-	libattr1-dev, libaio-dev, libzstd-dev
-Homepage: http://bcache.evilpiepirate.org/
+Standards-Version: 4.7.0
+Rules-Requires-Root: no
+Build-Depends: debhelper-compat (= 13),
+               cargo,
+               python3:native,
+               pkgconf,
+               python3-docutils,
+               libaio-dev,
+               libfuse3-dev,
+               libblkid-dev,
+               libkeyutils-dev,
+               liblz4-dev,
+               libscrypt-dev,
+               libsodium-dev,
+               libudev-dev,
+               liburcu-dev,
+               libzstd-dev,
+               systemd-dev,
+               uuid-dev,
+               zlib1g-dev,
+Homepage: https://bcachefs.org/
+Vcs-Git: https://salsa.debian.org/jcc/bcachefs-tools.git
+Vcs-Browser: https://salsa.debian.org/jcc/bcachefs-tools
 
 Package: bcachefs-tools
 Architecture: linux-any
-Depends: ${shlibs:Depends}, ${misc:Depends}
+Depends: ${shlibs:Depends}, ${misc:Depends}, python3:any
 Recommends: initramfs-tools | linux-initramfs-tool
 Description: bcachefs userspace tools
  Userspace tools for bcachefs, a modern copy on write, checksumming, multi
  device filesystem.
+ .
+ Note: The current Debian kernels do not come with bcachefs support, you
+ will have to use your own kernel or one provided by a 3rd party that
+ contains bcachefs support.
diff --git a/debian/copyright b/debian/copyright
index 1491ae80..4a1d369b 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -1,22 +1,67 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
 Upstream-Contact: kmo@daterainc.com
                   linux-bcache@vger.kernel.org
-Source: http://evilpiepirate.org/git/bcache-tools.git
+Source: https://evilpiepirate.org/git/bcachefs-tools.git
 
 Files: *
-Copyright: 2013 Kent Overstreet <kmo@daterainc.com>
+Copyright: 2023-2024 Oracle
+           2013-2022 Kent Overstreet <kmo@daterainc.com>
+	   2018 SUSE Linux
+	   2012-2016, Yann Collet
+           2013 Gabriel de Perthuis <g2p.code@gmail.com>
+	   2015 Martin Willi
+	   2007-2015 Herbert Xu <herbert@gondor.apana.org.au>
+	   2014-2015 Thomas Graf <tgraf@suug.ch>
+	   2008-2014 Patrick McHardy <kaber@trash.net>
+	   2004-2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+	   2012 Google Inc
+	   2010-2011 Inter Corp
+	   2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu)
+	   2006-2008 Greg Kroah-Hartman <greg@kroah.com>
+	   2006-2008 Novell Inc.
+           2007-2008 Intel Corporation <willy@linux.intel.com>
+	   2002-2003 Patrick Mochel
+	   2002-2003 Open Source Development Labs
+	   2002 James Morris <jmorris@intercode.com.au>
+	   2001-2002 Silicon Graphics, Inc.
+	   2002 David S. Miller (davem@redhat.com)
+	   2001 Jens Axboe <axboe@kernel.dk>
+	   2001 Ming Lei <ming.lei@canonical.com>
+           2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
+	   2001 Linus Torvalds
 License: GPL-2
 
-Files: bcache-super-show.c
-Copyright: 2013 Gabriel de Perthuis <g2p.code@gmail.com>
-License: GPL-2
+Files: ccan/*
+Copyright: Copyright waived
+License: cc0-1.0
+ On Debian systems, the complete text of the Creative Commons CC0 1.0
+ Universal license (CC0-1.0) can be found in
+ "/usr/share/common-licenses/CC0-1.0".
 
-Files: bcache.c
-Copyright: 1996-2001, PostgreSQL Global Development Group
-License: PostgreSQL
+Files: ccan/darray/*
+Copyright: 2011 Joseph Adams <joeyadams3.14159@gmail.com>
+License: expat
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+ .
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+ .
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
 
 Files: debian/*
-Copyright: 2014 Tom Strickx <tstrickx@rootcu.be>,
+Copyright: 2019-2024 Jonathan Carter <jcc@debian.org>
+           2014 Tom Strickx <tstrickx@rootcu.be>,
            2014 David Mohr <david@mcbf.net>
 License: GPL-2+
 
@@ -62,25 +107,3 @@ License: GPL-2+
  On Debian systems, the full text of the GNU General Public
  License version 2 can be found in the file
  `/usr/share/common-licenses/GPL-2'.
-
-License: PostgreSQL
- Permission to use, copy, modify, and distribute this
- software and its documentation for any purpose, without fee,
- and without a written agreement is hereby granted, provided
- that the above copyright notice and this paragraph and the
- following two paragraphs appear in all copies.
- .
- IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO
- ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR
- CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT
- OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
- THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE
- POSSIBILITY OF SUCH DAMAGE.
- .
- THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY
- WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS"
- BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS
- TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
- MODIFICATIONS.
diff --git a/debian/gbp.conf b/debian/gbp.conf
index a347f100..27e23cb0 100644
--- a/debian/gbp.conf
+++ b/debian/gbp.conf
@@ -1,5 +1,5 @@
 [DEFAULT]
-pristine-tar = True
+pristine-tar = False
 upstream-tag = v%(version)s
 ignore-branch = True
 
diff --git a/debian/rules b/debian/rules
index ae98f5ce..17729457 100755
--- a/debian/rules
+++ b/debian/rules
@@ -1,9 +1,41 @@
 #!/usr/bin/make -f
 
+include /usr/share/dpkg/architecture.mk
+include /usr/share/rustc/architecture.mk
+
+export DEB_HOST_RUST_TYPE DEB_HOST_GNU_TYPE
+
+export DEB_BUILD_MAINT_OPTIONS=hardening=+all
+export CARGO=/usr/share/cargo/bin/cargo
+export CARGO_HOME=$(CURDIR)/debian/cargo_home
+export DEB_CARGO_CRATE=bcachefs-tools_$(DEB_VERSION_UPSTREAM)
+
 PREFIX := /usr
+ROOT_SBINDIR := /usr/sbin
+
+DEB_BUILD_ARCH ?= $(shell dpkg-architecture -qDEB_BUILD_ARCH)
+
+ifeq ($(DEB_BUILD_ARCH),amd64)
+    DEB_BUILD_MAINT_OPTIONS += optimize=-lto
+endif
 
 %:
 	dh $@
 
+override_dh_auto_configure:
+	$(CARGO) prepare-debian $(CURDIR)/vendor --link-from-system
+
+override_dh_auto_build:
+	$(RM) Cargo.lock
+	dh_auto_build -- CARGO="$(CARGO)"
+
 override_dh_auto_install:
-	dh_auto_install -- "PREFIX=$(PREFIX)"
+	dh_auto_install -- "PREFIX=$(PREFIX)" "ROOT_SBINDIR=$(ROOT_SBINDIR)"
+
+override_dh_auto_clean:
+	! [ -d $(CURDIR)/vendor ] || $(RM) -r $(CURDIR)/vendor
+	! [ -d $(CARGO_HOME) ] || $(RM) -r $(CARGO_HOME)
+	$(RM) Cargo.lock
+	dh_auto_clean
+
+override_dh_auto_test:
diff --git a/debian/watch b/debian/watch
index f9ca3c6d..12df5313 100644
--- a/debian/watch
+++ b/debian/watch
@@ -1,3 +1,3 @@
-version=3
-
-http://evilpiepirate.org/git/bcache-tools.git/refs/ /git/bcache-tools.git/tag/\?id=v(\d[\d.]*)
+version=4
+opts=filenamemangle=s/.+\/v?(\d\S+)\.tar\.gz/bcachefs-tools_$1\.tar\.gz/,uversionmangle=s/(\d)/$1/ \
+  https://github.com/koverstreet/bcachefs-tools/tags .*/v?(\d\S+)\.tar\.gz
diff --git a/default.nix b/default.nix
index f19ff107..6466507b 100644
--- a/default.nix
+++ b/default.nix
@@ -1,32 +1,9 @@
-{ nixpkgs ? (import ./nix/nixpkgs.nix)
-}:
-
-with nixpkgs;
-
-stdenv.mkDerivation rec {
-  name = "bcachefs-tools-${version}";
-  version = "git";
-
-  src = lib.cleanSource ./.; # NOTE: ignore .git, otherwise things get weird!
-
-  nativeBuildInputs = [ git pkgconfig ];
-  buildInputs =
-    [ liburcu libuuid libaio zlib attr keyutils
-      libsodium libscrypt
-    ];
-
-  enableParallelBuilding = true;
-  makeFlags =
-    [ "PREFIX=$(out)"
-    ];
-
-  meta = with stdenv.lib; {
-    description = "Userspace tools for bcachefs";
-    homepage    = http://bcachefs.org;
-    license     = licenses.gpl2;
-    platforms   = platforms.linux;
-    maintainers =
-      [ "Kent Overstreet <kent.overstreet@gmail.com>"
-      ];
-  };
-}
+(import (
+  let
+    lock = builtins.fromJSON (builtins.readFile ./flake.lock);
+  in
+  fetchTarball {
+    url = "https://github.com/edolstra/flake-compat/archive/${lock.nodes.flake-compat.locked.rev}.tar.gz";
+    sha256 = lock.nodes.flake-compat.locked.narHash;
+  }
+) { src = ./.; }).defaultNix
diff --git a/doc/bcachefs-principles-of-operation.tex b/doc/bcachefs-principles-of-operation.tex
new file mode 100644
index 00000000..841108bd
--- /dev/null
+++ b/doc/bcachefs-principles-of-operation.tex
@@ -0,0 +1,1244 @@
+\documentclass{article}
+
+\usepackage{imakeidx}
+\usepackage[pdfborder={0 0 0}]{hyperref}
+\usepackage{longtable}
+
+\title{bcachefs: Principles of Operation}
+\author{Kent Overstreet}
+
+\date{}
+
+\begin{document}
+
+\maketitle
+\tableofcontents
+
+\section{Introduction and overview}
+
+Bcachefs is a modern, general purpose, copy on write filesystem descended from
+bcache, a block layer cache.
+
+The internal architecture is very different from most existing filesystems where
+the inode is central and many data structures hang off of the inode. Instead,
+bcachefs is architected more like a filesystem on top of a relational database,
+with tables for the different filesystem data types - extents, inodes, dirents,
+xattrs, et cetera.
+
+bcachefs supports almost all of the same features as other modern COW
+filesystems, such as ZFS and btrfs, but in general with a cleaner, simpler,
+higher performance design.
+
+\subsection{Performance overview}
+
+The core of the architecture is a very high performance and very low latency b+
+tree, which also is not a conventional b+ tree but more of hybrid, taking
+concepts from compacting data structures: btree nodes are very large, log
+structured, and compacted (resorted) as necessary in memory. This means our b+
+trees are very shallow compared to other filesystems.
+
+What this means for the end user is that since we require very few seeks or disk
+reads, filesystem latency is extremely good - especially cache cold filesystem
+latency, which does not show up in most benchmarks but has a huge impact on real
+world performance, as well as how fast the system "feels" in normal interactive
+usage. Latency has been a major focus throughout the codebase - notably, we have
+assertions that we never hold b+ tree locks while doing IO, and the btree
+transaction layer makes it easily to aggressively drop and retake locks as
+needed - one major goal of bcachefs is to be the first general purpose soft
+realtime filesystem.
+
+Additionally, unlike other COW btrees, btree updates are journalled. This
+greatly improves our write efficiency on random update workloads, as it means
+btree writes are only done when we have a large block of updates, or when
+required by memory reclaim or journal reclaim.
+
+\subsection{Bucket based allocation}
+
+As mentioned bcachefs is descended from bcache, where the ability to efficiently
+invalidate cached data and reuse disk space was a core design requirement. To
+make this possible the allocator divides the disk up into buckets, typically
+512k to 2M but possibly larger or smaller. Buckets and data pointers have
+generation numbers: we can reuse a bucket with cached data in it without finding
+and deleting all the data pointers by incrementing the generation number.
+
+In keeping with the copy-on-write theme of avoiding update in place wherever
+possible, we never rewrite or overwrite data within a bucket - when we allocate
+a bucket, we write to it sequentially and then we don't write to it again until
+the bucket has been invalidated and the generation number incremented.
+
+This means we require a copying garbage collector to deal with internal
+fragmentation, when patterns of random writes leave us with many buckets that
+are partially empty (because the data they contained was overwritten) - copy GC
+evacuates buckets that are mostly empty by writing the data they contain to new
+buckets. This also means that we need to reserve space on the device for the
+copy GC reserve when formatting - typically 8\% or 12\%.
+
+There are some advantages to structuring the allocator this way, besides being
+able to support cached data:
+\begin{itemize}
+	\item By maintaining multiple write points that are writing to different buckets,
+		we're able to easily and naturally segregate unrelated IO from different
+		processes, which helps greatly with fragmentation.
+
+	\item The fast path of the allocator is essentially a simple bump allocator - the
+		disk space allocation is extremely fast
+
+	\item Fragmentation is generally a non issue unless copygc has to kick
+		in, and it usually doesn't under typical usage patterns. The
+		allocator and copygc are doing essentially the same things as
+		the flash translation layer in SSDs, but within the filesystem
+		we have much greater visibility into where writes are coming
+		from and how to segregate them, as well as which data is
+		actually live - performance is generally more predictable than
+		with SSDs under similar usage patterns.
+
+	\item The same algorithms will in the future be used for managing SMR
+		hard drives directly, avoiding the translation layer in the hard
+		drive - doing this work within the filesystem should give much
+		better performance and much more predictable latency.
+\end{itemize}
+
+\section{Feature overview}
+
+\subsection{IO path options}
+
+Most options that control the IO path can be set at either the filesystem level
+or on individual inodes (files and directories). When set on a directory via the
+\texttt{bcachefs attr} command, they will be automatically applied recursively.
+
+\subsubsection{Checksumming}
+
+bcachefs supports both metadata and data checksumming - crc32c by default, but
+stronger checksums are available as well. Enabling data checksumming incurs some
+performance overhead - besides the checksum calculation, writes have to be
+bounced for checksum stability (Linux generally cannot guarantee that the buffer
+being written is not modified in flight), but reads generally do not have to be
+bounced.
+
+Checksum granularity in bcachefs is at the level of individual extents, which
+results in smaller metadata but means we have to read entire extents in order to
+verify the checksum. By default, checksummed and compressed extents are capped
+at 64k. For most applications and usage scenarios this is an ideal trade off, but
+small random \texttt{O\_DIRECT} reads will incur significant overhead. In the
+future, checksum granularity will be a per-inode option.
+
+\subsubsection{Encryption}
+
+bcachefs supports authenticated (AEAD style) encryption - ChaCha20/Poly1305.
+When encryption is enabled, the poly1305 MAC replaces the normal data and
+metadata checksums. This style of encryption is superior to typical block layer
+or filesystem level encryption (usually AES-XTS), which only operates on blocks
+and doesn't have a way to store nonces or MACs. In contrast, we store a nonce
+and cryptographic MAC alongside data pointers, meaning we have a chain of trust
+up to the superblock (or journal, in the case of unclean shutdowns) and can
+definitely tell if metadata has been modified, dropped, or replaced with an
+earlier version.  Therefore, replay attacks are not possible, with the exception
+of an offline rollback of the entire filesystem to a previous version (but see
+the WARNING below).
+
+Encryption can only be specified for the entire filesystem, not per file or
+directory - this is because metadata blocks do not belong to a particular file.
+All data and metadata except for the superblock is encrypted, and all data
+and metadata is authenticated.
+
+In the future we'll probably add AES-GCM for platforms that have hardware
+acceleration for AES, but in the meantime software implementations of ChaCha20
+are also quite fast on most platforms.
+
+\texttt{scrypt} is currently used for the key derivation function (KDF), which
+converts the user supplied passphrase to an encryption key.  This is the same
+function used by Tarsnap and Qubes OS’s backup support.  The key derivation is
+implemented entirely in user-space, so other means of deriving a key can be used
+in the future without any kernel changes.
+
+
+To format a filesystem with encryption, use
+\begin{quote} \begin{verbatim}
+bcachefs format --encrypted /dev/sda1
+\end{verbatim} \end{quote}
+
+You will be prompted for a passphrase. Then, to use an encrypted filesystem
+use the command
+\begin{quote} \begin{verbatim}
+bcachefs unlock /dev/sda1
+\end{verbatim} \end{quote}
+
+You will be prompted for the passphrase and the encryption key will be added to
+your in-kernel keyring; mount, fsck and other commands will then work as usual.
+
+The passphrase on an existing encrypted filesystem can be changed with the
+\texttt{bcachefs set-passphrase} command. To permanently unlock an encrypted
+filesystem, use the \texttt{bcachefs remove-passphrase} command - this can be
+useful when dumping filesystem metadata for debugging by the developers.
+
+There is a \texttt{wide\_macs} option which controls the size of the
+cryptographic MACs stored on disk. By default, only 80 bits are stored, which
+should be sufficient security for most applications. With the
+\texttt{wide\_macs} option enabled we store the full 128 bit MAC, at the cost of
+making extents 8 bytes bigger.  \texttt{wide\_macs} is recommended for cases
+where an attacker can make repeated attempts at forging a MAC, such as scenarios
+where the storage device itself is untrusted (but see below).
+
+For technical reasons, bcachefs encryption is unsafe if the underlying storage
+is snapshotted and rolled back to an earlier version.  (Using bcachefs's own
+snapshot functionality \textit{is} safe.) Therefore, one must exercise care
+when using bcachefs encryption with ``fancy'' storage devices.  It is safe to
+rely on bcachefs encryption if both of the following hold:
+
+\begin{itemize}
+	\item You trust your drives to not be actively malicious. For the
+	      internal storage on your laptop or desktop, this is probably a
+	      safe assumption, and if it is not, you likely have much worse
+	      problems. However, it is not necessarily a safe assumption for
+	      e.g. USB drives or network storage. In those cases you will
+	      need to decide for yourself if you are worried about this.
+
+	\item You are not using ``fancy'' storage systems that support snapshots.
+	      This includes e.g. LVM, ZFS, and loop devices on reflinked or
+	      snapshotted files. Most network storage and/or virtualization
+	      solutions also support snapshots.
+\end{itemize}
+
+If you \textit{are} using snapshots, you must make sure that you never mount
+a snapshotted, encrypted volume, except with \texttt{-o nochanges}.  If this
+rule is violated, an attacker might be able to recover sensitive data that
+the encryption was supposed to protect \footnotemark.  Future versions of
+bcachefs will not have this limitation.  In the meantime, one can make this
+problem much more difficult to exploit by encrypting the volumes on which
+bcachefs resides using LUKS, provided that LUKS is above anything that could
+take a snapshot.  For instance, if you are using bcachefs on LVM and might
+take an LVM snapshot, LUKS would need to be between LVM and bcachefs.
+
+\footnotetext{Technical details: AEAD algorithms, such as ChaCha20/Poly1305,
+require that a \textit{nonce} be used for every encryption. This nonce does not
+need to be kept secret, but one must never encrypt more than one message with
+the same (key, nonce) pair.  In the case of ChaCha20/Poly1305, violating this
+rule loses confidentiality and integrity for all messages with the reused nonce.
+Unfortunately, bcachefs currently derives the nonce for data and journal extents
+from on-disk state.  If a volume is snapshotted and the snapshot mounted,
+bcachefs will use the same keys and nonces for both the original volume and the
+snapshot.  As long at least one of the volumes is strictly read-only, everything
+is okay, but soon as data is written, bcachefs will use the same nonce to
+encrypt what is almost certain to be two different messages, which is insecure.
+Encrypting the volume bcachefs is on makes this much harder to exploit because
+the attacks rely on observing the XOR of the ChaCha20 ciphertexts, and disk
+encryption hides this information.}
+
+\subsubsection{Compression}
+
+bcachefs supports gzip, lz4 and zstd compression. As with data checksumming, we
+compress entire extents, not individual disk blocks - this gives us better
+compression ratios than other filesystems, at the cost of reduced small random
+read performance.
+
+Data can also be compressed or recompressed with a different algorithm in the
+background by the rebalance thread, if the \texttt{background\_compression}
+option is set.
+
+\subsection{Multiple devices}
+
+bcachefs is a multi-device filesystem. Devices need not be the same size: by
+default, the allocator will stripe across all available devices but biasing in
+favor of the devices with more free space, so that all devices in the filesystem
+fill up at the same rate. Devices need not have the same performance
+characteristics: we track device IO latency and direct reads to the device that
+is currently fastest.
+
+\subsubsection{Replication}
+
+bcachefs supports standard RAID1/10 style redundancy with the
+\texttt{data\_replicas} and \texttt{metadata\_replicas} options. Layout is not
+fixed as with RAID10: a given extent can be replicated across any set of
+devices; the \texttt{bcachefs fs usage} command shows how data is replicated
+within a filesystem.
+
+\subsubsection{Erasure coding}
+
+bcachefs also supports Reed-Solomon erasure coding - the same algorithm used by
+most RAID5/6 implementations) When enabled with the \texttt{ec} option, the
+desired redundancy is taken from the \texttt{data\_replicas} option - erasure
+coding of metadata is not supported.
+
+Erasure coding works significantly differently from both conventional RAID
+implementations and other filesystems with similar features. In conventional
+RAID, the "write hole" is a significant problem - doing a small write within a
+stripe requires the P and Q (recovery) blocks to be updated as well, and since
+those writes cannot be done atomically there is a window where the P and Q
+blocks are inconsistent - meaning that if the system crashes and recovers with a
+drive missing, reconstruct reads for unrelated data within that stripe will be
+corrupted.
+
+ZFS avoids this by fragmenting individual writes so that every write becomes a
+new stripe - this works, but the fragmentation has a negative effect on
+performance: metadata becomes bigger, and both read and write requests are
+excessively fragmented. Btrfs's erasure coding implementation is more
+conventional, and still subject to the write hole problem.
+
+bcachefs's erasure coding takes advantage of our copy on write nature - since
+updating stripes in place is a problem, we simply don't do that. And since
+excessively small stripes is a problem for fragmentation, we don't erasure code
+individual extents, we erasure code entire buckets - taking advantage of bucket
+based allocation and copying garbage collection.
+
+When erasure coding is enabled, writes are initially replicated, but one of the
+replicas is allocated from a bucket that is queued up to be part of a new
+stripe. When we finish filling up the new stripe, we write out the P and Q
+buckets and then drop the extra replicas for all the data within that stripe -
+the effect is similar to full data journalling, and it means that after erasure
+coding is done the layout of our data on disk is ideal.
+
+Since disks have write caches that are only flushed when we issue a cache flush
+command - which we only do on journal commit - if we can tweak the allocator so
+that the buckets used for the extra replicas are reused (and then overwritten
+again) immediately, this full data journalling should have negligible overhead -
+this optimization is not implemented yet, however.
+
+\subsubsection{Device labels and targets}
+
+By default, writes are striped across all devices in a filesystem, but they may
+be directed to a specific device or set of devices with the various target
+options. The allocator only prefers to allocate from devices matching the
+specified target; if those devices are full, it will fall back to allocating
+from any device in the filesystem.
+
+Target options may refer to a device directly, e.g.
+\texttt{foreground\_target=/dev/sda1}, or they may refer to a device label. A
+device label is a path delimited by periods - e.g. ssd.ssd1 (and labels need not
+be unique). This gives us ways of referring to multiple devices in target
+options: If we specify ssd in a target option, that will refer to all devices
+with the label ssd or labels that start with ssd. (e.g. ssd.ssd1, ssd.ssd2).
+
+Four target options exist. These options all may be set at the filesystem level
+(at format time, at mount time, or at runtime via sysfs), or on a particular
+file or directory:
+
+\begin{description}
+	\item \texttt{foreground\_target}: normal foreground data writes, and
+		metadata if \\ \texttt{metadata\_target} is not set
+	\item \texttt{metadata\_target}: btree writes
+	\item \texttt{background\_target}: If set, user data (not metadata) will
+		be moved to this target in the background
+	\item\texttt{promote\_target}: If set, a cached copy will be added to
+		this target on read, if none exists
+\end{description}
+
+\subsubsection{Caching}
+
+When an extent has multiple copies on different devices, some of those copies
+may be marked as cached. Buckets containing only cached data are discarded as
+needed by the allocator in LRU order.
+
+When data is moved from one device to another according to the \\
+\texttt{background\_target} option, the original copy is left in place but
+marked as cached. With the \texttt{promote\_target} option, the original copy is
+left unchanged and the new copy on the \texttt{promote\_target} device is marked
+as cached.
+
+To do writeback caching, set \texttt{foreground\_target} and
+\texttt{promote\_target} to the cache device, and \texttt{background\_target} to
+the backing device. To do writearound caching, set \texttt{foreground\_target}
+to the backing device and \texttt{promote\_target} to the cache device.
+
+\subsubsection{Durability}
+
+Some devices may be considered to be more reliable than others. For example, we
+might have a filesystem composed of a hardware RAID array and several NVME flash
+devices, to be used as cache. We can set replicas=2 so that losing any of the
+NVME flash devices will not cause us to lose data, and then additionally we can
+set durability=2 for the hardware RAID device to tell bcachefs that we don't
+need extra replicas for data on that device - data on that device will count as
+two replicas, not just one.
+
+The durability option can also be used for writethrough caching: by setting
+durability=0 for a device, it can be used as a cache and only as a cache -
+bcachefs won't consider copies on that device to count towards the number of
+replicas we're supposed to keep.
+
+\subsection{Reflink}
+
+bcachefs supports reflink, similarly to other filesystems with the same feature.
+\texttt{cp --reflink} will create a copy that shares the underlying storage.
+Reading from that file will become slightly slower - the extent pointing to that
+data is moved to the reflink btree (with a refcount added) and in the extents
+btree we leave a key that points to the indirect extent in the reflink btree,
+meaning that we now have to do two btree lookups to read from that data instead
+of just one.
+
+\subsection{Inline data extents}
+
+bcachefs supports inline data extents, controlled by the \texttt{inline\_data}
+option (on by default). When the end of a file is being written and is smaller
+than half of the filesystem blocksize, it will be written as an inline data
+extent. Inline data extents can also be reflinked (moved to the reflink btree
+with a refcount added): as a todo item we also intend to support compressed
+inline data extents.
+
+\subsection{Subvolumes and snapshots}
+
+bcachefs supports subvolumes and snapshots with a similar userspace interface as
+btrfs. A new subvolume may be created empty, or it may be created as a snapshot
+of another subvolume. Snapshots are writeable and may be snapshotted again,
+creating a tree of snapshots.
+
+Snapshots are very cheap to create: they're not based on cloning of COW btrees
+as with btrfs, but instead are based on versioning of individual keys in the
+btrees. Many thousands or millions of snapshots can be created, with the only
+limitation being disk space.
+
+The following subcommands exist for managing subvolumes and snapshots:
+\begin{itemize}
+	\item \texttt{bcachefs subvolume create}: Create a new, empty subvolume
+	\item \texttt{bcachefs subvolume delete}: Delete an existing subvolume
+		or snapshot
+	\item \texttt{bcachefs subvolume snapshot}: Create a snapshot of an
+		existing subvolume
+\end{itemize}
+
+A subvolume can also be deleted with a normal rmdir after deleting all the
+contents, as with \texttt{rm -rf}. Still to be implemented: read-only snapshots,
+recursive snapshot creation, and a method for recursively listing subvolumes.
+
+\subsection{Quotas}
+
+bcachefs supports conventional user/group/project quotas. Quotas do not
+currently apply to snapshot subvolumes, because if a file changes ownership in
+the snapshot it would be ambiguous as to what quota data within that file
+should be charged to.
+
+When a directory has a project ID set it is inherited automatically by
+descendants on creation and rename. When renaming a directory would cause the
+project ID to change we return -EXDEV so that the move is done file by file, so
+that the project ID is propagated correctly to descendants - thus, project
+quotas can be used as subdirectory quotas.
+
+\section{Management}
+
+\subsection{Formatting}
+
+To format a new bcachefs filesystem use the subcommand \texttt{bcachefs
+format}, or \texttt{mkfs.bcachefs}. All persistent filesystem-wide options can
+be specified at format time. For an example of a multi device filesystem with
+compression, encryption, replication and writeback caching:
+\begin{quote} \begin{verbatim}
+bcachefs format --compression=lz4               \
+                --encrypted                     \
+                --replicas=2                    \
+                --label=ssd.ssd1 /dev/sda       \
+                --label=ssd.ssd2 /dev/sdb       \
+                --label=hdd.hdd1 /dev/sdc       \
+                --label=hdd.hdd2 /dev/sdd       \
+                --label=hdd.hdd3 /dev/sde       \
+                --label=hdd.hdd4 /dev/sdf       \
+                --foreground_target=ssd	        \
+                --promote_target=ssd            \
+                --background_target=hdd
+\end{verbatim} \end{quote}
+
+\subsection{Mounting}
+
+To mount a multi device filesystem, there are two options. You can specify all
+component devices, separated by colons, e.g.
+\begin{quote} \begin{verbatim}
+mount -t bcachefs /dev/sda:/dev/sdb:/dev/sdc /mnt
+\end{verbatim} \end{quote}
+Or, use the mount.bcachefs tool to mount by filesystem UUID. Still todo: improve
+the mount.bcachefs tool to support mounting by filesystem label.
+
+No special handling is needed for recovering from unclean shutdown. Journal
+replay happens automatically, and diagnostic messages in the dmesg log will
+indicate whether recovery was from clean or unclean shutdown.
+
+The \texttt{-o degraded} option will allow a filesystem to be mounted without
+all the devices, but will fail if data would be missing. The
+\texttt{-o very\_degraded} can be used to attempt mounting when data would be
+missing.
+
+Also relevant is the \texttt{-o nochanges} option. It disallows any and all
+writes to the underlying devices, pinning dirty data in memory as necessary if
+for example journal replay was necessary - think of it as a "super read-only"
+mode. It can be used for data recovery, and for testing version upgrades.
+
+The \texttt{-o verbose} enables additional log output during the mount process.
+
+\subsection{Fsck}
+
+It is possible to run fsck either in userspace with the \texttt{bcachefs fsck}
+subcommand (also available as \texttt{fsck.bcachefs}, or in the kernel while
+mounting by specifying the \texttt{-o fsck} mount option). In either case the
+exact same fsck implementation is being run, only the environment is different.
+Running fsck in the kernel at mount time has the advantage of somewhat better
+performance, while running in userspace has the ability to be stopped with
+ctrl-c and can prompt the user for fixing errors. To fix errors while running
+fsck in the kernel, use the \texttt{-o fix\_errors} option.
+
+The \texttt{-n} option passed to fsck implies the \texttt{-o nochanges} option;
+\texttt{bcachefs fsck -ny} can be used to test filesystem repair in dry-run
+mode.
+
+\subsection{Status of data}
+
+The \texttt{bcachefs fs usage} may be used to display filesystem usage broken
+out in various ways. Data usage is broken out by type: superblock, journal,
+btree, data, cached data, and parity, and by which sets of devices extents are
+replicated across. We also give per-device usage which includes fragmentation
+due to partially used buckets.
+
+\subsection{Journal}
+
+The journal has a number of tunables that affect filesystem performance. Journal
+commits are fairly expensive operations as they require issuing FLUSH and FUA
+operations to the underlying devices. By default, we issue a journal flush one
+second after a filesystem update has been done; this is controlled with the
+\texttt{journal\_flush\_delay} option, which takes a parameter in milliseconds.
+
+Filesystem sync and fsync operations issue journal flushes; this can be disabled
+with the \texttt{journal\_flush\_disabled} option - the
+\texttt{journal\_flush\_delay} option will still apply, and in the event of a
+system crash we will never lose more than (by default) one second of work. This
+option may be useful on a personal workstation or laptop, and perhaps less
+appropriate on a server.
+
+The journal reclaim thread runs in the background, kicking off btree node writes
+and btree key cache flushes to free up space in the journal. Even in the absence
+of space pressure it will run slowly in the background: this is controlled by
+the \texttt{journal\_reclaim\_delay} parameter, with a default of 100
+milliseconds.
+
+The journal should be sized sufficiently that bursts of activity do not fill up
+the journal too quickly; also, a larger journal means that we can queue up
+larger btree writes. The \texttt{bcachefs device resize-journal} can be used for
+resizing the journal on disk on a particular device - it can be used on a
+mounted or unmounted filesystem.
+
+In the future, we should implement a method to see how much space is currently
+utilized in the journal.
+
+\subsection{Device management}
+
+\subsubsection{Filesystem resize}
+
+A filesystem can be resized on a particular device with the
+\texttt{bcachefs device resize} subcommand. Currently only growing is supported,
+not shrinking.
+
+\subsubsection{Device add/removal}
+
+The following subcommands exist for adding and removing devices from a mounted
+filesystem:
+\begin{itemize}
+	\item \texttt{bcachefs device add}: Formats and adds a new device to an
+		existing filesystem.
+	\item \texttt{bcachefs device remove}: Permanently removes a device from
+		an existing filesystem.
+	\item \texttt{bcachefs device online}: Connects a device to a running
+		filesystem that was mounted without it (i.e. in degraded mode)
+	\item \texttt{bcachefs device offline}: Disconnects a device from a
+		mounted filesystem without removing it.
+	\item \texttt{bcachefs device evacuate}: Migrates data off of a
+		particular device to prepare for removal, setting it read-only
+		if necessary.
+	\item \texttt{bcachefs device set-state}: Changes the state of a member
+		device: one of rw (readwrite), ro (readonly), failed, or spare.
+
+		A failed device is considered to have 0 durability, and replicas
+		on that device won't be counted towards the number of replicas
+		an extent should have by rereplicate - however, bcachefs will
+		still attempt to read from devices marked as failed.
+\end{itemize}
+
+The \texttt{bcachefs device remove}, \texttt{bcachefs device offline} and
+\texttt{bcachefs device set-state} commands take force options for when they
+would leave the filesystem degraded or with data missing. Todo: regularize and
+improve those options.
+
+\subsection{Data management}
+
+\subsubsection{Data rereplicate}
+
+The \texttt{bcachefs data rereplicate} command may be used to scan for extents
+that have insufficient replicas and write additional replicas, e.g. after a
+device has been removed from a filesystem or after replication has been enabled
+or increased.
+
+\subsubsection{Rebalance}
+
+To be implemented: a command for moving data between devices to equalize usage
+on each device. Not normally required because the allocator attempts to equalize
+usage across devices as it stripes, but can be necessary in certain scenarios -
+i.e. when a two-device filesystem with replication enabled that is very full has
+a third device added.
+
+\subsubsection{Scrub}
+
+To be implemented: a command for reading all data within a filesystem and
+ensuring that checksums are valid, fixing bitrot when a valid copy can be found.
+
+\section{Options}
+
+Most bcachefs options can be set filesystem wide, and a significant subset can
+also be set on inodes (files and directories), overriding the global defaults.
+Filesystem wide options may be set when formatting, when mounting, or at runtime
+via \texttt{/sys/fs/bcachefs/<uuid>/options/}. When set at runtime via sysfs,
+the persistent options in the superblock are updated as well; when options are
+passed as mount parameters the persistent options are unmodified.
+
+\subsection{File and directory options}
+
+<say something here about how attrs must be set via bcachefs attr command>
+
+Options set on inodes (files and directories) are automatically inherited by
+their descendants, and inodes also record whether a given option was explicitly
+set or inherited from their parent. When renaming a directory would cause
+inherited attributes to change we fail the rename with -EXDEV, causing userspace
+to do the rename file by file so that inherited attributes stay consistent.
+
+Inode options are available as extended attributes. The options that have been
+explicitly set are available under the \texttt{bcachefs} namespace, and the
+effective options (explicitly set and inherited options) are available under the
+\texttt{bcachefs\_effective} namespace. Examples of listing options with the
+getfattr command:
+
+\begin{quote} \begin{verbatim}
+$ getfattr -d -m '^bcachefs\.' filename
+$ getfattr -d -m '^bcachefs_effective\.' filename
+\end{verbatim} \end{quote}
+
+Options may be set via the extended attribute interface, but it is preferable to
+use the \texttt{bcachefs setattr} command as it will correctly propagate options
+recursively.
+
+\subsection{Full option list}
+
+\begin{tabbing}
+\hspace{0.2in} \= \kill
+	\texttt{block\_size}		\` \textbf{format}			\\
+	\> \parbox{4.3in}{Filesystem block size (default 4k)}			\\ \\
+
+	\texttt{btree\_node\_size}	\` \textbf{format}			\\
+	\> Btree node size, default 256k					\\ \\
+
+	\texttt{errors}			\` \textbf{format,mount,runtime}		\\
+	\> Action to take on filesystem error					\\ \\
+
+	\texttt{metadata\_replicas}	\` \textbf{format,mount,runtime}	\\
+	\> Number of replicas for metadata (journal and btree)			\\ \\
+
+	\texttt{data\_replicas}		\` \textbf{format,mount,runtime,inode}	\\
+	\> Number of replicas for user data					\\ \\
+
+	\texttt{replicas}		\` \textbf{format}			\\
+	\> Alias for both metadata\_replicas and data\_replicas			\\ \\
+
+	\texttt{metadata\_checksum}	\` \textbf{format,mount,runtime}	\\
+	\> Checksum type for metadata writes					\\ \\
+
+	\texttt{data\_checksum}		\` \textbf{format,mount,runtime,inode}	\\
+	\> Checksum type for data writes					\\ \\
+
+	\texttt{compression}		\` \textbf{format,mount,runtime,inode}	\\
+	\> Compression type							\\ \\
+
+	\texttt{background\_compression} \` \textbf{format,mount,runtime,inode}	\\
+	\> Background compression type						\\ \\
+
+	\texttt{str\_hash}		\` \textbf{format,mount,runtime,inode}	\\
+	\> Hash function for string hash tables (directories and xattrs)	\\ \\
+
+	\texttt{metadata\_target}	\` \textbf{format,mount,runtime,inode}	\\
+	\> Preferred target for metadata writes					\\ \\
+
+	\texttt{foreground\_target}	\` \textbf{format,mount,runtime,inode}	\\
+	\> Preferred target for foreground writes				\\ \\
+
+	\texttt{background\_target}	\` \textbf{format,mount,runtime,inode}	\\
+	\> Target for data to be moved to in the background			\\ \\
+
+	\texttt{promote\_target}	\` \textbf{format,mount,runtime,inode}	\\
+	\> Target for data to be copied to on read				\\ \\
+
+	\texttt{erasure\_code}		\` \textbf{format,mount,runtime,inode}	\\
+	\> Enable erasure coding						\\ \\
+
+	\texttt{inodes\_32bit}		\` \textbf{format,mount,runtime}	\\
+	\> Restrict new inode numbers to 32 bits				\\ \\
+
+	\texttt{shard\_inode\_numbers}	\` \textbf{format,mount,runtime}	\\
+	\> Use CPU id for high bits of new inode numbers. 			\\ \\
+
+	\texttt{wide\_macs}		\` \textbf{format,mount,runtime}	\\
+	\> Store full 128 bit cryptographic MACs (default 80)			\\ \\
+
+	\texttt{inline\_data}		\` \textbf{format,mount,runtime}	\\
+	\> Enable inline data extents (default on)				\\ \\
+
+	\texttt{journal\_flush\_delay}	\` \textbf{format,mount,runtime}	\\
+	\> Delay in milliseconds before automatic journal commit (default 1000)	\\ \\
+
+	\texttt{journal\_flush\_disabled}\`\textbf{format,mount,runtime}	\\
+	\> \begin{minipage}{4.3in}Disables journal flush on sync/fsync.
+		\texttt{journal\_flush\_delay}	remains in effect, thus with the
+		default setting not more than 1 second of work will be lost.
+	\end{minipage}								\\ \\
+
+	\texttt{journal\_reclaim\_delay}\` \textbf{format,mount,runtime}	\\
+	\> Delay in milliseconds before automatic journal reclaim		\\ \\
+
+	\texttt{acl}			\` \textbf{format,mount}		\\
+	\> Enable POSIX ACLs							\\ \\
+
+	\texttt{usrquota}		\` \textbf{format,mount}		\\
+	\> Enable user quotas							\\ \\
+
+	\texttt{grpquota}		\` \textbf{format,mount}		\\
+	\> Enable group quotas							\\ \\
+
+	\texttt{prjquota}		\` \textbf{format,mount}		\\
+	\> Enable project quotas						\\ \\
+
+	\texttt{degraded}		\` \textbf{mount}			\\
+	\> Allow mounting with data degraded					\\ \\
+
+	\texttt{very\_degraded}		\` \textbf{mount}			\\
+	\> Allow mounting with data missing					\\ \\
+
+	\texttt{verbose}		\` \textbf{mount}			\\
+	\> Extra debugging info during mount/recovery				\\ \\
+
+	\texttt{fsck}			\` \textbf{mount}			\\
+	\> Run fsck during mount						\\ \\
+
+	\texttt{fix\_errors}		\` \textbf{mount}			\\
+	\> Fix errors without asking during fsck				\\ \\
+
+	\texttt{ratelimit\_errors}	\` \textbf{mount}			\\
+	\> Ratelimit error messages during fsck					\\ \\
+
+	\texttt{read\_only}		\` \textbf{mount}			\\
+	\> Mount in read only mode						\\ \\
+
+	\texttt{nochanges}		\` \textbf{mount}			\\
+	\> Issue no writes, even for journal replay				\\ \\
+
+	\texttt{norecovery}		\` \textbf{mount}			\\
+	\> Don't replay the journal (not recommended)				\\ \\
+
+	\texttt{noexcl}			\` \textbf{mount}			\\
+	\> Don't open devices in exclusive mode					\\ \\
+
+	\texttt{version\_upgrade}	\` \textbf{mount}			\\
+	\> Upgrade on disk format to latest version				\\ \\
+
+	\texttt{discard}		\` \textbf{device}			\\
+	\> Enable discard/TRIM support						\\ \\
+\end{tabbing}
+
+\subsection{Error actions}
+The \texttt{errors} option is used for inconsistencies that indicate some sort
+of a bug. Valid error actions are:
+\begin{description}
+	\item[{\tt continue}] Log the error but continue normal operation
+	\item[{\tt ro}] Emergency read only, immediately halting any changes
+		to the filesystem on disk
+	\item[{\tt panic}] Immediately halt the entire machine, printing a
+		backtrace on the system console
+\end{description}
+
+\subsection{Checksum types}
+Valid checksum types are:
+\begin{description}
+	\item[{\tt none}]
+	\item[{\tt crc32c}] (default)
+	\item[{\tt crc64}]
+\end{description}
+
+\subsection{Compression types}
+Valid compression types are:
+\begin{description}
+	\item[{\tt none}] (default)
+	\item[{\tt lz4}]
+	\item[{\tt gzip}]
+	\item[{\tt zstd}]
+\end{description}
+
+\subsection{String hash types}
+Valid hash types for string hash tables are:
+\begin{description}
+	\item[{\tt crc32c}]
+	\item[{\tt crc64}]
+	\item[{\tt siphash}] (default)
+\end{description}
+
+\section{Debugging tools}
+
+\subsection{Sysfs interface}
+
+Mounted filesystems are available in sysfs at \texttt{/sys/fs/bcachefs/<uuid>/}
+with various options, performance counters and internal debugging aids.
+
+\subsubsection{Options}
+
+Filesystem options may be viewed and changed via \\
+\texttt{/sys/fs/bcachefs/<uuid>/options/}, and settings changed via sysfs will
+be persistently changed in the superblock as well.
+
+\subsubsection{Time stats}
+
+bcachefs tracks the latency and frequency of various operations and events, with
+quantiles for latency/duration in the
+\texttt{/sys/fs/bcachefs/<uuid>/time\_stats/} directory.
+
+\begin{description}
+	\item \texttt{blocked\_allocate} \\
+		Tracks when allocating a bucket must wait because none are
+		immediately available, meaning the copygc thread is not keeping
+		up with evacuating mostly empty buckets or the allocator thread
+		is not keeping up with invalidating and discarding buckets.
+
+	\item \texttt{blocked\_allocate\_open\_bucket} \\
+		Tracks when allocating a bucket must wait because all of our
+		handles for pinning open buckets are in use (we statically
+		allocate 1024).
+
+	\item \texttt{blocked\_journal} \\
+		Tracks when getting a journal reservation must wait, either
+		because journal reclaim isn't keeping up with reclaiming space
+		in the journal, or because journal writes are taking too long to
+		complete and we already have too many in flight.
+
+	\item \texttt{btree\_gc} \\
+		Tracks when the btree\_gc code must walk the btree at runtime -
+		for recalculating the oldest outstanding generation number of
+		every bucket in the btree.
+
+	\item \texttt{btree\_lock\_contended\_read}
+	\item \texttt{btree\_lock\_contended\_intent}
+	\item \texttt{btree\_lock\_contended\_write} \\
+		Track when taking a read, intent or write lock on a btree node
+		must block.
+
+	\item \texttt{btree\_node\_mem\_alloc} \\
+		Tracks the total time to allocate memory in the btree node cache
+		for a new btree node.
+
+	\item \texttt{btree\_node\_split} \\
+		Tracks btree node splits - when a btree node becomes full and is
+		split into two new nodes
+
+	\item \texttt{btree\_node\_compact} \\
+		Tracks btree node compactions - when a btree node becomes full
+		and needs to be compacted on disk.
+
+	\item \texttt{btree\_node\_merge} \\
+		Tracks when two adjacent btree nodes are merged.
+
+	\item \texttt{btree\_node\_sort} \\
+		Tracks sorting and resorting entire btree nodes in memory,
+		either after reading them in from disk or for compacting prior
+		to creating a new sorted array of keys.
+
+	\item \texttt{btree\_node\_read} \\
+		Tracks reading in btree nodes from disk.
+
+	\item \texttt{btree\_interior\_update\_foreground} \\
+		Tracks foreground time for btree updates that change btree
+		topology - i.e. btree node splits, compactions and merges; the
+		duration measured roughly corresponds to lock held time.
+
+	\item \texttt{btree\_interior\_update\_total} \\
+		Tracks time to completion for topology changing btree updates;
+		first they have a foreground part that updates btree nodes in
+		memory, then after the new nodes are written there is a
+		transaction phase that records an update to an interior node or
+		a new btree root as well as changes to the alloc btree.
+
+	\item \texttt{data\_read} \\
+		Tracks the core read path - looking up a request in the extents
+		(and possibly also reflink) btree, allocating bounce buffers if
+		necessary, issuing reads, checksumming, decompressing, decrypting,
+		and delivering completions.
+
+	\item \texttt{data\_write} \\
+		Tracks the core write path - allocating space on disk for a new
+		write, allocating bounce buffers if necessary,
+		compressing, encrypting, checksumming, issuing writes, and
+		updating the extents btree to point to the new data.
+
+	\item \texttt{data\_promote} \\
+		Tracks promote operations, which happen when a read operation
+		writes an additional cached copy of an extent to
+		\texttt{promote\_target}. This is done asynchronously from the
+		original read.
+
+	\item \texttt{journal\_flush\_write} \\
+		Tracks writing of flush journal entries to disk, which first
+		issue cache flush operations to the underlying devices then
+		issue the journal writes as FUA writes. Time is tracked starting
+		from after all journal reservations have released their
+		references or the completion of the previous journal write.
+
+	\item \texttt{journal\_noflush\_write} \\
+		Tracks writing of non-flush journal entries to disk, which do
+		not issue cache flushes or FUA writes.
+
+	\item \texttt{journal\_flush\_seq} \\
+		Tracks time to flush a journal sequence number to disk by
+		filesystem sync and fsync operations, as well as the allocator
+		prior to reusing buckets when none that do not need flushing are
+		available.
+\end{description}
+
+\subsubsection{Internals}
+
+\begin{description}
+	\item \texttt{btree\_cache} \\
+		Shows information on the btree node cache: number of cached
+		nodes, number of dirty nodes, and whether the cannibalize lock
+		(for reclaiming cached nodes to allocate new nodes) is held.
+
+	\item \texttt{dirty\_btree\_nodes} \\
+		Prints information related to the interior btree node update
+		machinery, which is responsible for ensuring dependent btree
+		node writes are ordered correctly.
+
+		For each dirty btree node, prints:
+		\begin{itemize}
+			\item Whether the \texttt{need\_write} flag is set
+			\item The level of the btree node
+			\item The number of sectors written
+			\item Whether writing this node is blocked, waiting for
+				other nodes to be written
+			\item Whether it is waiting on a btree\_update to
+				complete and make it reachable on-disk
+		\end{itemize}
+
+	\item \texttt{btree\_key\_cache} \\
+		Prints information on the btree key cache: number of freed keys
+		(which must wait for a sRCU barrier to complete before being
+		freed), number of cached keys, and number of dirty keys.
+
+	\item \texttt{btree\_transactions} \\
+		Lists each running btree transactions that has locks held,
+		listing which nodes they have locked and what type of lock, what
+		node (if any) the process is blocked attempting to lock, and
+		where the btree transaction was invoked from.
+
+	\item \texttt{btree\_updates} \\
+		Lists outstanding interior btree updates: the mode (nothing
+		updated yet, or updated a btree node, or wrote a new btree root,
+		or was reparented by another btree update), whether its new
+		btree nodes have finished writing, its embedded closure's
+		refcount (while nonzero, the btree update is still waiting), and
+		the pinned journal sequence number.
+
+	\item \texttt{journal\_debug} \\
+		Prints a variety of internal journal state.
+
+	\item \texttt{journal\_pins}
+		Lists items pinning journal entries, preventing them from being
+		reclaimed.
+
+	\item \texttt{new\_stripes} \\
+		Lists new erasure-coded stripes being created.
+
+	\item \texttt{stripes\_heap} \\
+		Lists erasure-coded stripes that are available to be reused.
+
+	\item \texttt{open\_buckets} \\
+		Lists buckets currently being written to, along with data type
+		and refcount.
+
+	\item \texttt{io\_timers\_read} \\
+	\item \texttt{io\_timers\_write} \\
+		Lists outstanding IO timers - timers that wait on total reads or
+		writes to the filesystem.
+
+	\item \texttt{trigger\_journal\_flush} \\
+		Echoing to this file triggers a journal commit.
+
+	\item \texttt{trigger\_gc} \\
+		Echoing to this file causes the GC code to recalculate each
+		bucket's oldest\_gen field.
+
+	\item \texttt{prune\_cache} \\
+		Echoing to this file prunes the btree node cache.
+
+	\item \texttt{read\_realloc\_races} \\
+		This counts events where the read path reads an extent and
+		discovers the bucket that was read from has been reused while
+		the IO was in flight, causing the read to be retried.
+
+	\item \texttt{extent\_migrate\_done} \\
+		This counts extents moved by the core move path, used by copygc
+		and rebalance.
+
+	\item \texttt{extent\_migrate\_raced} \\
+		This counts extents that the move path attempted to move but no
+		longer existed when doing the final btree update.
+\end{description}
+
+\subsubsection{Unit and performance tests}
+
+Echoing into \texttt{/sys/fs/bcachefs/<uuid>/perf\_test} runs various low level
+btree tests, some intended as unit tests and others as performance tests. The
+syntax is
+\begin{quote} \begin{verbatim}
+	echo <test_name> <nr_iterations> <nr_threads> > perf_test
+\end{verbatim} \end{quote}
+
+When complete, the elapsed time will be printed in the dmesg log. The full list
+of tests that can be run can be found near the bottom of
+\texttt{fs/bcachefs/tests.c}.
+
+\subsection{Debugfs interface}
+
+The contents of every btree, as well as various internal per-btree-node
+information, are available under \texttt{/sys/kernel/debug/bcachefs/<uuid>/}.
+
+For every btree, we have the following files:
+
+\begin{description}
+	\item \textit{btree\_name} \\
+		Entire btree contents, one key per line
+
+	\item \textit{btree\_name}\texttt{-formats} \\
+		Information about each btree node: the size of the packed bkey
+		format, how full each btree node is, number of packed and
+		unpacked keys, and number of nodes and failed nodes in the
+		in-memory search trees.
+
+	\item \textit{btree\_name}\texttt{-bfloat-failed} \\
+		For each sorted set of keys in a btree node, we construct a
+		binary search tree in eytzinger layout with compressed keys.
+		Sometimes we aren't able to construct a correct compressed
+		search key, which results in slower lookups; this file lists the
+		keys that resulted in these failed nodes.
+\end{description}
+
+\subsection{Listing and dumping filesystem metadata}
+
+\subsubsection{bcachefs show-super}
+
+This subcommand is used for examining and printing bcachefs superblocks. It
+takes two optional parameters:
+\begin{description}
+	\item \texttt{-l}: Print superblock layout, which records the amount of
+		space reserved for the superblock and the locations of the
+		backup superblocks.
+	\item \texttt{-f, --fields=(fields)}: List of superblock sections to
+		print, \texttt{all} to print all sections.
+\end{description}
+
+\subsubsection{bcachefs list}
+
+This subcommand gives access to the same functionality as the debugfs interface,
+listing btree nodes and contents, but for offline filesystems.
+
+\subsubsection{bcachefs list\_journal}
+
+This subcommand lists the contents of the journal, which primarily records btree
+updates ordered by when they occurred.
+
+\subsubsection{bcachefs dump}
+
+This subcommand can dump all metadata in a filesystem (including multi device
+filesystems) as qcow2 images: when encountering issues that \texttt{fsck} can
+not recover from and need attention from the developers, this makes it possible
+to send the developers only the required metadata. Encrypted filesystems must
+first be unlocked with \texttt{bcachefs remove-passphrase}.
+
+\section{ioctl interface}
+
+This section documents bcachefs-specific ioctls:
+
+\begin{description}
+	\item \texttt{BCH\_IOCTL\_QUERY\_UUID} \\
+		Returns the UUID of the filesystem: used to find the sysfs
+		directory given a path to a mounted filesystem.
+
+	\item \texttt{BCH\_IOCTL\_FS\_USAGE} \\
+		Queries filesystem usage, returning global counters and a list
+		of counters by \texttt{bch\_replicas} entry.
+
+	\item \texttt{BCH\_IOCTL\_DEV\_USAGE} \\
+		Queries usage for a particular device, as bucket and sector
+		counts broken out by data type.
+
+	\item \texttt{BCH\_IOCTL\_READ\_SUPER} \\
+		Returns the filesystem superblock, and optionally the superblock
+		for a particular device given that device's index.
+
+	\item \texttt{BCH\_IOCTL\_DISK\_ADD} \\
+		Given a path to a device, adds it to a mounted and running
+		filesystem. The device must already have a bcachefs superblock;
+		options and parameters are read from the new device's superblock
+		and added to the member info section of the existing
+		filesystem's superblock.
+
+	\item \texttt{BCH\_IOCTL\_DISK\_REMOVE} \\
+		Given a path to a device or a device index, attempts to remove
+		it from a mounted and running filesystem. This operation
+		requires walking the btree to remove all references to this
+		device, and may fail if data would become degraded or lost,
+		unless appropriate force flags are set.
+
+	\item \texttt{BCH\_IOCTL\_DISK\_ONLINE} \\
+		Given a path to a device that is a member of a running
+		filesystem (in degraded mode), brings it back online.
+
+	\item \texttt{BCH\_IOCTL\_DISK\_OFFLINE} \\
+		Given a path or device index of a device in a multi device
+		filesystem, attempts to close it without removing it, so that
+		the device may be re-added later and the contents will still be
+		available.
+
+	\item \texttt{BCH\_IOCTL\_DISK\_SET\_STATE} \\
+		Given a path or device index of a device in a multi device
+		filesystem, attempts to set its state to one of read-write,
+		read-only, failed or spare. Takes flags to force if the
+		filesystem would become degraded.
+
+	\item \texttt{BCH\_IOCTL\_DISK\_GET\_IDX} \\
+	\item \texttt{BCH\_IOCTL\_DISK\_RESIZE} \\
+	\item \texttt{BCH\_IOCTL\_DISK\_RESIZE\_JOURNAL} \\
+	\item \texttt{BCH\_IOCTL\_DATA} \\
+		Starts a data job, which walks all data and/or metadata in a
+		filesystem performing, performing some operations on each btree
+		node and extent. Returns a file descriptor which can be read
+		from to get the current status of the job, and closing the file
+		descriptor (i.e. on process exit stops the data job.
+
+	\item \texttt{BCH\_IOCTL\_SUBVOLUME\_CREATE} \\
+	\item \texttt{BCH\_IOCTL\_SUBVOLUME\_DESTROY} \\
+	\item \texttt{BCHFS\_IOC\_REINHERIT\_ATTRS} \\
+\end{description}
+
+\section{On disk format}
+
+\subsection{Superblock}
+
+The superblock is the first thing to be read when accessing a bcachefs
+filesystem. It is located 4kb from the start of the device, with redundant
+copies elsewhere - typically one immediately after the first superblock, and one
+at the end of the device.
+
+The \texttt{bch\_sb\_layout} records the amount of space reserved for the
+superblock as well as the locations of all the superblocks. It is included with
+every superblock, and additionally written 3584 bytes from the start of the
+device (512 bytes before the first superblock).
+
+Most of the superblock is identical across each device. The exceptions are the
+\texttt{dev\_idx} field, and the journal section which gives the location of the
+journal.
+
+The main section of the superblock contains UUIDs, version numbers, number of
+devices within the filesystem and device index, block size, filesystem creation
+time, and various options and settings. The superblock also has a number of
+variable length sections:
+
+\begin{description}
+	\item \texttt{BCH\_SB\_FIELD\_journal} \\
+		List of buckets used for the journal on this device.
+
+	\item \texttt{BCH\_SB\_FIELD\_members} \\
+		List of member devices, as well as per-device options and
+		settings, including bucket size, number of buckets and time when
+		last mounted.
+
+	\item \texttt{BCH\_SB\_FIELD\_crypt} \\
+		Contains the main chacha20 encryption key, encrypted by the
+		user's passphrase, as well as key derivation function settings.
+
+	\item \texttt{BCH\_SB\_FIELD\_replicas} \\
+		Contains a list of replica entries, which are lists of devices
+		that have extents replicated across them.
+
+	\item \texttt{BCH\_SB\_FIELD\_quota} \\
+		Contains timelimit and warnlimit fields for each quota type
+		(user, group and project) and counter (space, inodes).
+
+	\item \texttt{BCH\_SB\_FIELD\_disk\_groups} \\
+		Formerly referred to as disk groups (and still is throughout the
+		code); this section contains device label strings and records
+		the tree structure of label paths, allowing a label once parsed
+		to be referred to by integer ID by the target options.
+
+	\item \texttt{BCH\_SB\_FIELD\_clean} \\
+		When the filesystem is clean, this section contains a list of
+		journal entries that are normally written with each journal
+		write (\texttt{struct jset}): btree roots, as well as filesystem
+		usage and read/write counters (total amount of data read/written
+		to this filesystem). This allows reading the journal to be
+		skipped after clean shutdowns.
+\end{description}
+
+\subsection{Journal}
+
+Every journal write (\texttt{struct jset}) contains a list of entries:
+\texttt{struct jset\_entry}. Below are listed the various journal entry types.
+
+\begin{description}
+	\item \texttt{BCH\_JSET\_ENTRY\_btree\_key} \\
+		This entry type is used to record every btree update that
+		happens. It contains one or more btree keys (\texttt{struct
+		bkey}), and the \texttt{btree\_id} and \texttt{level} fields of
+		\texttt{jset\_entry} record the btree ID and level the key
+		belongs to.
+
+	\item \texttt{BCH\_JSET\_ENTRY\_btree\_root} \\
+		This entry type is used for pointers btree roots. In the current
+		implementation, every journal write still records every btree
+		root, although that is subject to change. A btree root is a bkey
+		of type \texttt{KEY\_TYPE\_btree\_ptr\_v2}, and the btree\_id
+		and level fields of \texttt{jset\_entry} record the btree ID and
+		depth.
+
+	\item \texttt{BCH\_JSET\_ENTRY\_clock} \\
+		Records IO time, not wall clock time - i.e. the amount of reads
+		and writes, in 512 byte sectors since the filesystem was
+		created.
+
+	\item \texttt{BCH\_JSET\_ENTRY\_usage} \\
+		Used for certain persistent counters: number of inodes, current
+		maximum key version, and sectors of persistent reservations.
+
+	\item \texttt{BCH\_JSET\_ENTRY\_data\_usage} \\
+		Stores replica entries with a usage counter, in sectors.
+
+	\item \texttt{BCH\_JSET\_ENTRY\_dev\_usage} \\
+		Stores usage counters for each device: sectors used and buckets
+		used, broken out by each data type.
+\end{description}
+
+\subsection{Btrees}
+
+\subsection{Btree keys}
+
+\begin{description}
+	\item \texttt{KEY\_TYPE\_deleted}
+	\item \texttt{KEY\_TYPE\_whiteout}
+	\item \texttt{KEY\_TYPE\_error}
+	\item \texttt{KEY\_TYPE\_cookie}
+	\item \texttt{KEY\_TYPE\_hash\_whiteout}
+	\item \texttt{KEY\_TYPE\_btree\_ptr}
+	\item \texttt{KEY\_TYPE\_extent}
+	\item \texttt{KEY\_TYPE\_reservation}
+	\item \texttt{KEY\_TYPE\_inode}
+	\item \texttt{KEY\_TYPE\_inode\_generation}
+	\item \texttt{KEY\_TYPE\_dirent}
+	\item \texttt{KEY\_TYPE\_xattr}
+	\item \texttt{KEY\_TYPE\_alloc}
+	\item \texttt{KEY\_TYPE\_quota}
+	\item \texttt{KEY\_TYPE\_stripe}
+	\item \texttt{KEY\_TYPE\_reflink\_p}
+	\item \texttt{KEY\_TYPE\_reflink\_v}
+	\item \texttt{KEY\_TYPE\_inline\_data}
+	\item \texttt{KEY\_TYPE\_btree\_ptr\_v2}
+	\item \texttt{KEY\_TYPE\_indirect\_inline\_data}
+	\item \texttt{KEY\_TYPE\_alloc\_v2}
+	\item \texttt{KEY\_TYPE\_subvolume}
+	\item \texttt{KEY\_TYPE\_snapshot}
+	\item \texttt{KEY\_TYPE\_inode\_v2}
+	\item \texttt{KEY\_TYPE\_alloc\_v3}
+\end{description}
+
+\end{document}
diff --git a/doc/bcachefs.5.rst.tmpl b/doc/bcachefs.5.rst.tmpl
new file mode 100644
index 00000000..e896b861
--- /dev/null
+++ b/doc/bcachefs.5.rst.tmpl
@@ -0,0 +1,120 @@
+========
+bcachefs
+========
+
+--------------------------------------------------
+bcachefs overview, user's manual and configuration
+--------------------------------------------------
+:Manual section: 5
+
+DESCRIPTION
+-----------
+Bcachefs is a multi device copy on write filesystem that supports
+
+- Checksumming
+- Compression
+- Encryption
+- Reflink
+- Caching
+- Replication
+- Erasure coding (reed-solomon)
+
+And more. This document is intended to be an overview of the various features
+and use cases.
+
+Configuration
+-------------
+Most configuration is done via filesystem options that can be set at format
+time, mount time (as mount -o parameters), or changed at runtime via sysfs (via
+the /sys/fs/bcachefs/<UUID>/options/ directory).
+
+Many of those options (particularly those that control the IO path) can also be
+set on individual files and directories, via the bcachefs set-file-option
+command (which internally mostly works via the extended attribute interface, but
+the set-file-option command takes care to propagate options to children
+correctly).
+
+OPTIONS
+-------
+OPTIONS_TABLE
+
+Device management
+-----------------
+Devices can be added, removed, and resized at will, at runtime. There is no
+fixed topology or data layout, as with hardware RAID or ZFS, and devices need
+not be the same size - the allocator will stripe across multiple disks,
+preferring to allocate from disks with more free space so that disks all fill up
+at the same time.
+
+We generally avoid per-device options, preferring instead options that can be
+overridden on files or directories, but there are some:
+
+ *durability*
+
+Device labels, targets
+----------------------
+
+Configuration options that point to targets (i.e. a disk or group of disks) may
+be passed either a device (i.e. /dev/sda), or a label. Labels are assigned to
+disks (and need not be unique), and these labels form a nested hierarchy: this
+allows disks to be grouped together and referred to either individually or as a
+group.
+
+For example, given disks formatted with these labels:
+
+.. code-block:: bash
+
+  bcachefs format -g controller1.hdd.hdd1 /dev/sda	\
+                  -g controller1.hdd.hdd2 /dev/sdb	\
+                  -g controller1.ssd.ssd1 /dev/sdc	\
+                  -g controller1.ssd.ssd1 /dev/sdd	\
+                  -g controller2.hdd1     /dev/sde	\
+                  -g controller2.hdd2     /dev/sdf
+
+Configuration options such as foreground_target may then refer to controller1,
+or controller1.hdd, or controller1.hdd1 - or to /dev/sda directly.
+
+Data placement, caching
+-----------------------
+
+The following options control which disks data is written to:
+
+- foreground_target
+- background_target
+- promote_target
+
+The foreground_target option is used to direct writes from applications. The
+background_target option, if set, will cause data to be moved to that target in
+the background by the rebalance thread some time after it has been initially
+written - leaving behind the original copy, but marking it cached so that it can
+be discarded by the allocator. The promote_target will cause reads to write
+a cached copy of the data being read to that target, if it doesn't exist.
+
+Together, these options can be used for writeback caching, like so:
+
+.. code-block:: bash
+
+  foreground_target=ssd
+  background_target=hdd
+  promote_target=ssd
+
+Writethrough caching requires telling bcachefs not to trust the cache device,
+which does require a per-device option and thus can't completely be done with
+per-file options. This is done by setting the device's durability to 0.
+
+These options can all be set on individual files or directories. They can also
+be used to pin a specific file or directory to a specific device or target:
+
+.. code-block:: bash
+
+  foreground_target=ssd
+  background_target=
+  promote_target=
+
+Note that if the target specified is full, the write will spill over to the rest
+of the filesystem.
+
+Data protection
+---------------
+
+foo
diff --git a/doc/macro2rst.py b/doc/macro2rst.py
new file mode 100755
index 00000000..e80f7edb
--- /dev/null
+++ b/doc/macro2rst.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+'''
+A utility script for generating documentation.
+
+Preprocessor macro output from opts_macro.h is parsed and combined with
+bcachefs.5.rst.tmpl to generate bcachefs.5.rst.
+
+>=python3.6
+'''
+
+import sys
+import re
+
+INDENT = '       '
+TEMPLATE = './doc/bcachefs.5.rst.tmpl'
+RST_FILE= './doc/bcachefs.5.rst'
+SANITIZE_CHARS = [
+            '\\\\n',
+            '\\n',
+            '  ',
+            '"',
+            '\\',
+        ]
+
+def sanitize(text):
+    '''
+    Parses opts_macro.h preprocessor output
+    :param text: text to sanitize
+    :type text: str
+    :returns: a list of options
+    :rtype: list
+    '''
+
+    args = []
+    reg = re.search('FMT_START_SECTION(.*)FMT_END_SECTION', text,
+            flags=re.DOTALL)
+    if not reg:
+        raise re.error('no text found')
+
+    # decoding would probably be better, but this works
+    for char in SANITIZE_CHARS:
+        text = text.replace(char, '')
+
+    text = re.split(r'FMT_END_LINE', text)
+
+    # this seemed easier than getting preprocessor macros to play nice
+    # with python's builtin csv module
+    for line in text:
+        vals = line.split(';')
+        if not vals:
+            continue
+        if len(vals) != 4:
+            continue
+        vals = list(map(str.strip, vals))
+        name, is_bool, desc, arg_name = vals
+
+        # this macro value from opts.h indicates that no values are passed
+        if is_bool == 'OPT_BOOL()':
+            args.append(f'--{name}\n{INDENT}{desc}')
+        else:
+            args.append(f'--{name} <{arg_name}>\n{INDENT}{desc}')
+    if not args:
+        raise re.error('no args found, likely parsing error')
+
+    return args
+
+
+def main():
+    ''' Transform stdin to option list and write templated output to new file '''
+    out = ''
+
+    stdin = sys.stdin.read()
+    opts = sanitize(stdin)
+    opts = '\n'.join(opts)
+
+    # Insert into template
+    with open(TEMPLATE, 'r') as in_handle:
+        in_handle = in_handle.read()
+    out = in_handle.replace('OPTIONS_TABLE', opts)
+    with open(RST_FILE, 'w') as out_handle:
+        out_handle.write(out)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/doc/opts_macro.h b/doc/opts_macro.h
new file mode 100644
index 00000000..9172802a
--- /dev/null
+++ b/doc/opts_macro.h
@@ -0,0 +1,12 @@
+#include "../libbcachefs/opts.h"
+
+/**
+ * generate tables from definitions in opt.h
+ */
+
+#define NULL (null)
+
+FMT_START_SECTION
+#define x(_name, _shortopt, _type, _in_mem_type, _mode, _sb_opt, _desc , _usage)\
+_name;_in_mem_type;_usage;_desc FMT_END_LINE
+BCH_OPTS() FMT_END_SECTION
diff --git a/flake.lock b/flake.lock
new file mode 100644
index 00000000..ba4887e2
--- /dev/null
+++ b/flake.lock
@@ -0,0 +1,156 @@
+{
+  "nodes": {
+    "crane": {
+      "inputs": {
+        "nixpkgs": [
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1721842668,
+        "narHash": "sha256-k3oiD2z2AAwBFLa4+xfU+7G5fisRXfkvrMTCJrjZzXo=",
+        "owner": "ipetkov",
+        "repo": "crane",
+        "rev": "529c1a0b1f29f0d78fa3086b8f6a134c71ef3aaf",
+        "type": "github"
+      },
+      "original": {
+        "owner": "ipetkov",
+        "repo": "crane",
+        "type": "github"
+      }
+    },
+    "fenix": {
+      "inputs": {
+        "nixpkgs": [
+          "nixpkgs"
+        ],
+        "rust-analyzer-src": "rust-analyzer-src"
+      },
+      "locked": {
+        "lastModified": 1722320953,
+        "narHash": "sha256-DfGaJtgrzcwPQYLTvjL1KaVIjpvi85b2MpM6yEGvJzM=",
+        "owner": "nix-community",
+        "repo": "fenix",
+        "rev": "483df76def3e5010d709aa3a0418ba2088503994",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-community",
+        "repo": "fenix",
+        "type": "github"
+      }
+    },
+    "flake-compat": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1696426674,
+        "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-parts": {
+      "inputs": {
+        "nixpkgs-lib": "nixpkgs-lib"
+      },
+      "locked": {
+        "lastModified": 1719994518,
+        "narHash": "sha256-pQMhCCHyQGRzdfAkdJ4cIWiw+JNuWsTX7f0ZYSyz0VY=",
+        "owner": "hercules-ci",
+        "repo": "flake-parts",
+        "rev": "9227223f6d922fee3c7b190b2cc238a99527bbb7",
+        "type": "github"
+      },
+      "original": {
+        "owner": "hercules-ci",
+        "repo": "flake-parts",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1722185531,
+        "narHash": "sha256-veKR07psFoJjINLC8RK4DiLniGGMgF3QMlS4tb74S6k=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "52ec9ac3b12395ad677e8b62106f0b98c1f8569d",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nixos",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "nixpkgs-lib": {
+      "locked": {
+        "lastModified": 1719876945,
+        "narHash": "sha256-Fm2rDDs86sHy0/1jxTOKB1118Q0O3Uc7EC0iXvXKpbI=",
+        "type": "tarball",
+        "url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz"
+      },
+      "original": {
+        "type": "tarball",
+        "url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz"
+      }
+    },
+    "root": {
+      "inputs": {
+        "crane": "crane",
+        "fenix": "fenix",
+        "flake-compat": "flake-compat",
+        "flake-parts": "flake-parts",
+        "nixpkgs": "nixpkgs",
+        "treefmt-nix": "treefmt-nix"
+      }
+    },
+    "rust-analyzer-src": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1722262053,
+        "narHash": "sha256-KxjkPVn9rQqYam6DhiN/V2NcMXtYW25maxkJoiVMpmE=",
+        "owner": "rust-lang",
+        "repo": "rust-analyzer",
+        "rev": "a021b85be57d34b1eed687fcafd5d5ec64b2d853",
+        "type": "github"
+      },
+      "original": {
+        "owner": "rust-lang",
+        "ref": "nightly",
+        "repo": "rust-analyzer",
+        "type": "github"
+      }
+    },
+    "treefmt-nix": {
+      "inputs": {
+        "nixpkgs": [
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1722330636,
+        "narHash": "sha256-uru7JzOa33YlSRwf9sfXpJG+UAV+bnBEYMjrzKrQZFw=",
+        "owner": "numtide",
+        "repo": "treefmt-nix",
+        "rev": "768acdb06968e53aa1ee8de207fd955335c754b7",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "treefmt-nix",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 00000000..4aa66a86
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,212 @@
+{
+  description = "Userspace tools for bcachefs";
+
+  inputs = {
+    nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
+
+    flake-parts.url = "github:hercules-ci/flake-parts";
+
+    treefmt-nix = {
+      url = "github:numtide/treefmt-nix";
+      inputs.nixpkgs.follows = "nixpkgs";
+    };
+
+    crane = {
+      url = "github:ipetkov/crane";
+      inputs.nixpkgs.follows = "nixpkgs";
+    };
+
+    fenix = {
+      url = "github:nix-community/fenix";
+      inputs.nixpkgs.follows = "nixpkgs";
+    };
+
+    flake-compat = {
+      url = "github:edolstra/flake-compat";
+      flake = false;
+    };
+  };
+
+  outputs =
+    inputs@{
+      self,
+      nixpkgs,
+      flake-parts,
+      treefmt-nix,
+      fenix,
+      crane,
+      ...
+    }:
+    flake-parts.lib.mkFlake { inherit inputs; } {
+      imports = [ inputs.treefmt-nix.flakeModule ];
+
+      # can be extended, but these have proper binary cache support in nixpkgs
+      # as of writing.
+      systems = [
+        "aarch64-linux"
+        "x86_64-linux"
+        "i686-linux"
+      ];
+
+      perSystem =
+        {
+          self',
+          config,
+          lib,
+          pkgs,
+          system,
+          ...
+        }:
+        let
+          inherit (builtins) readFile split;
+          inherit (lib.lists) findFirst;
+          inherit (lib.strings) hasPrefix removePrefix substring;
+
+          cargoToml = builtins.fromTOML (builtins.readFile ./Cargo.toml);
+          rustfmtToml = builtins.fromTOML (builtins.readFile ./rustfmt.toml);
+
+          craneLib = crane.mkLib pkgs;
+
+          rev = self.shortRev or self.dirtyShortRev or (substring 0 8 self.lastModifiedDate);
+          makefileVersion = removePrefix "VERSION=" (
+            findFirst (line: hasPrefix "VERSION=" line) "VERSION=0.0.0" (split "\n" (readFile ./Makefile))
+          );
+          version = "${makefileVersion}+${rev}";
+
+          commonArgs = {
+            inherit version;
+            src = self;
+
+            env = {
+              PKG_CONFIG_SYSTEMD_SYSTEMDSYSTEMUNITDIR = "${placeholder "out"}/lib/systemd/system";
+              PKG_CONFIG_UDEV_UDEVDIR = "${placeholder "out"}/lib/udev";
+            };
+
+            makeFlags = [
+              "INITRAMFS_DIR=${placeholder "out"}/etc/initramfs-tools"
+              "PREFIX=${placeholder "out"}"
+              "VERSION=${version}"
+            ];
+
+            dontStrip = true;
+
+            nativeBuildInputs = with pkgs; [
+              pkg-config
+              rustPlatform.bindgenHook
+            ];
+
+            buildInputs = with pkgs; [
+              attr
+              keyutils
+              libaio
+              libsodium
+              liburcu
+              libuuid
+              lz4
+              udev
+              zlib
+              zstd
+            ];
+
+            meta = {
+              description = "Userspace tools for bcachefs";
+              license = lib.licenses.gpl2Only;
+              mainProgram = "bcachefs";
+            };
+          };
+
+          cargoArtifacts = craneLib.buildDepsOnly (commonArgs // { pname = cargoToml.package.name; });
+        in
+        {
+          packages.default = config.packages.bcachefs-tools;
+          packages.bcachefs-tools = craneLib.buildPackage (
+            commonArgs
+            // {
+              inherit cargoArtifacts;
+
+              enableParallelBuilding = true;
+              buildPhaseCargoCommand = ''
+                make ''${enableParallelBuilding:+-j''${NIX_BUILD_CORES}} $makeFlags
+              '';
+              installPhaseCommand = ''
+                make ''${enableParallelBuilding:+-j''${NIX_BUILD_CORES}} $makeFlags install
+              '';
+
+              doInstallCheck = true;
+              installCheckPhase = ''
+                runHook preInstallCheck
+
+                test "$($out/bin/bcachefs version)" = "${version}"
+
+                runHook postInstallCheck
+              '';
+            }
+          );
+
+          packages.bcachefs-tools-fuse = config.packages.bcachefs-tools.overrideAttrs (
+            final: prev: {
+              makeFlags = prev.makeFlags ++ [ "BCACHEFS_FUSE=1" ];
+              buildInputs = prev.buildInputs ++ [ pkgs.fuse3 ];
+            }
+          );
+
+          checks.cargo-clippy = craneLib.cargoClippy (
+            commonArgs
+            // {
+              inherit cargoArtifacts;
+              cargoClippyExtraArgs = "--all-targets -- --deny warnings";
+            }
+          );
+
+          # we have to build our own `craneLib.cargoTest`
+          checks.cargo-test = craneLib.mkCargoDerivation (
+            commonArgs
+            // {
+              inherit cargoArtifacts;
+              doCheck = true;
+
+              enableParallelChecking = true;
+
+              pnameSuffix = "-test";
+              buildPhaseCargoCommand = "";
+              checkPhaseCargoCommand = ''
+                make ''${enableParallelChecking:+-j''${NIX_BUILD_CORES}} $makeFlags libbcachefs.a
+                cargo test --profile release -- --nocapture
+              '';
+            }
+          );
+
+          devShells.default = pkgs.mkShell {
+            inputsFrom = [
+              config.packages.default
+              config.treefmt.build.devShell
+            ];
+
+            # here go packages that aren't required for builds but are used for
+            # development, and might need to be version matched with build
+            # dependencies (e.g. clippy or rust-analyzer).
+            packages = with pkgs; [
+              bear
+              cargo-audit
+              cargo-outdated
+              clang-tools
+              clippy
+              rust-analyzer
+              rustc
+            ];
+          };
+
+          treefmt.config = {
+            projectRootFile = "flake.nix";
+            flakeCheck = false;
+
+            programs = {
+              nixfmt.enable = true;
+              rustfmt.edition = rustfmtToml.edition;
+              rustfmt.enable = true;
+              rustfmt.package = fenix.packages.${system}.default.rustfmt;
+            };
+          };
+        };
+    };
+}
diff --git a/fsck.bcachefs b/fsck.bcachefs
deleted file mode 100755
index f8de4a8c..00000000
--- a/fsck.bcachefs
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/sh
-
-SDIR="$(readlink -f "$0")"
-exec "${SDIR%/*}/bcachefs" fsck "$@"
diff --git a/fsck/.gitignore b/fsck/.gitignore
new file mode 100644
index 00000000..5be35c9e
--- /dev/null
+++ b/fsck/.gitignore
@@ -0,0 +1,2 @@
+*.service
+bcachefsck_all
+\ No newline at end of file
diff --git a/fsck/bcachefsck@.service.in b/fsck/bcachefsck@.service.in
new file mode 100644
index 00000000..86c1824c
--- /dev/null
+++ b/fsck/bcachefsck@.service.in
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2023-2024 Oracle.  All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+[Unit]
+Description=Online bcachefsck for %f
+OnFailure=bcachefsck@%i.service
+Documentation=man:bcachefs(8)
+
+# Explicitly require the capabilities that this program needs
+ConditionCapability=CAP_SYS_ADMIN
+ConditionCapability=CAP_FOWNER
+ConditionCapability=CAP_DAC_OVERRIDE
+ConditionCapability=CAP_DAC_READ_SEARCH
+ConditionCapability=CAP_SYS_RAWIO
+
+# Must be a mountpoint
+ConditionPathIsMountPoint=%f
+RequiresMountsFor=%f
+
+[Service]
+Type=oneshot
+Environment=SERVICE_MODE=1
+ExecStart=bcachefs fsck --real-mountpoint /tmp/scrub/ @bcachefsck_args@ %f
+SyslogIdentifier=%N
+
+# Run scrub with minimal CPU and IO priority so that nothing else will starve.
+IOSchedulingClass=idle
+CPUSchedulingPolicy=idle
+CPUAccounting=true
+Nice=19
+
+# Create the service underneath the background service slice so that we can
+# control resource usage.
+Slice=system-bcachefsck.slice
+
+# No realtime CPU scheduling
+RestrictRealtime=true
+
+# Dynamically create a user that isn't root
+DynamicUser=true
+
+# Make the entire filesystem readonly and /home inaccessible, then bind mount
+# the filesystem we're supposed to be checking into our private /tmp dir.
+# 'norbind' means that we don't bind anything under that original mount.
+# This enables checking filesystems mounted under /tmp in the global mount
+# namespace.
+ProtectSystem=strict
+ProtectHome=yes
+PrivateTmp=true
+BindPaths=%f:/tmp/scrub:norbind
+
+# No network access
+PrivateNetwork=true
+ProtectHostname=true
+RestrictAddressFamilies=none
+IPAddressDeny=any
+
+# Don't let the program mess with the kernel configuration at all
+ProtectKernelLogs=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectControlGroups=true
+ProtectProc=invisible
+RestrictNamespaces=true
+
+# Hide everything in /proc, even /proc/mounts
+ProcSubset=pid
+
+# Only allow the default personality Linux
+LockPersonality=true
+
+# No writable memory pages
+MemoryDenyWriteExecute=true
+
+# Don't let our mounts leak out to the host
+PrivateMounts=true
+
+# Restrict system calls to the native arch and only enough to get things going
+SystemCallArchitectures=native
+SystemCallFilter=@system-service
+SystemCallFilter=~@privileged
+SystemCallFilter=~@resources
+SystemCallFilter=~@mount
+
+# bcachefsck needs these privileges to run, and no others
+CapabilityBoundingSet=CAP_SYS_ADMIN CAP_FOWNER CAP_DAC_OVERRIDE CAP_DAC_READ_SEARCH CAP_SYS_RAWIO
+AmbientCapabilities=CAP_SYS_ADMIN CAP_FOWNER CAP_DAC_OVERRIDE CAP_DAC_READ_SEARCH CAP_SYS_RAWIO
+NoNewPrivileges=true
+
+# bcachefsck doesn't create files
+UMask=7777
+
+# No access to hardware /dev files except for block devices
+ProtectClock=true
+DevicePolicy=closed
+DeviceAllow=block-*
diff --git a/fsck/bcachefsck_all.in b/fsck/bcachefsck_all.in
new file mode 100644
index 00000000..4f6031eb
--- /dev/null
+++ b/fsck/bcachefsck_all.in
@@ -0,0 +1,481 @@
+#!/usr/bin/python3
+
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (C) 2023-2024 Oracle.  All rights reserved.
+#
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+# Run bcachefsck in parallel, but avoid thrashing.
+
+import subprocess
+import json
+import threading
+import time
+import sys
+import os
+import argparse
+import signal
+import dbus
+from io import TextIOWrapper
+from pathlib import Path
+from datetime import timedelta
+from datetime import datetime
+from datetime import timezone
+
+retcode = 0
+terminate = False
+debug = False
+
+def DEVNULL():
+	'''Return /dev/null in subprocess writable format.'''
+	try:
+		from subprocess import DEVNULL
+		return DEVNULL
+	except ImportError:
+		return open(os.devnull, 'wb')
+
+def find_mounts():
+	'''Map mountpoints to physical disks.'''
+	def find_bcachefs_mounts(bdev, fs, lastdisk):
+		'''Attach all lastdisk to each fs found under bdev.'''
+		if bdev['fstype'] == 'bcachefs' and bdev['mountpoint'] is not None:
+			mnt = bdev['mountpoint']
+			if mnt in fs:
+				fs[mnt].add(lastdisk.split(':'))
+			else:
+				fs[mnt] = set(lastdisk.split(':'))
+		if 'children' not in bdev:
+			return
+		for child in bdev['children']:
+			find_bcachefs_mounts(child, fs, lastdisk)
+
+	fs = {}
+	cmd=['lsblk', '-o', 'NAME,KNAME,TYPE,FSTYPE,MOUNTPOINT', '-J']
+	result = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+	result.wait()
+	if result.returncode != 0:
+		return fs
+	sarray = [x.decode(sys.stdout.encoding) for x in result.stdout.readlines()]
+	output = ' '.join(sarray)
+	bdevdata = json.loads(output)
+
+	# The lsblk output had better be in disks-then-partitions order
+	for bdev in bdevdata['blockdevices']:
+		lastdisk = bdev['kname']
+		find_bcachefs_mounts(bdev, fs, lastdisk)
+
+	return fs
+
+def backtick(cmd):
+	'''Generator function that yields lines of a program's stdout.'''
+	p = subprocess.Popen(cmd, stdout = subprocess.PIPE)
+	for line in TextIOWrapper(p.stdout, encoding="utf-8"):
+		yield line.strip()
+
+def remove_killfunc(killfuncs, fn):
+	'''Ensure fn is not in killfuncs.'''
+	try:
+		killfuncs.remove(fn)
+	except:
+		pass
+
+class scrub_control(object):
+	'''Control object for bcachefsck.'''
+	def __init__(self):
+		pass
+
+	def start(self):
+		'''Start scrub and wait for it to complete.  Returns -1 if the
+		service was not started, 0 if it succeeded, or 1 if it
+		failed.'''
+		assert False
+
+	def stop(self):
+		'''Stop scrub.'''
+		assert False
+
+class scrub_subprocess(scrub_control):
+	'''Control object for bcachefsck subprocesses.'''
+	def __init__(self, mnt):
+		cmd = ['bcachefs', 'fsck']
+		cmd += '@bcachefsck_args@'.split()
+		cmd += [mnt]
+		self.cmdline = cmd
+		self.proc = None
+
+	def start(self):
+		'''Start bcachefsck and wait for it to complete.  Returns -1 if
+		the service was not started, 0 if it succeeded, or 1 if it
+		failed.'''
+		global debug
+
+		if debug:
+			print('run ', ' '.join(self.cmdline))
+
+		try:
+			self.proc = subprocess.Popen(self.cmdline)
+			self.proc.wait()
+		except:
+			return -1
+
+		proc = self.proc
+		self.proc = None
+		return proc.returncode
+
+	def stop(self):
+		'''Stop bcachefsck.'''
+		global debug
+
+		if debug:
+			print('kill ', ' '.join(self.cmdline))
+		if self.proc is not None:
+			self.proc.terminate()
+
+def run_subprocess(mnt, killfuncs):
+	'''Run a killable program.  Returns program retcode or -1 if we can't
+	start it.'''
+	try:
+		p = scrub_subprocess(mnt)
+		killfuncs.add(p.stop)
+		ret = p.start()
+		remove_killfunc(killfuncs, p.stop)
+		return ret
+	except:
+		return -1
+
+# systemd doesn't like unit instance names with slashes in them, so it
+# replaces them with dashes when it invokes the service.  Filesystem paths
+# need a special --path argument so that dashes do not get mangled.
+def path_to_serviceunit(path):
+	'''Convert a pathname into a systemd service unit name.'''
+
+	svcname = 'bcachefsck@.service'
+	cmd = ['systemd-escape', '--template', svcname, '--path', path]
+
+	proc = subprocess.Popen(cmd, stdout = subprocess.PIPE)
+	proc.wait()
+	for line in proc.stdout:
+		return line.decode(sys.stdout.encoding).strip()
+
+def fibonacci(max_ret):
+	'''Yield fibonacci sequence up to but not including max_ret.'''
+	if max_ret < 1:
+		return
+
+	x = 0
+	y = 1
+	yield 1
+
+	z = x + y
+	while z <= max_ret:
+		yield z
+		x = y
+		y = z
+		z = x + y
+
+class scrub_service(scrub_control):
+	'''Control object for bcachefsck systemd service.'''
+	def __init__(self, mnt):
+		self.unitname = path_to_serviceunit(mnt)
+		self.prop = None
+		self.unit = None
+		self.bind()
+
+	def bind(self):
+		'''Bind to the dbus proxy object for this service.'''
+		sysbus = dbus.SystemBus()
+		systemd1 = sysbus.get_object('org.freedesktop.systemd1',
+					    '/org/freedesktop/systemd1')
+		manager = dbus.Interface(systemd1,
+				'org.freedesktop.systemd1.Manager')
+		path = manager.LoadUnit(self.unitname)
+
+		svc_obj = sysbus.get_object('org.freedesktop.systemd1', path)
+		self.prop = dbus.Interface(svc_obj,
+				'org.freedesktop.DBus.Properties')
+		self.unit = dbus.Interface(svc_obj,
+				'org.freedesktop.systemd1.Unit')
+
+	def __dbusrun(self, lambda_fn):
+		'''Call the lambda function to execute something on dbus.  dbus
+		exceptions result in retries with Fibonacci backoff, and the
+		bindings will be rebuilt every time.'''
+		global debug
+
+		fatal_ex = None
+
+		for i in fibonacci(30):
+			try:
+				return lambda_fn()
+			except dbus.exceptions.DBusException as e:
+				if debug:
+					print(e)
+				fatal_ex = e
+				time.sleep(i)
+				self.bind()
+		raise fatal_ex
+
+	def state(self):
+		'''Retrieve the active state for a systemd service.  As of
+		systemd 249, this is supposed to be one of the following:
+		"active", "reloading", "inactive", "failed", "activating",
+		or "deactivating".  These strings are not localized.'''
+		global debug
+
+		l = lambda: self.prop.Get('org.freedesktop.systemd1.Unit',
+				'ActiveState')
+		try:
+			return self.__dbusrun(l)
+		except Exception as e:
+			if debug:
+				print(e, file = sys.stderr)
+			return 'failed'
+
+	def wait(self, interval = 1):
+		'''Wait until the service finishes.'''
+		global debug
+
+		# Use a poll/sleep loop to wait for the service to finish.
+		# Avoid adding a dependency on python3 glib, which is required
+		# to use an event loop to receive a dbus signal.
+		s = self.state()
+		while s not in ['failed', 'inactive']:
+			if debug:
+				print('waiting %s %s' % (self.unitname, s))
+			time.sleep(interval)
+			s = self.state()
+		if debug:
+			print('waited %s %s' % (self.unitname, s))
+		if s == 'failed':
+			return 1
+		return 0
+
+	def start(self):
+		'''Start the service and wait for it to complete.  Returns -1
+		if the service was not started, 0 if it succeeded, or 1 if it
+		failed.'''
+		global debug
+
+		if debug:
+			print('starting %s' % self.unitname)
+
+		try:
+			self.__dbusrun(lambda: self.unit.Start('replace'))
+			return self.wait()
+		except Exception as e:
+			print(e, file = sys.stderr)
+			return -1
+
+	def stop(self):
+		'''Stop the service.'''
+		global debug
+
+		if debug:
+			print('stopping %s' % self.unitname)
+
+		try:
+			self.__dbusrun(lambda: self.unit.Stop('replace'))
+			return self.wait()
+		except Exception as e:
+			print(e, file = sys.stderr)
+			return -1
+
+def run_service(mnt, killfuncs):
+	'''Run scrub as a service.'''
+	try:
+		svc = scrub_service(mnt)
+	except:
+		return -1
+
+	killfuncs.add(svc.stop)
+	retcode = svc.start()
+	remove_killfunc(killfuncs, svc.stop)
+	return retcode
+
+def run_scrub(mnt, cond, running_devs, mntdevs, killfuncs):
+	'''Run a scrub process.'''
+	global retcode, terminate
+
+	print("Scrubbing %s..." % mnt)
+	sys.stdout.flush()
+
+	try:
+		if terminate:
+			return
+
+		# Run per-mount systemd bcachefsck service only if we ourselves
+		# are running as a systemd service.
+		if 'SERVICE_MODE' in os.environ:
+			ret = run_service(mnt, killfuncs)
+			if ret == 0 or ret == 1:
+				print("Scrubbing %s done, (err=%d)" % (mnt, ret))
+				sys.stdout.flush()
+				retcode |= ret
+				return
+
+			if terminate:
+				return
+
+		# Invoke bcachefsck manually if we're running in the foreground.
+		# We also permit this if we're running as a cronjob where
+		# systemd services are unavailable.
+		ret = run_subprocess(mnt, killfuncs)
+		if ret >= 0:
+			print("Scrubbing %s done, (err=%d)" % (mnt, ret))
+			sys.stdout.flush()
+			retcode |= ret
+			return
+
+		if terminate:
+			return
+
+		print("Unable to start scrub tool.")
+		sys.stdout.flush()
+	finally:
+		running_devs -= mntdevs
+		cond.acquire()
+		cond.notify()
+		cond.release()
+
+def signal_scrubs(signum, cond):
+	'''Handle termination signals by killing bcachefsck children.'''
+	global debug, terminate
+
+	if debug:
+		print('Signal handler called with signal', signum)
+		sys.stdout.flush()
+
+	terminate = True
+	cond.acquire()
+	cond.notify()
+	cond.release()
+
+def wait_for_termination(cond, killfuncs):
+	'''Wait for a child thread to terminate.  Returns True if we should
+	abort the program, False otherwise.'''
+	global debug, terminate
+
+	if debug:
+		print('waiting for threads to terminate')
+		sys.stdout.flush()
+
+	cond.acquire()
+	try:
+		cond.wait()
+	except KeyboardInterrupt:
+		terminate = True
+	cond.release()
+
+	if not terminate:
+		return False
+
+	print("Terminating...")
+	sys.stdout.flush()
+	while len(killfuncs) > 0:
+		fn = killfuncs.pop()
+		fn()
+	return True
+
+def scan_interval(string):
+	'''Convert a textual scan interval argument into a time delta.'''
+
+	if string.endswith('y'):
+		year = timedelta(seconds = 31556952)
+		return year * float(string[:-1])
+	if string.endswith('q'):
+		return timedelta(days = 90 * float(string[:-1]))
+	if string.endswith('mo'):
+		return timedelta(days = 30 * float(string[:-2]))
+	if string.endswith('w'):
+		return timedelta(weeks = float(string[:-1]))
+	if string.endswith('d'):
+		return timedelta(days = float(string[:-1]))
+	if string.endswith('h'):
+		return timedelta(hours = float(string[:-1]))
+	if string.endswith('m'):
+		return timedelta(minutes = float(string[:-1]))
+	if string.endswith('s'):
+		return timedelta(seconds = float(string[:-1]))
+	return timedelta(seconds = int(string))
+
+def utcnow():
+	'''Create a representation of the time right now, in UTC.'''
+
+	dt = datetime.utcnow()
+	return dt.replace(tzinfo = timezone.utc)
+
+def main():
+	'''Find mounts, schedule bcachefsck runs.'''
+	def thr(mnt, devs):
+		a = (mnt, cond, running_devs, devs, killfuncs)
+		thr = threading.Thread(target = run_scrub, args = a)
+		thr.start()
+	global retcode, terminate, debug
+
+	parser = argparse.ArgumentParser( \
+			description = "Scrub all mounted bcachefs filesystems.")
+	parser.add_argument("--debug", help = "Enabling debugging messages.", \
+			action = "store_true")
+	args = parser.parse_args()
+
+	if args.debug:
+		debug = True
+
+	fs = find_mounts()
+
+	# Schedule scrub jobs...
+	running_devs = set()
+	killfuncs = set()
+	cond = threading.Condition()
+
+	signal.signal(signal.SIGINT, lambda s, f: signal_scrubs(s, cond))
+	signal.signal(signal.SIGTERM, lambda s, f: signal_scrubs(s, cond))
+
+	while len(fs) > 0:
+		if len(running_devs) == 0:
+			mnt, devs = fs.popitem()
+			running_devs.update(devs)
+			thr(mnt, devs)
+		poppers = set()
+		for mnt in fs:
+			devs = fs[mnt]
+			can_run = True
+			for dev in devs:
+				if dev in running_devs:
+					can_run = False
+					break
+			if can_run:
+				running_devs.update(devs)
+				poppers.add(mnt)
+				thr(mnt, devs)
+		for p in poppers:
+			fs.pop(p)
+
+		# Wait for one thread to finish
+		if wait_for_termination(cond, killfuncs):
+			break
+
+	# Wait for the rest of the threads to finish
+	while len(killfuncs) > 0:
+		wait_for_termination(cond, killfuncs)
+
+	# If we're being run as a service, the return code must fit the LSB
+	# init script action error guidelines, which is to say that we compress
+	# all errors to 1 ("generic or unspecified error", LSB 5.0 section
+	# 22.2) and hope the admin will scan the log for what actually
+	# happened.
+	#
+	# We have to sleep 2 seconds here because journald uses the pid to
+	# connect our log messages to the systemd service.  This is critical
+	# for capturing all the log messages if the scrub fails, because the
+	# fail service uses the service name to gather log messages for the
+	# error report.
+	if 'SERVICE_MODE' in os.environ:
+		time.sleep(2)
+		if retcode != 0:
+			retcode = 1
+
+	sys.exit(retcode)
+
+if __name__ == '__main__':
+	main()
diff --git a/fsck/bcachefsck_all.service.in b/fsck/bcachefsck_all.service.in
new file mode 100644
index 00000000..b5718263
--- /dev/null
+++ b/fsck/bcachefsck_all.service.in
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2023-2024 Oracle.  All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+[Unit]
+Description=Online bcachefsck for All Filesystems
+OnFailure=bcachefsck_all_fail.service
+ConditionACPower=true
+Documentation=man:bcachefsck_all(8)
+After=paths.target multi-user.target network.target network-online.target systemd-networkd.service NetworkManager.service connman.service
+
+[Service]
+Type=oneshot
+Environment=SERVICE_MODE=1
+ExecStart=@libexecdir@/bcachefsck_all
+SyslogIdentifier=bcachefsck_all
+
+# Create the service underneath the scrub background service slice so that we
+# can control resource usage.
+Slice=system-bcachefsck.slice
+
+# Run scrub_all with minimal CPU and IO priority so that nothing will starve.
+IOSchedulingClass=idle
+CPUSchedulingPolicy=idle
+CPUAccounting=true
+Nice=19
+
+# No realtime scheduling
+RestrictRealtime=true
+
+# No special privileges, but we still have to run as root so that we can
+# contact the service manager to start the sub-units.
+CapabilityBoundingSet=
+NoNewPrivileges=true
+RestrictSUIDSGID=true
+
+# Make the entire filesystem readonly except for the media scan stamp file
+# directory.  We don't want to hide anything because we need to find all
+# mounted bcachefs filesystems in the host.
+ProtectSystem=strict
+ProtectHome=read-only
+PrivateTmp=false
+
+# No network access except to the systemd control socket
+PrivateNetwork=true
+ProtectHostname=true
+RestrictAddressFamilies=AF_UNIX
+IPAddressDeny=any
+
+# Don't let the program mess with the kernel configuration at all
+ProtectKernelLogs=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectControlGroups=true
+ProtectProc=invisible
+RestrictNamespaces=true
+
+# Hide everything in /proc, even /proc/mounts
+ProcSubset=pid
+
+# Only allow the default personality Linux
+LockPersonality=true
+
+# No writable memory pages
+MemoryDenyWriteExecute=true
+
+# Don't let our mounts leak out to the host
+PrivateMounts=true
+
+# Restrict system calls to the native arch and only enough to get things going
+SystemCallArchitectures=native
+SystemCallFilter=@system-service
+SystemCallFilter=~@privileged
+SystemCallFilter=~@resources
+SystemCallFilter=~@mount
+
+# Media scan stamp file shouldn't be readable by regular users
+UMask=0077
+
+# lsblk ignores mountpoints if it can't find the device files, so we cannot
+# hide them
+#ProtectClock=true
+#PrivateDevices=true
diff --git a/fsck/bcachefsck_all.timer b/fsck/bcachefsck_all.timer
new file mode 100644
index 00000000..65470d40
--- /dev/null
+++ b/fsck/bcachefsck_all.timer
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2023-2024 Oracle.  All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+[Unit]
+Description=Periodic bcachefsck for All Filesystems
+
+[Timer]
+# Run on Sunday at 3:10am, to avoid running afoul of DST changes
+OnCalendar=Sun *-*-* 03:10:00
+RandomizedDelaySec=60
+Persistent=true
+
+[Install]
+WantedBy=timers.target
diff --git a/fsck/bcachefsck_all_fail.service.in b/fsck/bcachefsck_all_fail.service.in
new file mode 100644
index 00000000..fe584514
--- /dev/null
+++ b/fsck/bcachefsck_all_fail.service.in
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2023-2024 Oracle.  All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+[Unit]
+Description=Online bcachefsck for All Filesystems Failure Reporting
+Documentation=man:bcachefsck_all(8)
+
+[Service]
+Type=oneshot
+Environment=EMAIL_ADDR=root
+ExecStart=@libexecdir@/bcachefsck_fail "${EMAIL_ADDR}" bcachefsck_all
+User=mail
+Group=mail
+SupplementaryGroups=systemd-journal
+
+# No realtime scheduling
+RestrictRealtime=true
+
+# Make the entire filesystem readonly and /home inaccessible.
+ProtectSystem=full
+ProtectHome=yes
+PrivateTmp=true
+RestrictSUIDSGID=true
+
+# Emailing reports requires network access, but not the ability to change the
+# hostname.
+ProtectHostname=true
+
+# Don't let the program mess with the kernel configuration at all
+ProtectKernelLogs=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectControlGroups=true
+ProtectProc=invisible
+RestrictNamespaces=true
+
+# Can't hide /proc because journalctl needs it to find various pieces of log
+# information
+#ProcSubset=pid
+
+# Only allow the default personality Linux
+LockPersonality=true
+
+# No writable memory pages
+MemoryDenyWriteExecute=true
+
+# Don't let our mounts leak out to the host
+PrivateMounts=true
+
+# Restrict system calls to the native arch and only enough to get things going
+SystemCallArchitectures=native
+SystemCallFilter=@system-service
+SystemCallFilter=~@privileged
+SystemCallFilter=~@resources
+SystemCallFilter=~@mount
+
+# xfs_scrub needs these privileges to run, and no others
+CapabilityBoundingSet=
+NoNewPrivileges=true
+
+# Failure reporting shouldn't create world-readable files
+UMask=0077
+
+# Clean up any IPC objects when this unit stops
+RemoveIPC=true
+
+# No access to hardware device files
+PrivateDevices=true
+ProtectClock=true
diff --git a/fsck/bcachefsck_fail b/fsck/bcachefsck_fail
new file mode 100755
index 00000000..283cee70
--- /dev/null
+++ b/fsck/bcachefsck_fail
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2023-2024 Oracle.  All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+# Email logs of failed bcachefsck and bcachefsck_all unit runs
+
+recipient="$1"
+test -z "${recipient}" && exit 0
+service="$2"
+test -z "${service}" && exit 0
+mntpoint="$3"
+
+hostname="$(hostname -f 2>/dev/null)"
+test -z "${hostname}" && hostname="${HOSTNAME}"
+
+mailer="$(command -v sendmail)"
+if [ ! -x "${mailer}" ]; then
+	echo "${mailer}: Mailer program not found."
+	exit 1
+fi
+
+fail_mail_mntpoint() {
+	local scrub_svc
+
+	# Turn the mountpoint into a properly escaped systemd instance name
+	scrub_svc="$(systemd-escape --template "${service}@.service" --path "${mntpoint}")"
+	cat << ENDL
+To: ${recipient}
+From: <${service}@${hostname}>
+Subject: ${service} failure on ${mntpoint}
+Content-Transfer-Encoding: 8bit
+Content-Type: text/plain; charset=UTF-8
+
+So sorry, the automatic ${service} of ${mntpoint} on ${hostname} failed.
+Please do not reply to this mesage.
+
+A log of what happened follows:
+ENDL
+	systemctl status --full --lines 4294967295 "${scrub_svc}"
+}
+
+fail_mail() {
+	cat << ENDL
+To: ${recipient}
+From: <${service}@${hostname}>
+Subject: ${service} failure
+
+So sorry, the automatic ${service} on ${hostname} failed.
+
+A log of what happened follows:
+ENDL
+	systemctl status --full --lines 4294967295 "${service}"
+}
+
+if [ -n "${mntpoint}" ]; then
+	fail_mail_mntpoint | "${mailer}" -t -i
+else
+	fail_mail | "${mailer}" -t -i
+fi
+exit "${PIPESTATUS[1]}"
diff --git a/fsck/bcachefsck_fail@.service.in b/fsck/bcachefsck_fail@.service.in
new file mode 100644
index 00000000..8d5ed4c6
--- /dev/null
+++ b/fsck/bcachefsck_fail@.service.in
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2023-2024 Oracle.  All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+[Unit]
+Description=Online bcachefsck Failure Reporting for %f
+Documentation=man:bcachefs(8)
+
+[Service]
+Type=oneshot
+Environment=EMAIL_ADDR=root
+ExecStart=@libexecdir@/bcachefsck_fail "${EMAIL_ADDR}" bcachefs %f
+User=mail
+Group=mail
+SupplementaryGroups=systemd-journal
+
+# Create the service underneath the background service slice so that we can
+# control resource usage.
+Slice=system-bcachefsck.slice
+
+# No realtime scheduling
+RestrictRealtime=true
+
+# Make the entire filesystem readonly and /home inaccessible.
+ProtectSystem=full
+ProtectHome=yes
+PrivateTmp=true
+RestrictSUIDSGID=true
+
+# Emailing reports requires network access, but not the ability to change the
+# hostname.
+ProtectHostname=true
+
+# Don't let the program mess with the kernel configuration at all
+ProtectKernelLogs=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectControlGroups=true
+ProtectProc=invisible
+RestrictNamespaces=true
+
+# Can't hide /proc because journalctl needs it to find various pieces of log
+# information
+#ProcSubset=pid
+
+# Only allow the default personality Linux
+LockPersonality=true
+
+# No writable memory pages
+MemoryDenyWriteExecute=true
+
+# Don't let our mounts leak out to the host
+PrivateMounts=true
+
+# Restrict system calls to the native arch and only enough to get things going
+SystemCallArchitectures=native
+SystemCallFilter=@system-service
+SystemCallFilter=~@privileged
+SystemCallFilter=~@resources
+SystemCallFilter=~@mount
+
+# xfs_scrub needs these privileges to run, and no others
+CapabilityBoundingSet=
+NoNewPrivileges=true
+
+# Failure reporting shouldn't create world-readable files
+UMask=0077
+
+# Clean up any IPC objects when this unit stops
+RemoveIPC=true
+
+# No access to hardware device files
+PrivateDevices=true
+ProtectClock=true
diff --git a/fsck/system-bcachefsck.slice b/fsck/system-bcachefsck.slice
new file mode 100644
index 00000000..ea368032
--- /dev/null
+++ b/fsck/system-bcachefsck.slice
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2023-2024 Oracle.  All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+[Unit]
+Description=bcachefsck background service slice
+Before=slices.target
+
+[Slice]
+
+# If the CPU usage cgroup controller is available, don't use more than 60% of a
+# single core for all background processes.
+CPUQuota=60%
+CPUAccounting=true
+
+[Install]
+# As of systemd 249, the systemd cgroupv2 configuration code will drop resource
+# controllers from the root and system.slice cgroups at startup if it doesn't
+# find any direct dependencies that require a given controller.  Newly
+# activated units with resource control directives are created under the system
+# slice but do not cause a reconfiguration of the slice's resource controllers.
+# Hence we cannot put CPUQuota= into the bcachefsck service units directly.
+#
+# For the CPUQuota directive to have any effect, we must therefore create an
+# explicit definition file for the slice that systemd creates to contain the
+# bcachefsck instance units (e.g. bcachefsck@.service) and we must configure this
+# slice as a dependency of the system slice to establish the direct dependency
+# relation.
+WantedBy=system.slice
diff --git a/include/asm/page.h b/include/asm/page.h
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/include/asm/page.h
diff --git a/include/crypto/chacha20.h b/include/crypto/chacha.h
index 1cdc77ba..f004cfb5 100644
--- a/include/crypto/chacha20.h
+++ b/include/crypto/chacha.h
@@ -8,8 +8,8 @@
 #include <linux/types.h>
 #include <linux/crypto.h>
 
-#define CHACHA20_IV_SIZE	16
-#define CHACHA20_KEY_SIZE	32
-#define CHACHA20_BLOCK_SIZE	64
+#define CHACHA_IV_SIZE	16
+#define CHACHA_KEY_SIZE	32
+#define CHACHA_BLOCK_SIZE	64
 
 #endif
diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h
new file mode 100644
index 00000000..8a46202b
--- /dev/null
+++ b/include/crypto/sha2.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common values for SHA algorithms
+ */
+
+#ifndef _CRYPTO_SHA_H
+#define _CRYPTO_SHA_H
+
+#include <linux/types.h>
+
+#define SHA1_DIGEST_SIZE        20
+#define SHA1_BLOCK_SIZE         64
+
+#define SHA224_DIGEST_SIZE	28
+#define SHA224_BLOCK_SIZE	64
+
+#define SHA256_DIGEST_SIZE      32
+#define SHA256_BLOCK_SIZE       64
+
+#define SHA384_DIGEST_SIZE      48
+#define SHA384_BLOCK_SIZE       128
+
+#define SHA512_DIGEST_SIZE      64
+#define SHA512_BLOCK_SIZE       128
+
+#define SHA1_H0		0x67452301UL
+#define SHA1_H1		0xefcdab89UL
+#define SHA1_H2		0x98badcfeUL
+#define SHA1_H3		0x10325476UL
+#define SHA1_H4		0xc3d2e1f0UL
+
+#define SHA224_H0	0xc1059ed8UL
+#define SHA224_H1	0x367cd507UL
+#define SHA224_H2	0x3070dd17UL
+#define SHA224_H3	0xf70e5939UL
+#define SHA224_H4	0xffc00b31UL
+#define SHA224_H5	0x68581511UL
+#define SHA224_H6	0x64f98fa7UL
+#define SHA224_H7	0xbefa4fa4UL
+
+#define SHA256_H0	0x6a09e667UL
+#define SHA256_H1	0xbb67ae85UL
+#define SHA256_H2	0x3c6ef372UL
+#define SHA256_H3	0xa54ff53aUL
+#define SHA256_H4	0x510e527fUL
+#define SHA256_H5	0x9b05688cUL
+#define SHA256_H6	0x1f83d9abUL
+#define SHA256_H7	0x5be0cd19UL
+
+#define SHA384_H0	0xcbbb9d5dc1059ed8ULL
+#define SHA384_H1	0x629a292a367cd507ULL
+#define SHA384_H2	0x9159015a3070dd17ULL
+#define SHA384_H3	0x152fecd8f70e5939ULL
+#define SHA384_H4	0x67332667ffc00b31ULL
+#define SHA384_H5	0x8eb44a8768581511ULL
+#define SHA384_H6	0xdb0c2e0d64f98fa7ULL
+#define SHA384_H7	0x47b5481dbefa4fa4ULL
+
+#define SHA512_H0	0x6a09e667f3bcc908ULL
+#define SHA512_H1	0xbb67ae8584caa73bULL
+#define SHA512_H2	0x3c6ef372fe94f82bULL
+#define SHA512_H3	0xa54ff53a5f1d36f1ULL
+#define SHA512_H4	0x510e527fade682d1ULL
+#define SHA512_H5	0x9b05688c2b3e6c1fULL
+#define SHA512_H6	0x1f83d9abfb41bd6bULL
+#define SHA512_H7	0x5be0cd19137e2179ULL
+
+extern const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE];
+
+extern const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE];
+
+extern const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE];
+
+extern const u8 sha384_zero_message_hash[SHA384_DIGEST_SIZE];
+
+extern const u8 sha512_zero_message_hash[SHA512_DIGEST_SIZE];
+
+struct sha1_state {
+	u32 state[SHA1_DIGEST_SIZE / 4];
+	u64 count;
+	u8 buffer[SHA1_BLOCK_SIZE];
+};
+
+struct sha256_state {
+	u32 state[SHA256_DIGEST_SIZE / 4];
+	u64 count;
+	u8 buf[SHA256_BLOCK_SIZE];
+};
+
+struct sha512_state {
+	u64 state[SHA512_DIGEST_SIZE / 8];
+	u64 count[2];
+	u8 buf[SHA512_BLOCK_SIZE];
+};
+
+struct shash_desc;
+
+extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len);
+
+extern int crypto_sha1_finup(struct shash_desc *desc, const u8 *data,
+			     unsigned int len, u8 *hash);
+
+extern int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len);
+
+extern int crypto_sha256_finup(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, u8 *hash);
+
+extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len);
+
+extern int crypto_sha512_finup(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, u8 *hash);
+#endif
diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h
index c9e887c9..833729dc 100644
--- a/include/crypto/skcipher.h
+++ b/include/crypto/skcipher.h
@@ -36,14 +36,29 @@ struct crypto_skcipher {
 	struct crypto_tfm	base;
 };
 
+struct crypto_sync_skcipher {
+	struct crypto_skcipher base;
+};
+
 struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name,
 					      u32 type, u32 mask);
 
+static inline struct crypto_sync_skcipher *
+crypto_alloc_sync_skcipher(const char *alg_name, u32 type, u32 mask)
+{
+	return (void *) crypto_alloc_skcipher(alg_name, type, mask);
+}
+
 static inline void crypto_free_skcipher(struct crypto_skcipher *tfm)
 {
 	kfree(tfm);
 }
 
+static inline void crypto_free_sync_skcipher(struct crypto_sync_skcipher *tfm)
+{
+	crypto_free_skcipher(&tfm->base);
+}
+
 struct skcipher_request {
 	unsigned		cryptlen;
 	u8			*iv;
@@ -54,9 +69,14 @@ struct skcipher_request {
 	struct crypto_tfm	*tfm;
 };
 
-#define SKCIPHER_REQUEST_ON_STACK(name, tfm)			\
-	struct skcipher_request __##name##_desc;		\
-	struct skcipher_request *name = &__##name##_desc
+#define MAX_SYNC_SKCIPHER_REQSIZE      384
+#define SYNC_SKCIPHER_REQUEST_ON_STACK(name, tfm) \
+	char __##name##_desc[sizeof(struct skcipher_request) + \
+			     MAX_SYNC_SKCIPHER_REQSIZE + \
+			     (!(sizeof((struct crypto_sync_skcipher *)1 == \
+				       (typeof(tfm))1))) \
+			    ] CRYPTO_MINALIGN_ATTR; \
+	struct skcipher_request *name = (void *)__##name##_desc
 
 static inline int crypto_skcipher_setkey(struct crypto_skcipher *tfm,
 					 const u8 *key, unsigned int keylen)
@@ -86,6 +106,14 @@ static inline void skcipher_request_set_tfm(struct skcipher_request *req,
 	req->tfm = &tfm->base;
 }
 
+static inline void skcipher_request_set_sync_tfm(struct skcipher_request *req,
+					    struct crypto_sync_skcipher *tfm)
+{
+	skcipher_request_set_tfm(req, &tfm->base);
+}
+
+#define skcipher_request_set_callback(...) do {} while (0)
+
 static inline void skcipher_request_set_crypt(
 	struct skcipher_request *req,
 	struct scatterlist *src, struct scatterlist *dst,
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 7471bd97..c594ff8b 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -26,17 +26,32 @@ typedef struct {
 
 #define __ATOMIC_READ(p)		uatomic_read(p)
 #define __ATOMIC_SET(p, v)		uatomic_set(p, v)
+#define __ATOMIC_SET_RELEASE(p, v)	uatomic_set(p, v)
 #define __ATOMIC_ADD_RETURN(v, p)	uatomic_add_return(p, v)
 #define __ATOMIC_SUB_RETURN(v, p)	uatomic_sub_return(p, v)
 #define __ATOMIC_ADD(v, p)		uatomic_add(p, v)
 #define __ATOMIC_SUB(v, p)		uatomic_sub(p, v)
 #define __ATOMIC_INC(p)			uatomic_inc(p)
 #define __ATOMIC_DEC(p)			uatomic_dec(p)
+#define __ATOMIC_AND(v, p)		uatomic_and(p, v)
+#define __ATOMIC_OR(v, p)		uatomic_or(p, v)
 
 #define xchg(p, v)			uatomic_xchg(p, v)
 #define xchg_acquire(p, v)		uatomic_xchg(p, v)
 #define cmpxchg(p, old, new)		uatomic_cmpxchg(p, old, new)
 #define cmpxchg_acquire(p, old, new)	uatomic_cmpxchg(p, old, new)
+#define cmpxchg_release(p, old, new)	uatomic_cmpxchg(p, old, new)
+
+#define try_cmpxchg(p, _old, _new)				\
+({								\
+	typeof(*(_old)) _v = cmpxchg(p, *(_old), _new);		\
+	bool _ret = _v == *(_old);				\
+	*(_old) = _v;						\
+	_ret;							\
+})
+
+#define try_cmpxchg_acquire(p, _old, _new)			\
+	try_cmpxchg(p, _old, _new)
 
 #define smp_mb__before_atomic()		cmm_smp_mb__before_uatomic_add()
 #define smp_mb__after_atomic()		cmm_smp_mb__after_uatomic_add()
@@ -44,19 +59,30 @@ typedef struct {
 #define smp_rmb()			cmm_smp_rmb()
 #define smp_mb()			cmm_smp_mb()
 #define smp_read_barrier_depends()	cmm_smp_read_barrier_depends()
+#define smp_acquire__after_ctrl_dep()	cmm_smp_mb()
 
 #else /* C11_ATOMICS */
 
 #define __ATOMIC_READ(p)		__atomic_load_n(p,	__ATOMIC_RELAXED)
 #define __ATOMIC_SET(p, v)		__atomic_store_n(p, v,	__ATOMIC_RELAXED)
+#define __ATOMIC_SET_RELEASE(p, v)	__atomic_store_n(p, v,	__ATOMIC_RELEASE)
 #define __ATOMIC_ADD_RETURN(v, p)	__atomic_add_fetch(p, v, __ATOMIC_RELAXED)
 #define __ATOMIC_ADD_RETURN_RELEASE(v, p)				\
 					__atomic_add_fetch(p, v, __ATOMIC_RELEASE)
 #define __ATOMIC_SUB_RETURN(v, p)	__atomic_sub_fetch(p, v, __ATOMIC_RELAXED)
+#define __ATOMIC_SUB_RETURN_RELEASE(v, p)				\
+					__atomic_sub_fetch(p, v, __ATOMIC_RELEASE)
+#define __ATOMIC_AND(p)			__atomic_and_fetch(p, v, __ATOMIC_RELAXED)
+#define __ATOMIC_OR(p)			__atomic_or_fetch(p, v, __ATOMIC_RELAXED)
 
 #define xchg(p, v)			__atomic_exchange_n(p, v, __ATOMIC_SEQ_CST)
 #define xchg_acquire(p, v)		__atomic_exchange_n(p, v, __ATOMIC_ACQUIRE)
 
+#define try_cmpxchg(p, old, new)				\
+	__atomic_compare_exchange_n((p), __old, new, false,	\
+				    __ATOMIC_SEQ_CST,		\
+				    __ATOMIC_SEQ_CST)
+
 #define cmpxchg(p, old, new)					\
 ({								\
 	typeof(*(p)) __old = (old);				\
@@ -77,6 +103,16 @@ typedef struct {
 	__old;							\
 })
 
+#define cmpxchg_release(p, old, new)				\
+({								\
+	typeof(*(p)) __old = (old);				\
+								\
+	__atomic_compare_exchange_n((p), &__old, new, false,	\
+				    __ATOMIC_RELEASE,		\
+				    __ATOMIC_RELEASE);		\
+	__old;							\
+})
+
 #define smp_mb__before_atomic()	__atomic_thread_fence(__ATOMIC_SEQ_CST)
 #define smp_mb__after_atomic()	__atomic_thread_fence(__ATOMIC_SEQ_CST)
 #define smp_wmb()		__atomic_thread_fence(__ATOMIC_SEQ_CST)
@@ -112,6 +148,11 @@ do {									\
 	({ smp_mb__before_atomic(); __ATOMIC_ADD_RETURN(i, v); })
 #endif
 
+#ifndef __ATOMIC_SUB_RETURN_RELEASE
+#define __ATOMIC_SUB_RETURN_RELEASE(i, v)				\
+	({ smp_mb__before_atomic(); __ATOMIC_SUB_RETURN(i, v); })
+#endif
+
 #ifndef __ATOMIC_SUB
 #define __ATOMIC_SUB(i, v) __ATOMIC_SUB_RETURN(i, v)
 #endif
@@ -138,11 +179,23 @@ static inline i_type a_type##_read(const a_type##_t *v)			\
 	return __ATOMIC_READ(&v->counter);				\
 }									\
 									\
+static inline i_type a_type##_read_acquire(const a_type##_t *v)		\
+{									\
+	i_type ret = __ATOMIC_READ(&v->counter);			\
+	smp_mb__after_atomic();						\
+	return ret;							\
+}									\
+									\
 static inline void a_type##_set(a_type##_t *v, i_type i)		\
 {									\
 	return __ATOMIC_SET(&v->counter, i);				\
 }									\
 									\
+static inline void a_type##_set_release(a_type##_t *v, i_type i)	\
+{									\
+	return __ATOMIC_SET_RELEASE(&v->counter, i);			\
+}									\
+									\
 static inline i_type a_type##_add_return(i_type i, a_type##_t *v)	\
 {									\
 	return __ATOMIC_ADD_RETURN(i, &v->counter);			\
@@ -153,6 +206,11 @@ static inline i_type a_type##_add_return_release(i_type i, a_type##_t *v)\
 	return __ATOMIC_ADD_RETURN_RELEASE(i, &v->counter);		\
 }									\
 									\
+static inline i_type a_type##_sub_return_release(i_type i, a_type##_t *v)\
+{									\
+	return __ATOMIC_SUB_RETURN_RELEASE(i, &v->counter);		\
+}									\
+									\
 static inline i_type a_type##_sub_return(i_type i, a_type##_t *v)	\
 {									\
 	return __ATOMIC_SUB_RETURN(i, &v->counter);			\
@@ -178,6 +236,11 @@ static inline i_type a_type##_dec_return(a_type##_t *v)			\
 	return __ATOMIC_DEC_RETURN(&v->counter);			\
 }									\
 									\
+static inline i_type a_type##_dec_return_release(a_type##_t *v)		\
+{									\
+	return __ATOMIC_SUB_RETURN_RELEASE(1, &v->counter);		\
+}									\
+									\
 static inline void a_type##_inc(a_type##_t *v)				\
 {									\
 	__ATOMIC_INC(&v->counter);					\
@@ -221,6 +284,16 @@ static inline bool a_type##_inc_not_zero(a_type##_t *v)			\
 	return a_type##_add_unless(v, 1, 0);				\
 }									\
 									\
+static inline void a_type##_and(i_type a, a_type##_t *v)		\
+{									\
+	__ATOMIC_AND(a, v);						\
+}									\
+									\
+static inline void a_type##_or(i_type a, a_type##_t *v)			\
+{									\
+	__ATOMIC_OR(a, v);						\
+}									\
+									\
 static inline i_type a_type##_xchg(a_type##_t *v, i_type i)		\
 {									\
 	return xchg(&v->counter, i);					\
@@ -231,9 +304,21 @@ static inline i_type a_type##_cmpxchg(a_type##_t *v, i_type old, i_type new)\
 	return cmpxchg(&v->counter, old, new);				\
 }									\
 									\
+static inline bool a_type##_try_cmpxchg(a_type##_t *v, i_type *old, i_type new)\
+{									\
+	return try_cmpxchg(&v->counter, old, new);			\
+}									\
+									\
 static inline i_type a_type##_cmpxchg_acquire(a_type##_t *v, i_type old, i_type new)\
 {									\
 	return cmpxchg_acquire(&v->counter, old, new);			\
+}									\
+									\
+static inline bool a_type##_try_cmpxchg_acquire(a_type##_t *v, i_type *old, i_type new)\
+{									\
+	i_type prev = *old;						\
+	*old = cmpxchg_acquire(&v->counter, *old, new);			\
+	return prev == *old;						\
 }
 
 DEF_ATOMIC_OPS(atomic,		int)
@@ -243,6 +328,13 @@ DEF_ATOMIC_OPS(atomic_long,	long)
 DEF_ATOMIC_OPS(atomic64,	s64)
 #else
 s64 atomic64_read(const atomic64_t *v);
+static inline s64 atomic64_read_acquire(const atomic64_t *v)
+{
+	s64 ret = atomic64_read(v);
+	smp_mb__after_atomic();
+	return ret;
+}
+
 void atomic64_set(atomic64_t *v, s64);
 
 s64 atomic64_add_return(s64, atomic64_t *);
@@ -252,6 +344,7 @@ void atomic64_sub(s64, atomic64_t *);
 
 s64 atomic64_xchg(atomic64_t *, s64);
 s64 atomic64_cmpxchg(atomic64_t *, s64, s64);
+bool atomic64_try_cmpxchg(atomic64_t *, s64 *, s64);
 
 #define atomic64_add_negative(a, v)	(atomic64_add_return((a), (v)) < 0)
 #define atomic64_inc(v)			atomic64_add(1LL, (v))
@@ -274,6 +367,12 @@ static inline s64 atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
 	return atomic64_cmpxchg(v, old, new);
 }
 
+static inline s64 atomic64_sub_return_release(s64 i, atomic64_t *v)
+{
+       smp_mb__before_atomic();
+       return atomic64_sub_return(i, v);
+}
+
 #endif
 
 #endif /* __TOOLS_LINUX_ATOMIC_H */
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/include/linux/backing-dev-defs.h
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7736198f..b3755406 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -113,8 +113,17 @@ static inline void *bio_data(struct bio *bio)
 
 #define __bio_kunmap_atomic(addr)	kunmap_atomic(addr)
 
-#define bio_for_each_segment_all(bvl, bio, i)				\
-	for (i = 0, bvl = (bio)->bi_io_vec; i < (bio)->bi_vcnt; i++, bvl++)
+static inline struct bio_vec *bio_next_segment(const struct bio *bio,
+					       struct bvec_iter_all *iter)
+{
+	if (iter->idx >= bio->bi_vcnt)
+		return NULL;
+
+	return &bio->bi_io_vec[iter->idx];
+}
+
+#define bio_for_each_segment_all(bvl, bio, iter) \
+	for ((iter).idx = 0; (bvl = bio_next_segment((bio), &(iter))); (iter).idx++)
 
 static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
 				    unsigned bytes)
@@ -136,6 +145,9 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
 #define bio_for_each_segment(bvl, bio, iter)				\
 	__bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter)
 
+#define __bio_for_each_bvec(bvl, bio, iter, start)			\
+	__bio_for_each_segment(bvl, bio, iter, start)
+
 #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
 
 static inline unsigned bio_segments(struct bio *bio)
@@ -200,23 +212,19 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors,
 
 struct bio_set {
 	unsigned int front_pad;
+	unsigned int back_pad;
+	mempool_t bio_pool;
+	mempool_t bvec_pool;
 };
 
-static inline void bioset_exit(struct bio_set *bs) {}
 
 static inline void bioset_free(struct bio_set *bs)
 {
 	kfree(bs);
 }
 
-static inline int bioset_init(struct bio_set *bs,
-			      unsigned pool_size,
-			      unsigned front_pad,
-			      int flags)
-{
-	bs->front_pad = front_pad;
-	return 0;
-}
+void bioset_exit(struct bio_set *);
+int bioset_init(struct bio_set *, unsigned, unsigned, int);
 
 extern struct bio_set *bioset_create(unsigned int, unsigned int);
 extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int);
@@ -225,29 +233,26 @@ enum {
 	BIOSET_NEED_RESCUER	= 1 << 1,
 };
 
-extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
+struct bio *bio_alloc_bioset(struct block_device *, unsigned,
+			     blk_opf_t, gfp_t, struct bio_set *);
+
 extern void bio_put(struct bio *);
 
-extern void __bio_clone_fast(struct bio *, struct bio *);
-extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
-extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
+int bio_add_page(struct bio *, struct page *, unsigned, unsigned);
 
-static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
-{
-	return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
-}
+struct bio *bio_alloc_clone(struct block_device *, struct bio *,
+			    gfp_t, struct bio_set *);
 
-static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
-{
-	return bio_clone_bioset(bio, gfp_mask, NULL);
+struct bio *bio_alloc(struct block_device *, unsigned,
+		      blk_opf_t, gfp_t);
 
-}
+struct bio *bio_kmalloc(unsigned int, gfp_t);
 
 extern void bio_endio(struct bio *);
 
 extern void bio_advance(struct bio *, unsigned);
 
-extern void bio_reset(struct bio *);
+extern void bio_reset(struct bio *, struct block_device *, unsigned);
 void bio_chain(struct bio *, struct bio *);
 
 extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
@@ -414,20 +419,15 @@ static inline void bio_inc_remaining(struct bio *bio)
 	atomic_inc(&bio->__bi_remaining);
 }
 
-static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
-{
-	return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
-}
-
-static inline struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
-{
-	return bio_clone_bioset(bio, gfp_mask, NULL);
-}
-
-static inline void bio_init(struct bio *bio, struct bio_vec *table,
-	      unsigned short max_vecs)
+static inline void bio_init(struct bio *bio,
+			    struct block_device *bdev,
+			    struct bio_vec *table,
+			    unsigned short max_vecs,
+			    unsigned int opf)
 {
 	memset(bio, 0, sizeof(*bio));
+	bio->bi_bdev = bdev;
+	bio->bi_opf = opf;
 	atomic_set(&bio->__bi_remaining, 1);
 	atomic_set(&bio->__bi_cnt, 1);
 
diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h
index 0e88820a..873f08c2 100644
--- a/include/linux/bit_spinlock.h
+++ b/include/linux/bit_spinlock.h
@@ -3,38 +3,81 @@
 
 #include <linux/kernel.h>
 #include <linux/preempt.h>
-#include <linux/atomic.h>
-#include <linux/bug.h>
+#include <linux/futex.h>
+#include <urcu/futex.h>
 
-static inline void bit_spin_lock(int bitnum, unsigned long *addr)
+/*
+ * The futex wait op wants an explicit 32-bit address and value. If the bitmap
+ * used for the spinlock is 64-bit, cast down and pass the right 32-bit region
+ * for the in-kernel checks. The value is the copy that has already been read
+ * from the atomic op.
+ *
+ * The futex wake op interprets the value as the number of waiters to wake (up
+ * to INT_MAX), so pass that along directly.
+ */
+static inline void do_futex(int nr, unsigned long *addr, unsigned long v, int futex_flags)
 {
-	while (unlikely(test_and_set_bit_lock(bitnum, addr))) {
-		do {
-			cpu_relax();
-		} while (test_bit(bitnum, addr));
+	u32 *addr32 = (u32 *) addr;
+	u32 *v32 = (u32 *) &v;
+	int shift = 0;
+
+	futex_flags |= FUTEX_PRIVATE_FLAG;
+
+#if BITS_PER_LONG == 64
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	shift = (nr >= 32) ? 1 : 0;
+#else
+	shift = (nr < 32) ? 1 : 0;
+#endif
+#endif
+	if (shift) {
+		addr32 += shift;
+		v32 += shift;
 	}
+	/*
+	 * The shift to determine the futex address may have cast away a
+	 * literal wake count value. The value is capped to INT_MAX and thus
+	 * always in the low bytes of v regardless of bit nr. Copy in the wake
+	 * count to whatever 32-bit range was selected.
+	 */
+	if (futex_flags == FUTEX_WAKE_PRIVATE)
+		*v32 = (u32) v;
+	futex(addr32, futex_flags, *v32, NULL, NULL, 0);
 }
 
-static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
+static inline void bit_spin_lock(int nr, unsigned long *_addr)
 {
-	return !test_and_set_bit_lock(bitnum, addr);
-}
+	unsigned long mask;
+	unsigned long *addr = _addr + (nr / BITS_PER_LONG);
+	unsigned long v;
 
-static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
-{
-	BUG_ON(!test_bit(bitnum, addr));
+	nr &= BITS_PER_LONG - 1;
+	mask = 1UL << nr;
 
-	clear_bit_unlock(bitnum, addr);
+	while (1) {
+		v = __atomic_fetch_or(addr, mask, __ATOMIC_ACQUIRE);
+		if (!(v & mask))
+			break;
+
+		do_futex(nr, addr, v, FUTEX_WAIT);
+	}
 }
 
-static inline void __bit_spin_unlock(int bitnum, unsigned long *addr)
+static inline void bit_spin_wake(int nr, unsigned long *_addr)
 {
-	bit_spin_unlock(bitnum, addr);
+	do_futex(nr, _addr, INT_MAX, FUTEX_WAKE);
 }
 
-static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
+static inline void bit_spin_unlock(int nr, unsigned long *_addr)
 {
-	return test_bit(bitnum, addr);
+	unsigned long mask;
+	unsigned long *addr = _addr + (nr / BITS_PER_LONG);
+
+	nr &= BITS_PER_LONG - 1;
+	mask = 1UL << nr;
+
+	__atomic_and_fetch(addr, ~mask, __ATOMIC_RELEASE);
+	do_futex(nr, addr, INT_MAX, FUTEX_WAKE);
 }
 
 #endif /* __LINUX_BIT_SPINLOCK_H */
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 80e8ecda..c5592a5a 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -3,6 +3,7 @@
 
 #include <string.h>
 #include <linux/bitops.h>
+#include <linux/kernel.h>
 #include <stdlib.h>
 
 #define DECLARE_BITMAP(name,bits) \
@@ -61,6 +62,29 @@ static inline void bitmap_complement(unsigned long *dst, const unsigned long *sr
 		dst[k] = ~src[k];
 }
 
+static inline bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
+				const unsigned long *bitmap2, unsigned int bits)
+{
+	unsigned int k;
+	unsigned int lim = bits/BITS_PER_LONG;
+	unsigned long result = 0;
+
+	for (k = 0; k < lim; k++)
+		result |= (dst[k] = bitmap1[k] & ~bitmap2[k]);
+	if (bits % BITS_PER_LONG)
+		result |= (dst[k] = bitmap1[k] & ~bitmap2[k] &
+			   BITMAP_LAST_WORD_MASK(bits));
+	return result != 0;
+}
+
+static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1,
+			const unsigned long *src2, unsigned int nbits)
+{
+	if (small_const_nbits(nbits))
+		return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
+	return __bitmap_andnot(dst, src1, src2, nbits);
+}
+
 static inline void bitmap_zero(unsigned long *dst, int nbits)
 {
 	memset(dst, 0, BITS_TO_LONGS(nbits) * sizeof(unsigned long));
@@ -135,4 +159,12 @@ static inline unsigned long find_next_zero_bit(const unsigned long *addr, unsign
 #define find_first_bit(addr, size) find_next_bit((addr), (size), 0)
 #define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0)
 
+static inline bool bitmap_empty(const unsigned long *src, unsigned nbits)
+{
+	if (small_const_nbits(nbits))
+		return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
+
+	return find_first_bit(src, nbits) == nbits;
+}
+
 #endif /* _PERF_BITOPS_H */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index dc2927b3..93739532 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -2,7 +2,6 @@
 #define _TOOLS_LINUX_BITOPS_H_
 
 #include <asm/types.h>
-#include <linux/kernel.h>
 #include <linux/compiler.h>
 #include <linux/page.h>
 
@@ -16,6 +15,7 @@
 
 #define BIT_MASK(nr)		(1UL << ((nr) % BITS_PER_LONG))
 #define BIT_WORD(nr)		((nr) / BITS_PER_LONG)
+#define BITS_PER_TYPE(type)	(sizeof(type) * BITS_PER_BYTE)
 #define BITS_PER_BYTE		8
 #define BITS_TO_LONGS(nr)	DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
 #define BITS_TO_U64(nr)		DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64))
@@ -85,6 +85,17 @@ static inline bool test_and_set_bit(long nr, volatile unsigned long *addr)
 	return (old & mask) != 0;
 }
 
+static inline bool test_and_clear_bit(long nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *) addr) + BIT_WORD(nr);
+	unsigned long old;
+
+	old = __atomic_fetch_and(p, ~mask, __ATOMIC_RELAXED);
+
+	return (old & mask) != 0;
+}
+
 static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
@@ -120,6 +131,17 @@ static inline unsigned long hweight_long(unsigned long w)
 	return __builtin_popcountl(w);
 }
 
+static inline unsigned long hweight64(u64 w)
+{
+	return __builtin_popcount((u32) w) +
+	       __builtin_popcount(w >> 32);
+}
+
+static inline unsigned long hweight32(u32 w)
+{
+	return __builtin_popcount(w);
+}
+
 static inline unsigned long hweight8(unsigned long w)
 {
 	return __builtin_popcountl(w);
diff --git a/include/linux/bitrev.h b/include/linux/bitrev.h
deleted file mode 100644
index fb790b84..00000000
--- a/include/linux/bitrev.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef _LINUX_BITREV_H
-#define _LINUX_BITREV_H
-
-#include <linux/types.h>
-
-#ifdef CONFIG_HAVE_ARCH_BITREVERSE
-#include <asm/bitrev.h>
-
-#define __bitrev32 __arch_bitrev32
-#define __bitrev16 __arch_bitrev16
-#define __bitrev8 __arch_bitrev8
-
-#else
-extern u8 const byte_rev_table[256];
-static inline u8 __bitrev8(u8 byte)
-{
-	return byte_rev_table[byte];
-}
-
-static inline u16 __bitrev16(u16 x)
-{
-	return (__bitrev8(x & 0xff) << 8) | __bitrev8(x >> 8);
-}
-
-static inline u32 __bitrev32(u32 x)
-{
-	return (__bitrev16(x & 0xffff) << 16) | __bitrev16(x >> 16);
-}
-
-#endif /* CONFIG_HAVE_ARCH_BITREVERSE */
-
-#define __constant_bitrev32(x)	\
-({					\
-	u32 __x = x;			\
-	__x = (__x >> 16) | (__x << 16);	\
-	__x = ((__x & (u32)0xFF00FF00UL) >> 8) | ((__x & (u32)0x00FF00FFUL) << 8);	\
-	__x = ((__x & (u32)0xF0F0F0F0UL) >> 4) | ((__x & (u32)0x0F0F0F0FUL) << 4);	\
-	__x = ((__x & (u32)0xCCCCCCCCUL) >> 2) | ((__x & (u32)0x33333333UL) << 2);	\
-	__x = ((__x & (u32)0xAAAAAAAAUL) >> 1) | ((__x & (u32)0x55555555UL) << 1);	\
-	__x;								\
-})
-
-#define __constant_bitrev16(x)	\
-({					\
-	u16 __x = x;			\
-	__x = (__x >> 8) | (__x << 8);	\
-	__x = ((__x & (u16)0xF0F0U) >> 4) | ((__x & (u16)0x0F0FU) << 4);	\
-	__x = ((__x & (u16)0xCCCCU) >> 2) | ((__x & (u16)0x3333U) << 2);	\
-	__x = ((__x & (u16)0xAAAAU) >> 1) | ((__x & (u16)0x5555U) << 1);	\
-	__x;								\
-})
-
-#define __constant_bitrev8(x)	\
-({					\
-	u8 __x = x;			\
-	__x = (__x >> 4) | (__x << 4);	\
-	__x = ((__x & (u8)0xCCU) >> 2) | ((__x & (u8)0x33U) << 2);	\
-	__x = ((__x & (u8)0xAAU) >> 1) | ((__x & (u8)0x55U) << 1);	\
-	__x;								\
-})
-
-#define bitrev32(x) \
-({			\
-	u32 __x = x;	\
-	__builtin_constant_p(__x) ?	\
-	__constant_bitrev32(__x) :			\
-	__bitrev32(__x);				\
-})
-
-#define bitrev16(x) \
-({			\
-	u16 __x = x;	\
-	__builtin_constant_p(__x) ?	\
-	__constant_bitrev16(__x) :			\
-	__bitrev16(__x);				\
- })
-
-#define bitrev8(x) \
-({			\
-	u8 __x = x;	\
-	__builtin_constant_p(__x) ?	\
-	__constant_bitrev8(__x) :			\
-	__bitrev8(__x)	;			\
- })
-#endif /* _LINUX_BITREV_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 42cd0032..3cbf8c9e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -8,12 +8,64 @@
 #include <linux/atomic.h>
 #include <linux/types.h>
 #include <linux/bvec.h>
+#include <linux/kobject.h>
 
 struct bio_set;
 struct bio;
-struct block_device;
 typedef void (bio_end_io_t) (struct bio *);
 
+#define BDEVNAME_SIZE	32
+
+typedef unsigned int __bitwise blk_mode_t;
+
+/* open for reading */
+#define BLK_OPEN_READ		((__force blk_mode_t)(1 << 0))
+/* open for writing */
+#define BLK_OPEN_WRITE		((__force blk_mode_t)(1 << 1))
+/* open exclusively (vs other exclusive openers */
+#define BLK_OPEN_EXCL		((__force blk_mode_t)(1 << 2))
+/* opened with O_NDELAY */
+#define BLK_OPEN_NDELAY		((__force blk_mode_t)(1 << 3))
+/* open for "writes" only for ioctls (specialy hack for floppy.c) */
+#define BLK_OPEN_WRITE_IOCTL	((__force blk_mode_t)(1 << 4))
+
+#define BLK_OPEN_BUFFERED	((__force blk_mode_t)(1 << 5))
+
+struct inode {
+	unsigned long		i_ino;
+	loff_t			i_size;
+	struct super_block	*i_sb;
+	blk_mode_t		mode;
+};
+
+struct request_queue {
+	struct backing_dev_info *backing_dev_info;
+};
+
+struct gendisk {
+	struct backing_dev_info	*bdi;
+	struct backing_dev_info	__bdi;
+};
+
+struct hd_struct {
+	struct kobject		kobj;
+};
+
+struct block_device {
+	struct kobject		kobj;
+	dev_t			bd_dev;
+	char			name[BDEVNAME_SIZE];
+	struct inode		*bd_inode;
+	struct inode		__bd_inode;
+	struct request_queue	queue;
+	void			*bd_holder;
+	struct gendisk *	bd_disk;
+	struct gendisk		__bd_disk;
+	int			bd_fd;
+};
+
+#define bdev_kobj(_bdev) (&((_bdev)->kobj))
+
 /*
  * Block error status values.  See block/blk-core:blk_errors for the details.
  */
@@ -35,6 +87,8 @@ typedef u8 __bitwise blk_status_t;
 
 #define BLK_STS_AGAIN		((__force blk_status_t)12)
 
+#define BIO_INLINE_VECS 4
+
 /*
  * main unit of I/O for the block layer and lower layers (ie drivers and
  * stacking drivers)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1d5581dc..b295bd9a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -6,7 +6,12 @@
 #include <linux/kobject.h>
 #include <linux/types.h>
 
+#define MAX_LFS_FILESIZE 	((loff_t)LLONG_MAX)
+
+#define BIO_MAX_VECS	256U
+
 typedef unsigned fmode_t;
+typedef __u32 __bitwise blk_opf_t;
 
 struct bio;
 struct user_namespace;
@@ -18,36 +23,6 @@ struct user_namespace;
 #define MINOR(dev)	((unsigned int) ((dev) & MINORMASK))
 #define MKDEV(ma,mi)	(((ma) << MINORBITS) | (mi))
 
-/* file is open for reading */
-#define FMODE_READ		((__force fmode_t)0x1)
-/* file is open for writing */
-#define FMODE_WRITE		((__force fmode_t)0x2)
-/* file is seekable */
-#define FMODE_LSEEK		((__force fmode_t)0x4)
-/* file can be accessed using pread */
-#define FMODE_PREAD		((__force fmode_t)0x8)
-/* file can be accessed using pwrite */
-#define FMODE_PWRITE		((__force fmode_t)0x10)
-/* File is opened for execution with sys_execve / sys_uselib */
-#define FMODE_EXEC		((__force fmode_t)0x20)
-/* File is opened with O_NDELAY (only set for block devices) */
-#define FMODE_NDELAY		((__force fmode_t)0x40)
-/* File is opened with O_EXCL (only set for block devices) */
-#define FMODE_EXCL		((__force fmode_t)0x80)
-/* File is opened using open(.., 3, ..) and is writeable only for ioctls
-   (specialy hack for floppy.c) */
-#define FMODE_WRITE_IOCTL	((__force fmode_t)0x100)
-/* 32bit hashes as llseek() offset (for directories) */
-#define FMODE_32BITHASH         ((__force fmode_t)0x200)
-/* 64bit hashes as llseek() offset (for directories) */
-#define FMODE_64BITHASH         ((__force fmode_t)0x400)
-
-struct inode {
-	unsigned long		i_ino;
-	loff_t			i_size;
-	struct super_block	*i_sb;
-};
-
 struct file {
 	struct inode		*f_inode;
 };
@@ -57,36 +32,8 @@ static inline struct inode *file_inode(const struct file *f)
 	return f->f_inode;
 }
 
-#define BDEVNAME_SIZE	32
-
-struct request_queue {
-	struct backing_dev_info *backing_dev_info;
-};
-
-struct gendisk {
-};
-
-struct hd_struct {
-	struct kobject		kobj;
-};
-
 #define part_to_dev(part)	(part)
 
-struct block_device {
-	char			name[BDEVNAME_SIZE];
-	struct inode		*bd_inode;
-	struct request_queue	queue;
-	void			*bd_holder;
-	struct hd_struct	*bd_part;
-	struct gendisk		*bd_disk;
-	struct gendisk		__bd_disk;
-	int			bd_fd;
-	int			bd_sync_fd;
-
-	struct backing_dev_info	*bd_bdi;
-	struct backing_dev_info	__bd_bdi;
-};
-
 void generic_make_request(struct bio *);
 int submit_bio_wait(struct bio *);
 
@@ -95,21 +42,41 @@ static inline void submit_bio(struct bio *bio)
 	generic_make_request(bio);
 }
 
-int blkdev_issue_discard(struct block_device *, sector_t,
-			 sector_t, gfp_t, unsigned long);
+int blkdev_issue_discard(struct block_device *, sector_t, sector_t, gfp_t);
+int blkdev_issue_zeroout(struct block_device *, sector_t, sector_t, gfp_t, unsigned);
 
 #define bdev_get_queue(bdev)		(&((bdev)->queue))
 
-#define blk_queue_discard(q)		((void) (q), 0)
+#ifndef SECTOR_SHIFT
+#define SECTOR_SHIFT 9
+#endif
+#ifndef SECTOR_SIZE
+#define SECTOR_SIZE (1 << SECTOR_SHIFT)
+#endif
+
+#define PAGE_SECTORS_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
+#define PAGE_SECTORS		(1 << PAGE_SECTORS_SHIFT)
+#define SECTOR_MASK		(PAGE_SECTORS - 1)
+
+#define bdev_max_discard_sectors(bdev)	((void) (bdev), 0)
 #define blk_queue_nonrot(q)		((void) (q), 0)
 
 unsigned bdev_logical_block_size(struct block_device *bdev);
 sector_t get_capacity(struct gendisk *disk);
 
-void blkdev_put(struct block_device *bdev, fmode_t mode);
-void bdput(struct block_device *bdev);
-struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder);
-struct block_device *lookup_bdev(const char *path);
+struct blk_holder_ops {
+        void (*mark_dead)(struct block_device *bdev);
+};
+
+static inline struct block_device *file_bdev(struct file *file)
+{
+	return container_of(file->f_inode, struct block_device, __bd_inode);
+}
+
+void bdev_fput(struct file *);
+struct file *bdev_file_open_by_path(const char *, blk_mode_t, void *,
+				    const struct blk_holder_ops *);
+int lookup_bdev(const char *path, dev_t *);
 
 struct super_block {
 	void			*s_fs_info;
@@ -131,6 +98,7 @@ struct super_block {
 #define DT_LNK		10
 #define DT_SOCK		12
 #define DT_WHT		14
+#define DT_MAX		16
 #endif
 
 /*
@@ -199,6 +167,7 @@ static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)
 
 int blk_status_to_errno(blk_status_t status);
 blk_status_t errno_to_blk_status(int errno);
+const char *blk_status_to_str(blk_status_t status);
 
 #endif /* __TOOLS_LINUX_BLKDEV_H */
 
diff --git a/include/linux/bsearch.h b/include/linux/bsearch.h
new file mode 100644
index 00000000..e66b711d
--- /dev/null
+++ b/include/linux/bsearch.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_BSEARCH_H
+#define _LINUX_BSEARCH_H
+
+#include <linux/types.h>
+
+static __always_inline
+void *__inline_bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp)
+{
+	const char *pivot;
+	int result;
+
+	while (num > 0) {
+		pivot = base + (num >> 1) * size;
+		result = cmp(key, pivot);
+
+		if (result == 0)
+			return (void *)pivot;
+
+		if (result > 0) {
+			base = pivot + size;
+			num--;
+		}
+		num >>= 1;
+	}
+
+	return NULL;
+}
+
+extern void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp);
+
+#endif /* _LINUX_BSEARCH_H */
diff --git a/include/linux/bug.h b/include/linux/bug.h
index f8929688..1a10f7e6 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -2,30 +2,65 @@
 #define __TOOLS_LINUX_BUG_H
 
 #include <assert.h>
+#include <stdio.h>
 #include <linux/compiler.h>
 
+#ifdef CONFIG_VALGRIND
+#include <valgrind/memcheck.h>
+
+#define DEBUG_MEMORY_FREED(p, len) VALGRIND_MAKE_MEM_UNDEFINED(p, len)
+#endif
+
 #define BUILD_BUG_ON_NOT_POWER_OF_2(n)			\
 	BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0))
 #define BUILD_BUG_ON_ZERO(e)	(sizeof(struct { int:-!!(e); }))
 #define BUILD_BUG_ON_NULL(e)	((void *)sizeof(struct { int:-!!(e); }))
 
-#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+#define BUILD_BUG_ON(cond)	((void)sizeof(char[1 - 2*!!(cond)]))
 
-#define BUG()			do { assert(0); unreachable(); } while (0)
+#define BUG()			do { fflush(stdout); assert(0); unreachable(); } while (0)
 #define BUG_ON(cond)		assert(!(cond))
 
-#define WARN_ON_ONCE(cond)	({ bool _r = (cond); if (_r) assert(0); _r; })
-#define WARN_ONCE(cond, ...)	({ bool _r = (cond); if (_r) assert(0); _r; })
+#define WARN(cond, fmt, ...)						\
+({									\
+	int __ret_warn_on = unlikely(!!(cond));				\
+	if (__ret_warn_on)						\
+		fprintf(stderr, "WARNING at " __FILE__ ":%d: " fmt "\n",\
+			__LINE__, ##__VA_ARGS__);			\
+	__ret_warn_on;							\
+})
 
-#define __WARN()		assert(0)
-#define __WARN_printf(arg...)	assert(0)
-#define WARN(cond, ...)		assert(!(cond))
+#define __WARN()							\
+do {									\
+	fprintf(stderr, "WARNING at " __FILE__ ":%d\n", __LINE__);	\
+} while (0)
 
-#define WARN_ON(condition) ({						\
-	int __ret_warn_on = unlikely(!!(condition));			\
+#define WARN_ON(cond) ({						\
+	int __ret_warn_on = unlikely(!!(cond));				\
 	if (__ret_warn_on)						\
 		__WARN();						\
 	__ret_warn_on;							\
 })
 
+#define WARN_ONCE(cond, fmt, ...)					\
+({									\
+	static bool __warned;						\
+	int __ret_warn_on = unlikely(!!(cond));				\
+	if (__ret_warn_on && !__warned) {				\
+		__warned = true;					\
+		__WARN();						\
+	}								\
+	__ret_warn_on;							\
+})
+
+#define WARN_ON_ONCE(cond) ({						\
+	static bool __warned;						\
+	int __ret_warn_on = unlikely(!!(cond));				\
+	if (__ret_warn_on && !__warned) {				\
+		__warned = true;					\
+		__WARN();						\
+	}								\
+	__ret_warn_on;							\
+})
+
 #endif /* __TOOLS_LINUX_BUG_H */
diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 89b65b82..5bc68b42 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -43,6 +43,10 @@ struct bvec_iter {
 						   current bvec */
 };
 
+struct bvec_iter_all {
+	int		idx;
+};
+
 /*
  * various member access, note that bio_data should of course not be used
  * on highmem page vectors
diff --git a/include/linux/closure.h b/include/linux/closure.h
index a9de6d93..880fe85e 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -1,8 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_CLOSURE_H
 #define _LINUX_CLOSURE_H
 
 #include <linux/llist.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/workqueue.h>
 
 /*
@@ -102,7 +104,8 @@
 
 struct closure;
 struct closure_syncer;
-typedef void (closure_fn) (struct closure *);
+typedef void (closure_fn) (struct work_struct *);
+extern struct dentry *bcache_debug;
 
 struct closure_waitlist {
 	struct llist_head	list;
@@ -125,10 +128,10 @@ enum closure_state {
 	 * annotate where references are being transferred.
 	 */
 
-	CLOSURE_BITS_START	= (1U << 27),
-	CLOSURE_DESTRUCTOR	= (1U << 27),
-	CLOSURE_WAITING		= (1U << 29),
-	CLOSURE_RUNNING		= (1U << 31),
+	CLOSURE_BITS_START	= (1U << 26),
+	CLOSURE_DESTRUCTOR	= (1U << 26),
+	CLOSURE_WAITING		= (1U << 28),
+	CLOSURE_RUNNING		= (1U << 30),
 };
 
 #define CLOSURE_GUARD_MASK					\
@@ -151,12 +154,14 @@ struct closure {
 	struct closure		*parent;
 
 	atomic_t		remaining;
+	bool			closure_get_happened;
 
 #ifdef CONFIG_DEBUG_CLOSURES
 #define CLOSURE_MAGIC_DEAD	0xc054dead
 #define CLOSURE_MAGIC_ALIVE	0xc054a11e
+#define CLOSURE_MAGIC_STACK	0xc05451cc
 
-	unsigned		magic;
+	unsigned int		magic;
 	struct list_head	all;
 	unsigned long		ip;
 	unsigned long		waiting_on;
@@ -169,6 +174,11 @@ void __closure_wake_up(struct closure_waitlist *list);
 bool closure_wait(struct closure_waitlist *list, struct closure *cl);
 void __closure_sync(struct closure *cl);
 
+static inline unsigned closure_nr_remaining(struct closure *cl)
+{
+	return atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK;
+}
+
 /**
  * closure_sync - sleep until a closure a closure has nothing left to wait on
  *
@@ -177,10 +187,26 @@ void __closure_sync(struct closure *cl);
  */
 static inline void closure_sync(struct closure *cl)
 {
-	if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
+#ifdef CONFIG_DEBUG_CLOSURES
+	BUG_ON(closure_nr_remaining(cl) != 1 && !cl->closure_get_happened);
+#endif
+
+	if (cl->closure_get_happened)
 		__closure_sync(cl);
 }
 
+int __closure_sync_timeout(struct closure *cl, unsigned long timeout);
+
+static inline int closure_sync_timeout(struct closure *cl, unsigned long timeout)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	BUG_ON(closure_nr_remaining(cl) != 1 && !cl->closure_get_happened);
+#endif
+	return cl->closure_get_happened
+		? __closure_sync_timeout(cl, timeout)
+		: 0;
+}
+
 #ifdef CONFIG_DEBUG_CLOSURES
 
 void closure_debug_create(struct closure *cl);
@@ -225,19 +251,23 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
 	closure_set_ip(cl);
 	cl->fn = fn;
 	cl->wq = wq;
-	/* between atomic_dec() in closure_put() */
-	smp_mb__before_atomic();
 }
 
 static inline void closure_queue(struct closure *cl)
 {
 	struct workqueue_struct *wq = cl->wq;
+	/**
+	 * Changes made to closure, work_struct, or a couple of other structs
+	 * may cause work.func not pointing to the right location.
+	 */
+	BUILD_BUG_ON(offsetof(struct closure, fn)
+		     != offsetof(struct work_struct, func));
 
 	if (wq) {
 		INIT_WORK(&cl->work, cl->work.func);
-		queue_work(wq, &cl->work);
+		BUG_ON(!queue_work(wq, &cl->work));
 	} else
-		cl->fn(cl);
+		cl->fn(&cl->work);
 }
 
 /**
@@ -245,6 +275,8 @@ static inline void closure_queue(struct closure *cl)
  */
 static inline void closure_get(struct closure *cl)
 {
+	cl->closure_get_happened = true;
+
 #ifdef CONFIG_DEBUG_CLOSURES
 	BUG_ON((atomic_inc_return(&cl->remaining) &
 		CLOSURE_REMAINING_MASK) <= 1);
@@ -254,6 +286,21 @@ static inline void closure_get(struct closure *cl)
 }
 
 /**
+ * closure_get_not_zero
+ */
+static inline bool closure_get_not_zero(struct closure *cl)
+{
+	unsigned old = atomic_read(&cl->remaining);
+	do {
+		if (!(old & CLOSURE_REMAINING_MASK))
+			return false;
+
+	} while (!atomic_try_cmpxchg_acquire(&cl->remaining, &old, old + 1));
+
+	return true;
+}
+
+/**
  * closure_init - Initialize a closure, setting the refcount to 1
  * @cl:		closure to initialize
  * @parent:	parent of the new closure. cl will take a refcount on it for its
@@ -267,6 +314,7 @@ static inline void closure_init(struct closure *cl, struct closure *parent)
 		closure_get(parent);
 
 	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+	cl->closure_get_happened = false;
 
 	closure_debug_create(cl);
 	closure_set_ip(cl);
@@ -276,22 +324,35 @@ static inline void closure_init_stack(struct closure *cl)
 {
 	memset(cl, 0, sizeof(struct closure));
 	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->magic = CLOSURE_MAGIC_STACK;
+#endif
+}
+
+static inline void closure_init_stack_release(struct closure *cl)
+{
+	memset(cl, 0, sizeof(struct closure));
+	atomic_set_release(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->magic = CLOSURE_MAGIC_STACK;
+#endif
 }
 
 /**
- * closure_wake_up - wake up all closures on a wait list.
+ * closure_wake_up - wake up all closures on a wait list,
+ *		     with memory barrier
  */
 static inline void closure_wake_up(struct closure_waitlist *list)
 {
+	/* Memory barrier for the wait list */
 	smp_mb();
 	__closure_wake_up(list);
 }
 
-#define continue_at_noreturn(_cl, _fn, _wq)				\
-do {									\
-	set_closure_fn(_cl, _fn, _wq);					\
-	closure_sub(_cl, CLOSURE_RUNNING + 1);				\
-} while (0)
+#define CLOSURE_CALLBACK(name)	void name(struct work_struct *ws)
+#define closure_type(name, type, member)				\
+	struct closure *cl = container_of(ws, struct closure, work);	\
+	type *name = container_of(cl, type, member)
 
 /**
  * continue_at - jump to another function with barrier
@@ -300,16 +361,16 @@ do {									\
  * been dropped with closure_put()), it will resume execution at @fn running out
  * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
  *
- * NOTE: This macro expands to a return in the calling function!
- *
  * This is because after calling continue_at() you no longer have a ref on @cl,
  * and whatever @cl owns may be freed out from under you - a running closure fn
  * has a ref on its own closure which continue_at() drops.
+ *
+ * Note you are expected to immediately return after using this macro.
  */
 #define continue_at(_cl, _fn, _wq)					\
 do {									\
-	continue_at_noreturn(_cl, _fn, _wq);				\
-	return;								\
+	set_closure_fn(_cl, _fn, _wq);					\
+	closure_sub(_cl, CLOSURE_RUNNING + 1);				\
 } while (0)
 
 /**
@@ -322,38 +383,27 @@ do {									\
  */
 #define closure_return(_cl)	continue_at((_cl), NULL, NULL)
 
+void closure_return_sync(struct closure *cl);
+
 /**
  * continue_at_nobarrier - jump to another function without barrier
  *
  * Causes @fn to be executed out of @cl, in @wq context (or called directly if
  * @wq is NULL).
  *
- * NOTE: like continue_at(), this macro expands to a return in the caller!
- *
  * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
  * thus it's not safe to touch anything protected by @cl after a
  * continue_at_nobarrier().
  */
 #define continue_at_nobarrier(_cl, _fn, _wq)				\
 do {									\
-	closure_set_ip(_cl);						\
-	if (_wq) {							\
-		INIT_WORK(&(_cl)->work, (void *) _fn);			\
-		queue_work((_wq), &(_cl)->work);			\
-	} else {							\
-		(_fn)(_cl);						\
-	}								\
-	return;								\
-} while (0)
-
-#define closure_return_with_destructor_noreturn(_cl, _destructor)	\
-do {									\
-	set_closure_fn(_cl, _destructor, NULL);				\
-	closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1);	\
+	set_closure_fn(_cl, _fn, _wq);					\
+	closure_queue(_cl);						\
 } while (0)
 
 /**
- * closure_return - finish execution of a closure, with destructor
+ * closure_return_with_destructor - finish execution of a closure,
+ *				    with destructor
  *
  * Works like closure_return(), except @destructor will be called when all
  * outstanding refs on @cl have been dropped; @destructor may be used to safely
@@ -363,8 +413,8 @@ do {									\
  */
 #define closure_return_with_destructor(_cl, _destructor)		\
 do {									\
-	closure_return_with_destructor_noreturn(_cl, _destructor);	\
-	return;								\
+	set_closure_fn(_cl, _destructor, NULL);				\
+	closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1);	\
 } while (0)
 
 /**
@@ -404,4 +454,39 @@ do {									\
 		__closure_wait_event(waitlist, _cond);			\
 } while (0)
 
+#define __closure_wait_event_timeout(waitlist, _cond, _until)		\
+({									\
+	struct closure cl;						\
+	long _t;							\
+									\
+	closure_init_stack(&cl);					\
+									\
+	while (1) {							\
+		closure_wait(waitlist, &cl);				\
+		if (_cond) {						\
+			_t = max_t(long, 1L, _until - jiffies);		\
+			break;						\
+		}							\
+		_t = max_t(long, 0L, _until - jiffies);			\
+		if (!_t)						\
+			break;						\
+		closure_sync_timeout(&cl, _t);				\
+	}								\
+	closure_wake_up(waitlist);					\
+	closure_sync(&cl);						\
+	_t;								\
+})
+
+/*
+ * Returns 0 if timeout expired, remaining time in jiffies (at least 1) if
+ * condition became true
+ */
+#define closure_wait_event_timeout(waitlist, _cond, _timeout)		\
+({									\
+	unsigned long _until = jiffies + _timeout;			\
+	(_cond)								\
+		? max_t(long, 1L, _until - jiffies)			\
+		: __closure_wait_event_timeout(waitlist, _cond, _until);\
+})
+
 #endif /* _LINUX_CLOSURE_H */
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 915a6f88..3ecc3dd1 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -10,6 +10,10 @@
 # define __always_inline	inline __attribute__((always_inline))
 #endif
 
+#ifndef __attribute_const__
+#define __attribute_const__     __attribute__((__const__))
+#endif
+
 #ifdef __ANDROID__
 /*
  * FIXME: Big hammer to get rid of tons of:
@@ -34,6 +38,7 @@
 #define __maybe_unused		__attribute__((unused))
 #define __always_unused		__attribute__((unused))
 #define __packed		__attribute__((__packed__))
+#define __flatten		__attribute__((flatten))
 #define __force
 #define __nocast
 #define __iomem
@@ -42,6 +47,7 @@
 #define __builtin_warning(x, y...) (1)
 #define __must_hold(x)
 #define __acquires(x)
+#define __cond_acquires(x)
 #define __releases(x)
 #define __acquire(x) (void)0
 #define __release(x) (void)0
@@ -59,6 +65,12 @@
 #define unlikely(x)		__builtin_expect(!!(x), 0)
 #define unreachable()		__builtin_unreachable()
 #define __same_type(a, b)	__builtin_types_compatible_p(typeof(a), typeof(b))
+#define fallthrough		__attribute__((__fallthrough__))
+#define __noreturn		__attribute__((__noreturn__))
+
+#ifndef __counted_by
+#define __counted_by(nr)
+#endif
 
 #define ___PASTE(a,b) a##b
 #define __PASTE(a,b) ___PASTE(a,b)
@@ -170,4 +182,9 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 #define CONFIG_X86_64	y
 #endif
 
+#define __is_constexpr(x) \
+	(sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8)))
+#define is_signed_type(type) (((type)(-1)) < (__force type)1)
+#define is_unsigned_type(type) (!is_signed_type(type))
+
 #endif /* _TOOLS_LINUX_COMPILER_H */
diff --git a/include/linux/console.h b/include/linux/console.h
index d01aa9a2..31aaa087 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -1,7 +1,8 @@
 #ifndef _LINUX_CONSOLE_H_
 #define _LINUX_CONSOLE_H_
 
-#define console_lock()
-#define console_unlock()
+#define console_lock()		do {} while (0)
+#define console_trylock()	true
+#define console_unlock()	do {} while (0)
 
 #endif /* _LINUX_CONSOLE_H */
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 024d645c..bfab7ea7 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -10,6 +10,8 @@
 #define cpu_present(cpu)	((cpu) == 0)
 #define cpu_active(cpu)		((cpu) == 0)
 
+#define raw_smp_processor_id()	0U
+
 #define for_each_cpu(cpu, mask)			\
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
 #define for_each_cpu_not(cpu, mask)		\
diff --git a/include/linux/crc64.h b/include/linux/crc64.h
new file mode 100644
index 00000000..c756e65a
--- /dev/null
+++ b/include/linux/crc64.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * See lib/crc64.c for the related specification and polynomial arithmetic.
+ */
+#ifndef _LINUX_CRC64_H
+#define _LINUX_CRC64_H
+
+#include <linux/types.h>
+
+u64 __pure crc64_be(u64 crc, const void *p, size_t len);
+#endif /* _LINUX_CRC64_H */
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 4db5b3f4..9a78cb16 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -17,34 +17,10 @@
 
 #include <linux/fs.h>
 #include <linux/seq_file.h>
-
 #include <linux/types.h>
 #include <linux/compiler.h>
 
-struct device;
 struct file_operations;
-struct vfsmount;
-struct srcu_struct;
-
-struct debugfs_blob_wrapper {
-	void *data;
-	unsigned long size;
-};
-
-struct debugfs_reg32 {
-	char *name;
-	unsigned long offset;
-};
-
-struct debugfs_regset32 {
-	const struct debugfs_reg32 *regs;
-	int nregs;
-	void __iomem *base;
-};
-
-extern struct dentry *arch_debugfs_dir;
-
-extern struct srcu_struct debugfs_srcu;
 
 #include <linux/err.h>
 
@@ -55,189 +31,16 @@ static inline struct dentry *debugfs_create_file(const char *name, umode_t mode,
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
-					struct dentry *parent, void *data,
-					const struct file_operations *fops,
-					loff_t file_size)
-{
-	return ERR_PTR(-ENODEV);
-}
-
 static inline struct dentry *debugfs_create_dir(const char *name,
 						struct dentry *parent)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_symlink(const char *name,
-						    struct dentry *parent,
-						    const char *dest)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline struct dentry *debugfs_create_automount(const char *name,
-					struct dentry *parent,
-					struct vfsmount *(*f)(void *),
-					void *data)
-{
-	return ERR_PTR(-ENODEV);
-}
-
 static inline void debugfs_remove(struct dentry *dentry)
 { }
 
 static inline void debugfs_remove_recursive(struct dentry *dentry)
 { }
 
-static inline int debugfs_use_file_start(const struct dentry *dentry,
-					int *srcu_idx)
-	__acquires(&debugfs_srcu)
-{
-	return 0;
-}
-
-static inline void debugfs_use_file_finish(int srcu_idx)
-	__releases(&debugfs_srcu)
-{ }
-
-#define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt)	\
-	static const struct file_operations __fops = { 0 }
-
-static inline struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
-                struct dentry *new_dir, char *new_name)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline struct dentry *debugfs_create_u8(const char *name, umode_t mode,
-					       struct dentry *parent,
-					       u8 *value)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline struct dentry *debugfs_create_u16(const char *name, umode_t mode,
-						struct dentry *parent,
-						u16 *value)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline struct dentry *debugfs_create_u32(const char *name, umode_t mode,
-						struct dentry *parent,
-						u32 *value)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline struct dentry *debugfs_create_u64(const char *name, umode_t mode,
-						struct dentry *parent,
-						u64 *value)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline struct dentry *debugfs_create_x8(const char *name, umode_t mode,
-					       struct dentry *parent,
-					       u8 *value)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline struct dentry *debugfs_create_x16(const char *name, umode_t mode,
-						struct dentry *parent,
-						u16 *value)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline struct dentry *debugfs_create_x32(const char *name, umode_t mode,
-						struct dentry *parent,
-						u32 *value)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline struct dentry *debugfs_create_x64(const char *name, umode_t mode,
-						struct dentry *parent,
-						u64 *value)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
-				     struct dentry *parent,
-				     size_t *value)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
-				     struct dentry *parent, atomic_t *value)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline struct dentry *debugfs_create_bool(const char *name, umode_t mode,
-						 struct dentry *parent,
-						 bool *value)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline struct dentry *debugfs_create_blob(const char *name, umode_t mode,
-				  struct dentry *parent,
-				  struct debugfs_blob_wrapper *blob)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline struct dentry *debugfs_create_regset32(const char *name,
-				   umode_t mode, struct dentry *parent,
-				   struct debugfs_regset32 *regset)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs,
-			 int nregs, void __iomem *base, char *prefix)
-{
-}
-
-static inline bool debugfs_initialized(void)
-{
-	return false;
-}
-
-static inline struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
-					struct dentry *parent,
-					u32 *array, u32 elements)
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline struct dentry *debugfs_create_devm_seqfile(struct device *dev,
-							 const char *name,
-							 struct dentry *parent,
-					   int (*read_fn)(struct seq_file *s,
-							  void *data))
-{
-	return ERR_PTR(-ENODEV);
-}
-
-static inline ssize_t debugfs_read_file_bool(struct file *file,
-					     char __user *user_buf,
-					     size_t count, loff_t *ppos)
-{
-	return -ENODEV;
-}
-
-static inline ssize_t debugfs_write_file_bool(struct file *file,
-					      const char __user *user_buf,
-					      size_t count, loff_t *ppos)
-{
-	return -ENODEV;
-}
-
 #endif
diff --git a/include/linux/errname.h b/include/linux/errname.h
new file mode 100644
index 00000000..443d5040
--- /dev/null
+++ b/include/linux/errname.h
@@ -0,0 +1,11 @@
+#ifndef _LINUX_ERRNAME_H
+#define _LINUX_ERRNAME_H
+
+#include <string.h>
+
+static inline const char *errname(int err)
+{
+	return strerror(abs(err));
+}
+
+#endif /* _LINUX_ERRNAME_H */
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 2b76d8c8..d90373f3 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -3,5 +3,10 @@
 
 #define try_to_freeze()
 #define set_freezable()
+#define freezing(task)		false
+#define freezable_schedule()	schedule()
+#define freezable_schedule_timeout(_t) schedule_timeout(_t)
+
+static inline void __refrigerator(bool f) {}
 
 #endif /* __TOOLS_LINUX_FREEZER_H */
diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h
new file mode 100644
index 00000000..40c6224a
--- /dev/null
+++ b/include/linux/fs_parser.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Filesystem parameter description and parser
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#ifndef _LINUX_FS_PARSER_H
+#define _LINUX_FS_PARSER_H
+
+struct constant_table {
+	const char	*name;
+	int		value;
+};
+
+extern int lookup_constant(const struct constant_table tbl[], const char *name, int not_found);
+
+extern const struct constant_table bool_names[];
+
+#endif /* _LINUX_FS_PARSER_H */
diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index 7f637e17..5b51c3d5 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -1,34 +1,119 @@
 #ifndef _LINUX_GENERIC_RADIX_TREE_H
 #define _LINUX_GENERIC_RADIX_TREE_H
 
-/*
- * Generic radix trees/sparse arrays:
+/**
+ * DOC: Generic radix trees/sparse arrays
+ *
+ * Very simple and minimalistic, supporting arbitrary size entries up to
+ * GENRADIX_NODE_SIZE.
+ *
+ * A genradix is defined with the type it will store, like so:
+ *
+ * static GENRADIX(struct foo) foo_genradix;
+ *
+ * The main operations are:
+ *
+ * - genradix_init(radix) - initialize an empty genradix
  *
- * A generic radix tree has all nodes of size PAGE_SIZE - both leaves and
- * interior nodes.
+ * - genradix_free(radix) - free all memory owned by the genradix and
+ *   reinitialize it
+ *
+ * - genradix_ptr(radix, idx) - gets a pointer to the entry at idx, returning
+ *   NULL if that entry does not exist
+ *
+ * - genradix_ptr_alloc(radix, idx, gfp) - gets a pointer to an entry,
+ *   allocating it if necessary
+ *
+ * - genradix_for_each(radix, iter, p) - iterate over each entry in a genradix
+ *
+ * The radix tree allocates one page of entries at a time, so entries may exist
+ * that were never explicitly allocated - they will be initialized to all
+ * zeroes.
+ *
+ * Internally, a genradix is just a radix tree of pages, and indexing works in
+ * terms of byte offsets. The wrappers in this header file use sizeof on the
+ * type the radix contains to calculate a byte offset from the index - see
+ * __idx_to_offset.
  */
 
+#include <asm/page.h>
 #include <linux/bug.h>
-#include <linux/kernel.h>
+#include <linux/limits.h>
 #include <linux/log2.h>
+#include <linux/math.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+struct genradix_root;
 
-struct genradix_node;
+#define GENRADIX_NODE_SHIFT	9
+#define GENRADIX_NODE_SIZE	(1U << GENRADIX_NODE_SHIFT)
+
+#define GENRADIX_ARY		(GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
+#define GENRADIX_ARY_SHIFT	ilog2(GENRADIX_ARY)
+
+/* depth that's needed for a genradix that can address up to ULONG_MAX: */
+#define GENRADIX_MAX_DEPTH	\
+	DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
+
+#define GENRADIX_DEPTH_MASK				\
+	((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
+
+static inline int genradix_depth_shift(unsigned depth)
+{
+	return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
+}
+
+/*
+ * Returns size (of data, in bytes) that a tree of a given depth holds:
+ */
+static inline size_t genradix_depth_size(unsigned depth)
+{
+	return 1UL << genradix_depth_shift(depth);
+}
+
+static inline unsigned genradix_root_to_depth(struct genradix_root *r)
+{
+	return (unsigned long) r & GENRADIX_DEPTH_MASK;
+}
+
+static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
+{
+	return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
+}
 
 struct __genradix {
-	struct genradix_node		*root;
-	size_t				depth;
+	struct genradix_root		*root;
+};
+
+struct genradix_node {
+	union {
+		/* Interior node: */
+		struct genradix_node	*children[GENRADIX_ARY];
+
+		/* Leaf: */
+		u8			data[GENRADIX_NODE_SIZE];
+	};
 };
 
+static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
+{
+	return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
+}
+
+static inline void genradix_free_node(struct genradix_node *node)
+{
+	kfree(node);
+}
+
 /*
- * NOTE: currently, sizeof(_type) must be a power of two and not larger than
- * PAGE_SIZE:
+ * NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE:
  */
 
 #define __GENRADIX_INITIALIZER					\
 	{							\
 		.tree = {					\
 			.root = NULL,				\
-			.depth = 0,				\
 		}						\
 	}
 
@@ -49,6 +134,12 @@ struct {							\
 #define DEFINE_GENRADIX(_name, _type)				\
 	GENRADIX(_type) _name = __GENRADIX_INITIALIZER
 
+/**
+ * genradix_init - initialize a genradix
+ * @_radix:	genradix to initialize
+ *
+ * Does not fail
+ */
 #define genradix_init(_radix)					\
 do {								\
 	*(_radix) = (typeof(*_radix)) __GENRADIX_INITIALIZER;	\
@@ -56,16 +147,25 @@ do {								\
 
 void __genradix_free(struct __genradix *);
 
+/**
+ * genradix_free: free all memory owned by a genradix
+ * @_radix: the genradix to free
+ *
+ * After freeing, @_radix will be reinitialized and empty
+ */
 #define genradix_free(_radix)	__genradix_free(&(_radix)->tree)
 
 static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
 {
-	BUILD_BUG_ON(obj_size > PAGE_SIZE);
+	if (__builtin_constant_p(obj_size))
+		BUILD_BUG_ON(obj_size > GENRADIX_NODE_SIZE);
+	else
+		BUG_ON(obj_size > GENRADIX_NODE_SIZE);
 
 	if (!is_power_of_2(obj_size)) {
-		size_t objs_per_page = PAGE_SIZE / obj_size;
+		size_t objs_per_page = GENRADIX_NODE_SIZE / obj_size;
 
-		return (idx / objs_per_page) * PAGE_SIZE +
+		return (idx / objs_per_page) * GENRADIX_NODE_SIZE +
 			(idx % objs_per_page) * obj_size;
 	} else {
 		return idx * obj_size;
@@ -74,31 +174,102 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
 
 #define __genradix_cast(_radix)		(typeof((_radix)->type[0]) *)
 #define __genradix_obj_size(_radix)	sizeof((_radix)->type[0])
+#define __genradix_objs_per_page(_radix)			\
+	(GENRADIX_NODE_SIZE / sizeof((_radix)->type[0]))
+#define __genradix_page_remainder(_radix)			\
+	(GENRADIX_NODE_SIZE % sizeof((_radix)->type[0]))
+
 #define __genradix_idx_to_offset(_radix, _idx)			\
 	__idx_to_offset(_idx, __genradix_obj_size(_radix))
 
+static inline void *__genradix_ptr_inlined(struct __genradix *radix, size_t offset)
+{
+	struct genradix_root *r = READ_ONCE(radix->root);
+	struct genradix_node *n = genradix_root_to_node(r);
+	unsigned level		= genradix_root_to_depth(r);
+	unsigned shift		= genradix_depth_shift(level);
+
+	if (unlikely(ilog2(offset) >= genradix_depth_shift(level)))
+		return NULL;
+
+	while (n && shift > GENRADIX_NODE_SHIFT) {
+		shift -= GENRADIX_ARY_SHIFT;
+		n = n->children[offset >> shift];
+		offset &= (1UL << shift) - 1;
+	}
+
+	return n ? &n->data[offset] : NULL;
+}
+
+#define genradix_ptr_inlined(_radix, _idx)			\
+	(__genradix_cast(_radix)				\
+	 __genradix_ptr_inlined(&(_radix)->tree,		\
+			__genradix_idx_to_offset(_radix, _idx)))
+
 void *__genradix_ptr(struct __genradix *, size_t);
 
-/* Returns a pointer to element at @_idx */
+/**
+ * genradix_ptr - get a pointer to a genradix entry
+ * @_radix:	genradix to access
+ * @_idx:	index to fetch
+ *
+ * Returns a pointer to entry at @_idx, or NULL if that entry does not exist.
+ */
 #define genradix_ptr(_radix, _idx)				\
 	(__genradix_cast(_radix)				\
 	 __genradix_ptr(&(_radix)->tree,			\
 			__genradix_idx_to_offset(_radix, _idx)))
 
-void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
+void *__genradix_ptr_alloc(struct __genradix *, size_t,
+			   struct genradix_node **, gfp_t);
+
+#define genradix_ptr_alloc_inlined(_radix, _idx, _gfp)			\
+	(__genradix_cast(_radix)					\
+	 (__genradix_ptr_inlined(&(_radix)->tree,			\
+			__genradix_idx_to_offset(_radix, _idx)) ?:	\
+	  __genradix_ptr_alloc(&(_radix)->tree,				\
+			__genradix_idx_to_offset(_radix, _idx),		\
+			NULL, _gfp)))
 
-/* Returns a pointer to element at @_idx, allocating it if necessary */
+#define genradix_ptr_alloc_preallocated_inlined(_radix, _idx, _new_node, _gfp)\
+	(__genradix_cast(_radix)					\
+	 (__genradix_ptr_inlined(&(_radix)->tree,			\
+			__genradix_idx_to_offset(_radix, _idx)) ?:	\
+	  __genradix_ptr_alloc(&(_radix)->tree,				\
+			__genradix_idx_to_offset(_radix, _idx),		\
+			_new_node, _gfp)))
+
+/**
+ * genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
+ *			if necessary
+ * @_radix:	genradix to access
+ * @_idx:	index to fetch
+ * @_gfp:	gfp mask
+ *
+ * Returns a pointer to entry at @_idx, or NULL on allocation failure
+ */
 #define genradix_ptr_alloc(_radix, _idx, _gfp)			\
 	(__genradix_cast(_radix)				\
 	 __genradix_ptr_alloc(&(_radix)->tree,			\
 			__genradix_idx_to_offset(_radix, _idx),	\
-			_gfp))
+			NULL, _gfp))
+
+#define genradix_ptr_alloc_preallocated(_radix, _idx, _new_node, _gfp)\
+	(__genradix_cast(_radix)				\
+	 __genradix_ptr_alloc(&(_radix)->tree,			\
+			__genradix_idx_to_offset(_radix, _idx),	\
+			_new_node, _gfp))
 
 struct genradix_iter {
 	size_t			offset;
 	size_t			pos;
 };
 
+/**
+ * genradix_iter_init - initialize a genradix_iter
+ * @_radix:	genradix that will be iterated over
+ * @_idx:	index to start iterating from
+ */
 #define genradix_iter_init(_radix, _idx)			\
 	((struct genradix_iter) {				\
 		.pos	= (_idx),				\
@@ -107,19 +278,51 @@ struct genradix_iter {
 
 void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
 
+/**
+ * genradix_iter_peek - get first entry at or above iterator's current
+ *			position
+ * @_iter:	a genradix_iter
+ * @_radix:	genradix being iterated over
+ *
+ * If no more entries exist at or above @_iter's current position, returns NULL
+ */
 #define genradix_iter_peek(_iter, _radix)			\
 	(__genradix_cast(_radix)				\
 	 __genradix_iter_peek(_iter, &(_radix)->tree,		\
-			      PAGE_SIZE / __genradix_obj_size(_radix)))
+			__genradix_objs_per_page(_radix)))
+
+void *__genradix_iter_peek_prev(struct genradix_iter *, struct __genradix *,
+				size_t, size_t);
+
+/**
+ * genradix_iter_peek_prev - get first entry at or below iterator's current
+ *			     position
+ * @_iter:	a genradix_iter
+ * @_radix:	genradix being iterated over
+ *
+ * If no more entries exist at or below @_iter's current position, returns NULL
+ */
+#define genradix_iter_peek_prev(_iter, _radix)			\
+	(__genradix_cast(_radix)				\
+	 __genradix_iter_peek_prev(_iter, &(_radix)->tree,	\
+			__genradix_objs_per_page(_radix),	\
+			__genradix_obj_size(_radix) +		\
+			__genradix_page_remainder(_radix)))
 
 static inline void __genradix_iter_advance(struct genradix_iter *iter,
 					   size_t obj_size)
 {
+	if (iter->offset + obj_size < iter->offset) {
+		iter->offset	= SIZE_MAX;
+		iter->pos	= SIZE_MAX;
+		return;
+	}
+
 	iter->offset += obj_size;
 
 	if (!is_power_of_2(obj_size) &&
-	    (iter->offset & (PAGE_SIZE - 1)) + obj_size > PAGE_SIZE)
-		iter->offset = round_up(iter->offset, PAGE_SIZE);
+	    (iter->offset & (GENRADIX_NODE_SIZE - 1)) + obj_size > GENRADIX_NODE_SIZE)
+		iter->offset = round_up(iter->offset, GENRADIX_NODE_SIZE);
 
 	iter->pos++;
 }
@@ -127,4 +330,73 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
 #define genradix_iter_advance(_iter, _radix)			\
 	__genradix_iter_advance(_iter, __genradix_obj_size(_radix))
 
+static inline void __genradix_iter_rewind(struct genradix_iter *iter,
+					  size_t obj_size)
+{
+	if (iter->offset == 0 ||
+	    iter->offset == SIZE_MAX) {
+		iter->offset = SIZE_MAX;
+		return;
+	}
+
+	if ((iter->offset & (GENRADIX_NODE_SIZE - 1)) == 0)
+		iter->offset -= GENRADIX_NODE_SIZE % obj_size;
+
+	iter->offset -= obj_size;
+	iter->pos--;
+}
+
+#define genradix_iter_rewind(_iter, _radix)			\
+	__genradix_iter_rewind(_iter, __genradix_obj_size(_radix))
+
+#define genradix_for_each_from(_radix, _iter, _p, _start)	\
+	for (_iter = genradix_iter_init(_radix, _start);	\
+	     (_p = genradix_iter_peek(&_iter, _radix)) != NULL;	\
+	     genradix_iter_advance(&_iter, _radix))
+
+/**
+ * genradix_for_each - iterate over entry in a genradix
+ * @_radix:	genradix to iterate over
+ * @_iter:	a genradix_iter to track current position
+ * @_p:		pointer to genradix entry type
+ *
+ * On every iteration, @_p will point to the current entry, and @_iter.pos
+ * will be the current entry's index.
+ */
+#define genradix_for_each(_radix, _iter, _p)			\
+	genradix_for_each_from(_radix, _iter, _p, 0)
+
+#define genradix_last_pos(_radix)				\
+	(SIZE_MAX / GENRADIX_NODE_SIZE * __genradix_objs_per_page(_radix) - 1)
+
+/**
+ * genradix_for_each_reverse - iterate over entry in a genradix, reverse order
+ * @_radix:	genradix to iterate over
+ * @_iter:	a genradix_iter to track current position
+ * @_p:		pointer to genradix entry type
+ *
+ * On every iteration, @_p will point to the current entry, and @_iter.pos
+ * will be the current entry's index.
+ */
+#define genradix_for_each_reverse(_radix, _iter, _p)		\
+	for (_iter = genradix_iter_init(_radix,	genradix_last_pos(_radix));\
+	     (_p = genradix_iter_peek_prev(&_iter, _radix)) != NULL;\
+	     genradix_iter_rewind(&_iter, _radix))
+
+int __genradix_prealloc(struct __genradix *, size_t, gfp_t);
+
+/**
+ * genradix_prealloc - preallocate entries in a generic radix tree
+ * @_radix:	genradix to preallocate
+ * @_nr:	number of entries to preallocate
+ * @_gfp:	gfp mask
+ *
+ * Returns 0 on success, -ENOMEM on failure
+ */
+#define genradix_prealloc(_radix, _nr, _gfp)			\
+	 __genradix_prealloc(&(_radix)->tree,			\
+			__genradix_idx_to_offset(_radix, _nr + 1),\
+			_gfp)
+
+
 #endif /* _LINUX_GENERIC_RADIX_TREE_H */
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 9b8dd43d..d16ea76f 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_JIFFIES_H
 #define _LINUX_JIFFIES_H
 
+#include <time.h>
 #include <linux/kernel.h>
 #include <linux/time64.h>
 #include <linux/typecheck.h>
@@ -42,6 +43,8 @@
 	(time_after_eq64(a, b) && \
 	 time_before_eq64(a, c))
 
+#define time_is_before_jiffies(a) time_after(jiffies, a)
+
 #define HZ		1000
 
 static inline u64 jiffies_to_nsecs(const unsigned long j)
@@ -68,7 +71,7 @@ static inline u64 sched_clock(void)
 {
 	struct timespec ts;
 
-	clock_gettime(CLOCK_MONOTONIC, &ts);
+	clock_gettime(CLOCK_MONOTONIC_COARSE, &ts);
 
 	return ((s64) ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec;
 }
@@ -78,6 +81,11 @@ static inline u64 local_clock(void)
 	return sched_clock();
 }
 
+static inline u64 ktime_get_ns(void)
+{
+	return sched_clock();
+}
+
 #define jiffies			nsecs_to_jiffies(sched_clock())
 
 #endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index a4c8149e..8f1cbc2d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -11,8 +11,54 @@
 #include <linux/bug.h>
 #include <linux/byteorder.h>
 #include <linux/compiler.h>
+#include <linux/math.h>
+#include <linux/minmax.h>
 
-#define IS_ENABLED(opt)		0
+#define BIT(nr)			(1UL << (nr))
+#define BIT_ULL(nr)		(1ULL << (nr))
+
+#define __ARG_PLACEHOLDER_1 0,
+#define __take_second_arg(__ignored, val, ...) val
+
+#define __and(x, y)			___and(x, y)
+#define ___and(x, y)			____and(__ARG_PLACEHOLDER_##x, y)
+#define ____and(arg1_or_junk, y)	__take_second_arg(arg1_or_junk y, 0)
+
+#define __or(x, y)			___or(x, y)
+#define ___or(x, y)			____or(__ARG_PLACEHOLDER_##x, y)
+#define ____or(arg1_or_junk, y)		__take_second_arg(arg1_or_junk 1, y)
+
+#define __is_defined(x)			___is_defined(x)
+#define ___is_defined(val)		____is_defined(__ARG_PLACEHOLDER_##val)
+#define ____is_defined(arg1_or_junk)	__take_second_arg(arg1_or_junk 1, 0)
+
+/*
+ * IS_BUILTIN(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y', 0
+ * otherwise. For boolean options, this is equivalent to
+ * IS_ENABLED(CONFIG_FOO).
+ */
+#define IS_BUILTIN(option) __is_defined(option)
+
+/*
+ * IS_MODULE(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'm', 0
+ * otherwise.
+ */
+#define IS_MODULE(option) __is_defined(option##_MODULE)
+
+/*
+ * IS_REACHABLE(CONFIG_FOO) evaluates to 1 if the currently compiled
+ * code can call a function defined in code compiled based on CONFIG_FOO.
+ * This is similar to IS_ENABLED(), but returns false when invoked from
+ * built-in code when CONFIG_FOO is set to 'm'.
+ */
+#define IS_REACHABLE(option) __or(IS_BUILTIN(option), \
+				__and(IS_MODULE(option), __is_defined(MODULE)))
+
+/*
+ * IS_ENABLED(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y' or 'm',
+ * 0 otherwise.
+ */
+#define IS_ENABLED(option) __or(IS_BUILTIN(option), IS_MODULE(option))
 #define EXPORT_SYMBOL(sym)
 
 #define U8_MAX		((u8)~0U)
@@ -37,16 +83,6 @@
 #define __must_be_array(a)	BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
 
-#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
-
-#define mult_frac(x, numer, denom)(			\
-{							\
-	typeof(x) quot = (x) / (denom);			\
-	typeof(x) rem  = (x) % (denom);			\
-	(quot * (numer)) + ((rem * (numer)) / (denom));	\
-}							\
-)
-
 #ifndef offsetof
 #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
 #endif
@@ -64,52 +100,28 @@
 	(type *)((char *)__mptr - offsetof(type, member)); })
 #endif
 
-#define __round_mask(x, y) ((__typeof__(x))((y)-1))
-#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
-#define round_down(x, y) ((x) & ~__round_mask(x, y))
-
-#define roundup(x, y)					\
-({							\
-	const typeof(y) __y = y;			\
-	(((x) + (__y - 1)) / __y) * __y;		\
-})
-
-#define max(x, y) ({				\
-	typeof(x) _max1 = (x);			\
-	typeof(y) _max2 = (y);			\
-	(void) (&_max1 == &_max2);		\
-	_max1 > _max2 ? _max1 : _max2; })
-
-#define min(x, y) ({				\
-	typeof(x) _min1 = (x);			\
-	typeof(y) _min2 = (y);			\
-	(void) (&_min1 == &_min2);		\
-	_min1 < _min2 ? _min1 : _min2; })
-
-#define min_t(type, x, y) ({			\
-	type __min1 = (x);			\
-	type __min2 = (y);			\
-	__min1 < __min2 ? __min1: __min2; })
-
-#define max_t(type, x, y) ({			\
-	type __max1 = (x);			\
-	type __max2 = (y);			\
-	__max1 > __max2 ? __max1: __max2; })
+#ifndef __struct_group
+#define __struct_group(TAG, NAME, ATTRS, MEMBERS...) \
+	union { \
+		struct { MEMBERS } ATTRS; \
+		struct TAG { MEMBERS } ATTRS NAME; \
+	}
+#endif
 
-#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi)
+#define struct_group(NAME, MEMBERS...)	\
+	__struct_group(/* no tag */, NAME, /* no attrs */, MEMBERS)
 
-#define swap(a, b) \
-	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
+/* This counts to 12. Any more, it will return 13th argument. */
+#define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n
+#define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
 
 #define _RET_IP_		(unsigned long)__builtin_return_address(0)
 #define _THIS_IP_  ({ __label__ __here; __here: (unsigned long)&&__here; })
 
 #define might_sleep()
 
-#define NR_CPUS			32
-
-#define cpu_relax()		do {} while (0)
-#define cpu_relax_lowlatency()	do {} while (0)
+#define cpu_relax()		barrier()
+#define cpu_relax_lowlatency()	barrier()
 
 #define panic(fmt, ...)					\
 do {							\
@@ -117,11 +129,6 @@ do {							\
 	BUG();						\
 } while (0)
 
-unsigned long simple_strtoul(const char *,char **,unsigned int);
-long simple_strtol(const char *,char **,unsigned int);
-unsigned long long simple_strtoull(const char *,char **,unsigned int);
-long long simple_strtoll(const char *,char **,unsigned int);
-
 int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
 int __must_check _kstrtol(const char *s, unsigned int base, long *res);
 
@@ -209,18 +216,18 @@ static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 *
 	return kstrtoint(s, base, res);
 }
 
-/* Permissions on a sysfs file: you didn't miss the 0 prefix did you? */
-#define VERIFY_OCTAL_PERMISSIONS(perms)						\
-	(BUILD_BUG_ON_ZERO((perms) < 0) +					\
-	 BUILD_BUG_ON_ZERO((perms) > 0777) +					\
-	 /* USER_READABLE >= GROUP_READABLE >= OTHER_READABLE */		\
-	 BUILD_BUG_ON_ZERO((((perms) >> 6) & 4) < (((perms) >> 3) & 4)) +	\
-	 BUILD_BUG_ON_ZERO((((perms) >> 3) & 4) < ((perms) & 4)) +		\
-	 /* USER_WRITABLE >= GROUP_WRITABLE */					\
-	 BUILD_BUG_ON_ZERO((((perms) >> 6) & 2) < (((perms) >> 3) & 2)) +	\
-	 /* OTHER_WRITABLE?  Generally considered a bad idea. */		\
-	 BUILD_BUG_ON_ZERO((perms) & 2) +					\
-	 (perms))
+struct printbuf;
+extern void prt_u64(struct printbuf *out, u64 num);
+
+extern __printf(2, 0) void prt_vprintf(struct printbuf *out, const char *fmt, va_list args);
+extern __printf(2, 3) void prt_printf(struct printbuf *out, const char *fmt, ...);
+
+static const char hex_asc[] = "0123456789abcdef";
+#define hex_asc_lo(x)	hex_asc[((x) & 0x0f)]
+#define hex_asc_hi(x)	hex_asc[((x) & 0xf0) >> 4]
+static const char hex_asc_upper[] = "0123456789ABCDEF";
+#define hex_asc_upper_lo(x)	hex_asc_upper[((x) & 0x0f)]
+#define hex_asc_upper_hi(x)	hex_asc_upper[((x) & 0xf0) >> 4]
 
 /* The hash is always the low bits of hash_len */
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
@@ -241,4 +248,17 @@ struct qstr {
 
 #define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
 
+#define POISON_FREE 0x6b
+
+static inline void dump_stack(void) {}
+
+#define unsafe_memcpy(dst, src, bytes, justification)		\
+	memcpy(dst, src, bytes)
+
+#ifdef __DECLARE_FLEX_ARRAY
+#define DECLARE_FLEX_ARRAY(TYPE, NAME) __DECLARE_FLEX_ARRAY(TYPE, NAME)
+#else
+#define DECLARE_FLEX_ARRAY(T, member)        T member[0]
+#endif
+
 #endif
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
new file mode 100644
index 00000000..6a3cd1bf
--- /dev/null
+++ b/include/linux/kmemleak.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * include/linux/kmemleak.h
+ *
+ * Copyright (C) 2008 ARM Limited
+ * Written by Catalin Marinas <catalin.marinas@arm.com>
+ */
+
+#ifndef __KMEMLEAK_H
+#define __KMEMLEAK_H
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#ifdef CONFIG_DEBUG_KMEMLEAK
+
+extern void kmemleak_init(void) __init;
+extern void kmemleak_alloc(const void *ptr, size_t size, int min_count,
+			   gfp_t gfp) __ref;
+extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+				  gfp_t gfp) __ref;
+extern void kmemleak_vmalloc(const struct vm_struct *area, size_t size,
+			     gfp_t gfp) __ref;
+extern void kmemleak_free(const void *ptr) __ref;
+extern void kmemleak_free_part(const void *ptr, size_t size) __ref;
+extern void kmemleak_free_percpu(const void __percpu *ptr) __ref;
+extern void kmemleak_update_trace(const void *ptr) __ref;
+extern void kmemleak_not_leak(const void *ptr) __ref;
+extern void kmemleak_ignore(const void *ptr) __ref;
+extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref;
+extern void kmemleak_no_scan(const void *ptr) __ref;
+extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
+				gfp_t gfp) __ref;
+extern void kmemleak_free_part_phys(phys_addr_t phys, size_t size) __ref;
+extern void kmemleak_ignore_phys(phys_addr_t phys) __ref;
+
+static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
+					    int min_count, slab_flags_t flags,
+					    gfp_t gfp)
+{
+	if (!(flags & SLAB_NOLEAKTRACE))
+		kmemleak_alloc(ptr, size, min_count, gfp);
+}
+
+static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags)
+{
+	if (!(flags & SLAB_NOLEAKTRACE))
+		kmemleak_free(ptr);
+}
+
+static inline void kmemleak_erase(void **ptr)
+{
+	*ptr = NULL;
+}
+
+#else
+
+static inline void kmemleak_init(void)
+{
+}
+static inline void kmemleak_alloc(const void *ptr, size_t size, int min_count,
+				  gfp_t gfp)
+{
+}
+static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
+					    int min_count, slab_flags_t flags,
+					    gfp_t gfp)
+{
+}
+static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+					 gfp_t gfp)
+{
+}
+static inline void kmemleak_vmalloc(const struct vm_struct *area, size_t size,
+				    gfp_t gfp)
+{
+}
+static inline void kmemleak_free(const void *ptr)
+{
+}
+static inline void kmemleak_free_part(const void *ptr, size_t size)
+{
+}
+static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags)
+{
+}
+static inline void kmemleak_free_percpu(const void __percpu *ptr)
+{
+}
+static inline void kmemleak_update_trace(const void *ptr)
+{
+}
+static inline void kmemleak_not_leak(const void *ptr)
+{
+}
+static inline void kmemleak_ignore(const void *ptr)
+{
+}
+static inline void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
+{
+}
+static inline void kmemleak_erase(void **ptr)
+{
+}
+static inline void kmemleak_no_scan(const void *ptr)
+{
+}
+static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
+				       gfp_t gfp)
+{
+}
+static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size)
+{
+}
+static inline void kmemleak_ignore_phys(phys_addr_t phys)
+{
+}
+
+#endif	/* CONFIG_DEBUG_KMEMLEAK */
+
+#endif	/* __KMEMLEAK_H */
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 2ec53f8a..c33b2126 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -20,10 +20,8 @@
 #include <linux/bug.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
-#include <linux/kref.h>
 #include <linux/sysfs.h>
 #include <linux/types.h>
-#include <linux/wait.h>
 #include <linux/workqueue.h>
 
 struct kset;
@@ -31,7 +29,7 @@ struct kset;
 struct kobj_type {
 	void (*release)(struct kobject *kobj);
 	const struct sysfs_ops *sysfs_ops;
-	struct attribute **default_attrs;
+	const struct attribute_group **default_groups;
 	const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj);
 	const void *(*namespace)(struct kobject *kobj);
 };
@@ -50,9 +48,9 @@ struct kobj_attribute {
 struct kobject {
 	struct kobject		*parent;
 	struct kset		*kset;
-	struct kobj_type	*ktype;
+	const struct kobj_type	*ktype;
 	struct kernfs_node	*sd; /* sysfs directory entry */
-	struct kref		kref;
+	atomic_t		ref;
 	unsigned int state_initialized:1;
 	unsigned int state_in_sysfs:1;
 	unsigned int state_add_uevent_sent:1;
@@ -64,18 +62,13 @@ struct kset {
 	struct kobject		kobj;
 };
 
-static inline struct kobj_type *get_ktype(struct kobject *kobj)
-{
-	return kobj->ktype;
-}
-
 #define kobject_add(...)	0
 
-static inline void kobject_init(struct kobject *kobj, struct kobj_type *ktype)
+static inline void kobject_init(struct kobject *kobj, const struct kobj_type *ktype)
 {
 	memset(kobj, 0, sizeof(*kobj));
 
-	kref_init(&kobj->kref);
+	atomic_set(&kobj->ref, 1);
 	kobj->ktype = ktype;
 	kobj->state_initialized = 1;
 }
@@ -84,7 +77,7 @@ static inline void kobject_del(struct kobject *kobj);
 
 static inline void kobject_cleanup(struct kobject *kobj)
 {
-	struct kobj_type *t = get_ktype(kobj);
+	const struct kobj_type *t = kobj->ktype;
 
 	/* remove from sysfs if the caller did not do it */
 	if (kobj->state_in_sysfs)
@@ -94,29 +87,20 @@ static inline void kobject_cleanup(struct kobject *kobj)
 		t->release(kobj);
 }
 
-static inline void kobject_release(struct kref *kref)
-{
-	struct kobject *kobj = container_of(kref, struct kobject, kref);
-
-	kobject_cleanup(kobj);
-}
-
 static inline void kobject_put(struct kobject *kobj)
 {
 	BUG_ON(!kobj);
 	BUG_ON(!kobj->state_initialized);
 
-	kref_put(&kobj->kref, kobject_release);
+	if (atomic_dec_and_test(&kobj->ref))
+		kobject_cleanup(kobj);
 }
 
 static inline void kobject_del(struct kobject *kobj)
 {
-	struct kernfs_node *sd;
-
 	if (!kobj)
 		return;
 
-	sd = kobj->sd;
 	kobj->state_in_sysfs = 0;
 #if 0
 	kobj_kset_leave(kobj);
@@ -130,11 +114,14 @@ static inline struct kobject *kobject_get(struct kobject *kobj)
 	BUG_ON(!kobj);
 	BUG_ON(!kobj->state_initialized);
 
-	kref_get(&kobj->kref);
+	atomic_inc(&kobj->ref);
 	return kobj;
 }
 
-static inline void kset_unregister(struct kset *kset) {}
+static inline void kset_unregister(struct kset *kset)
+{
+	kfree(kset);
+}
 
 #define kset_create_and_add(_name, _u, _parent)				\
 	((struct kset *) kzalloc(sizeof(struct kset), GFP_KERNEL))
diff --git a/include/linux/kref.h b/include/linux/kref.h
deleted file mode 100644
index e15828fd..00000000
--- a/include/linux/kref.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * kref.h - library routines for handling generic reference counted objects
- *
- * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
- * Copyright (C) 2004 IBM Corp.
- *
- * based on kobject.h which was:
- * Copyright (C) 2002-2003 Patrick Mochel <mochel@osdl.org>
- * Copyright (C) 2002-2003 Open Source Development Labs
- *
- * This file is released under the GPLv2.
- *
- */
-
-#ifndef _KREF_H_
-#define _KREF_H_
-
-#include <linux/bug.h>
-#include <linux/atomic.h>
-#include <linux/kernel.h>
-#include <linux/mutex.h>
-
-struct kref {
-	atomic_t refcount;
-};
-
-/**
- * kref_init - initialize object.
- * @kref: object in question.
- */
-static inline void kref_init(struct kref *kref)
-{
-	atomic_set(&kref->refcount, 1);
-}
-
-/**
- * kref_get - increment refcount for object.
- * @kref: object.
- */
-static inline void kref_get(struct kref *kref)
-{
-	/* If refcount was 0 before incrementing then we have a race
-	 * condition when this kref is freeing by some other thread right now.
-	 * In this case one should use kref_get_unless_zero()
-	 */
-	WARN_ON_ONCE(atomic_inc_return(&kref->refcount) < 2);
-}
-
-/**
- * kref_sub - subtract a number of refcounts for object.
- * @kref: object.
- * @count: Number of recounts to subtract.
- * @release: pointer to the function that will clean up the object when the
- *	     last reference to the object is released.
- *	     This pointer is required, and it is not acceptable to pass kfree
- *	     in as this function.  If the caller does pass kfree to this
- *	     function, you will be publicly mocked mercilessly by the kref
- *	     maintainer, and anyone else who happens to notice it.  You have
- *	     been warned.
- *
- * Subtract @count from the refcount, and if 0, call release().
- * Return 1 if the object was removed, otherwise return 0.  Beware, if this
- * function returns 0, you still can not count on the kref from remaining in
- * memory.  Only use the return value if you want to see if the kref is now
- * gone, not present.
- */
-static inline int kref_sub(struct kref *kref, unsigned int count,
-	     void (*release)(struct kref *kref))
-{
-	WARN_ON(release == NULL);
-
-	if (atomic_sub_and_test((int) count, &kref->refcount)) {
-		release(kref);
-		return 1;
-	}
-	return 0;
-}
-
-/**
- * kref_put - decrement refcount for object.
- * @kref: object.
- * @release: pointer to the function that will clean up the object when the
- *	     last reference to the object is released.
- *	     This pointer is required, and it is not acceptable to pass kfree
- *	     in as this function.  If the caller does pass kfree to this
- *	     function, you will be publicly mocked mercilessly by the kref
- *	     maintainer, and anyone else who happens to notice it.  You have
- *	     been warned.
- *
- * Decrement the refcount, and if 0, call release().
- * Return 1 if the object was removed, otherwise return 0.  Beware, if this
- * function returns 0, you still can not count on the kref from remaining in
- * memory.  Only use the return value if you want to see if the kref is now
- * gone, not present.
- */
-static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
-{
-	return kref_sub(kref, 1, release);
-}
-
-static inline int kref_put_mutex(struct kref *kref,
-				 void (*release)(struct kref *kref),
-				 struct mutex *lock)
-{
-	WARN_ON(release == NULL);
-	if (unlikely(!atomic_add_unless(&kref->refcount, -1, 1))) {
-		mutex_lock(lock);
-		if (unlikely(!atomic_dec_and_test(&kref->refcount))) {
-			mutex_unlock(lock);
-			return 0;
-		}
-		release(kref);
-		return 1;
-	}
-	return 0;
-}
-
-/**
- * kref_get_unless_zero - Increment refcount for object unless it is zero.
- * @kref: object.
- *
- * Return non-zero if the increment succeeded. Otherwise return 0.
- *
- * This function is intended to simplify locking around refcounting for
- * objects that can be looked up from a lookup structure, and which are
- * removed from that lookup structure in the object destructor.
- * Operations on such objects require at least a read lock around
- * lookup + kref_get, and a write lock around kref_put + remove from lookup
- * structure. Furthermore, RCU implementations become extremely tricky.
- * With a lookup followed by a kref_get_unless_zero *with return value check*
- * locking in the kref_put path can be deferred to the actual removal from
- * the lookup structure and RCU lookups become trivial.
- */
-static inline int __must_check kref_get_unless_zero(struct kref *kref)
-{
-	return atomic_add_unless(&kref->refcount, 1, 0);
-}
-#endif /* _KREF_H_ */
diff --git a/include/linux/lglock.h b/include/linux/lglock.h
deleted file mode 100644
index a9108bcd..00000000
--- a/include/linux/lglock.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef __TOOLS_LINUX_LGLOCK_H
-#define __TOOLS_LINUX_LGLOCK_H
-
-#include <pthread.h>
-
-struct lglock {
-	pthread_mutex_t lock;
-};
-
-#define lg_lock_free(l)		do {} while (0)
-#define lg_lock_init(l)		pthread_mutex_init(&(l)->lock, NULL)
-
-#define lg_local_lock(l)	pthread_mutex_lock(&(l)->lock)
-#define lg_local_unlock(l)	pthread_mutex_unlock(&(l)->lock)
-#define lg_global_lock(l)	pthread_mutex_lock(&(l)->lock)
-#define lg_global_unlock(l)	pthread_mutex_unlock(&(l)->lock)
-
-#endif /* __TOOLS_LINUX_LGLOCK_H */
diff --git a/include/linux/list.h b/include/linux/list.h
index 4a317090..d176d0d3 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -10,6 +10,7 @@
 #define list_add(n, h)			cds_list_add(n, h)
 #define list_add_tail(n, h)		cds_list_add_tail(n, h)
 #define __list_del_entry(l)		cds_list_del(l)
+#define __list_del(p, n)		__cds_list_del(p, n)
 #define list_del(l)			cds_list_del(l)
 #define list_del_init(l)		cds_list_del_init(l)
 #define list_replace(o, n)		cds_list_replace(o, n)
@@ -26,7 +27,6 @@
 #define list_for_each_entry(p, h, m)	cds_list_for_each_entry(p, h, m)
 #define list_for_each_entry_reverse(p, h, m) cds_list_for_each_entry_reverse(p, h, m)
 #define list_for_each_entry_safe(p, n, h, m) cds_list_for_each_entry_safe(p, n, h, m)
-#define list_for_each_entry_safe_reverse(p, n, h, m) cds_list_for_each_entry_safe_reverse(p, n, h, m)
 
 static inline int list_empty_careful(const struct list_head *head)
 {
@@ -54,6 +54,15 @@ static inline void list_splice_init(struct list_head *list,
 #define list_first_entry_or_null(ptr, type, member) \
 	(!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
 
+#define list_prev_entry(pos, member) \
+	list_entry((pos)->member.prev, typeof(*(pos)), member)
+
+#define list_for_each_entry_safe_reverse(pos, n, head, member)		\
+	for (pos = list_last_entry(head, typeof(*pos), member),		\
+		n = list_prev_entry(pos, member);			\
+	     &pos->member != (head); 					\
+	     pos = n, n = list_prev_entry(n, member))
+
 /* hlists: */
 
 #include <urcu/hlist.h>
@@ -61,4 +70,43 @@ static inline void list_splice_init(struct list_head *list,
 #define hlist_head			cds_hlist_head
 #define hlist_node			cds_hlist_node
 
+#define hlist_add_head(n, h)	cds_hlist_add_head(n, h)
+#define hlist_del(n)		cds_hlist_del(n)
+#define hlist_del_init(n)	cds_hlist_del_init(n)
+
+static inline int hlist_unhashed(const struct hlist_node *h)
+{
+	return !h->prev;
+}
+
+static inline void hlist_del_init(struct hlist_node *n)
+{
+	hlist_del(n);
+	n->prev = NULL;
+	n->next = NULL;
+}
+
+#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_entry_safe(ptr, type, member) \
+	({ typeof(ptr) ____ptr = (ptr); \
+	   ____ptr ? hlist_entry(____ptr, type, member) : NULL; \
+	})
+
+#define hlist_for_each_entry(pos, head, member)				\
+	for (pos = hlist_entry_safe((head)->next, typeof(*(pos)), member);\
+	     pos;							\
+	     pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
+
+static inline size_t list_count_nodes(struct list_head *head)
+{
+	struct list_head *pos;
+	size_t count = 0;
+
+	list_for_each(pos, head)
+		count++;
+
+	return count;
+}
+
 #endif /* _LIST_LIST_H */
diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index b01fe100..fa6e8471 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_LIST_NULLS_H
 #define _LINUX_LIST_NULLS_H
 
@@ -29,6 +30,11 @@ struct hlist_nulls_node {
 	((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))
 
 #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_nulls_entry_safe(ptr, type, member) \
+	({ typeof(ptr) ____ptr = (ptr); \
+	   !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \
+	})
 /**
  * ptr_is_a_nulls - Test if a ptr is a nulls
  * @ptr: ptr to be tested
@@ -50,11 +56,33 @@ static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr)
 	return ((unsigned long)ptr) >> 1;
 }
 
+/**
+ * hlist_nulls_unhashed - Has node been removed and reinitialized?
+ * @h: Node to be checked
+ *
+ * Not that not all removal functions will leave a node in unhashed state.
+ * For example, hlist_del_init_rcu() leaves the node in unhashed state,
+ * but hlist_nulls_del() does not.
+ */
 static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
 {
 	return !h->pprev;
 }
 
+/**
+ * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized?
+ * @h: Node to be checked
+ *
+ * Not that not all removal functions will leave a node in unhashed state.
+ * For example, hlist_del_init_rcu() leaves the node in unhashed state,
+ * but hlist_nulls_del() does not.  Unlike hlist_nulls_unhashed(), this
+ * function may be used locklessly.
+ */
+static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h)
+{
+	return !READ_ONCE(h->pprev);
+}
+
 static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
 {
 	return is_a_nulls(READ_ONCE(h->first));
@@ -66,10 +94,10 @@ static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
 	struct hlist_nulls_node *first = h->first;
 
 	n->next = first;
-	n->pprev = &h->first;
+	WRITE_ONCE(n->pprev, &h->first);
 	h->first = n;
 	if (!is_a_nulls(first))
-		first->pprev = &n->next;
+		WRITE_ONCE(first->pprev, &n->next);
 }
 
 static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
@@ -79,13 +107,13 @@ static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
 
 	WRITE_ONCE(*pprev, next);
 	if (!is_a_nulls(next))
-		next->pprev = pprev;
+		WRITE_ONCE(next->pprev, pprev);
 }
 
 static inline void hlist_nulls_del(struct hlist_nulls_node *n)
 {
 	__hlist_nulls_del(n);
-	n->pprev = LIST_POISON2;
+	WRITE_ONCE(n->pprev, LIST_POISON2);
 }
 
 /**
diff --git a/include/linux/llist.h b/include/linux/llist.h
index 8abc2e02..2e9c7215 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -1,31 +1,36 @@
-#ifndef __TOOLS_LINUX_LLIST_H
-#define __TOOLS_LINUX_LLIST_H
-
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef LLIST_H
+#define LLIST_H
 /*
  * Lock-less NULL terminated single linked list
  *
- * If there are multiple producers and multiple consumers, llist_add
- * can be used in producers and llist_del_all can be used in
- * consumers.  They can work simultaneously without lock.  But
- * llist_del_first can not be used here.  Because llist_del_first
- * depends on list->first->next does not changed if list->first is not
- * changed during its operation, but llist_del_first, llist_add,
- * llist_add (or llist_del_all, llist_add, llist_add) sequence in
- * another consumer may violate that.
- *
- * If there are multiple producers and one consumer, llist_add can be
- * used in producers and llist_del_all or llist_del_first can be used
- * in the consumer.
- *
- * This can be summarized as follow:
+ * Cases where locking is not needed:
+ * If there are multiple producers and multiple consumers, llist_add can be
+ * used in producers and llist_del_all can be used in consumers simultaneously
+ * without locking. Also a single consumer can use llist_del_first while
+ * multiple producers simultaneously use llist_add, without any locking.
+ *
+ * Cases where locking is needed:
+ * If we have multiple consumers with llist_del_first used in one consumer, and
+ * llist_del_first or llist_del_all used in other consumers, then a lock is
+ * needed.  This is because llist_del_first depends on list->first->next not
+ * changing, but without lock protection, there's no way to be sure about that
+ * if a preemption happens in the middle of the delete operation and on being
+ * preempted back, the list->first is the same as before causing the cmpxchg in
+ * llist_del_first to succeed. For example, while a llist_del_first operation
+ * is in progress in one consumer, then a llist_del_first, llist_add,
+ * llist_add (or llist_del_all, llist_add, llist_add) sequence in another
+ * consumer may cause violations.
+ *
+ * This can be summarized as follows:
  *
  *           |   add    | del_first |  del_all
  * add       |    -     |     -     |     -
  * del_first |          |     L     |     L
  * del_all   |          |           |     -
  *
- * Where "-" stands for no lock is needed, while "L" stands for lock
- * is needed.
+ * Where, a particular row's operation can happen concurrently with a column's
+ * operation, with "-" being no lock needed, while "L" being lock is needed.
  *
  * The list entries deleted via llist_del_all can be traversed with
  * traversing function such as llist_for_each etc.  But the list
@@ -41,19 +46,6 @@
  *
  * Copyright 2010,2011 Intel Corp.
  *   Author: Huang Ying <ying.huang@intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation;
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/atomic.h>
@@ -89,6 +81,23 @@ static inline void init_llist_head(struct llist_head *list)
 	container_of(ptr, type, member)
 
 /**
+ * member_address_is_nonnull - check whether the member address is not NULL
+ * @ptr:	the object pointer (struct type * that contains the llist_node)
+ * @member:	the name of the llist_node within the struct.
+ *
+ * This macro is conceptually the same as
+ *	&ptr->member != NULL
+ * but it works around the fact that compilers can decide that taking a member
+ * address is never a NULL pointer.
+ *
+ * Real objects that start at a high address and have a member at NULL are
+ * unlikely to exist, but such pointers may be returned e.g. by the
+ * container_of() macro.
+ */
+#define member_address_is_nonnull(ptr, member)	\
+	((uintptr_t)(ptr) + offsetof(typeof(*(ptr)), member) != 0)
+
+/**
  * llist_for_each - iterate over some deleted entries of a lock-less list
  * @pos:	the &struct llist_node to use as a loop cursor
  * @node:	the first entry of deleted list entries
@@ -106,6 +115,25 @@ static inline void init_llist_head(struct llist_head *list)
 	for ((pos) = (node); pos; (pos) = (pos)->next)
 
 /**
+ * llist_for_each_safe - iterate over some deleted entries of a lock-less list
+ *			 safe against removal of list entry
+ * @pos:	the &struct llist_node to use as a loop cursor
+ * @n:		another &struct llist_node to use as temporary storage
+ * @node:	the first entry of deleted list entries
+ *
+ * In general, some entries of the lock-less list can be traversed
+ * safely only after being deleted from list, so start with an entry
+ * instead of list head.
+ *
+ * If being used on entries deleted from lock-less list directly, the
+ * traverse order is from the newest to the oldest added entry.  If
+ * you want to traverse from the oldest to the newest, you must
+ * reverse the order by yourself before traversing.
+ */
+#define llist_for_each_safe(pos, n, node)			\
+	for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))
+
+/**
  * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type
  * @pos:	the type * to use as a loop cursor.
  * @node:	the fist entry of deleted list entries.
@@ -122,7 +150,7 @@ static inline void init_llist_head(struct llist_head *list)
  */
 #define llist_for_each_entry(pos, node, member)				\
 	for ((pos) = llist_entry((node), typeof(*(pos)), member);	\
-	     &(pos)->member != NULL;					\
+	     member_address_is_nonnull(pos, member);			\
 	     (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member))
 
 /**
@@ -144,7 +172,7 @@ static inline void init_llist_head(struct llist_head *list)
  */
 #define llist_for_each_entry_safe(pos, n, node, member)			       \
 	for (pos = llist_entry((node), typeof(*pos), member);		       \
-	     &pos->member != NULL &&					       \
+	     member_address_is_nonnull(pos, member) &&			       \
 	        (n = llist_entry(pos->member.next, typeof(*n), member), true); \
 	     pos = n)
 
@@ -158,7 +186,7 @@ static inline void init_llist_head(struct llist_head *list)
  */
 static inline bool llist_empty(const struct llist_head *head)
 {
-	return ACCESS_ONCE(head->first) == NULL;
+	return READ_ONCE(head->first) == NULL;
 }
 
 static inline struct llist_node *llist_next(struct llist_node *node)
@@ -198,4 +226,4 @@ extern struct llist_node *llist_del_first(struct llist_head *head);
 
 struct llist_node *llist_reverse_order(struct llist_node *head);
 
-#endif /* __TOOLS_LINUX_LLIST_H */
+#endif /* LLIST_H */
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index d95d8da3..27bf6915 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -5,7 +5,9 @@ struct lock_class_key {};
 struct task_struct;
 
 # define lock_acquire(l, s, t, r, c, n, i)	do { } while (0)
-# define lock_release(l, n, i)			do { } while (0)
+# define lock_acquire_exclusive(...)		do { } while (0)
+# define lockdep_set_notrack_class(...)		do { } while (0)
+# define lock_release(l, i)			do { } while (0)
 # define lock_set_class(l, n, k, s, i)		do { } while (0)
 # define lock_set_subclass(l, s, i)		do { } while (0)
 # define lockdep_set_current_reclaim_state(g)	do { } while (0)
@@ -51,5 +53,10 @@ debug_check_no_locks_held(void)
 {
 }
 
+static inline int lock_class_is_held(struct lock_class_key *k)
+{
+	return 0;
+}
+
 #endif /* __TOOLS_LINUX_LOCKDEP_H */
 
diff --git a/include/linux/log2.h b/include/linux/log2.h
index 96f62458..f031ea12 100644
--- a/include/linux/log2.h
+++ b/include/linux/log2.h
@@ -1,30 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* Integer base 2 logarithm calculation
  *
  * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
  */
 
-#ifndef _TOOLS_LINUX_LOG2_H
-#define _TOOLS_LINUX_LOG2_H
-
-#include <limits.h>
-#ifndef PAGE_SHIFT
-#define PAGE_SHIFT ilog2(PAGE_SIZE)
-#endif
+#ifndef _LINUX_LOG2_H
+#define _LINUX_LOG2_H
 
+#include <linux/types.h>
 #include <linux/bitops.h>
-#include <linux/compiler.h>
-
-/*
- * deal with unrepresentable constant logarithms
- */
-extern __attribute__((const, noreturn))
-int ____ilog2_NaN(void);
 
 /*
  * non-constant log of base 2 calculators
@@ -32,31 +17,39 @@ int ____ilog2_NaN(void);
  *   more efficiently than using fls() and fls64()
  * - the arch is not required to handle n==0 if implementing the fallback
  */
+#ifndef CONFIG_ARCH_HAS_ILOG2_U32
 static inline __attribute__((const))
 int __ilog2_u32(u32 n)
 {
 	return fls(n) - 1;
 }
+#endif
 
+#ifndef CONFIG_ARCH_HAS_ILOG2_U64
 static inline __attribute__((const))
 int __ilog2_u64(u64 n)
 {
 	return fls64(n) - 1;
 }
+#endif
 
-/*
- *  Determine whether some value is a power of two, where zero is
+/**
+ * is_power_of_2() - check if a value is a power of two
+ * @n: the value to check
+ *
+ * Determine whether some value is a power of two, where zero is
  * *not* considered a power of two.
+ * Return: true if @n is a power of 2, otherwise false.
  */
-
 static inline __attribute__((const))
 bool is_power_of_2(unsigned long n)
 {
 	return (n != 0 && ((n & (n - 1)) == 0));
 }
 
-/*
- * round up to nearest power of two
+/**
+ * __roundup_pow_of_two() - round up to nearest power of two
+ * @n: value to round up
  */
 static inline __attribute__((const))
 unsigned long __roundup_pow_of_two(unsigned long n)
@@ -64,8 +57,9 @@ unsigned long __roundup_pow_of_two(unsigned long n)
 	return 1UL << fls_long(n - 1);
 }
 
-/*
- * round down to nearest power of two
+/**
+ * __rounddown_pow_of_two() - round down to nearest power of two
+ * @n: value to round down
  */
 static inline __attribute__((const))
 unsigned long __rounddown_pow_of_two(unsigned long n)
@@ -74,19 +68,16 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
 }
 
 /**
- * ilog2 - log of base 2 of 32-bit or a 64-bit unsigned value
- * @n - parameter
- *
- * constant-capable log of base 2 calculation
- * - this can be used to initialise global variables from constant data, hence
- *   the massive ternary operator construction
+ * const_ilog2 - log base 2 of 32-bit or a 64-bit constant unsigned value
+ * @n: parameter
  *
- * selects the appropriately-sized optimised version depending on sizeof(n)
+ * Use this where sparse expects a true constant expression, e.g. for array
+ * indices.
  */
-#define ilog2(n)				\
+#define const_ilog2(n)				\
 (						\
 	__builtin_constant_p(n) ? (		\
-		(n) < 1 ? ____ilog2_NaN() :	\
+		(n) < 2 ? 0 :			\
 		(n) & (1ULL << 63) ? 63 :	\
 		(n) & (1ULL << 62) ? 62 :	\
 		(n) & (1ULL << 61) ? 61 :	\
@@ -149,18 +140,31 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
 		(n) & (1ULL <<  4) ?  4 :	\
 		(n) & (1ULL <<  3) ?  3 :	\
 		(n) & (1ULL <<  2) ?  2 :	\
-		(n) & (1ULL <<  1) ?  1 :	\
-		(n) & (1ULL <<  0) ?  0 :	\
-		____ilog2_NaN()			\
-				   ) :		\
-	(sizeof(n) <= 4) ?			\
-	__ilog2_u32(n) :			\
-	__ilog2_u64(n)				\
+		1) :				\
+	-1)
+
+/**
+ * ilog2 - log base 2 of 32-bit or a 64-bit unsigned value
+ * @n: parameter
+ *
+ * constant-capable log of base 2 calculation
+ * - this can be used to initialise global variables from constant data, hence
+ * the massive ternary operator construction
+ *
+ * selects the appropriately-sized optimised version depending on sizeof(n)
+ */
+#define ilog2(n) \
+( \
+	__builtin_constant_p(n) ?	\
+	const_ilog2(n) :		\
+	(sizeof(n) <= 4) ?		\
+	__ilog2_u32(n) :		\
+	__ilog2_u64(n)			\
  )
 
 /**
  * roundup_pow_of_two - round the given value up to nearest power of two
- * @n - parameter
+ * @n: parameter
  *
  * round the given value up to the nearest power of two
  * - the result is undefined when n == 0
@@ -177,7 +181,7 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
 
 /**
  * rounddown_pow_of_two - round the given value down to nearest power of two
- * @n - parameter
+ * @n: parameter
  *
  * round the given value down to the nearest power of two
  * - the result is undefined when n == 0
@@ -190,29 +194,105 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
 	__rounddown_pow_of_two(n)		\
  )
 
+static inline __attribute_const__
+int __order_base_2(unsigned long n)
+{
+	return n > 1 ? ilog2(n - 1) + 1 : 0;
+}
+
+/**
+ * order_base_2 - calculate the (rounded up) base 2 order of the argument
+ * @n: parameter
+ *
+ * The first few values calculated by this routine:
+ *  ob2(0) = 0
+ *  ob2(1) = 0
+ *  ob2(2) = 1
+ *  ob2(3) = 2
+ *  ob2(4) = 2
+ *  ob2(5) = 3
+ *  ... and so on.
+ */
+#define order_base_2(n)				\
+(						\
+	__builtin_constant_p(n) ? (		\
+		((n) == 0 || (n) == 1) ? 0 :	\
+		ilog2((n) - 1) + 1) :		\
+	__order_base_2(n)			\
+)
+
 static inline __attribute__((const))
-int __get_order(unsigned long size)
+int __bits_per(unsigned long n)
+{
+	if (n < 2)
+		return 1;
+	if (is_power_of_2(n))
+		return order_base_2(n) + 1;
+	return order_base_2(n);
+}
+
+/**
+ * bits_per - calculate the number of bits required for the argument
+ * @n: parameter
+ *
+ * This is constant-capable and can be used for compile time
+ * initializations, e.g bitfields.
+ *
+ * The first few values calculated by this routine:
+ * bf(0) = 1
+ * bf(1) = 1
+ * bf(2) = 2
+ * bf(3) = 2
+ * bf(4) = 3
+ * ... and so on.
+ */
+#define bits_per(n)				\
+(						\
+	__builtin_constant_p(n) ? (		\
+		((n) == 0 || (n) == 1)		\
+			? 1 : ilog2(n) + 1	\
+	) :					\
+	__bits_per(n)				\
+)
+
+/**
+ * get_order - Determine the allocation order of a memory size
+ * @size: The size for which to get the order
+ *
+ * Determine the allocation order of a particular sized block of memory.  This
+ * is on a logarithmic scale, where:
+ *
+ *	0 -> 2^0 * PAGE_SIZE and below
+ *	1 -> 2^1 * PAGE_SIZE to 2^0 * PAGE_SIZE + 1
+ *	2 -> 2^2 * PAGE_SIZE to 2^1 * PAGE_SIZE + 1
+ *	3 -> 2^3 * PAGE_SIZE to 2^2 * PAGE_SIZE + 1
+ *	4 -> 2^4 * PAGE_SIZE to 2^3 * PAGE_SIZE + 1
+ *	...
+ *
+ * The order returned is used to find the smallest allocation granule required
+ * to hold an object of the specified size.
+ *
+ * The result is undefined if the size is 0.
+ */
+static inline __attribute_const__ int get_order(unsigned long size)
 {
-	int order;
+	if (__builtin_constant_p(size)) {
+		if (!size)
+			return BITS_PER_LONG - PAGE_SHIFT;
+
+		if (size < (1UL << PAGE_SHIFT))
+			return 0;
+
+		return ilog2((size) - 1) - PAGE_SHIFT + 1;
+	}
 
 	size--;
 	size >>= PAGE_SHIFT;
 #if BITS_PER_LONG == 32
-	order = fls(size);
+	return fls(size);
 #else
-	order = fls64(size);
+	return fls64(size);
 #endif
-	return order;
 }
 
-#define get_order(n)						\
-(								\
-	__builtin_constant_p(n) ? (				\
-		((n) == 0UL) ? BITS_PER_LONG - PAGE_SHIFT :	\
-		(((n) < (1UL << PAGE_SHIFT)) ? 0 :		\
-		 ilog2((n) - 1) - PAGE_SHIFT + 1)		\
-	) :							\
-	__get_order(n)						\
-)
-
-#endif /* _TOOLS_LINUX_LOG2_H */
+#endif /* _LINUX_LOG2_H */
diff --git a/include/linux/lz4.h b/include/linux/lz4.h
index 86e1dde3..f574964a 100644
--- a/include/linux/lz4.h
+++ b/include/linux/lz4.h
@@ -2,4 +2,9 @@
 
 #define LZ4_compress_destSize(src, dst, srclen, dstlen, workspace)	\
 	LZ4_compress_destSize(src, dst, srclen, dstlen)
+
+#define LZ4_compress_HC(src, dst, srclen, dstlen, level, workspace)	-1
+
 #define LZ4_MEM_COMPRESS 0
+#define LZ4HC_MEM_COMPRESS 0
+#define LZ4HC_MIN_CLEVEL 0
diff --git a/include/linux/math.h b/include/linux/math.h
new file mode 100644
index 00000000..85c8c8aa
--- /dev/null
+++ b/include/linux/math.h
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MATH_H
+#define _LINUX_MATH_H
+
+#include <linux/kernel.h>
+
+/* abs() */
+#include <stdlib.h>
+
+/*
+ * This looks more complex than it should be. But we need to
+ * get the type for the ~ right in round_down (it needs to be
+ * as wide as the result!), and we want to evaluate the macro
+ * arguments just once each.
+ */
+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
+
+/**
+ * round_up - round up to next specified power of 2
+ * @x: the value to round
+ * @y: multiple to round up to (must be a power of 2)
+ *
+ * Rounds @x up to next multiple of @y (which must be a power of 2).
+ * To perform arbitrary rounding up, use roundup() below.
+ */
+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
+
+/**
+ * round_down - round down to next specified power of 2
+ * @x: the value to round
+ * @y: multiple to round down to (must be a power of 2)
+ *
+ * Rounds @x down to next multiple of @y (which must be a power of 2).
+ * To perform arbitrary rounding down, use rounddown() below.
+ */
+#define round_down(x, y) ((x) & ~__round_mask(x, y))
+
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+
+#define DIV_ROUND_DOWN_ULL(ll, d) \
+	({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; })
+
+#define DIV_ROUND_UP_ULL(ll, d) \
+	DIV_ROUND_DOWN_ULL((unsigned long long)(ll) + (d) - 1, (d))
+
+#if BITS_PER_LONG == 32
+# define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP_ULL(ll, d)
+#else
+# define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP(ll,d)
+#endif
+
+/**
+ * roundup - round up to the next specified multiple
+ * @x: the value to up
+ * @y: multiple to round up to
+ *
+ * Rounds @x up to next multiple of @y. If @y will always be a power
+ * of 2, consider using the faster round_up().
+ */
+#define roundup(x, y) (					\
+{							\
+	typeof(y) __y = y;				\
+	(((x) + (__y - 1)) / __y) * __y;		\
+}							\
+)
+/**
+ * rounddown - round down to next specified multiple
+ * @x: the value to round
+ * @y: multiple to round down to
+ *
+ * Rounds @x down to next multiple of @y. If @y will always be a power
+ * of 2, consider using the faster round_down().
+ */
+#define rounddown(x, y) (				\
+{							\
+	typeof(x) __x = (x);				\
+	__x - (__x % (y));				\
+}							\
+)
+
+/*
+ * Divide positive or negative dividend by positive or negative divisor
+ * and round to closest integer. Result is undefined for negative
+ * divisors if the dividend variable type is unsigned and for negative
+ * dividends if the divisor variable type is unsigned.
+ */
+#define DIV_ROUND_CLOSEST(x, divisor)(			\
+{							\
+	typeof(x) __x = x;				\
+	typeof(divisor) __d = divisor;			\
+	(((typeof(x))-1) > 0 ||				\
+	 ((typeof(divisor))-1) > 0 ||			\
+	 (((__x) > 0) == ((__d) > 0))) ?		\
+		(((__x) + ((__d) / 2)) / (__d)) :	\
+		(((__x) - ((__d) / 2)) / (__d));	\
+}							\
+)
+/*
+ * Same as above but for u64 dividends. divisor must be a 32-bit
+ * number.
+ */
+#define DIV_ROUND_CLOSEST_ULL(x, divisor)(		\
+{							\
+	typeof(divisor) __d = divisor;			\
+	unsigned long long _tmp = (x) + (__d) / 2;	\
+	do_div(_tmp, __d);				\
+	_tmp;						\
+}							\
+)
+
+/*
+ * Multiplies an integer by a fraction, while avoiding unnecessary
+ * overflow or loss of precision.
+ */
+#define mult_frac(x, numer, denom)(			\
+{							\
+	typeof(x) quot = (x) / (denom);			\
+	typeof(x) rem  = (x) % (denom);			\
+	(quot * (numer)) + ((rem * (numer)) / (denom));	\
+}							\
+)
+
+#define sector_div(a, b) do_div(a, b)
+
+/**
+ * reciprocal_scale - "scale" a value into range [0, ep_ro)
+ * @val: value
+ * @ep_ro: right open interval endpoint
+ *
+ * Perform a "reciprocal multiplication" in order to "scale" a value into
+ * range [0, @ep_ro), where the upper interval endpoint is right-open.
+ * This is useful, e.g. for accessing a index of an array containing
+ * @ep_ro elements, for example. Think of it as sort of modulus, only that
+ * the result isn't that of modulo. ;) Note that if initial input is a
+ * small value, then result will return 0.
+ *
+ * Return: a result based on @val in interval [0, @ep_ro).
+ */
+static inline u32 reciprocal_scale(u32 val, u32 ep_ro)
+{
+	return (u32)(((u64) val * ep_ro) >> 32);
+}
+
+u64 int_pow(u64 base, unsigned int exp);
+unsigned long int_sqrt(unsigned long);
+
+#if BITS_PER_LONG < 64
+u32 int_sqrt64(u64 x);
+#else
+static inline u32 int_sqrt64(u64 x)
+{
+	return (u32)int_sqrt(x);
+}
+#endif
+
+#define abs(x)	__abs_choose_expr(x, long long,				\
+		__abs_choose_expr(x, long,				\
+		__abs_choose_expr(x, int,				\
+		__abs_choose_expr(x, short,				\
+		__abs_choose_expr(x, char,				\
+		__builtin_choose_expr(					\
+			__builtin_types_compatible_p(typeof(x), char),	\
+			(char)({ signed char __x = (x); __x<0?-__x:__x; }), \
+			((void)0)))))))
+
+#define __abs_choose_expr(x, type, other) __builtin_choose_expr(	\
+	__builtin_types_compatible_p(typeof(x),   signed type) ||	\
+	__builtin_types_compatible_p(typeof(x), unsigned type),		\
+	({ signed type __x = (x); __x < 0 ? -__x : __x; }), other)
+
+#endif	/* _LINUX_MATH_H */
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 37d81492..37325170 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -1,11 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * memory buffer pool support
  */
 #ifndef _LINUX_MEMPOOL_H
 #define _LINUX_MEMPOOL_H
 
+#include <linux/wait.h>
 #include <linux/compiler.h>
-#include <linux/bug.h>
 #include <linux/slab.h>
 
 struct kmem_cache;
@@ -14,74 +15,111 @@ typedef void * (mempool_alloc_t)(gfp_t gfp_mask, void *pool_data);
 typedef void (mempool_free_t)(void *element, void *pool_data);
 
 typedef struct mempool_s {
-	size_t			elem_size;
-	void			*pool_data;
-	mempool_alloc_t		*alloc;
-	mempool_free_t		*free;
+	spinlock_t lock;
+	int min_nr;		/* nr of elements at *elements */
+	int curr_nr;		/* Current nr of elements at *elements */
+	void **elements;
+
+	void *pool_data;
+	mempool_alloc_t *alloc;
+	mempool_free_t *free;
+	wait_queue_head_t wait;
 } mempool_t;
 
 static inline bool mempool_initialized(mempool_t *pool)
 {
-	return true;
+	return pool->elements != NULL;
 }
 
+void mempool_exit(mempool_t *pool);
+int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+		      mempool_free_t *free_fn, void *pool_data,
+		      gfp_t gfp_mask, int node_id);
+int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+		 mempool_free_t *free_fn, void *pool_data);
+
+extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+			mempool_free_t *free_fn, void *pool_data);
+extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
+			mempool_free_t *free_fn, void *pool_data,
+			gfp_t gfp_mask, int nid);
+
 extern int mempool_resize(mempool_t *pool, int new_min_nr);
+extern void mempool_destroy(mempool_t *pool);
+extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc;
+extern void mempool_free(void *element, mempool_t *pool);
+
+/*
+ * A mempool_alloc_t and mempool_free_t that get the memory from
+ * a slab cache that is passed in through pool_data.
+ * Note: the slab cache may not have a ctor function.
+ */
+void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data);
+void mempool_free_slab(void *element, void *pool_data);
 
-static inline void mempool_free(void *element, mempool_t *pool)
+static inline int
+mempool_init_slab_pool(mempool_t *pool, int min_nr, struct kmem_cache *kc)
 {
-	free(element);
+	return mempool_init(pool, min_nr, mempool_alloc_slab,
+			    mempool_free_slab, (void *) kc);
 }
 
-static inline void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc
+static inline mempool_t *
+mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
 {
-	BUG_ON(!pool->elem_size);
-	return kmalloc(pool->elem_size, gfp_mask);
+	return mempool_create(min_nr, mempool_alloc_slab, mempool_free_slab,
+			      (void *) kc);
 }
 
-static inline void mempool_exit(mempool_t *pool) {}
+/*
+ * a mempool_alloc_t and a mempool_free_t to kmalloc and kfree the
+ * amount of memory specified by pool_data
+ */
+void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data);
+void mempool_kfree(void *element, void *pool_data);
 
-static inline void mempool_destroy(mempool_t *pool)
+static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t size)
 {
-	free(pool);
+	return mempool_init(pool, min_nr, mempool_kmalloc,
+			    mempool_kfree, (void *) size);
 }
 
-static inline int
-mempool_init_slab_pool(mempool_t *pool, int min_nr, struct kmem_cache *kc)
+static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
 {
-	pool->elem_size = 0;
-	return 0;
+	return mempool_create(min_nr, mempool_kmalloc, mempool_kfree,
+			      (void *) size);
 }
 
-static inline mempool_t *
-mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
+void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data);
+void mempool_kvfree(void *element, void *pool_data);
+
+static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size)
 {
-	mempool_t *pool = malloc(sizeof(*pool));
-	pool->elem_size = 0;
-	return pool;
+	return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size);
 }
 
-static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size)
 {
-	pool->elem_size = size;
-	return 0;
+	return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size);
 }
 
+/*
+ * A mempool_alloc_t and mempool_free_t for a simple page allocator that
+ * allocates pages of the order specified by pool_data
+ */
+void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data);
+void mempool_free_pages(void *element, void *pool_data);
+
 static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int order)
 {
-	pool->elem_size = PAGE_SIZE << order;
-	return 0;
+	return mempool_init(pool, min_nr, mempool_alloc_pages,
+			    mempool_free_pages, (void *)(long)order);
 }
 
-static inline int mempool_init(mempool_t *pool, int min_nr,
-			       mempool_alloc_t *alloc_fn,
-			       mempool_free_t *free_fn,
-			       void *pool_data)
+static inline mempool_t *mempool_create_page_pool(int min_nr, int order)
 {
-	pool->elem_size = (size_t) pool_data;
-	pool->pool_data	= pool_data;
-	pool->alloc	= alloc_fn;
-	pool->free	= free_fn;
-	return 0;
+	return mempool_create(min_nr, mempool_alloc_pages, mempool_free_pages,
+			      (void *)(long)order);
 }
 
 #endif /* _LINUX_MEMPOOL_H */
diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h
new file mode 100644
index 00000000..43a7b9dc
--- /dev/null
+++ b/include/linux/min_heap.h
@@ -0,0 +1,236 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MIN_HEAP_H
+#define _LINUX_MIN_HEAP_H
+
+#include <linux/bug.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+/**
+ * Data structure to hold a min-heap.
+ * @nr: Number of elements currently in the heap.
+ * @size: Maximum number of elements that can be held in current storage.
+ * @data: Pointer to the start of array holding the heap elements.
+ * @preallocated: Start of the static preallocated array holding the heap elements.
+ */
+#define MIN_HEAP_PREALLOCATED(_type, _name, _nr)	\
+struct _name {	\
+	int nr;	\
+	int size;	\
+	_type *data;	\
+	_type preallocated[_nr];	\
+}
+
+#define DEFINE_MIN_HEAP(_type, _name) MIN_HEAP_PREALLOCATED(_type, _name, 0)
+
+typedef DEFINE_MIN_HEAP(char, min_heap_char) min_heap_char;
+
+#define __minheap_cast(_heap)		(typeof((_heap)->data[0]) *)
+#define __minheap_obj_size(_heap)	sizeof((_heap)->data[0])
+
+/**
+ * struct min_heap_callbacks - Data/functions to customise the min_heap.
+ * @less: Partial order function for this heap.
+ * @swp: Swap elements function.
+ */
+struct min_heap_callbacks {
+	bool (*less)(const void *lhs, const void *rhs, void *args);
+	void (*swp)(void *lhs, void *rhs, void *args);
+};
+
+/* Initialize a min-heap. */
+static __always_inline
+void __min_heap_init(min_heap_char *heap, void *data, int size)
+{
+	heap->nr = 0;
+	heap->size = size;
+	if (data)
+		heap->data = data;
+	else
+		heap->data = heap->preallocated;
+}
+
+#define min_heap_init(_heap, _data, _size)	\
+	__min_heap_init((min_heap_char *)_heap, _data, _size)
+
+/* Get the minimum element from the heap. */
+static __always_inline
+void *__min_heap_peek(struct min_heap_char *heap)
+{
+	return heap->nr ? heap->data : NULL;
+}
+
+#define min_heap_peek(_heap)	\
+	(__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap))
+
+/* Check if the heap is full. */
+static __always_inline
+bool __min_heap_full(min_heap_char *heap)
+{
+	return heap->nr == heap->size;
+}
+
+#define min_heap_full(_heap)	\
+	__min_heap_full((min_heap_char *)_heap)
+
+/* Sift the element at pos down the heap. */
+static __always_inline
+void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size,
+		const struct min_heap_callbacks *func, void *args)
+{
+	void *left, *right;
+	void *data = heap->data;
+	void *root = data + pos * elem_size;
+	int i = pos, j;
+
+	/* Find the sift-down path all the way to the leaves. */
+	for (;;) {
+		if (i * 2 + 2 >= heap->nr)
+			break;
+		left = data + (i * 2 + 1) * elem_size;
+		right = data + (i * 2 + 2) * elem_size;
+		i = func->less(left, right, args) ? i * 2 + 1 : i * 2 + 2;
+	}
+
+	/* Special case for the last leaf with no sibling. */
+	if (i * 2 + 2 == heap->nr)
+		i = i * 2 + 1;
+
+	/* Backtrack to the correct location. */
+	while (i != pos && func->less(root, data + i * elem_size, args))
+		i = (i - 1) / 2;
+
+	/* Shift the element into its correct place. */
+	j = i;
+	while (i != pos) {
+		i = (i - 1) / 2;
+		func->swp(data + i * elem_size, data + j * elem_size, args);
+	}
+}
+
+#define min_heap_sift_down(_heap, _pos, _func, _args)	\
+	__min_heap_sift_down((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args)
+
+/* Sift up ith element from the heap, O(log2(nr)). */
+static __always_inline
+void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx,
+		const struct min_heap_callbacks *func, void *args)
+{
+	void *data = heap->data;
+	size_t parent;
+
+	while (idx) {
+		parent = (idx - 1) / 2;
+		if (func->less(data + parent * elem_size, data + idx * elem_size, args))
+			break;
+		func->swp(data + parent * elem_size, data + idx * elem_size, args);
+		idx = parent;
+	}
+}
+
+#define min_heap_sift_up(_heap, _idx, _func, _args)	\
+	__min_heap_sift_up((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args)
+
+/* Floyd's approach to heapification that is O(nr). */
+static __always_inline
+void __min_heapify_all(min_heap_char *heap, size_t elem_size,
+		const struct min_heap_callbacks *func, void *args)
+{
+	int i;
+
+	for (i = heap->nr / 2 - 1; i >= 0; i--)
+		__min_heap_sift_down(heap, i, elem_size, func, args);
+}
+
+#define min_heapify_all(_heap, _func, _args)	\
+	__min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args)
+
+/* Remove minimum element from the heap, O(log2(nr)). */
+static __always_inline
+bool __min_heap_pop(min_heap_char *heap, size_t elem_size,
+		const struct min_heap_callbacks *func, void *args)
+{
+	void *data = heap->data;
+
+	if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap"))
+		return false;
+
+	/* Place last element at the root (position 0) and then sift down. */
+	heap->nr--;
+	memcpy(data, data + (heap->nr * elem_size), elem_size);
+	__min_heap_sift_down(heap, 0, elem_size, func, args);
+
+	return true;
+}
+
+#define min_heap_pop(_heap, _func, _args)	\
+	__min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args)
+
+/*
+ * Remove the minimum element and then push the given element. The
+ * implementation performs 1 sift (O(log2(nr))) and is therefore more
+ * efficient than a pop followed by a push that does 2.
+ */
+static __always_inline
+void __min_heap_pop_push(min_heap_char *heap,
+		const void *element, size_t elem_size,
+		const struct min_heap_callbacks *func,
+		void *args)
+{
+	memcpy(heap->data, element, elem_size);
+	__min_heap_sift_down(heap, 0, elem_size, func, args);
+}
+
+#define min_heap_pop_push(_heap, _element, _func, _args)	\
+	__min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args)
+
+/* Push an element on to the heap, O(log2(nr)). */
+static __always_inline
+bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size,
+		const struct min_heap_callbacks *func, void *args)
+{
+	void *data = heap->data;
+	int pos;
+
+	if (WARN_ONCE(heap->nr >= heap->size, "Pushing on a full heap"))
+		return false;
+
+	/* Place at the end of data. */
+	pos = heap->nr;
+	memcpy(data + (pos * elem_size), element, elem_size);
+	heap->nr++;
+
+	/* Sift child at pos up. */
+	__min_heap_sift_up(heap, elem_size, pos, func, args);
+
+	return true;
+}
+
+#define min_heap_push(_heap, _element, _func, _args)	\
+	__min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args)
+
+/* Remove ith element from the heap, O(log2(nr)). */
+static __always_inline
+bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx,
+		const struct min_heap_callbacks *func, void *args)
+{
+	void *data = heap->data;
+
+	if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap"))
+		return false;
+
+	/* Place last element at the root (position 0) and then sift down. */
+	heap->nr--;
+	if (idx == heap->nr)
+		return true;
+	func->swp(data + (idx * elem_size), data + (heap->nr * elem_size), args);
+	__min_heap_sift_up(heap, elem_size, idx, func, args);
+	__min_heap_sift_down(heap, idx, elem_size, func, args);
+
+	return true;
+}
+
+#define min_heap_del(_heap, _idx, _func, _args)	\
+	__min_heap_del((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args)
+
+#endif /* _LINUX_MIN_HEAP_H */
diff --git a/include/linux/minmax.h b/include/linux/minmax.h
new file mode 100644
index 00000000..ddc15bf7
--- /dev/null
+++ b/include/linux/minmax.h
@@ -0,0 +1,272 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MINMAX_H
+#define _LINUX_MINMAX_H
+
+#include <linux/compiler.h>
+#include <linux/const.h>
+#include <linux/types.h>
+
+/*
+ * min()/max()/clamp() macros must accomplish three things:
+ *
+ * - Avoid multiple evaluations of the arguments (so side-effects like
+ *   "x++" happen only once) when non-constant.
+ * - Retain result as a constant expressions when called with only
+ *   constant expressions (to avoid tripping VLA warnings in stack
+ *   allocation usage).
+ * - Perform signed v unsigned type-checking (to generate compile
+ *   errors instead of nasty runtime surprises).
+ * - Unsigned char/short are always promoted to signed int and can be
+ *   compared against signed or unsigned arguments.
+ * - Unsigned arguments can be compared against non-negative signed constants.
+ * - Comparison of a signed argument against an unsigned constant fails
+ *   even if the constant is below __INT_MAX__ and could be cast to int.
+ */
+#define __typecheck(x, y) \
+	(!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
+
+/* is_signed_type() isn't a constexpr for pointer types */
+#define __is_signed(x) 								\
+	__builtin_choose_expr(__is_constexpr(is_signed_type(typeof(x))),	\
+		is_signed_type(typeof(x)), 0)
+
+/* True for a non-negative signed int constant */
+#define __is_noneg_int(x)	\
+	(__builtin_choose_expr(__is_constexpr(x) && __is_signed(x), x, -1) >= 0)
+
+#define __types_ok(x, y) 					\
+	(__is_signed(x) == __is_signed(y) ||			\
+		__is_signed((x) + 0) == __is_signed((y) + 0) ||	\
+		__is_noneg_int(x) || __is_noneg_int(y))
+
+#define __cmp_op_min <
+#define __cmp_op_max >
+
+#define __cmp(op, x, y)	((x) __cmp_op_##op (y) ? (x) : (y))
+
+#define __cmp_once(op, x, y, unique_x, unique_y) ({	\
+	typeof(x) unique_x = (x);			\
+	typeof(y) unique_y = (y);			\
+	static_assert(__types_ok(x, y),			\
+		#op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \
+	__cmp(op, unique_x, unique_y); })
+
+#define __careful_cmp(op, x, y)					\
+	__builtin_choose_expr(__is_constexpr((x) - (y)),	\
+		__cmp(op, x, y),				\
+		__cmp_once(op, x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y)))
+
+#define __clamp(val, lo, hi)	\
+	((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val)))
+
+#define __clamp_once(val, lo, hi, unique_val, unique_lo, unique_hi) ({		\
+	typeof(val) unique_val = (val);						\
+	typeof(lo) unique_lo = (lo);						\
+	typeof(hi) unique_hi = (hi);						\
+	static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), 	\
+			(lo) <= (hi), true),					\
+		"clamp() low limit " #lo " greater than high limit " #hi);	\
+	static_assert(__types_ok(val, lo), "clamp() 'lo' signedness error");	\
+	static_assert(__types_ok(val, hi), "clamp() 'hi' signedness error");	\
+	__clamp(unique_val, unique_lo, unique_hi); })
+
+#define __careful_clamp(val, lo, hi) ({					\
+	__builtin_choose_expr(__is_constexpr((val) - (lo) + (hi)),	\
+		__clamp(val, lo, hi),					\
+		__clamp_once(val, lo, hi, __UNIQUE_ID(__val),		\
+			     __UNIQUE_ID(__lo), __UNIQUE_ID(__hi))); })
+
+/**
+ * min - return minimum of two values of the same or compatible types
+ * @x: first value
+ * @y: second value
+ */
+#define min(x, y)	__careful_cmp(min, x, y)
+
+/**
+ * max - return maximum of two values of the same or compatible types
+ * @x: first value
+ * @y: second value
+ */
+#define max(x, y)	__careful_cmp(max, x, y)
+
+/**
+ * umin - return minimum of two non-negative values
+ *   Signed types are zero extended to match a larger unsigned type.
+ * @x: first value
+ * @y: second value
+ */
+#define umin(x, y)	\
+	__careful_cmp(min, (x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull)
+
+/**
+ * umax - return maximum of two non-negative values
+ * @x: first value
+ * @y: second value
+ */
+#define umax(x, y)	\
+	__careful_cmp(max, (x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull)
+
+/**
+ * min3 - return minimum of three values
+ * @x: first value
+ * @y: second value
+ * @z: third value
+ */
+#define min3(x, y, z) min((typeof(x))min(x, y), z)
+
+/**
+ * max3 - return maximum of three values
+ * @x: first value
+ * @y: second value
+ * @z: third value
+ */
+#define max3(x, y, z) max((typeof(x))max(x, y), z)
+
+/**
+ * min_not_zero - return the minimum that is _not_ zero, unless both are zero
+ * @x: value1
+ * @y: value2
+ */
+#define min_not_zero(x, y) ({			\
+	typeof(x) __x = (x);			\
+	typeof(y) __y = (y);			\
+	__x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
+
+/**
+ * clamp - return a value clamped to a given range with strict typechecking
+ * @val: current value
+ * @lo: lowest allowable value
+ * @hi: highest allowable value
+ *
+ * This macro does strict typechecking of @lo/@hi to make sure they are of the
+ * same type as @val.  See the unnecessary pointer comparisons.
+ */
+#define clamp(val, lo, hi) __careful_clamp(val, lo, hi)
+
+/*
+ * ..and if you can't take the strict
+ * types, you can specify one yourself.
+ *
+ * Or not use min/max/clamp at all, of course.
+ */
+
+/**
+ * min_t - return minimum of two values, using the specified type
+ * @type: data type to use
+ * @x: first value
+ * @y: second value
+ */
+#define min_t(type, x, y)	__careful_cmp(min, (type)(x), (type)(y))
+
+/**
+ * max_t - return maximum of two values, using the specified type
+ * @type: data type to use
+ * @x: first value
+ * @y: second value
+ */
+#define max_t(type, x, y)	__careful_cmp(max, (type)(x), (type)(y))
+
+/*
+ * Do not check the array parameter using __must_be_array().
+ * In the following legit use-case where the "array" passed is a simple pointer,
+ * __must_be_array() will return a failure.
+ * --- 8< ---
+ * int *buff
+ * ...
+ * min = min_array(buff, nb_items);
+ * --- 8< ---
+ *
+ * The first typeof(&(array)[0]) is needed in order to support arrays of both
+ * 'int *buff' and 'int buff[N]' types.
+ *
+ * The array can be an array of const items.
+ * typeof() keeps the const qualifier. Use __unqual_scalar_typeof() in order
+ * to discard the const qualifier for the __element variable.
+ */
+#define __minmax_array(op, array, len) ({				\
+	typeof(&(array)[0]) __array = (array);				\
+	typeof(len) __len = (len);					\
+	__unqual_scalar_typeof(__array[0]) __element = __array[--__len];\
+	while (__len--)							\
+		__element = op(__element, __array[__len]);		\
+	__element; })
+
+/**
+ * min_array - return minimum of values present in an array
+ * @array: array
+ * @len: array length
+ *
+ * Note that @len must not be zero (empty array).
+ */
+#define min_array(array, len) __minmax_array(min, array, len)
+
+/**
+ * max_array - return maximum of values present in an array
+ * @array: array
+ * @len: array length
+ *
+ * Note that @len must not be zero (empty array).
+ */
+#define max_array(array, len) __minmax_array(max, array, len)
+
+/**
+ * clamp_t - return a value clamped to a given range using a given type
+ * @type: the type of variable to use
+ * @val: current value
+ * @lo: minimum allowable value
+ * @hi: maximum allowable value
+ *
+ * This macro does no typechecking and uses temporary variables of type
+ * @type to make all the comparisons.
+ */
+#define clamp_t(type, val, lo, hi) __careful_clamp((type)(val), (type)(lo), (type)(hi))
+
+/**
+ * clamp_val - return a value clamped to a given range using val's type
+ * @val: current value
+ * @lo: minimum allowable value
+ * @hi: maximum allowable value
+ *
+ * This macro does no typechecking and uses temporary variables of whatever
+ * type the input argument @val is.  This is useful when @val is an unsigned
+ * type and @lo and @hi are literals that will otherwise be assigned a signed
+ * integer type.
+ */
+#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi)
+
+static inline bool in_range64(u64 val, u64 start, u64 len)
+{
+	return (val - start) < len;
+}
+
+static inline bool in_range32(u32 val, u32 start, u32 len)
+{
+	return (val - start) < len;
+}
+
+/**
+ * in_range - Determine if a value lies within a range.
+ * @val: Value to test.
+ * @start: First value in range.
+ * @len: Number of values in range.
+ *
+ * This is more efficient than "if (start <= val && val < (start + len))".
+ * It also gives a different answer if @start + @len overflows the size of
+ * the type by a sufficient amount to encompass @val.  Decide for yourself
+ * which behaviour you want, or prove that start + len never overflow.
+ * Do not blindly replace one form with the other.
+ */
+#define in_range(val, start, len)					\
+	((sizeof(start) | sizeof(len) | sizeof(val)) <= sizeof(u32) ?	\
+		in_range32(val, start, len) : in_range64(val, start, len))
+
+/**
+ * swap - swap values of @a and @b
+ * @a: first value
+ * @b: second value
+ */
+#define swap(a, b) \
+	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
+
+#endif	/* _LINUX_MINMAX_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3830bc2f..b7e83af0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1 +1,32 @@
-#include <linux/slab.h>
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TOOLS_LINUX_MM_H
+#define _TOOLS_LINUX_MM_H
+
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <linux/types.h>
+
+struct sysinfo {
+	long uptime;		/* Seconds since boot */
+	unsigned long loads[3];	/* 1, 5, and 15 minute load averages */
+	unsigned long totalram;	/* Total usable main memory size */
+	unsigned long freeram;	/* Available memory size */
+	unsigned long sharedram;	/* Amount of shared memory */
+	unsigned long bufferram;	/* Memory used by buffers */
+	unsigned long totalswap;	/* Total swap space size */
+	unsigned long freeswap;	/* swap space still available */
+	__u16 procs;		   	/* Number of current processes */
+	__u16 pad;		   	/* Explicit padding for m68k */
+	unsigned long totalhigh;	/* Total high memory size */
+	unsigned long freehigh;	/* Available high memory size */
+	__u32 mem_unit;			/* Memory unit size in bytes */
+};
+
+
+
+static inline void si_meminfo(struct sysinfo *val)
+{
+	BUG_ON(syscall(SYS_sysinfo, val));
+}
+
+#endif /* _TOOLS_LINUX_MM_H */
diff --git a/include/linux/module.h b/include/linux/module.h
index 812aa350..42d4e18a 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -3,7 +3,6 @@
 
 #include <linux/stat.h>
 #include <linux/compiler.h>
-#include <linux/moduleparam.h>
 #include <linux/export.h>
 
 struct module;
@@ -43,4 +42,7 @@ static inline void module_put(struct module *module)
 {
 }
 
+#define module_param_named(name, value, type, perm)
+#define MODULE_PARM_DESC(_parm, desc)
+
 #endif /* _LINUX_MODULE_H */
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
deleted file mode 100644
index 6002673f..00000000
--- a/include/linux/moduleparam.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _LINUX_MODULE_PARAMS_H
-#define _LINUX_MODULE_PARAMS_H
-
-#define module_param_named(name, value, type, perm)
-#define MODULE_PARM_DESC(_parm, desc)
-
-#endif /* _LINUX_MODULE_PARAMS_H */
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
new file mode 100644
index 00000000..ba30f77e
--- /dev/null
+++ b/include/linux/overflow.h
@@ -0,0 +1,345 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+#ifndef __LINUX_OVERFLOW_H
+#define __LINUX_OVERFLOW_H
+
+#include <linux/compiler.h>
+#include <linux/limits.h>
+
+/*
+ * In the fallback code below, we need to compute the minimum and
+ * maximum values representable in a given type. These macros may also
+ * be useful elsewhere, so we provide them outside the
+ * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block.
+ *
+ * It would seem more obvious to do something like
+ *
+ * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0)
+ * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0)
+ *
+ * Unfortunately, the middle expressions, strictly speaking, have
+ * undefined behaviour, and at least some versions of gcc warn about
+ * the type_max expression (but not if -fsanitize=undefined is in
+ * effect; in that case, the warning is deferred to runtime...).
+ *
+ * The slightly excessive casting in type_min is to make sure the
+ * macros also produce sensible values for the exotic type _Bool. [The
+ * overflow checkers only almost work for _Bool, but that's
+ * a-feature-not-a-bug, since people shouldn't be doing arithmetic on
+ * _Bools. Besides, the gcc builtins don't allow _Bool* as third
+ * argument.]
+ *
+ * Idea stolen from
+ * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html -
+ * credit to Christian Biere.
+ */
+#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type)))
+#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
+#define type_min(T) ((T)((T)-type_max(T)-(T)1))
+
+/*
+ * Avoids triggering -Wtype-limits compilation warning,
+ * while using unsigned data types to check a < 0.
+ */
+#define is_non_negative(a) ((a) > 0 || (a) == 0)
+#define is_negative(a) (!(is_non_negative(a)))
+
+/*
+ * Allows for effectively applying __must_check to a macro so we can have
+ * both the type-agnostic benefits of the macros while also being able to
+ * enforce that the return value is, in fact, checked.
+ */
+static inline bool __must_check __must_check_overflow(bool overflow)
+{
+	return unlikely(overflow);
+}
+
+#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW
+/*
+ * For simplicity and code hygiene, the fallback code below insists on
+ * a, b and *d having the same type (similar to the min() and max()
+ * macros), whereas gcc's type-generic overflow checkers accept
+ * different types. Hence we don't just make check_add_overflow an
+ * alias for __builtin_add_overflow, but add type checks similar to
+ * below.
+ */
+#define check_add_overflow(a, b, d) __must_check_overflow(({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	__builtin_add_overflow(__a, __b, __d);	\
+}))
+
+#define check_sub_overflow(a, b, d) __must_check_overflow(({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	__builtin_sub_overflow(__a, __b, __d);	\
+}))
+
+#define check_mul_overflow(a, b, d) __must_check_overflow(({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	__builtin_mul_overflow(__a, __b, __d);	\
+}))
+
+#else
+
+
+/* Checking for unsigned overflow is relatively easy without causing UB. */
+#define __unsigned_add_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = __a + __b;			\
+	*__d < __a;				\
+})
+#define __unsigned_sub_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = __a - __b;			\
+	__a < __b;				\
+})
+/*
+ * If one of a or b is a compile-time constant, this avoids a division.
+ */
+#define __unsigned_mul_overflow(a, b, d) ({		\
+	typeof(a) __a = (a);				\
+	typeof(b) __b = (b);				\
+	typeof(d) __d = (d);				\
+	(void) (&__a == &__b);				\
+	(void) (&__a == __d);				\
+	*__d = __a * __b;				\
+	__builtin_constant_p(__b) ?			\
+	  __b > 0 && __a > type_max(typeof(__a)) / __b : \
+	  __a > 0 && __b > type_max(typeof(__b)) / __a;	 \
+})
+
+/*
+ * For signed types, detecting overflow is much harder, especially if
+ * we want to avoid UB. But the interface of these macros is such that
+ * we must provide a result in *d, and in fact we must produce the
+ * result promised by gcc's builtins, which is simply the possibly
+ * wrapped-around value. Fortunately, we can just formally do the
+ * operations in the widest relevant unsigned type (u64) and then
+ * truncate the result - gcc is smart enough to generate the same code
+ * with and without the (u64) casts.
+ */
+
+/*
+ * Adding two signed integers can overflow only if they have the same
+ * sign, and overflow has happened iff the result has the opposite
+ * sign.
+ */
+#define __signed_add_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = (u64)__a + (u64)__b;		\
+	(((~(__a ^ __b)) & (*__d ^ __a))	\
+		& type_min(typeof(__a))) != 0;	\
+})
+
+/*
+ * Subtraction is similar, except that overflow can now happen only
+ * when the signs are opposite. In this case, overflow has happened if
+ * the result has the opposite sign of a.
+ */
+#define __signed_sub_overflow(a, b, d) ({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	*__d = (u64)__a - (u64)__b;		\
+	((((__a ^ __b)) & (*__d ^ __a))		\
+		& type_min(typeof(__a))) != 0;	\
+})
+
+/*
+ * Signed multiplication is rather hard. gcc always follows C99, so
+ * division is truncated towards 0. This means that we can write the
+ * overflow check like this:
+ *
+ * (a > 0 && (b > MAX/a || b < MIN/a)) ||
+ * (a < -1 && (b > MIN/a || b < MAX/a) ||
+ * (a == -1 && b == MIN)
+ *
+ * The redundant casts of -1 are to silence an annoying -Wtype-limits
+ * (included in -Wextra) warning: When the type is u8 or u16, the
+ * __b_c_e in check_mul_overflow obviously selects
+ * __unsigned_mul_overflow, but unfortunately gcc still parses this
+ * code and warns about the limited range of __b.
+ */
+
+#define __signed_mul_overflow(a, b, d) ({				\
+	typeof(a) __a = (a);						\
+	typeof(b) __b = (b);						\
+	typeof(d) __d = (d);						\
+	typeof(a) __tmax = type_max(typeof(a));				\
+	typeof(a) __tmin = type_min(typeof(a));				\
+	(void) (&__a == &__b);						\
+	(void) (&__a == __d);						\
+	*__d = (u64)__a * (u64)__b;					\
+	(__b > 0   && (__a > __tmax/__b || __a < __tmin/__b)) ||	\
+	(__b < (typeof(__b))-1  && (__a > __tmin/__b || __a < __tmax/__b)) || \
+	(__b == (typeof(__b))-1 && __a == __tmin);			\
+})
+
+
+#define check_add_overflow(a, b, d)	__must_check_overflow(		\
+	__builtin_choose_expr(is_signed_type(typeof(a)),		\
+			__signed_add_overflow(a, b, d),			\
+			__unsigned_add_overflow(a, b, d)))
+
+#define check_sub_overflow(a, b, d)	__must_check_overflow(		\
+	__builtin_choose_expr(is_signed_type(typeof(a)),		\
+			__signed_sub_overflow(a, b, d),			\
+			__unsigned_sub_overflow(a, b, d)))
+
+#define check_mul_overflow(a, b, d)	__must_check_overflow(		\
+	__builtin_choose_expr(is_signed_type(typeof(a)),		\
+			__signed_mul_overflow(a, b, d),			\
+			__unsigned_mul_overflow(a, b, d)))
+
+#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */
+
+/** check_shl_overflow() - Calculate a left-shifted value and check overflow
+ *
+ * @a: Value to be shifted
+ * @s: How many bits left to shift
+ * @d: Pointer to where to store the result
+ *
+ * Computes *@d = (@a << @s)
+ *
+ * Returns true if '*d' cannot hold the result or when 'a << s' doesn't
+ * make sense. Example conditions:
+ * - 'a << s' causes bits to be lost when stored in *d.
+ * - 's' is garbage (e.g. negative) or so large that the result of
+ *   'a << s' is guaranteed to be 0.
+ * - 'a' is negative.
+ * - 'a << s' sets the sign bit, if any, in '*d'.
+ *
+ * '*d' will hold the results of the attempted shift, but is not
+ * considered "safe for use" if false is returned.
+ */
+#define check_shl_overflow(a, s, d) __must_check_overflow(({		\
+	typeof(a) _a = a;						\
+	typeof(s) _s = s;						\
+	typeof(d) _d = d;						\
+	u64 _a_full = _a;						\
+	unsigned int _to_shift =					\
+		is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0;	\
+	*_d = (_a_full << _to_shift);					\
+	(_to_shift != _s || is_negative(*_d) || is_negative(_a) ||	\
+	(*_d >> _to_shift) != _a);					\
+}))
+
+/**
+ * array_size() - Calculate size of 2-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ *
+ * Calculates size of 2-dimensional array: @a * @b.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array_size(size_t a, size_t b)
+{
+	size_t bytes;
+
+	if (check_mul_overflow(a, b, &bytes))
+		return SIZE_MAX;
+
+	return bytes;
+}
+
+/**
+ * array3_size() - Calculate size of 3-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ * @c: dimension three
+ *
+ * Calculates size of 3-dimensional array: @a * @b * @c.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array3_size(size_t a, size_t b, size_t c)
+{
+	size_t bytes;
+
+	if (check_mul_overflow(a, b, &bytes))
+		return SIZE_MAX;
+	if (check_mul_overflow(bytes, c, &bytes))
+		return SIZE_MAX;
+
+	return bytes;
+}
+
+/*
+ * Compute a*b+c, returning SIZE_MAX on overflow. Internal helper for
+ * struct_size() below.
+ */
+static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c)
+{
+	size_t bytes;
+
+	if (check_mul_overflow(a, b, &bytes))
+		return SIZE_MAX;
+	if (check_add_overflow(bytes, c, &bytes))
+		return SIZE_MAX;
+
+	return bytes;
+}
+
+/**
+ * struct_size() - Calculate size of structure with trailing array.
+ * @p: Pointer to the structure.
+ * @member: Name of the array member.
+ * @count: Number of elements in the array.
+ *
+ * Calculates size of memory needed for structure @p followed by an
+ * array of @count number of @member elements.
+ *
+ * Return: number of bytes needed or SIZE_MAX on overflow.
+ */
+#define struct_size(p, member, count)					\
+	__ab_c_size(count,						\
+		    sizeof(*(p)->member) + __must_be_array((p)->member),\
+		    sizeof(*(p)))
+
+/**
+ * flex_array_size() - Calculate size of a flexible array member
+ *                     within an enclosing structure.
+ *
+ * @p: Pointer to the structure.
+ * @member: Name of the flexible array member.
+ * @count: Number of elements in the array.
+ *
+ * Calculates size of a flexible array of @count number of @member
+ * elements, at the end of structure @p.
+ *
+ * Return: number of bytes needed or SIZE_MAX on overflow.
+ */
+#define flex_array_size(p, member, count)				\
+	array_size(count,						\
+		    sizeof(*(p)->member) + __must_be_array((p)->member))
+
+#endif /* __LINUX_OVERFLOW_H */
diff --git a/include/linux/page.h b/include/linux/page.h
index 87be064f..111e5e68 100644
--- a/include/linux/page.h
+++ b/include/linux/page.h
@@ -12,6 +12,11 @@ struct page;
 
 #endif
 
+#ifndef PAGE_SHIFT
+#define PAGE_SHIFT 12
+#endif
+
+
 #define virt_to_page(p)							\
 	((struct page *) (((unsigned long) (p)) & PAGE_MASK))
 #define offset_in_page(p)		((unsigned long) (p) & ~PAGE_MASK)
@@ -21,6 +26,11 @@ struct page;
 #define kmap_atomic(page)		page_address(page)
 #define kunmap_atomic(addr)		do {} while (0)
 
+#define kmap_local_page(page)		page_address(page)
+#define kunmap_local(addr)		do {} while (0)
+
+#define PageHighMem(page)		false
+
 static const char zero_page[PAGE_SIZE];
 
 #define ZERO_PAGE(o)			((struct page *) &zero_page[0])
diff --git a/include/linux/path.h b/include/linux/path.h
deleted file mode 100644
index d1372186..00000000
--- a/include/linux/path.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _LINUX_PATH_H
-#define _LINUX_PATH_H
-
-struct dentry;
-struct vfsmount;
-
-struct path {
-	struct vfsmount *mnt;
-	struct dentry *dentry;
-};
-
-extern void path_get(const struct path *);
-extern void path_put(const struct path *);
-
-static inline int path_equal(const struct path *path1, const struct path *path2)
-{
-	return path1->mnt == path2->mnt && path1->dentry == path2->dentry;
-}
-
-#endif  /* _LINUX_PATH_H */
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index c233e3ce..153251c0 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -7,63 +7,49 @@
 #include <linux/preempt.h>
 
 struct percpu_rw_semaphore {
-	pthread_rwlock_t	lock;
+	pthread_mutex_t		lock;
 };
 
-#define DEFINE_STATIC_PERCPU_RWSEM(name)				\
-static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);		\
-static struct percpu_rw_semaphore name = {				\
-	.rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),	\
-	.read_count = &__percpu_rwsem_rc_##name,			\
-	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
-	.writer = __RCUWAIT_INITIALIZER(name.writer),			\
-}
-
-extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
-extern void __percpu_up_read(struct percpu_rw_semaphore *);
-
 static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem)
 {
-	pthread_rwlock_rdlock(&sem->lock);
-	preempt_disable();
+	pthread_mutex_lock(&sem->lock);
 }
 
 static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
 {
-	pthread_rwlock_rdlock(&sem->lock);
+	pthread_mutex_lock(&sem->lock);
 }
 
 static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 {
-	return !pthread_rwlock_tryrdlock(&sem->lock);
+	return !pthread_mutex_trylock(&sem->lock);
 }
 
 static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem)
 {
-	preempt_enable();
-	pthread_rwlock_unlock(&sem->lock);
+	pthread_mutex_unlock(&sem->lock);
 }
 
 static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 {
-	pthread_rwlock_unlock(&sem->lock);
+	pthread_mutex_unlock(&sem->lock);
 }
 
 static inline void percpu_down_write(struct percpu_rw_semaphore *sem)
 {
-	pthread_rwlock_wrlock(&sem->lock);
+	pthread_mutex_lock(&sem->lock);
 }
 
 static inline void percpu_up_write(struct percpu_rw_semaphore *sem)
 {
-	pthread_rwlock_unlock(&sem->lock);
+	pthread_mutex_unlock(&sem->lock);
 }
 
 static inline void percpu_free_rwsem(struct percpu_rw_semaphore *sem) {}
 
 static inline int percpu_init_rwsem(struct percpu_rw_semaphore *sem)
 {
-	pthread_rwlock_init(&sem->lock, NULL);
+	pthread_mutex_init(&sem->lock, NULL);
 	return 0;
 }
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index ad249776..740d8332 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -1,6 +1,8 @@
 #ifndef __TOOLS_LINUX_PERCPU_H
 #define __TOOLS_LINUX_PERCPU_H
 
+#include <linux/cpumask.h>
+
 #define __percpu
 
 #define free_percpu(percpu)				free(percpu)
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 51334ede..331a9a99 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_POISON_H
 #define _LINUX_POISON_H
 
@@ -20,21 +21,13 @@
  * non-initialized list entries.
  */
 #define LIST_POISON1  ((void *) 0x100 + POISON_POINTER_DELTA)
-#define LIST_POISON2  ((void *) 0x200 + POISON_POINTER_DELTA)
+#define LIST_POISON2  ((void *) 0x122 + POISON_POINTER_DELTA)
 
 /********** include/linux/timer.h **********/
-/*
- * Magic number "tsta" to indicate a static timer initializer
- * for the object debugging code.
- */
 #define TIMER_ENTRY_STATIC	((void *) 0x300 + POISON_POINTER_DELTA)
 
-/********** mm/debug-pagealloc.c **********/
-#ifdef CONFIG_PAGE_POISONING_ZERO
-#define PAGE_POISON 0x00
-#else
+/********** mm/page_poison.c **********/
 #define PAGE_POISON 0xaa
-#endif
 
 /********** mm/page_alloc.c ************/
 
@@ -45,11 +38,8 @@
  * Magic nums for obj red zoning.
  * Placed in the first word before and the first word after an obj.
  */
-#define	RED_INACTIVE	0x09F911029D74E35BULL	/* when obj is inactive */
-#define	RED_ACTIVE	0xD84156C5635688C0ULL	/* when obj is active */
-
-#define SLUB_RED_INACTIVE	0xbb
-#define SLUB_RED_ACTIVE		0xcc
+#define SLUB_RED_INACTIVE	0xbb	/* when obj is inactive */
+#define SLUB_RED_ACTIVE		0xcc	/* when obj is active */
 
 /* ...and for poisoning */
 #define	POISON_INUSE	0x5a	/* for use-uninitialised poisoning */
@@ -59,12 +49,6 @@
 /********** arch/$ARCH/mm/init.c **********/
 #define POISON_FREE_INITMEM	0xcc
 
-/********** arch/ia64/hp/common/sba_iommu.c **********/
-/*
- * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a
- * value of "SBAIOMMU POISON\0" for spill-over poisoning.
- */
-
 /********** fs/jbd/journal.c **********/
 #define JBD_POISON_FREE		0x5b
 #define JBD2_POISON_FREE	0x5c
@@ -80,11 +64,26 @@
 /********** kernel/mutexes **********/
 #define MUTEX_DEBUG_INIT	0x11
 #define MUTEX_DEBUG_FREE	0x22
-
-/********** lib/flex_array.c **********/
-#define FLEX_ARRAY_FREE	0x6c	/* for use-after-free poisoning */
+#define MUTEX_POISON_WW_CTX	((void *) 0x500 + POISON_POINTER_DELTA)
 
 /********** security/ **********/
 #define KEY_DESTROY		0xbd
 
+/********** net/core/page_pool.c **********/
+#define PP_SIGNATURE		(0x40 + POISON_POINTER_DELTA)
+
+/********** net/core/skbuff.c **********/
+#define SKB_LIST_POISON_NEXT	((void *)(0x800 + POISON_POINTER_DELTA))
+/********** net/ **********/
+#define NET_PTR_POISON		((void *)(0x801 + POISON_POINTER_DELTA))
+
+/********** kernel/bpf/ **********/
+#define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA))
+
+/********** VFS **********/
+#define VFS_PTR_POISON ((void *)(0xF5 + POISON_POINTER_DELTA))
+
+/********** lib/stackdepot.c **********/
+#define STACK_DEPOT_POISON ((void *)(0xD390 + POISON_POINTER_DELTA))
+
 #endif
diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index 65beeb14..a8dad160 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -28,7 +28,7 @@ typedef struct {
 	posix_acl_xattr_entry	a_entries[0];
 } posix_acl_xattr_header;
 
-extern const struct xattr_handler posix_acl_access_xattr_handler;
-extern const struct xattr_handler posix_acl_default_xattr_handler;
+extern const struct xattr_handler nop_posix_acl_access;
+extern const struct xattr_handler nop_posix_acl_default;
 
 #endif	/* _POSIX_ACL_XATTR_H */
diff --git a/include/linux/prandom.h b/include/linux/prandom.h
new file mode 100644
index 00000000..9aea22dc
--- /dev/null
+++ b/include/linux/prandom.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_PRANDOM_H
+#define _LINUX_PRANDOM_H
+
+#include <linux/random.h>
+
+static inline void prandom_bytes(void *buf, int nbytes)
+{
+	return get_random_bytes(buf, nbytes);
+}
+
+#define prandom_type(type)				\
+static inline type prandom_##type(void)			\
+{							\
+	type v;						\
+							\
+	prandom_bytes(&v, sizeof(v));			\
+	return v;					\
+}
+
+prandom_type(int);
+prandom_type(long);
+prandom_type(u32);
+prandom_type(u64);
+#undef prandom_type
+
+static inline u32 prandom_u32_max(u32 max)
+{
+	return prandom_u32() % max;
+
+}
+
+#endif /* _LINUX_PRANDOM_H */
+
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 06186016..dbc7c24d 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -1,15 +1,16 @@
 #ifndef __LINUX_PREEMPT_H
 #define __LINUX_PREEMPT_H
 
-#define preempt_disable()			barrier()
-#define sched_preempt_enable_no_resched()	barrier()
-#define preempt_enable_no_resched()		barrier()
-#define preempt_enable()			barrier()
+extern void preempt_disable(void);
+extern void preempt_enable(void);
+
+#define sched_preempt_enable_no_resched()	preempt_enable()
+#define preempt_enable_no_resched()		preempt_enable()
 #define preempt_check_resched()			do { } while (0)
 
-#define preempt_disable_notrace()		barrier()
-#define preempt_enable_no_resched_notrace()	barrier()
-#define preempt_enable_notrace()		barrier()
+#define preempt_disable_notrace()		preempt_disable()
+#define preempt_enable_no_resched_notrace()	preempt_enable()
+#define preempt_enable_notrace()		preempt_enable()
 #define preemptible()				0
 
 #endif /* __LINUX_PREEMPT_H */
diff --git a/include/linux/prefetch.h b/include/linux/prefetch.h
index 13cb826d..b14fbe93 100644
--- a/include/linux/prefetch.h
+++ b/include/linux/prefetch.h
@@ -4,4 +4,7 @@
 #define prefetch(p)	\
 	({ __maybe_unused typeof(p) __var = (p); })
 
+#define prefetchw(p)	\
+	({ __maybe_unused typeof(p) __var = (p); })
+
 #endif /* _LINUX_PREFETCH_H */
diff --git a/include/linux/pretty-printers.h b/include/linux/pretty-printers.h
new file mode 100644
index 00000000..f39d8edf
--- /dev/null
+++ b/include/linux/pretty-printers.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+/* Copyright (C) 2022 Kent Overstreet */
+
+#ifndef _LINUX_PRETTY_PRINTERS_H
+#define _LINUX_PRETTY_PRINTERS_H
+
+void prt_string_option(struct printbuf *, const char * const[], size_t);
+void prt_bitflags(struct printbuf *, const char * const[], u64);
+
+#endif /* _LINUX_PRETTY_PRINTERS_H */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 4e29af49..cdafb9af 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -5,6 +5,7 @@
 #define pr_fmt(fmt) fmt
 #endif
 
+#include <linux/compiler.h>
 #include <stdarg.h>
 #include <stdio.h>
 
@@ -18,29 +19,35 @@
 #define KERN_DEBUG	""
 #define KERN_DEFAULT	""
 #define KERN_CONT	""
+#define KERN_SOH	"\001"
 
 static inline int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
 {
-       int i = vsnprintf(buf, size, fmt, args);
-       ssize_t ssize = size;
+	int i;
 
-       return (i >= ssize) ? (ssize - 1) : i;
+	i = vsnprintf(buf, size, fmt, args);
+
+	if (likely(i < size))
+		return i;
+	if (size != 0)
+		return size - 1;
+	return 0;
 }
 
 static inline int scnprintf(char * buf, size_t size, const char * fmt, ...)
 {
-       ssize_t ssize = size;
-       va_list args;
-       int i;
+	va_list args;
+	int i;
 
-       va_start(args, fmt);
-       i = vsnprintf(buf, size, fmt, args);
-       va_end(args);
+	va_start(args, fmt);
+	i = vscnprintf(buf, size, fmt, args);
+	va_end(args);
 
-       return (i >= ssize) ? (ssize - 1) : i;
+	return i;
 }
 
 #define printk(...)	printf(__VA_ARGS__)
+#define vprintk(...)	vprintf(__VA_ARGS__)
 
 #define no_printk(fmt, ...)				\
 ({							\
@@ -164,7 +171,6 @@ static inline int scnprintf(char * buf, size_t size, const char * fmt, ...)
  * ratelimited messages with local ratelimit_state,
  * no local ratelimit_state used in the !PRINTK case
  */
-#ifdef CONFIG_PRINTK
 #define printk_ratelimited(fmt, ...)					\
 ({									\
 	static DEFINE_RATELIMIT_STATE(_rs,				\
@@ -174,10 +180,6 @@ static inline int scnprintf(char * buf, size_t size, const char * fmt, ...)
 	if (__ratelimit(&_rs))						\
 		printk(fmt, ##__VA_ARGS__);				\
 })
-#else
-#define printk_ratelimited(fmt, ...)					\
-	no_printk(fmt, ##__VA_ARGS__)
-#endif
 
 #define pr_emerg_ratelimited(fmt, ...)					\
 	printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
diff --git a/include/linux/random.h b/include/linux/random.h
index 90fe5749..3203d13c 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -9,8 +9,9 @@
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <linux/bug.h>
+#include <linux/log2.h>
 
-#ifdef __NR_getrandom
+#ifdef SYS_getrandom
 static inline int getrandom(void *buf, size_t buflen, unsigned int flags)
 {
 	 return syscall(SYS_getrandom, buf, buflen, flags);
@@ -29,11 +30,6 @@ static inline void get_random_bytes(void *buf, int nbytes)
 	BUG_ON(getrandom(buf, nbytes, 0) != nbytes);
 }
 
-static inline void prandom_bytes(void *buf, int nbytes)
-{
-	return get_random_bytes(buf, nbytes);
-}
-
 #define get_random_type(type)				\
 static inline type get_random_##type(void)		\
 {							\
@@ -45,6 +41,30 @@ static inline type get_random_##type(void)		\
 
 get_random_type(int);
 get_random_type(long);
+get_random_type(u8);
+get_random_type(u16);
+get_random_type(u32);
 get_random_type(u64);
 
+static inline u32 get_random_u32_below(u32 ceil)
+{
+	if (ceil <= 1)
+		return 0;
+	for (;;) {
+		if (ceil <= 1U << 8) {
+			u32 mult = ceil * get_random_u8();
+			if (likely(is_power_of_2(ceil) || (u8)mult >= (1U << 8) % ceil))
+				return mult >> 8;
+		} else if (ceil <= 1U << 16) {
+			u32 mult = ceil * get_random_u16();
+			if (likely(is_power_of_2(ceil) || (u16)mult >= (1U << 16) % ceil))
+				return mult >> 16;
+		} else {
+			u64 mult = (u64)ceil * get_random_u32();
+			if (likely(is_power_of_2(ceil) || (u32)mult >= -ceil % ceil))
+				return mult >> 32;
+		}
+	}
+}
+
 #endif /* _LINUX_RANDOM_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index c99d78a8..6b7785e6 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -4,13 +4,46 @@
 #include <urcu.h>
 #include <linux/compiler.h>
 
+#define ULONG_CMP_GE(a, b)      (ULONG_MAX / 2 >= (a) - (b))
+
 #define rcu_dereference_check(p, c)	rcu_dereference(p)
 #define rcu_dereference_raw(p)		rcu_dereference(p)
 #define rcu_dereference_protected(p, c)	rcu_dereference(p)
 #define rcu_access_pointer(p)		READ_ONCE(p)
 
 #define kfree_rcu(ptr, rcu_head)	kfree(ptr) /* XXX */
+#define kfree_rcu_mightsleep(ptr)	kfree(ptr) /* XXX */
+#define kvfree_rcu(ptr, rcu_head)	kfree(ptr) /* XXX */
+#define kvfree_rcu_mightsleep(ptr)	kfree(ptr) /* XXX */
 
 #define RCU_INIT_POINTER(p, v)		WRITE_ONCE(p, v)
 
+/* Has the specified rcu_head structure been handed to call_rcu()? */
+
+/**
+ * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
+ * @rhp: The rcu_head structure to initialize.
+ *
+ * If you intend to invoke rcu_head_after_call_rcu() to test whether a
+ * given rcu_head structure has already been passed to call_rcu(), then
+ * you must also invoke this rcu_head_init() function on it just after
+ * allocating that structure.  Calls to this function must not race with
+ * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
+ */
+static inline void rcu_head_init(struct rcu_head *rhp)
+{
+	rhp->func = (void *)~0L;
+}
+
+static inline bool
+rcu_head_after_call_rcu(struct rcu_head *rhp,
+			void (*f)(struct rcu_head *head))
+{
+	void (*func)(struct rcu_head *head) = READ_ONCE(rhp->func);
+
+	if (func == f)
+		return true;
+	return false;
+}
+
 #endif /* __TOOLS_LINUX_RCUPDATE_H */
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
new file mode 100644
index 00000000..ddeec986
--- /dev/null
+++ b/include/linux/refcount.h
@@ -0,0 +1,352 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Variant of atomic_t specialized for reference counts.
+ *
+ * The interface matches the atomic_t interface (to aid in porting) but only
+ * provides the few functions one should use for reference counting.
+ *
+ * Saturation semantics
+ * ====================
+ *
+ * refcount_t differs from atomic_t in that the counter saturates at
+ * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the
+ * counter and causing 'spurious' use-after-free issues. In order to avoid the
+ * cost associated with introducing cmpxchg() loops into all of the saturating
+ * operations, we temporarily allow the counter to take on an unchecked value
+ * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow
+ * or overflow has occurred. Although this is racy when multiple threads
+ * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly
+ * equidistant from 0 and INT_MAX we minimise the scope for error:
+ *
+ * 	                           INT_MAX     REFCOUNT_SATURATED   UINT_MAX
+ *   0                          (0x7fff_ffff)    (0xc000_0000)    (0xffff_ffff)
+ *   +--------------------------------+----------------+----------------+
+ *                                     <---------- bad value! ---------->
+ *
+ * (in a signed view of the world, the "bad value" range corresponds to
+ * a negative counter value).
+ *
+ * As an example, consider a refcount_inc() operation that causes the counter
+ * to overflow:
+ *
+ * 	int old = atomic_fetch_add_relaxed(r);
+ *	// old is INT_MAX, refcount now INT_MIN (0x8000_0000)
+ *	if (old < 0)
+ *		atomic_set(r, REFCOUNT_SATURATED);
+ *
+ * If another thread also performs a refcount_inc() operation between the two
+ * atomic operations, then the count will continue to edge closer to 0. If it
+ * reaches a value of 1 before /any/ of the threads reset it to the saturated
+ * value, then a concurrent refcount_dec_and_test() may erroneously free the
+ * underlying object.
+ * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently
+ * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK).
+ * With the current PID limit, if no batched refcounting operations are used and
+ * the attacker can't repeatedly trigger kernel oopses in the middle of refcount
+ * operations, this makes it impossible for a saturated refcount to leave the
+ * saturation range, even if it is possible for multiple uses of the same
+ * refcount to nest in the context of a single task:
+ *
+ *     (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT =
+ *     0x40000000 / 0x400000 = 0x100 = 256
+ *
+ * If hundreds of references are added/removed with a single refcounting
+ * operation, it may potentially be possible to leave the saturation range; but
+ * given the precise timing details involved with the round-robin scheduling of
+ * each thread manipulating the refcount and the need to hit the race multiple
+ * times in succession, there doesn't appear to be a practical avenue of attack
+ * even if using refcount_add() operations with larger increments.
+ *
+ * Memory ordering
+ * ===============
+ *
+ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
+ * and provide only what is strictly required for refcounts.
+ *
+ * The increments are fully relaxed; these will not provide ordering. The
+ * rationale is that whatever is used to obtain the object we're increasing the
+ * reference count on will provide the ordering. For locked data structures,
+ * its the lock acquire, for RCU/lockless data structures its the dependent
+ * load.
+ *
+ * Do note that inc_not_zero() provides a control dependency which will order
+ * future stores against the inc, this ensures we'll never modify the object
+ * if we did not in fact acquire a reference.
+ *
+ * The decrements will provide release order, such that all the prior loads and
+ * stores will be issued before, it also provides a control dependency, which
+ * will order us against the subsequent free().
+ *
+ * The control dependency is against the load of the cmpxchg (ll/sc) that
+ * succeeded. This means the stores aren't fully ordered, but this is fine
+ * because the 1->0 transition indicates no concurrency.
+ *
+ * Note that the allocator is responsible for ordering things between free()
+ * and alloc().
+ *
+ * The decrements dec_and_test() and sub_and_test() also provide acquire
+ * ordering on success.
+ *
+ */
+
+#ifndef _LINUX_REFCOUNT_H
+#define _LINUX_REFCOUNT_H
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/compiler.h>
+#include <linux/limits.h>
+
+struct mutex;
+
+/**
+ * typedef refcount_t - variant of atomic_t specialized for reference counts
+ * @refs: atomic_t counter field
+ *
+ * The counter saturates at REFCOUNT_SATURATED and will not move once
+ * there. This avoids wrapping the counter and causing 'spurious'
+ * use-after-free bugs.
+ */
+typedef struct refcount_struct {
+	atomic_t refs;
+} refcount_t;
+
+#define REFCOUNT_INIT(n)	{ .refs = ATOMIC_INIT(n), }
+#define REFCOUNT_MAX		INT_MAX
+#define REFCOUNT_SATURATED	(INT_MIN / 2)
+
+enum refcount_saturation_type {
+	REFCOUNT_ADD_NOT_ZERO_OVF,
+	REFCOUNT_ADD_OVF,
+	REFCOUNT_ADD_UAF,
+	REFCOUNT_SUB_UAF,
+	REFCOUNT_DEC_LEAK,
+};
+
+/**
+ * refcount_set - set a refcount's value
+ * @r: the refcount
+ * @n: value to which the refcount will be set
+ */
+static inline void refcount_set(refcount_t *r, int n)
+{
+	atomic_set(&r->refs, n);
+}
+
+/**
+ * refcount_read - get a refcount's value
+ * @r: the refcount
+ *
+ * Return: the refcount's value
+ */
+static inline unsigned int refcount_read(const refcount_t *r)
+{
+	return atomic_read(&r->refs);
+}
+
+static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp)
+{
+	int old = refcount_read(r);
+
+	do {
+		if (!old)
+			break;
+	} while (!atomic_try_cmpxchg_acquire(&r->refs, &old, old + i));
+
+	if (oldp)
+		*oldp = old;
+
+	return old;
+}
+
+/**
+ * refcount_add_not_zero - add a value to a refcount unless it is 0
+ * @i: the value to add to the refcount
+ * @r: the refcount
+ *
+ * Will saturate at REFCOUNT_SATURATED and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_inc(), or one of its variants, should instead be used to
+ * increment a reference count.
+ *
+ * Return: false if the passed refcount is 0, true otherwise
+ */
+static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
+{
+	return __refcount_add_not_zero(i, r, NULL);
+}
+
+static inline void __refcount_add(int i, refcount_t *r, int *oldp)
+{
+	int old = atomic_add_return(i, &r->refs);
+
+	if (oldp)
+		*oldp = old;
+}
+
+/**
+ * refcount_add - add a value to a refcount
+ * @i: the value to add to the refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_inc(), or one of its variants, should instead be used to
+ * increment a reference count.
+ */
+static inline void refcount_add(int i, refcount_t *r)
+{
+	__refcount_add(i, r, NULL);
+}
+
+static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp)
+{
+	return __refcount_add_not_zero(1, r, oldp);
+}
+
+/**
+ * refcount_inc_not_zero - increment a refcount unless it is 0
+ * @r: the refcount to increment
+ *
+ * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED
+ * and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ *
+ * Return: true if the increment was successful, false otherwise
+ */
+static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
+{
+	return __refcount_inc_not_zero(r, NULL);
+}
+
+static inline void __refcount_inc(refcount_t *r, int *oldp)
+{
+	__refcount_add(1, r, oldp);
+}
+
+/**
+ * refcount_inc - increment a refcount
+ * @r: the refcount to increment
+ *
+ * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller already has a
+ * reference on the object.
+ *
+ * Will WARN if the refcount is 0, as this represents a possible use-after-free
+ * condition.
+ */
+static inline void refcount_inc(refcount_t *r)
+{
+	__refcount_inc(r, NULL);
+}
+
+static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp)
+{
+	int old = atomic_sub_return_release(i, &r->refs);
+
+	if (oldp)
+		*oldp = old;
+
+	if (old == i) {
+		smp_acquire__after_ctrl_dep();
+		return true;
+	}
+
+	return false;
+}
+
+/**
+ * refcount_sub_and_test - subtract from a refcount and test if it is 0
+ * @i: amount to subtract from the refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_dec_and_test(), but it will WARN, return false and
+ * ultimately leak on underflow and will fail to decrement when saturated
+ * at REFCOUNT_SATURATED.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides an acquire ordering on success such that free()
+ * must come after.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_dec(), or one of its variants, should instead be used to
+ * decrement a reference count.
+ *
+ * Return: true if the resulting refcount is 0, false otherwise
+ */
+static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r)
+{
+	return __refcount_sub_and_test(i, r, NULL);
+}
+
+static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp)
+{
+	return __refcount_sub_and_test(1, r, oldp);
+}
+
+/**
+ * refcount_dec_and_test - decrement a refcount and test if it is 0
+ * @r: the refcount
+ *
+ * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
+ * decrement when saturated at REFCOUNT_SATURATED.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides an acquire ordering on success such that free()
+ * must come after.
+ *
+ * Return: true if the resulting refcount is 0, false otherwise
+ */
+static inline __must_check bool refcount_dec_and_test(refcount_t *r)
+{
+	return __refcount_dec_and_test(r, NULL);
+}
+
+static inline void __refcount_dec(refcount_t *r, int *oldp)
+{
+	int old = atomic_sub_return_release(1, &r->refs);
+
+	if (oldp)
+		*oldp = old;
+}
+
+/**
+ * refcount_dec - decrement a refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
+ * when saturated at REFCOUNT_SATURATED.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before.
+ */
+static inline void refcount_dec(refcount_t *r)
+{
+	__refcount_dec(r, NULL);
+}
+
+extern __must_check bool refcount_dec_if_one(refcount_t *r);
+extern __must_check bool refcount_dec_not_one(refcount_t *r);
+extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) __cond_acquires(lock);
+extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) __cond_acquires(lock);
+extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r,
+						       spinlock_t *lock,
+						       unsigned long *flags) __cond_acquires(lock);
+#endif /* _LINUX_REFCOUNT_H */
diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
new file mode 100644
index 00000000..57467cbf
--- /dev/null
+++ b/include/linux/rhashtable-types.h
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Resizable, Scalable, Concurrent Hash Table
+ *
+ * Simple structures that might be needed in include
+ * files.
+ */
+
+#ifndef _LINUX_RHASHTABLE_TYPES_H
+#define _LINUX_RHASHTABLE_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+
+struct rhash_head {
+	struct rhash_head __rcu		*next;
+};
+
+struct rhlist_head {
+	struct rhash_head		rhead;
+	struct rhlist_head __rcu	*next;
+};
+
+struct bucket_table;
+
+/**
+ * struct rhashtable_compare_arg - Key for the function rhashtable_compare
+ * @ht: Hash table
+ * @key: Key to compare against
+ */
+struct rhashtable_compare_arg {
+	struct rhashtable *ht;
+	const void *key;
+};
+
+typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed);
+typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed);
+typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
+			       const void *obj);
+
+/**
+ * struct rhashtable_params - Hash table construction parameters
+ * @nelem_hint: Hint on number of elements, should be 75% of desired size
+ * @key_len: Length of key
+ * @key_offset: Offset of key in struct to be hashed
+ * @head_offset: Offset of rhash_head in struct to be hashed
+ * @max_size: Maximum size while expanding
+ * @min_size: Minimum size while shrinking
+ * @automatic_shrinking: Enable automatic shrinking of tables
+ * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
+ * @obj_hashfn: Function to hash object
+ * @obj_cmpfn: Function to compare key with object
+ */
+struct rhashtable_params {
+	u16			nelem_hint;
+	u16			key_len;
+	u16			key_offset;
+	u16			head_offset;
+	unsigned int		max_size;
+	u16			min_size;
+	bool			automatic_shrinking;
+	rht_hashfn_t		hashfn;
+	rht_obj_hashfn_t	obj_hashfn;
+	rht_obj_cmpfn_t		obj_cmpfn;
+};
+
+/**
+ * struct rhashtable - Hash table handle
+ * @tbl: Bucket table
+ * @key_len: Key length for hashfn
+ * @max_elems: Maximum number of elements in table
+ * @p: Configuration parameters
+ * @rhlist: True if this is an rhltable
+ * @run_work: Deferred worker to expand/shrink asynchronously
+ * @mutex: Mutex to protect current/future table swapping
+ * @lock: Spin lock to protect walker list
+ * @nelems: Number of elements in table
+ */
+struct rhashtable {
+	struct bucket_table __rcu	*tbl;
+	unsigned int			key_len;
+	unsigned int			max_elems;
+	struct rhashtable_params	p;
+	bool				rhlist;
+	struct work_struct		run_work;
+	struct mutex                    mutex;
+	spinlock_t			lock;
+	atomic_t			nelems;
+};
+
+/**
+ * struct rhltable - Hash table with duplicate objects in a list
+ * @ht: Underlying rhtable
+ */
+struct rhltable {
+	struct rhashtable ht;
+};
+
+/**
+ * struct rhashtable_walker - Hash table walker
+ * @list: List entry on list of walkers
+ * @tbl: The table that we were walking over
+ */
+struct rhashtable_walker {
+	struct list_head list;
+	struct bucket_table *tbl;
+};
+
+/**
+ * struct rhashtable_iter - Hash table iterator
+ * @ht: Table to iterate through
+ * @p: Current pointer
+ * @list: Current hash list pointer
+ * @walker: Associated rhashtable walker
+ * @slot: Current slot
+ * @skip: Number of entries to skip in slot
+ */
+struct rhashtable_iter {
+	struct rhashtable *ht;
+	struct rhash_head *p;
+	struct rhlist_head *list;
+	struct rhashtable_walker walker;
+	unsigned int slot;
+	unsigned int skip;
+	bool end_of_table;
+};
+
+int rhashtable_init(struct rhashtable *ht,
+		    const struct rhashtable_params *params);
+int rhltable_init(struct rhltable *hlt,
+		  const struct rhashtable_params *params);
+
+#endif /* _LINUX_RHASHTABLE_TYPES_H */
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index f3faea17..1c6dbdc8 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Resizable, Scalable, Concurrent Hash Table
  *
- * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
+ * Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au>
  * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
  * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
  *
@@ -17,103 +18,97 @@
 #ifndef _LINUX_RHASHTABLE_H
 #define _LINUX_RHASHTABLE_H
 
-#include <linux/atomic.h>
-#include <linux/cache.h>
-#include <linux/compiler.h>
-#include <linux/cpumask.h>
 #include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/jhash.h>
 #include <linux/list_nulls.h>
-#include <linux/workqueue.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <linux/rculist.h>
+#include <linux/bit_spinlock.h>
 
-#define RHT_BASE_BITS		4
-#define RHT_HASH_BITS		27
-#define RHT_BASE_SHIFT		RHT_HASH_BITS
-#define RHT_HASH_RESERVED_SPACE	(RHT_BASE_BITS + 1)
-
-struct rhash_head {
-	struct rhash_head __rcu		*next;
-};
+#include <linux/rhashtable-types.h>
+/*
+ * Objects in an rhashtable have an embedded struct rhash_head
+ * which is linked into as hash chain from the hash table - or one
+ * of two or more hash tables when the rhashtable is being resized.
+ * The end of the chain is marked with a special nulls marks which has
+ * the least significant bit set but otherwise stores the address of
+ * the hash bucket.  This allows us to be sure we've found the end
+ * of the right list.
+ * The value stored in the hash bucket has BIT(0) used as a lock bit.
+ * This bit must be atomically set before any changes are made to
+ * the chain.  To avoid dereferencing this pointer without clearing
+ * the bit first, we use an opaque 'struct rhash_lock_head *' for the
+ * pointer stored in the bucket.  This struct needs to be defined so
+ * that rcu_dereference() works on it, but it has no content so a
+ * cast is needed for it to be useful.  This ensures it isn't
+ * used by mistake with clearing the lock bit first.
+ */
+struct rhash_lock_head {};
 
+/* Maximum chain length before rehash
+ *
+ * The maximum (not average) chain length grows with the size of the hash
+ * table, at a rate of (log N)/(log log N).
+ *
+ * The value of 16 is selected so that even if the hash table grew to
+ * 2^32 you would not expect the maximum chain length to exceed it
+ * unless we are under attack (or extremely unlucky).
+ *
+ * As this limit is only to detect attacks, we don't need to set it to a
+ * lower value as you'd need the chain length to vastly exceed 16 to have
+ * any real effect on the system.
+ */
+#define RHT_ELASTICITY	16u
+
+/**
+ * struct bucket_table - Table of hash buckets
+ * @size: Number of hash buckets
+ * @nest: Number of bits of first-level nested table.
+ * @rehash: Current bucket being rehashed
+ * @hash_rnd: Random seed to fold into hash
+ * @walkers: List of active walkers
+ * @rcu: RCU structure for freeing the table
+ * @future_tbl: Table under construction during rehashing
+ * @ntbl: Nested table used when out of memory.
+ * @buckets: size * hash buckets
+ */
 struct bucket_table {
 	unsigned int		size;
-	unsigned int		rehash;
+	unsigned int		nest;
 	u32			hash_rnd;
-	unsigned int		locks_mask;
-	spinlock_t		*locks;
 	struct list_head	walkers;
 	struct rcu_head		rcu;
 
 	struct bucket_table __rcu *future_tbl;
 
-	struct rhash_head __rcu	*buckets[] ____cacheline_aligned_in_smp;
+	struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp;
 };
 
-struct rhashtable_compare_arg {
-	struct rhashtable *ht;
-	const void *key;
-};
-
-typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed);
-typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed);
-typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
-			       const void *obj);
-
-struct rhashtable_params {
-	size_t			nelem_hint;
-	size_t			key_len;
-	size_t			key_offset;
-	size_t			head_offset;
-	unsigned int		insecure_max_entries;
-	unsigned int		max_size;
-	unsigned int		min_size;
-	u32			nulls_base;
-	bool			insecure_elasticity;
-	bool			automatic_shrinking;
-	size_t			locks_mul;
-	rht_hashfn_t		hashfn;
-	rht_obj_hashfn_t	obj_hashfn;
-	rht_obj_cmpfn_t		obj_cmpfn;
-};
-
-struct rhashtable {
-	struct bucket_table __rcu	*tbl;
-	atomic_t			nelems;
-	unsigned int			key_len;
-	unsigned int			elasticity;
-	struct rhashtable_params	p;
-	struct work_struct		run_work;
-	struct mutex                    mutex;
-	spinlock_t			lock;
-};
-
-struct rhashtable_walker {
-	struct list_head list;
-	struct bucket_table *tbl;
-};
-
-static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash)
-{
-	return NULLS_MARKER(ht->p.nulls_base + hash);
-}
-
-#define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \
-	((ptr) = (typeof(ptr)) rht_marker(ht, hash))
+/*
+ * NULLS_MARKER() expects a hash value with the low
+ * bits mostly likely to be significant, and it discards
+ * the msb.
+ * We give it an address, in which the bottom bit is
+ * always 0, and the msb might be significant.
+ * So we shift the address down one bit to align with
+ * expectations and avoid losing a significant bit.
+ *
+ * We never store the NULLS_MARKER in the hash table
+ * itself as we need the lsb for locking.
+ * Instead we store a NULL
+ */
+#define	RHT_NULLS_MARKER(ptr)	\
+	((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1))
+#define INIT_RHT_NULLS_HEAD(ptr)	\
+	((ptr) = NULL)
 
 static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
 {
 	return ((unsigned long) ptr & 1);
 }
 
-static inline unsigned long rht_get_nulls_value(const struct rhash_head *ptr)
-{
-	return ((unsigned long) ptr) >> 1;
-}
-
 static inline void *rht_obj(const struct rhashtable *ht,
 			    const struct rhash_head *he)
 {
@@ -123,37 +118,45 @@ static inline void *rht_obj(const struct rhashtable *ht,
 static inline unsigned int rht_bucket_index(const struct bucket_table *tbl,
 					    unsigned int hash)
 {
-	return (hash >> RHT_HASH_RESERVED_SPACE) & (tbl->size - 1);
+	return hash & (tbl->size - 1);
 }
 
-static inline unsigned int rht_key_hashfn(
-	struct rhashtable *ht, const struct bucket_table *tbl,
-	const void *key, const struct rhashtable_params params)
+static inline unsigned int rht_key_get_hash(struct rhashtable *ht,
+	const void *key, const struct rhashtable_params params,
+	unsigned int hash_rnd)
 {
 	unsigned int hash;
 
 	/* params must be equal to ht->p if it isn't constant. */
 	if (!__builtin_constant_p(params.key_len))
-		hash = ht->p.hashfn(key, ht->key_len, tbl->hash_rnd);
+		hash = ht->p.hashfn(key, ht->key_len, hash_rnd);
 	else if (params.key_len) {
 		unsigned int key_len = params.key_len;
 
 		if (params.hashfn)
-			hash = params.hashfn(key, key_len, tbl->hash_rnd);
+			hash = params.hashfn(key, key_len, hash_rnd);
 		else if (key_len & (sizeof(u32) - 1))
-			hash = jhash(key, key_len, tbl->hash_rnd);
+			hash = jhash(key, key_len, hash_rnd);
 		else
-			hash = jhash2(key, key_len / sizeof(u32),
-				      tbl->hash_rnd);
+			hash = jhash2(key, key_len / sizeof(u32), hash_rnd);
 	} else {
 		unsigned int key_len = ht->p.key_len;
 
 		if (params.hashfn)
-			hash = params.hashfn(key, key_len, tbl->hash_rnd);
+			hash = params.hashfn(key, key_len, hash_rnd);
 		else
-			hash = jhash(key, key_len, tbl->hash_rnd);
+			hash = jhash(key, key_len, hash_rnd);
 	}
 
+	return hash;
+}
+
+static inline unsigned int rht_key_hashfn(
+	struct rhashtable *ht, const struct bucket_table *tbl,
+	const void *key, const struct rhashtable_params params)
+{
+	unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd);
+
 	return rht_bucket_index(tbl, hash);
 }
 
@@ -170,6 +173,11 @@ static inline unsigned int rht_head_hashfn(
 	       rht_key_hashfn(ht, tbl, ptr + params.key_offset, params);
 }
 
+/**
+ * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
+ * @ht:		hash table
+ * @tbl:	current table
+ */
 static inline bool rht_grow_above_75(const struct rhashtable *ht,
 				     const struct bucket_table *tbl)
 {
@@ -178,6 +186,11 @@ static inline bool rht_grow_above_75(const struct rhashtable *ht,
 	       (!ht->p.max_size || tbl->size < ht->p.max_size);
 }
 
+/**
+ * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
+ * @ht:		hash table
+ * @tbl:	current table
+ */
 static inline bool rht_shrink_below_30(const struct rhashtable *ht,
 				       const struct bucket_table *tbl)
 {
@@ -186,6 +199,11 @@ static inline bool rht_shrink_below_30(const struct rhashtable *ht,
 	       tbl->size > ht->p.min_size;
 }
 
+/**
+ * rht_grow_above_100 - returns true if nelems > table-size
+ * @ht:		hash table
+ * @tbl:	current table
+ */
 static inline bool rht_grow_above_100(const struct rhashtable *ht,
 				      const struct bucket_table *tbl)
 {
@@ -193,62 +211,354 @@ static inline bool rht_grow_above_100(const struct rhashtable *ht,
 		(!ht->p.max_size || tbl->size < ht->p.max_size);
 }
 
+/**
+ * rht_grow_above_max - returns true if table is above maximum
+ * @ht:		hash table
+ * @tbl:	current table
+ */
 static inline bool rht_grow_above_max(const struct rhashtable *ht,
 				      const struct bucket_table *tbl)
 {
-	return ht->p.insecure_max_entries &&
-	       atomic_read(&ht->nelems) >= ht->p.insecure_max_entries;
+	return atomic_read(&ht->nelems) >= ht->max_elems;
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_rht_mutex_is_held(struct rhashtable *ht);
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
+#else
+static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht)
+{
+	return 1;
+}
+
+static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl,
+					     u32 hash)
+{
+	return 1;
 }
+#endif /* CONFIG_PROVE_LOCKING */
+
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+			     struct rhash_head *obj);
+
+void rhashtable_walk_enter(struct rhashtable *ht,
+			   struct rhashtable_iter *iter);
+void rhashtable_walk_exit(struct rhashtable_iter *iter);
+int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU);
 
-static inline spinlock_t *rht_bucket_lock(const struct bucket_table *tbl,
-					  unsigned int hash)
+static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
 {
-	return &tbl->locks[hash & tbl->locks_mask];
+	(void)rhashtable_walk_start_check(iter);
 }
 
-int rhashtable_insert_rehash(struct rhashtable *, struct bucket_table *);
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *,
-					    const void *,
-					    struct rhash_head *,
-					    struct bucket_table *);
+void *rhashtable_walk_next(struct rhashtable_iter *iter);
+void *rhashtable_walk_peek(struct rhashtable_iter *iter);
+void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
 
-int rhashtable_init(struct rhashtable *, const struct rhashtable_params *);
-void rhashtable_destroy(struct rhashtable *);
+void rhashtable_free_and_destroy(struct rhashtable *ht,
+				 void (*free_fn)(void *ptr, void *arg),
+				 void *arg);
+void rhashtable_destroy(struct rhashtable *ht);
 
-#define rht_dereference(p, ht)			rcu_dereference(p)
-#define rht_dereference_rcu(p, ht)		rcu_dereference(p)
-#define rht_dereference_bucket(p, tbl, hash)	rcu_dereference(p)
-#define rht_dereference_bucket_rcu(p, tbl, hash) rcu_dereference(p)
+struct rhash_lock_head __rcu **rht_bucket_nested(
+	const struct bucket_table *tbl, unsigned int hash);
+struct rhash_lock_head __rcu **__rht_bucket_nested(
+	const struct bucket_table *tbl, unsigned int hash);
+struct rhash_lock_head __rcu **rht_bucket_nested_insert(
+	struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash);
+
+#define rht_dereference(p, ht) \
+	rcu_dereference(p)
+
+#define rht_dereference_rcu(p, ht) \
+	rcu_dereference(p)
+
+#define rht_dereference_bucket(p, tbl, hash) \
+	rcu_dereference(p)
+
+#define rht_dereference_bucket_rcu(p, tbl, hash) \
+	rcu_dereference(p)
 
 #define rht_entry(tpos, pos, member) \
 	({ tpos = container_of(pos, typeof(*tpos), member); 1; })
 
-#define rht_for_each_continue(pos, head, tbl, hash) \
-	for (pos = rht_dereference_bucket(head, tbl, hash); \
-	     !rht_is_a_nulls(pos); \
+static inline struct rhash_lock_head __rcu *const *rht_bucket(
+	const struct bucket_table *tbl, unsigned int hash)
+{
+	return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
+				     &tbl->buckets[hash];
+}
+
+static inline struct rhash_lock_head __rcu **rht_bucket_var(
+	struct bucket_table *tbl, unsigned int hash)
+{
+	return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) :
+				     &tbl->buckets[hash];
+}
+
+static inline struct rhash_lock_head __rcu **rht_bucket_insert(
+	struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
+{
+	return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) :
+				     &tbl->buckets[hash];
+}
+
+/*
+ * We lock a bucket by setting BIT(0) in the pointer - this is always
+ * zero in real pointers.  The NULLS mark is never stored in the bucket,
+ * rather we store NULL if the bucket is empty.
+ * bit_spin_locks do not handle contention well, but the whole point
+ * of the hashtable design is to achieve minimum per-bucket contention.
+ * A nested hash table might not have a bucket pointer.  In that case
+ * we cannot get a lock.  For remove and replace the bucket cannot be
+ * interesting and doesn't need locking.
+ * For insert we allocate the bucket if this is the last bucket_table,
+ * and then take the lock.
+ * Sometimes we unlock a bucket by writing a new pointer there.  In that
+ * case we don't need to unlock, but we do need to reset state such as
+ * local_bh. For that we have rht_assign_unlock().  As rcu_assign_pointer()
+ * provides the same release semantics that bit_spin_unlock() provides,
+ * this is safe.
+ * When we write to a bucket without unlocking, we use rht_assign_locked().
+ */
+
+static inline void rht_lock(struct bucket_table *tbl,
+			    struct rhash_lock_head __rcu **bkt)
+{
+	bit_spin_lock(0, (unsigned long *)bkt);
+}
+
+static inline void rht_lock_nested(struct bucket_table *tbl,
+				   struct rhash_lock_head __rcu **bucket,
+				   unsigned int subclass)
+{
+	bit_spin_lock(0, (unsigned long *)bucket);
+}
+
+static inline void rht_unlock(struct bucket_table *tbl,
+			      struct rhash_lock_head __rcu **bkt)
+{
+	bit_spin_unlock(0, (unsigned long *)bkt);
+}
+
+static inline struct rhash_head *__rht_ptr(
+	struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt)
+{
+	return (struct rhash_head *)
+		((unsigned long)p & ~BIT(0) ?:
+		 (unsigned long)RHT_NULLS_MARKER(bkt));
+}
+
+/*
+ * Where 'bkt' is a bucket and might be locked:
+ *   rht_ptr_rcu() dereferences that pointer and clears the lock bit.
+ *   rht_ptr() dereferences in a context where the bucket is locked.
+ *   rht_ptr_exclusive() dereferences in a context where exclusive
+ *            access is guaranteed, such as when destroying the table.
+ */
+static inline struct rhash_head *rht_ptr_rcu(
+	struct rhash_lock_head __rcu *const *bkt)
+{
+	return __rht_ptr(rcu_dereference(*bkt), bkt);
+}
+
+static inline struct rhash_head *rht_ptr(
+	struct rhash_lock_head __rcu *const *bkt,
+	struct bucket_table *tbl,
+	unsigned int hash)
+{
+	return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt);
+}
+
+static inline struct rhash_head *rht_ptr_exclusive(
+	struct rhash_lock_head __rcu *const *bkt)
+{
+	return __rht_ptr(rcu_dereference(*bkt), bkt);
+}
+
+static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
+				     struct rhash_head *obj)
+{
+	if (rht_is_a_nulls(obj))
+		obj = NULL;
+	rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0)));
+}
+
+static inline void rht_assign_unlock(struct bucket_table *tbl,
+				     struct rhash_lock_head __rcu **bkt,
+				     struct rhash_head *obj)
+{
+	if (rht_is_a_nulls(obj))
+		obj = NULL;
+	rcu_assign_pointer(*bkt, (void *)obj);
+	preempt_enable();
+	__release(bitlock);
+	bit_spin_wake(0, (unsigned long *) bkt);
+}
+
+/**
+ * rht_for_each_from - iterate over hash chain from given head
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the &struct rhash_head to start from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ */
+#define rht_for_each_from(pos, head, tbl, hash) \
+	for (pos = head;			\
+	     !rht_is_a_nulls(pos);		\
 	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
 
+/**
+ * rht_for_each - iterate over hash chain
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ */
 #define rht_for_each(pos, tbl, hash) \
-	rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash)
+	rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash),  \
+			  tbl, hash)
+
+/**
+ * rht_for_each_entry_from - iterate over hash chain from given head
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the &struct rhash_head to start from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ */
+#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member)	\
+	for (pos = head;						\
+	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	\
+	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
 
-#define rht_for_each_rcu_continue(pos, head, tbl, hash)			\
+/**
+ * rht_for_each_entry - iterate over hash chain of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ */
+#define rht_for_each_entry(tpos, pos, tbl, hash, member)		\
+	rht_for_each_entry_from(tpos, pos,				\
+				rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
+				tbl, hash, member)
+
+/**
+ * rht_for_each_entry_safe - safely iterate over hash chain of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @next:	the &struct rhash_head to use as next in loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive allows for the looped code to
+ * remove the loop cursor from the list.
+ */
+#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)	      \
+	for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash),		      \
+	     next = !rht_is_a_nulls(pos) ?				      \
+		       rht_dereference_bucket(pos->next, tbl, hash) : NULL;   \
+	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	      \
+	     pos = next,						      \
+	     next = !rht_is_a_nulls(pos) ?				      \
+		       rht_dereference_bucket(pos->next, tbl, hash) : NULL)
+
+/**
+ * rht_for_each_rcu_from - iterate over rcu hash chain from given head
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the &struct rhash_head to start from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_rcu_from(pos, head, tbl, hash)			\
 	for (({barrier(); }),						\
-	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		\
+	     pos = head;						\
 	     !rht_is_a_nulls(pos);					\
 	     pos = rcu_dereference_raw(pos->next))
 
-#define rht_for_each_rcu(pos, tbl, hash)				\
-	rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash)
+/**
+ * rht_for_each_rcu - iterate over rcu hash chain
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_rcu(pos, tbl, hash)			\
+	for (({barrier(); }),					\
+	     pos = rht_ptr_rcu(rht_bucket(tbl, hash));		\
+	     !rht_is_a_nulls(pos);				\
+	     pos = rcu_dereference_raw(pos->next))
 
-#define rht_for_each_entry_rcu_continue(tpos, pos, head, tbl, hash, member) \
+/**
+ * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the &struct rhash_head to start from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \
 	for (({barrier(); }),						    \
-	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		    \
+	     pos = head;						    \
 	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	    \
 	     pos = rht_dereference_bucket_rcu(pos->next, tbl, hash))
 
-#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		\
-	rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\
-					tbl, hash, member)
+/**
+ * rht_for_each_entry_rcu - iterate over rcu hash chain of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		   \
+	rht_for_each_entry_rcu_from(tpos, pos,				   \
+				    rht_ptr_rcu(rht_bucket(tbl, hash)),	   \
+				    tbl, hash, member)
+
+/**
+ * rhl_for_each_rcu - iterate over rcu hash table list
+ * @pos:	the &struct rlist_head to use as a loop cursor.
+ * @list:	the head of the list
+ *
+ * This hash chain list-traversal primitive should be used on the
+ * list returned by rhltable_lookup.
+ */
+#define rhl_for_each_rcu(pos, list)					\
+	for (pos = list; pos; pos = rcu_dereference_raw(pos->next))
+
+/**
+ * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rlist_head to use as a loop cursor.
+ * @list:	the head of the list
+ * @member:	name of the &struct rlist_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive should be used on the
+ * list returned by rhltable_lookup.
+ */
+#define rhl_for_each_entry_rcu(tpos, pos, list, member)			\
+	for (pos = list; pos && rht_entry(tpos, pos, member);		\
+	     pos = rcu_dereference_raw(pos->next))
 
 static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
 				     const void *obj)
@@ -259,7 +569,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
 	return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len);
 }
 
-static inline void *rhashtable_lookup_fast(
+/* Internal function, do not use. */
+static inline struct rhash_head *__rhashtable_lookup(
 	struct rhashtable *ht, const void *key,
 	const struct rhashtable_params params)
 {
@@ -267,23 +578,27 @@ static inline void *rhashtable_lookup_fast(
 		.ht = ht,
 		.key = key,
 	};
-	const struct bucket_table *tbl;
+	struct rhash_lock_head __rcu *const *bkt;
+	struct bucket_table *tbl;
 	struct rhash_head *he;
 	unsigned int hash;
 
-	rcu_read_lock();
-
 	tbl = rht_dereference_rcu(ht->tbl, ht);
 restart:
 	hash = rht_key_hashfn(ht, tbl, key, params);
-	rht_for_each_rcu(he, tbl, hash) {
-		if (params.obj_cmpfn ?
-		    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
-		    rhashtable_compare(&arg, rht_obj(ht, he)))
-			continue;
-		rcu_read_unlock();
-		return rht_obj(ht, he);
-	}
+	bkt = rht_bucket(tbl, hash);
+	do {
+		rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) {
+			if (params.obj_cmpfn ?
+			    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
+			    rhashtable_compare(&arg, rht_obj(ht, he)))
+				continue;
+			return he;
+		}
+		/* An object might have been moved to a different hash chain,
+		 * while we walk along it - better check and retry.
+		 */
+	} while (he != RHT_NULLS_MARKER(bkt));
 
 	/* Ensure we see any new tables. */
 	smp_rmb();
@@ -291,150 +606,594 @@ restart:
 	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 	if (unlikely(tbl))
 		goto restart;
-	rcu_read_unlock();
 
 	return NULL;
 }
 
-static inline int __rhashtable_insert_fast(
-	struct rhashtable *ht, const void *key, struct rhash_head *obj,
+/**
+ * rhashtable_lookup - search hash table
+ * @ht:		hash table
+ * @key:	the pointer to the key
+ * @params:	hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. The first matching entry is returned.
+ *
+ * This must only be called under the RCU read lock.
+ *
+ * Returns the first entry on which the compare function returned true.
+ */
+static inline void *rhashtable_lookup(
+	struct rhashtable *ht, const void *key,
 	const struct rhashtable_params params)
 {
+	struct rhash_head *he = __rhashtable_lookup(ht, key, params);
+
+	return he ? rht_obj(ht, he) : NULL;
+}
+
+/**
+ * rhashtable_lookup_fast - search hash table, without RCU read lock
+ * @ht:		hash table
+ * @key:	the pointer to the key
+ * @params:	hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. The first matching entry is returned.
+ *
+ * Only use this function when you have other mechanisms guaranteeing
+ * that the object won't go away after the RCU read lock is released.
+ *
+ * Returns the first entry on which the compare function returned true.
+ */
+static inline void *rhashtable_lookup_fast(
+	struct rhashtable *ht, const void *key,
+	const struct rhashtable_params params)
+{
+	void *obj;
+
+	rcu_read_lock();
+	obj = rhashtable_lookup(ht, key, params);
+	rcu_read_unlock();
+
+	return obj;
+}
+
+/**
+ * rhltable_lookup - search hash list table
+ * @hlt:	hash table
+ * @key:	the pointer to the key
+ * @params:	hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key.  All matching entries are returned
+ * in a list.
+ *
+ * This must only be called under the RCU read lock.
+ *
+ * Returns the list of entries that match the given key.
+ */
+static inline struct rhlist_head *rhltable_lookup(
+	struct rhltable *hlt, const void *key,
+	const struct rhashtable_params params)
+{
+	struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params);
+
+	return he ? container_of(he, struct rhlist_head, rhead) : NULL;
+}
+
+/* Internal function, please use rhashtable_insert_fast() instead. This
+ * function returns the existing element already in hashes in there is a clash,
+ * otherwise it returns an error via ERR_PTR().
+ */
+static inline void *__rhashtable_insert_fast(
+	struct rhashtable *ht, const void *key, struct rhash_head *obj,
+	const struct rhashtable_params params, bool rhlist)
+{
 	struct rhashtable_compare_arg arg = {
 		.ht = ht,
 		.key = key,
 	};
-	struct bucket_table *tbl, *new_tbl;
+	struct rhash_lock_head __rcu **bkt;
+	struct rhash_head __rcu **pprev;
+	struct bucket_table *tbl;
 	struct rhash_head *head;
-	spinlock_t *lock;
-	unsigned int elasticity;
 	unsigned int hash;
-	int err;
+	int elasticity;
+	void *data;
 
-restart:
 	rcu_read_lock();
 
 	tbl = rht_dereference_rcu(ht->tbl, ht);
+	hash = rht_head_hashfn(ht, tbl, obj, params);
+	elasticity = RHT_ELASTICITY;
+	bkt = rht_bucket_insert(ht, tbl, hash);
+	data = ERR_PTR(-ENOMEM);
+	if (!bkt)
+		goto out;
+	pprev = NULL;
+	rht_lock(tbl, bkt);
 
-	/* All insertions must grab the oldest table containing
-	 * the hashed bucket that is yet to be rehashed.
-	 */
-	for (;;) {
-		hash = rht_head_hashfn(ht, tbl, obj, params);
-		lock = rht_bucket_lock(tbl, hash);
-		spin_lock_bh(lock);
+	if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
+slow_path:
+		rht_unlock(tbl, bkt);
+		rcu_read_unlock();
+		return rhashtable_insert_slow(ht, key, obj);
+	}
 
-		if (tbl->rehash <= hash)
-			break;
+	rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
+		struct rhlist_head *plist;
+		struct rhlist_head *list;
 
-		spin_unlock_bh(lock);
-		tbl = rht_dereference_rcu(tbl->future_tbl, ht);
-	}
+		elasticity--;
+		if (!key ||
+		    (params.obj_cmpfn ?
+		     params.obj_cmpfn(&arg, rht_obj(ht, head)) :
+		     rhashtable_compare(&arg, rht_obj(ht, head)))) {
+			pprev = &head->next;
+			continue;
+		}
 
-	new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
-	if (unlikely(new_tbl)) {
-		tbl = rhashtable_insert_slow(ht, key, obj, new_tbl);
-		if (!IS_ERR_OR_NULL(tbl))
-			goto slow_path;
+		data = rht_obj(ht, head);
 
-		err = PTR_ERR(tbl);
-		goto out;
-	}
+		if (!rhlist)
+			goto out_unlock;
 
-	err = -E2BIG;
-	if (unlikely(rht_grow_above_max(ht, tbl)))
-		goto out;
 
-	if (unlikely(rht_grow_above_100(ht, tbl))) {
-slow_path:
-		spin_unlock_bh(lock);
-		err = rhashtable_insert_rehash(ht, tbl);
-		rcu_read_unlock();
-		if (err)
-			return err;
+		list = container_of(obj, struct rhlist_head, rhead);
+		plist = container_of(head, struct rhlist_head, rhead);
 
-		goto restart;
+		RCU_INIT_POINTER(list->next, plist);
+		head = rht_dereference_bucket(head->next, tbl, hash);
+		RCU_INIT_POINTER(list->rhead.next, head);
+		if (pprev) {
+			rcu_assign_pointer(*pprev, obj);
+			rht_unlock(tbl, bkt);
+		} else
+			rht_assign_unlock(tbl, bkt, obj);
+		data = NULL;
+		goto out;
 	}
 
-	err = -EEXIST;
-	elasticity = ht->elasticity;
-	rht_for_each(head, tbl, hash) {
-		if (key &&
-		    unlikely(!(params.obj_cmpfn ?
-			       params.obj_cmpfn(&arg, rht_obj(ht, head)) :
-			       rhashtable_compare(&arg, rht_obj(ht, head)))))
-			goto out;
-		if (!--elasticity)
-			goto slow_path;
-	}
+	if (elasticity <= 0)
+		goto slow_path;
+
+	data = ERR_PTR(-E2BIG);
+	if (unlikely(rht_grow_above_max(ht, tbl)))
+		goto out_unlock;
 
-	err = 0;
+	if (unlikely(rht_grow_above_100(ht, tbl)))
+		goto slow_path;
 
-	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+	/* Inserting at head of list makes unlocking free. */
+	head = rht_ptr(bkt, tbl, hash);
 
 	RCU_INIT_POINTER(obj->next, head);
+	if (rhlist) {
+		struct rhlist_head *list;
 
-	rcu_assign_pointer(tbl->buckets[hash], obj);
+		list = container_of(obj, struct rhlist_head, rhead);
+		RCU_INIT_POINTER(list->next, NULL);
+	}
 
 	atomic_inc(&ht->nelems);
+	rht_assign_unlock(tbl, bkt, obj);
+
 	if (rht_grow_above_75(ht, tbl))
 		schedule_work(&ht->run_work);
 
+	data = NULL;
 out:
-	spin_unlock_bh(lock);
 	rcu_read_unlock();
 
-	return err;
+	return data;
+
+out_unlock:
+	rht_unlock(tbl, bkt);
+	goto out;
 }
 
+/**
+ * rhashtable_insert_fast - insert object into hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ *
+ * Will take the per bucket bitlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
+static inline int rhashtable_insert_fast(
+	struct rhashtable *ht, struct rhash_head *obj,
+	const struct rhashtable_params params)
+{
+	void *ret;
+
+	ret = __rhashtable_insert_fast(ht, NULL, obj, params, false);
+	if (IS_ERR(ret))
+		return PTR_ERR(ret);
+
+	return ret == NULL ? 0 : -EEXIST;
+}
+
+/**
+ * rhltable_insert_key - insert object into hash list table
+ * @hlt:	hash list table
+ * @key:	the pointer to the key
+ * @list:	pointer to hash list head inside object
+ * @params:	hash table parameters
+ *
+ * Will take the per bucket bitlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
+static inline int rhltable_insert_key(
+	struct rhltable *hlt, const void *key, struct rhlist_head *list,
+	const struct rhashtable_params params)
+{
+	return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
+						params, true));
+}
+
+/**
+ * rhltable_insert - insert object into hash list table
+ * @hlt:	hash list table
+ * @list:	pointer to hash list head inside object
+ * @params:	hash table parameters
+ *
+ * Will take the per bucket bitlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
+static inline int rhltable_insert(
+	struct rhltable *hlt, struct rhlist_head *list,
+	const struct rhashtable_params params)
+{
+	const char *key = rht_obj(&hlt->ht, &list->rhead);
+
+	key += params.key_offset;
+
+	return rhltable_insert_key(hlt, key, list, params);
+}
+
+/**
+ * rhashtable_lookup_insert_fast - lookup and insert object into hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ *
+ * This lookup function may only be used for fixed key hash table (key_len
+ * parameter set). It will BUG() if used inappropriately.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
 static inline int rhashtable_lookup_insert_fast(
 	struct rhashtable *ht, struct rhash_head *obj,
 	const struct rhashtable_params params)
 {
 	const char *key = rht_obj(ht, obj);
+	void *ret;
 
 	BUG_ON(ht->p.obj_hashfn);
 
-	return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj,
-					params);
+	ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
+				       false);
+	if (IS_ERR(ret))
+		return PTR_ERR(ret);
+
+	return ret == NULL ? 0 : -EEXIST;
 }
 
-static inline int __rhashtable_remove_fast(
+/**
+ * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ *
+ * Just like rhashtable_lookup_insert_fast(), but this function returns the
+ * object if it exists, NULL if it did not and the insertion was successful,
+ * and an ERR_PTR otherwise.
+ */
+static inline void *rhashtable_lookup_get_insert_fast(
+	struct rhashtable *ht, struct rhash_head *obj,
+	const struct rhashtable_params params)
+{
+	const char *key = rht_obj(ht, obj);
+
+	BUG_ON(ht->p.obj_hashfn);
+
+	return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
+					false);
+}
+
+/**
+ * rhashtable_lookup_insert_key - search and insert object to hash table
+ *				  with explicit key
+ * @ht:		hash table
+ * @key:	key
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ *
+ * Lookups may occur in parallel with hashtable mutations and resizing.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ *
+ * Returns zero on success.
+ */
+static inline int rhashtable_lookup_insert_key(
+	struct rhashtable *ht, const void *key, struct rhash_head *obj,
+	const struct rhashtable_params params)
+{
+	void *ret;
+
+	BUG_ON(!ht->p.obj_hashfn || !key);
+
+	ret = __rhashtable_insert_fast(ht, key, obj, params, false);
+	if (IS_ERR(ret))
+		return PTR_ERR(ret);
+
+	return ret == NULL ? 0 : -EEXIST;
+}
+
+/**
+ * rhashtable_lookup_get_insert_key - lookup and insert object into hash table
+ * @ht:		hash table
+ * @key:	key
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ *
+ * Just like rhashtable_lookup_insert_key(), but this function returns the
+ * object if it exists, NULL if it does not and the insertion was successful,
+ * and an ERR_PTR otherwise.
+ */
+static inline void *rhashtable_lookup_get_insert_key(
+	struct rhashtable *ht, const void *key, struct rhash_head *obj,
+	const struct rhashtable_params params)
+{
+	BUG_ON(!ht->p.obj_hashfn || !key);
+
+	return __rhashtable_insert_fast(ht, key, obj, params, false);
+}
+
+/* Internal function, please use rhashtable_remove_fast() instead */
+static inline int __rhashtable_remove_fast_one(
 	struct rhashtable *ht, struct bucket_table *tbl,
-	struct rhash_head *obj, const struct rhashtable_params params)
+	struct rhash_head *obj, const struct rhashtable_params params,
+	bool rhlist)
 {
+	struct rhash_lock_head __rcu **bkt;
 	struct rhash_head __rcu **pprev;
 	struct rhash_head *he;
-	spinlock_t * lock;
 	unsigned int hash;
 	int err = -ENOENT;
 
 	hash = rht_head_hashfn(ht, tbl, obj, params);
-	lock = rht_bucket_lock(tbl, hash);
+	bkt = rht_bucket_var(tbl, hash);
+	if (!bkt)
+		return -ENOENT;
+	pprev = NULL;
+	rht_lock(tbl, bkt);
+
+	rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
+		struct rhlist_head *list;
 
-	spin_lock_bh(lock);
+		list = container_of(he, struct rhlist_head, rhead);
 
-	pprev = &tbl->buckets[hash];
-	rht_for_each(he, tbl, hash) {
 		if (he != obj) {
+			struct rhlist_head __rcu **lpprev;
+
 			pprev = &he->next;
-			continue;
+
+			if (!rhlist)
+				continue;
+
+			do {
+				lpprev = &list->next;
+				list = rht_dereference_bucket(list->next,
+							      tbl, hash);
+			} while (list && obj != &list->rhead);
+
+			if (!list)
+				continue;
+
+			list = rht_dereference_bucket(list->next, tbl, hash);
+			RCU_INIT_POINTER(*lpprev, list);
+			err = 0;
+			break;
+		}
+
+		obj = rht_dereference_bucket(obj->next, tbl, hash);
+		err = 1;
+
+		if (rhlist) {
+			list = rht_dereference_bucket(list->next, tbl, hash);
+			if (list) {
+				RCU_INIT_POINTER(list->rhead.next, obj);
+				obj = &list->rhead;
+				err = 0;
+			}
+		}
+
+		if (pprev) {
+			rcu_assign_pointer(*pprev, obj);
+			rht_unlock(tbl, bkt);
+		} else {
+			rht_assign_unlock(tbl, bkt, obj);
 		}
+		goto unlocked;
+	}
 
-		rcu_assign_pointer(*pprev, obj->next);
+	rht_unlock(tbl, bkt);
+unlocked:
+	if (err > 0) {
+		atomic_dec(&ht->nelems);
+		if (unlikely(ht->p.automatic_shrinking &&
+			     rht_shrink_below_30(ht, tbl)))
+			schedule_work(&ht->run_work);
 		err = 0;
-		break;
 	}
 
-	spin_unlock_bh(lock);
+	return err;
+}
+
+/* Internal function, please use rhashtable_remove_fast() instead */
+static inline int __rhashtable_remove_fast(
+	struct rhashtable *ht, struct rhash_head *obj,
+	const struct rhashtable_params params, bool rhlist)
+{
+	struct bucket_table *tbl;
+	int err;
+
+	rcu_read_lock();
+
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+
+	/* Because we have already taken (and released) the bucket
+	 * lock in old_tbl, if we find that future_tbl is not yet
+	 * visible then that guarantees the entry to still be in
+	 * the old tbl if it exists.
+	 */
+	while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params,
+						   rhlist)) &&
+	       (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
+		;
+
+	rcu_read_unlock();
 
 	return err;
 }
 
+/**
+ * rhashtable_remove_fast - remove object from hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ *
+ * Since the hash chain is single linked, the removal operation needs to
+ * walk the bucket chain upon removal. The removal operation is thus
+ * considerable slow if the hash table is not correctly sized.
+ *
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%.
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found.
+ */
 static inline int rhashtable_remove_fast(
 	struct rhashtable *ht, struct rhash_head *obj,
 	const struct rhashtable_params params)
 {
+	return __rhashtable_remove_fast(ht, obj, params, false);
+}
+
+/**
+ * rhltable_remove - remove object from hash list table
+ * @hlt:	hash list table
+ * @list:	pointer to hash list head inside object
+ * @params:	hash table parameters
+ *
+ * Since the hash chain is single linked, the removal operation needs to
+ * walk the bucket chain upon removal. The removal operation is thus
+ * considerable slow if the hash table is not correctly sized.
+ *
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found.
+ */
+static inline int rhltable_remove(
+	struct rhltable *hlt, struct rhlist_head *list,
+	const struct rhashtable_params params)
+{
+	return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true);
+}
+
+/* Internal function, please use rhashtable_replace_fast() instead */
+static inline int __rhashtable_replace_fast(
+	struct rhashtable *ht, struct bucket_table *tbl,
+	struct rhash_head *obj_old, struct rhash_head *obj_new,
+	const struct rhashtable_params params)
+{
+	struct rhash_lock_head __rcu **bkt;
+	struct rhash_head __rcu **pprev;
+	struct rhash_head *he;
+	unsigned int hash;
+	int err = -ENOENT;
+
+	/* Minimally, the old and new objects must have same hash
+	 * (which should mean identifiers are the same).
+	 */
+	hash = rht_head_hashfn(ht, tbl, obj_old, params);
+	if (hash != rht_head_hashfn(ht, tbl, obj_new, params))
+		return -EINVAL;
+
+	bkt = rht_bucket_var(tbl, hash);
+	if (!bkt)
+		return -ENOENT;
+
+	pprev = NULL;
+	rht_lock(tbl, bkt);
+
+	rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
+		if (he != obj_old) {
+			pprev = &he->next;
+			continue;
+		}
+
+		rcu_assign_pointer(obj_new->next, obj_old->next);
+		if (pprev) {
+			rcu_assign_pointer(*pprev, obj_new);
+			rht_unlock(tbl, bkt);
+		} else {
+			rht_assign_unlock(tbl, bkt, obj_new);
+		}
+		err = 0;
+		goto unlocked;
+	}
+
+	rht_unlock(tbl, bkt);
+
+unlocked:
+	return err;
+}
+
+/**
+ * rhashtable_replace_fast - replace an object in hash table
+ * @ht:		hash table
+ * @obj_old:	pointer to hash head inside object being replaced
+ * @obj_new:	pointer to hash head inside object which is new
+ * @params:	hash table parameters
+ *
+ * Replacing an object doesn't affect the number of elements in the hash table
+ * or bucket, so we don't need to worry about shrinking or expanding the
+ * table here.
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found,
+ * -EINVAL if hash is not the same for the old and new objects.
+ */
+static inline int rhashtable_replace_fast(
+	struct rhashtable *ht, struct rhash_head *obj_old,
+	struct rhash_head *obj_new,
+	const struct rhashtable_params params)
+{
 	struct bucket_table *tbl;
 	int err;
 
@@ -447,22 +1206,62 @@ static inline int rhashtable_remove_fast(
 	 * visible then that guarantees the entry to still be in
 	 * the old tbl if it exists.
 	 */
-	while ((err = __rhashtable_remove_fast(ht, tbl, obj, params)) &&
+	while ((err = __rhashtable_replace_fast(ht, tbl, obj_old,
+						obj_new, params)) &&
 	       (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
 		;
 
-	if (err)
-		goto out;
-
-	atomic_dec(&ht->nelems);
-	if (unlikely(ht->p.automatic_shrinking &&
-		     rht_shrink_below_30(ht, tbl)))
-		schedule_work(&ht->run_work);
-
-out:
 	rcu_read_unlock();
 
 	return err;
 }
 
+/**
+ * rhltable_walk_enter - Initialise an iterator
+ * @hlt:	Table to walk over
+ * @iter:	Hash table Iterator
+ *
+ * This function prepares a hash table walk.
+ *
+ * Note that if you restart a walk after rhashtable_walk_stop you
+ * may see the same object twice.  Also, you may miss objects if
+ * there are removals in between rhashtable_walk_stop and the next
+ * call to rhashtable_walk_start.
+ *
+ * For a completely stable walk you should construct your own data
+ * structure outside the hash table.
+ *
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
+ *
+ * You must call rhashtable_walk_exit after this function returns.
+ */
+static inline void rhltable_walk_enter(struct rhltable *hlt,
+				       struct rhashtable_iter *iter)
+{
+	return rhashtable_walk_enter(&hlt->ht, iter);
+}
+
+/**
+ * rhltable_free_and_destroy - free elements and destroy hash list table
+ * @hlt:	the hash list table to destroy
+ * @free_fn:	callback to release resources of element
+ * @arg:	pointer passed to free_fn
+ *
+ * See documentation for rhashtable_free_and_destroy.
+ */
+static inline void rhltable_free_and_destroy(struct rhltable *hlt,
+					     void (*free_fn)(void *ptr,
+							     void *arg),
+					     void *arg)
+{
+	return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
+}
+
+static inline void rhltable_destroy(struct rhltable *hlt)
+{
+	return rhltable_free_and_destroy(hlt, NULL, NULL);
+}
+
 #endif /* _LINUX_RHASHTABLE_H */
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 9d70e6e2..f851d6a2 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -19,6 +19,7 @@ static inline void init_rwsem(struct rw_semaphore *lock)
 }
 
 #define down_read(l)		pthread_rwlock_rdlock(&(l)->lock)
+#define down_read_killable(l)	(pthread_rwlock_rdlock(&(l)->lock), 0)
 #define down_read_trylock(l)	(!pthread_rwlock_tryrdlock(&(l)->lock))
 #define up_read(l)		pthread_rwlock_unlock(&(l)->lock)
 
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 04bf59df..1e4395c5 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -1,10 +1,8 @@
 #ifndef _LINUX_SCATTERLIST_H
 #define _LINUX_SCATTERLIST_H
 
-#include <linux/string.h>
-#include <linux/types.h>
 #include <linux/bug.h>
-#include <linux/mm.h>
+#include <linux/slab.h>
 
 struct scatterlist {
 	unsigned long	page_link;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 38a5fecb..153bd73d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -7,6 +7,7 @@
 #include <linux/bug.h>
 #include <linux/completion.h>
 #include <linux/jiffies.h>
+#include <linux/rwsem.h>
 #include <linux/time64.h>
 
 #define TASK_RUNNING		0
@@ -27,6 +28,7 @@
 #define TASK_NEW		2048
 #define TASK_IDLE_WORKER	4096
 #define TASK_STATE_MAX		8192
+#define TASK_FREEZABLE		(1U << 14)
 
 /* Convenience macros for the sake of set_task_state */
 #define TASK_KILLABLE		(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
@@ -58,10 +60,13 @@
 #define PF_FROZEN	0x00010000	/* frozen for system suspend */
 #define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
 #define PF_KSWAPD	0x00040000	/* I am kswapd */
+#define PF_MEMALLOC_NOFS 0x00040000	/* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */
 #define PF_MEMALLOC_NOIO 0x00080000	/* Allocating memory without IO involved */
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
 #define PF_KTHREAD	0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
+#define PF_MEMALLOC_NORECLAIM	0x00800000	/* All allocation requests will clear __GFP_DIRECT_RECLAIM */
+#define PF_MEMALLOC_NOWARN	0x01000000	/* All allocation requests will inherit __GFP_NOWARN */
 #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
 #define PF_NO_SETAFFINITY 0x04000000	/* Userland is not allowed to meddle with cpus_allowed */
 #define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
@@ -85,7 +90,13 @@ struct task_struct {
 
 	bool			on_cpu;
 	char			comm[TASK_COMM_LEN];
+	pid_t			pid;
+
 	struct bio_list		*bio_list;
+
+	struct signal_struct	{
+		struct rw_semaphore exec_update_lock;
+	}			*signal, _signal;
 };
 
 extern __thread struct task_struct *current;
@@ -99,7 +110,12 @@ extern __thread struct task_struct *current;
 #define set_current_state(state_value)			\
 	smp_store_mb(current->state, (state_value))
 
-#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
+static inline struct task_struct *get_task_struct(struct task_struct *task)
+{
+	atomic_inc(&task->usage);
+	return task;
+
+}
 
 extern void __put_task_struct(struct task_struct *t);
 
@@ -109,7 +125,7 @@ static inline void put_task_struct(struct task_struct *t)
 		__put_task_struct(t);
 }
 
-#define cond_resched()
+static inline void cond_resched(void) {}
 #define need_resched()	0
 
 void schedule(void);
@@ -138,14 +154,38 @@ static inline u64 ktime_get_seconds(void)
 	return ts.tv_sec;
 }
 
-static inline struct timespec current_kernel_time(void)
+static inline u64 ktime_get_real_ns(void)
 {
 	struct timespec ts;
 
-	clock_gettime(CLOCK_MONOTONIC, &ts);
-	return ts;
+	clock_gettime(CLOCK_REALTIME, &ts);
+	return timespec_to_ns(&ts);
+}
+
+static inline u64 ktime_get_real_seconds(void)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_REALTIME, &ts);
+
+	return ts.tv_sec;
+}
+
+static inline void ktime_get_coarse_real_ts64(struct timespec64 *ts)
+{
+	clock_gettime(CLOCK_REALTIME_COARSE, ts);
 }
 
+#define current_kernel_time64()	current_kernel_time()
 #define CURRENT_TIME		(current_kernel_time())
 
+#define sched_annotate_sleep()	do {} while (0)
+
+static inline unsigned int stack_trace_save_tsk(struct task_struct *task,
+				  unsigned long *store, unsigned int size,
+				  unsigned int skipnr)
+{
+	return 0;
+}
+
 #endif /* __TOOLS_LINUX_SCHED_H */
diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/include/linux/sched/debug.h
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
new file mode 100644
index 00000000..34045ce0
--- /dev/null
+++ b/include/linux/sched/mm.h
@@ -0,0 +1,126 @@
+#ifndef _LINUX_SCHED_MM_H
+#define _LINUX_SCHED_MM_H
+
+#define PF_MEMALLOC		0x00000800	/* Allocating memory */
+#define PF_MEMALLOC_NOFS	0x00040000	/* All allocation requests will inherit GFP_NOFS */
+
+/**
+ * memalloc_flags_save - Add a PF_* flag to current->flags, save old value
+ *
+ * This allows PF_* flags to be conveniently added, irrespective of current
+ * value, and then the old version restored with memalloc_flags_restore().
+ */
+static inline unsigned memalloc_flags_save(unsigned flags)
+{
+	unsigned oldflags = ~current->flags & flags;
+	current->flags |= flags;
+	return oldflags;
+}
+
+static inline void memalloc_flags_restore(unsigned flags)
+{
+	current->flags &= ~flags;
+}
+
+/**
+ * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope.
+ *
+ * This functions marks the beginning of the GFP_NOIO allocation scope.
+ * All further allocations will implicitly drop __GFP_IO flag and so
+ * they are safe for the IO critical section from the allocation recursion
+ * point of view. Use memalloc_noio_restore to end the scope with flags
+ * returned by this function.
+ *
+ * Context: This function is safe to be used from any context.
+ * Return: The saved flags to be passed to memalloc_noio_restore.
+ */
+static inline unsigned int memalloc_noio_save(void)
+{
+	return memalloc_flags_save(PF_MEMALLOC_NOIO);
+}
+
+/**
+ * memalloc_noio_restore - Ends the implicit GFP_NOIO scope.
+ * @flags: Flags to restore.
+ *
+ * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function.
+ * Always make sure that the given flags is the return value from the
+ * pairing memalloc_noio_save call.
+ */
+static inline void memalloc_noio_restore(unsigned int flags)
+{
+	memalloc_flags_restore(flags);
+}
+
+/**
+ * memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope.
+ *
+ * This functions marks the beginning of the GFP_NOFS allocation scope.
+ * All further allocations will implicitly drop __GFP_FS flag and so
+ * they are safe for the FS critical section from the allocation recursion
+ * point of view. Use memalloc_nofs_restore to end the scope with flags
+ * returned by this function.
+ *
+ * Context: This function is safe to be used from any context.
+ * Return: The saved flags to be passed to memalloc_nofs_restore.
+ */
+static inline unsigned int memalloc_nofs_save(void)
+{
+	return memalloc_flags_save(PF_MEMALLOC_NOFS);
+}
+
+/**
+ * memalloc_nofs_restore - Ends the implicit GFP_NOFS scope.
+ * @flags: Flags to restore.
+ *
+ * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function.
+ * Always make sure that the given flags is the return value from the
+ * pairing memalloc_nofs_save call.
+ */
+static inline void memalloc_nofs_restore(unsigned int flags)
+{
+	memalloc_flags_restore(flags);
+}
+
+/**
+ * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope.
+ *
+ * This function marks the beginning of the __GFP_MEMALLOC allocation scope.
+ * All further allocations will implicitly add the __GFP_MEMALLOC flag, which
+ * prevents entering reclaim and allows access to all memory reserves. This
+ * should only be used when the caller guarantees the allocation will allow more
+ * memory to be freed very shortly, i.e. it needs to allocate some memory in
+ * the process of freeing memory, and cannot reclaim due to potential recursion.
+ *
+ * Users of this scope have to be extremely careful to not deplete the reserves
+ * completely and implement a throttling mechanism which controls the
+ * consumption of the reserve based on the amount of freed memory. Usage of a
+ * pre-allocated pool (e.g. mempool) should be always considered before using
+ * this scope.
+ *
+ * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC
+ *
+ * Context: This function should not be used in an interrupt context as that one
+ *          does not give PF_MEMALLOC access to reserves.
+ *          See __gfp_pfmemalloc_flags().
+ * Return: The saved flags to be passed to memalloc_noreclaim_restore.
+ */
+static inline unsigned int memalloc_noreclaim_save(void)
+{
+	return memalloc_flags_save(PF_MEMALLOC);
+}
+
+/**
+ * memalloc_noreclaim_restore - Ends the implicit __GFP_MEMALLOC scope.
+ * @flags: Flags to restore.
+ *
+ * Ends the implicit __GFP_MEMALLOC scope started by memalloc_noreclaim_save
+ * function. Always make sure that the given flags is the return value from the
+ * pairing memalloc_noreclaim_save call.
+ */
+static inline void memalloc_noreclaim_restore(unsigned int flags)
+{
+	memalloc_flags_restore(flags);
+}
+
+#endif /* _LINUX_SCHED_MM_H */
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
new file mode 100644
index 00000000..20bdc050
--- /dev/null
+++ b/include/linux/sched/signal.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_SIGNAL_H
+#define _LINUX_SCHED_SIGNAL_H
+
+static inline int fatal_signal_pending(struct task_struct *p)
+{
+	return 0;
+}
+
+#endif /* _LINUX_SCHED_SIGNAL_H */
+
diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/include/linux/sched/task_stack.h
diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h
new file mode 100644
index 00000000..498e717a
--- /dev/null
+++ b/include/linux/semaphore.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2008 Intel Corporation
+ * Author: Matthew Wilcox <willy@linux.intel.com>
+ *
+ * Please see kernel/locking/semaphore.c for documentation of these functions
+ */
+#ifndef __LINUX_SEMAPHORE_H
+#define __LINUX_SEMAPHORE_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+/* Please don't access any members of this structure directly */
+struct semaphore {
+	raw_spinlock_t		lock;
+	unsigned int		count;
+	struct list_head	wait_list;
+};
+
+#define __SEMAPHORE_INITIALIZER(name, n)				\
+{									\
+	.lock		= __RAW_SPIN_LOCK_UNLOCKED((name).lock),	\
+	.count		= n,						\
+	.wait_list	= LIST_HEAD_INIT((name).wait_list),		\
+}
+
+#define DEFINE_SEMAPHORE(name)	\
+	struct semaphore name = __SEMAPHORE_INITIALIZER(name, 1)
+
+static inline void sema_init(struct semaphore *sem, int val)
+{
+	*sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val);
+}
+
+extern void down(struct semaphore *sem);
+extern int __must_check down_interruptible(struct semaphore *sem);
+extern int __must_check down_killable(struct semaphore *sem);
+extern int __must_check down_trylock(struct semaphore *sem);
+extern int __must_check down_timeout(struct semaphore *sem, long);
+extern void up(struct semaphore *sem);
+
+#endif /* __LINUX_SEMAPHORE_H */
diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
new file mode 100644
index 00000000..8c9c0dd7
--- /dev/null
+++ b/include/linux/seq_buf.h
@@ -0,0 +1,153 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SEQ_BUF_H
+#define _LINUX_SEQ_BUF_H
+
+#include <linux/kernel.h>
+#include <stdarg.h>
+#include <string.h>
+
+/*
+ * Trace sequences are used to allow a function to call several other functions
+ * to create a string of data to use.
+ */
+
+/**
+ * seq_buf - seq buffer structure
+ * @buffer:	pointer to the buffer
+ * @size:	size of the buffer
+ * @len:	the amount of data inside the buffer
+ * @readpos:	The next position to read in the buffer.
+ */
+struct seq_buf {
+	char			*buffer;
+	size_t			size;
+	size_t			len;
+	loff_t			readpos;
+};
+
+static inline void seq_buf_clear(struct seq_buf *s)
+{
+	s->len = 0;
+	s->readpos = 0;
+}
+
+static inline void
+seq_buf_init(struct seq_buf *s, char *buf, unsigned int size)
+{
+	s->buffer = buf;
+	s->size = size;
+	seq_buf_clear(s);
+}
+
+/*
+ * seq_buf have a buffer that might overflow. When this happens
+ * the len and size are set to be equal.
+ */
+static inline bool
+seq_buf_has_overflowed(struct seq_buf *s)
+{
+	return s->len > s->size;
+}
+
+static inline void
+seq_buf_set_overflow(struct seq_buf *s)
+{
+	s->len = s->size + 1;
+}
+
+/*
+ * How much buffer is left on the seq_buf?
+ */
+static inline unsigned int
+seq_buf_buffer_left(struct seq_buf *s)
+{
+	if (seq_buf_has_overflowed(s))
+		return 0;
+
+	return s->size - s->len;
+}
+
+/* How much buffer was written? */
+static inline unsigned int seq_buf_used(struct seq_buf *s)
+{
+	return min(s->len, s->size);
+}
+
+/**
+ * seq_buf_terminate - Make sure buffer is nul terminated
+ * @s: the seq_buf descriptor to terminate.
+ *
+ * This makes sure that the buffer in @s is nul terminated and
+ * safe to read as a string.
+ *
+ * Note, if this is called when the buffer has overflowed, then
+ * the last byte of the buffer is zeroed, and the len will still
+ * point passed it.
+ *
+ * After this function is called, s->buffer is safe to use
+ * in string operations.
+ */
+static inline void seq_buf_terminate(struct seq_buf *s)
+{
+	if (WARN_ON(s->size == 0))
+		return;
+
+	if (seq_buf_buffer_left(s))
+		s->buffer[s->len] = 0;
+	else
+		s->buffer[s->size - 1] = 0;
+}
+
+/**
+ * seq_buf_get_buf - get buffer to write arbitrary data to
+ * @s: the seq_buf handle
+ * @bufp: the beginning of the buffer is stored here
+ *
+ * Return the number of bytes available in the buffer, or zero if
+ * there's no space.
+ */
+static inline size_t seq_buf_get_buf(struct seq_buf *s, char **bufp)
+{
+	WARN_ON(s->len > s->size + 1);
+
+	if (s->len < s->size) {
+		*bufp = s->buffer + s->len;
+		return s->size - s->len;
+	}
+
+	*bufp = NULL;
+	return 0;
+}
+
+/**
+ * seq_buf_commit - commit data to the buffer
+ * @s: the seq_buf handle
+ * @num: the number of bytes to commit
+ *
+ * Commit @num bytes of data written to a buffer previously acquired
+ * by seq_buf_get.  To signal an error condition, or that the data
+ * didn't fit in the available space, pass a negative @num value.
+ */
+static inline void seq_buf_commit(struct seq_buf *s, int num)
+{
+	if (num < 0) {
+		seq_buf_set_overflow(s);
+	} else {
+		/* num must be negative on overflow */
+		BUG_ON(s->len + num > s->size);
+		s->len += num;
+	}
+}
+
+extern __printf(2, 3)
+int seq_buf_printf(struct seq_buf *s, const char *fmt, ...);
+extern __printf(2, 0)
+int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args);
+extern int seq_buf_to_user(struct seq_buf *s, char __user *ubuf,
+			   int cnt);
+extern int seq_buf_puts(struct seq_buf *s, const char *str);
+extern int seq_buf_putc(struct seq_buf *s, unsigned char c);
+
+void seq_buf_human_readable_u64(struct seq_buf *, u64);
+
+#endif /* _LINUX_SEQ_BUF_H */
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 70478387..b455ebca 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -4,9 +4,6 @@
 #include <linux/types.h>
 #include <linux/fs.h>
 
-struct seq_operations;
-struct path;
-
 struct seq_file {
 	char *buf;
 	size_t size;
@@ -16,7 +13,6 @@ struct seq_file {
 	loff_t index;
 	loff_t read_pos;
 	u64 version;
-	const struct seq_operations *op;
 	int poll_event;
 	const struct file *file;
 	void *private;
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 7a081377..435420fe 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -1,78 +1,18 @@
 #ifndef __LINUX_SEQLOCK_H
 #define __LINUX_SEQLOCK_H
-/*
- * Reader/writer consistent mechanism without starving writers. This type of
- * lock for data where the reader wants a consistent set of information
- * and is willing to retry if the information changes. There are two types
- * of readers:
- * 1. Sequence readers which never block a writer but they may have to retry
- *    if a writer is in progress by detecting change in sequence number.
- *    Writers do not wait for a sequence reader.
- * 2. Locking readers which will wait if a writer or another locking reader
- *    is in progress. A locking reader in progress will also block a writer
- *    from going forward. Unlike the regular rwlock, the read lock here is
- *    exclusive so that only one locking reader can get it.
- *
- * This is not as cache friendly as brlock. Also, this may not work well
- * for data that contains pointers, because any writer could
- * invalidate a pointer that a reader was following.
- *
- * Expected non-blocking reader usage:
- * 	do {
- *	    seq = read_seqbegin(&foo);
- * 	...
- *      } while (read_seqretry(&foo, seq));
- *
- *
- * On non-SMP the spin locks disappear but the writer still needs
- * to increment the sequence variables because an interrupt routine could
- * change the state of the data.
- *
- * Based on x86_64 vsyscall gettimeofday 
- * by Keith Owens and Andrea Arcangeli
- */
 
-#include <linux/spinlock.h>
-#include <linux/lockdep.h>
 #include <linux/compiler.h>
 
-/*
- * Version using sequence counter only.
- * This can be used when code has its own mutex protecting the
- * updating starting before the write_seqcountbeqin() and ending
- * after the write_seqcount_end().
- */
 typedef struct seqcount {
 	unsigned sequence;
 } seqcount_t;
 
-static inline void __seqcount_init(seqcount_t *s, const char *name,
-					  struct lock_class_key *key)
+static inline void seqcount_init(seqcount_t *s)
 {
 	s->sequence = 0;
 }
 
-# define SEQCOUNT_DEP_MAP_INIT(lockname)
-# define seqcount_init(s) __seqcount_init(s, NULL, NULL)
-# define seqcount_lockdep_reader_access(x)
-
-#define SEQCNT_ZERO(lockname) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(lockname)}
-
-
-/**
- * __read_seqcount_begin - begin a seq-read critical section (without barrier)
- * @s: pointer to seqcount_t
- * Returns: count to be passed to read_seqcount_retry
- *
- * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb()
- * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
- * provided before actually loading any of the variables that are to be
- * protected in this critical section.
- *
- * Use carefully, only in critical code, and comment how the barrier is
- * provided.
- */
-static inline unsigned __read_seqcount_begin(const seqcount_t *s)
+static inline unsigned read_seqcount_begin(const seqcount_t *s)
 {
 	unsigned ret;
 
@@ -82,486 +22,26 @@ repeat:
 		cpu_relax();
 		goto repeat;
 	}
-	return ret;
-}
-
-/**
- * raw_read_seqcount - Read the raw seqcount
- * @s: pointer to seqcount_t
- * Returns: count to be passed to read_seqcount_retry
- *
- * raw_read_seqcount opens a read critical section of the given
- * seqcount without any lockdep checking and without checking or
- * masking the LSB. Calling code is responsible for handling that.
- */
-static inline unsigned raw_read_seqcount(const seqcount_t *s)
-{
-	unsigned ret = READ_ONCE(s->sequence);
 	smp_rmb();
 	return ret;
 }
 
-/**
- * raw_read_seqcount_begin - start seq-read critical section w/o lockdep
- * @s: pointer to seqcount_t
- * Returns: count to be passed to read_seqcount_retry
- *
- * raw_read_seqcount_begin opens a read critical section of the given
- * seqcount, but without any lockdep checking. Validity of the critical
- * section is tested by checking read_seqcount_retry function.
- */
-static inline unsigned raw_read_seqcount_begin(const seqcount_t *s)
-{
-	unsigned ret = __read_seqcount_begin(s);
-	smp_rmb();
-	return ret;
-}
-
-/**
- * read_seqcount_begin - begin a seq-read critical section
- * @s: pointer to seqcount_t
- * Returns: count to be passed to read_seqcount_retry
- *
- * read_seqcount_begin opens a read critical section of the given seqcount.
- * Validity of the critical section is tested by checking read_seqcount_retry
- * function.
- */
-static inline unsigned read_seqcount_begin(const seqcount_t *s)
-{
-	seqcount_lockdep_reader_access(s);
-	return raw_read_seqcount_begin(s);
-}
-
-/**
- * raw_seqcount_begin - begin a seq-read critical section
- * @s: pointer to seqcount_t
- * Returns: count to be passed to read_seqcount_retry
- *
- * raw_seqcount_begin opens a read critical section of the given seqcount.
- * Validity of the critical section is tested by checking read_seqcount_retry
- * function.
- *
- * Unlike read_seqcount_begin(), this function will not wait for the count
- * to stabilize. If a writer is active when we begin, we will fail the
- * read_seqcount_retry() instead of stabilizing at the beginning of the
- * critical section.
- */
-static inline unsigned raw_seqcount_begin(const seqcount_t *s)
-{
-	unsigned ret = READ_ONCE(s->sequence);
-	smp_rmb();
-	return ret & ~1;
-}
-
-/**
- * __read_seqcount_retry - end a seq-read critical section (without barrier)
- * @s: pointer to seqcount_t
- * @start: count, from read_seqcount_begin
- * Returns: 1 if retry is required, else 0
- *
- * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb()
- * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
- * provided before actually loading any of the variables that are to be
- * protected in this critical section.
- *
- * Use carefully, only in critical code, and comment how the barrier is
- * provided.
- */
-static inline int __read_seqcount_retry(const seqcount_t *s, unsigned start)
-{
-	return unlikely(s->sequence != start);
-}
-
-/**
- * read_seqcount_retry - end a seq-read critical section
- * @s: pointer to seqcount_t
- * @start: count, from read_seqcount_begin
- * Returns: 1 if retry is required, else 0
- *
- * read_seqcount_retry closes a read critical section of the given seqcount.
- * If the critical section was invalid, it must be ignored (and typically
- * retried).
- */
 static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
 {
 	smp_rmb();
-	return __read_seqcount_retry(s, start);
-}
-
-
-
-static inline void raw_write_seqcount_begin(seqcount_t *s)
-{
-	s->sequence++;
-	smp_wmb();
-}
-
-static inline void raw_write_seqcount_end(seqcount_t *s)
-{
-	smp_wmb();
-	s->sequence++;
+	return unlikely(s->sequence != start);
 }
 
-/**
- * raw_write_seqcount_barrier - do a seq write barrier
- * @s: pointer to seqcount_t
- *
- * This can be used to provide an ordering guarantee instead of the
- * usual consistency guarantee. It is one wmb cheaper, because we can
- * collapse the two back-to-back wmb()s.
- *
- *      seqcount_t seq;
- *      bool X = true, Y = false;
- *
- *      void read(void)
- *      {
- *              bool x, y;
- *
- *              do {
- *                      int s = read_seqcount_begin(&seq);
- *
- *                      x = X; y = Y;
- *
- *              } while (read_seqcount_retry(&seq, s));
- *
- *              BUG_ON(!x && !y);
- *      }
- *
- *      void write(void)
- *      {
- *              Y = true;
- *
- *              raw_write_seqcount_barrier(seq);
- *
- *              X = false;
- *      }
- */
-static inline void raw_write_seqcount_barrier(seqcount_t *s)
+static inline void write_seqcount_begin(seqcount_t *s)
 {
 	s->sequence++;
 	smp_wmb();
-	s->sequence++;
-}
-
-static inline int raw_read_seqcount_latch(seqcount_t *s)
-{
-	int seq = READ_ONCE(s->sequence);
-	/* Pairs with the first smp_wmb() in raw_write_seqcount_latch() */
-	smp_read_barrier_depends();
-	return seq;
-}
-
-/**
- * raw_write_seqcount_latch - redirect readers to even/odd copy
- * @s: pointer to seqcount_t
- *
- * The latch technique is a multiversion concurrency control method that allows
- * queries during non-atomic modifications. If you can guarantee queries never
- * interrupt the modification -- e.g. the concurrency is strictly between CPUs
- * -- you most likely do not need this.
- *
- * Where the traditional RCU/lockless data structures rely on atomic
- * modifications to ensure queries observe either the old or the new state the
- * latch allows the same for non-atomic updates. The trade-off is doubling the
- * cost of storage; we have to maintain two copies of the entire data
- * structure.
- *
- * Very simply put: we first modify one copy and then the other. This ensures
- * there is always one copy in a stable state, ready to give us an answer.
- *
- * The basic form is a data structure like:
- *
- * struct latch_struct {
- *	seqcount_t		seq;
- *	struct data_struct	data[2];
- * };
- *
- * Where a modification, which is assumed to be externally serialized, does the
- * following:
- *
- * void latch_modify(struct latch_struct *latch, ...)
- * {
- *	smp_wmb();	<- Ensure that the last data[1] update is visible
- *	latch->seq++;
- *	smp_wmb();	<- Ensure that the seqcount update is visible
- *
- *	modify(latch->data[0], ...);
- *
- *	smp_wmb();	<- Ensure that the data[0] update is visible
- *	latch->seq++;
- *	smp_wmb();	<- Ensure that the seqcount update is visible
- *
- *	modify(latch->data[1], ...);
- * }
- *
- * The query will have a form like:
- *
- * struct entry *latch_query(struct latch_struct *latch, ...)
- * {
- *	struct entry *entry;
- *	unsigned seq, idx;
- *
- *	do {
- *		seq = raw_read_seqcount_latch(&latch->seq);
- *
- *		idx = seq & 0x01;
- *		entry = data_query(latch->data[idx], ...);
- *
- *		smp_rmb();
- *	} while (seq != latch->seq);
- *
- *	return entry;
- * }
- *
- * So during the modification, queries are first redirected to data[1]. Then we
- * modify data[0]. When that is complete, we redirect queries back to data[0]
- * and we can modify data[1].
- *
- * NOTE: The non-requirement for atomic modifications does _NOT_ include
- *       the publishing of new entries in the case where data is a dynamic
- *       data structure.
- *
- *       An iteration might start in data[0] and get suspended long enough
- *       to miss an entire modification sequence, once it resumes it might
- *       observe the new entry.
- *
- * NOTE: When data is a dynamic data structure; one should use regular RCU
- *       patterns to manage the lifetimes of the objects within.
- */
-static inline void raw_write_seqcount_latch(seqcount_t *s)
-{
-       smp_wmb();      /* prior stores before incrementing "sequence" */
-       s->sequence++;
-       smp_wmb();      /* increment "sequence" before following stores */
-}
-
-/*
- * Sequence counter only version assumes that callers are using their
- * own mutexing.
- */
-static inline void write_seqcount_begin_nested(seqcount_t *s, int subclass)
-{
-	raw_write_seqcount_begin(s);
-}
-
-static inline void write_seqcount_begin(seqcount_t *s)
-{
-	write_seqcount_begin_nested(s, 0);
 }
 
 static inline void write_seqcount_end(seqcount_t *s)
 {
-	raw_write_seqcount_end(s);
-}
-
-/**
- * write_seqcount_invalidate - invalidate in-progress read-side seq operations
- * @s: pointer to seqcount_t
- *
- * After write_seqcount_invalidate, no read-side seq operations will complete
- * successfully and see data older than this.
- */
-static inline void write_seqcount_invalidate(seqcount_t *s)
-{
 	smp_wmb();
-	s->sequence+=2;
-}
-
-typedef struct {
-	struct seqcount seqcount;
-	spinlock_t lock;
-} seqlock_t;
-
-/*
- * These macros triggered gcc-3.x compile-time problems.  We think these are
- * OK now.  Be cautious.
- */
-#define __SEQLOCK_UNLOCKED(lockname)			\
-	{						\
-		.seqcount = SEQCNT_ZERO(lockname),	\
-		.lock =	__SPIN_LOCK_UNLOCKED(lockname)	\
-	}
-
-#define seqlock_init(x)					\
-	do {						\
-		seqcount_init(&(x)->seqcount);		\
-		spin_lock_init(&(x)->lock);		\
-	} while (0)
-
-#define DEFINE_SEQLOCK(x) \
-		seqlock_t x = __SEQLOCK_UNLOCKED(x)
-
-/*
- * Read side functions for starting and finalizing a read side section.
- */
-static inline unsigned read_seqbegin(const seqlock_t *sl)
-{
-	return read_seqcount_begin(&sl->seqcount);
-}
-
-static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
-{
-	return read_seqcount_retry(&sl->seqcount, start);
-}
-
-/*
- * Lock out other writers and update the count.
- * Acts like a normal spin_lock/unlock.
- * Don't need preempt_disable() because that is in the spin_lock already.
- */
-static inline void write_seqlock(seqlock_t *sl)
-{
-	spin_lock(&sl->lock);
-	write_seqcount_begin(&sl->seqcount);
-}
-
-static inline void write_sequnlock(seqlock_t *sl)
-{
-	write_seqcount_end(&sl->seqcount);
-	spin_unlock(&sl->lock);
-}
-
-static inline void write_seqlock_bh(seqlock_t *sl)
-{
-	spin_lock_bh(&sl->lock);
-	write_seqcount_begin(&sl->seqcount);
-}
-
-static inline void write_sequnlock_bh(seqlock_t *sl)
-{
-	write_seqcount_end(&sl->seqcount);
-	spin_unlock_bh(&sl->lock);
-}
-
-static inline void write_seqlock_irq(seqlock_t *sl)
-{
-	spin_lock_irq(&sl->lock);
-	write_seqcount_begin(&sl->seqcount);
-}
-
-static inline void write_sequnlock_irq(seqlock_t *sl)
-{
-	write_seqcount_end(&sl->seqcount);
-	spin_unlock_irq(&sl->lock);
-}
-
-static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&sl->lock, flags);
-	write_seqcount_begin(&sl->seqcount);
-	return flags;
-}
-
-#define write_seqlock_irqsave(lock, flags)				\
-	do { flags = __write_seqlock_irqsave(lock); } while (0)
-
-static inline void
-write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
-{
-	write_seqcount_end(&sl->seqcount);
-	spin_unlock_irqrestore(&sl->lock, flags);
-}
-
-/*
- * A locking reader exclusively locks out other writers and locking readers,
- * but doesn't update the sequence number. Acts like a normal spin_lock/unlock.
- * Don't need preempt_disable() because that is in the spin_lock already.
- */
-static inline void read_seqlock_excl(seqlock_t *sl)
-{
-	spin_lock(&sl->lock);
-}
-
-static inline void read_sequnlock_excl(seqlock_t *sl)
-{
-	spin_unlock(&sl->lock);
-}
-
-/**
- * read_seqbegin_or_lock - begin a sequence number check or locking block
- * @lock: sequence lock
- * @seq : sequence number to be checked
- *
- * First try it once optimistically without taking the lock. If that fails,
- * take the lock. The sequence number is also used as a marker for deciding
- * whether to be a reader (even) or writer (odd).
- * N.B. seq must be initialized to an even number to begin with.
- */
-static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
-{
-	if (!(*seq & 1))	/* Even */
-		*seq = read_seqbegin(lock);
-	else			/* Odd */
-		read_seqlock_excl(lock);
-}
-
-static inline int need_seqretry(seqlock_t *lock, int seq)
-{
-	return !(seq & 1) && read_seqretry(lock, seq);
-}
-
-static inline void done_seqretry(seqlock_t *lock, int seq)
-{
-	if (seq & 1)
-		read_sequnlock_excl(lock);
-}
-
-static inline void read_seqlock_excl_bh(seqlock_t *sl)
-{
-	spin_lock_bh(&sl->lock);
-}
-
-static inline void read_sequnlock_excl_bh(seqlock_t *sl)
-{
-	spin_unlock_bh(&sl->lock);
-}
-
-static inline void read_seqlock_excl_irq(seqlock_t *sl)
-{
-	spin_lock_irq(&sl->lock);
-}
-
-static inline void read_sequnlock_excl_irq(seqlock_t *sl)
-{
-	spin_unlock_irq(&sl->lock);
-}
-
-static inline unsigned long __read_seqlock_excl_irqsave(seqlock_t *sl)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&sl->lock, flags);
-	return flags;
-}
-
-#define read_seqlock_excl_irqsave(lock, flags)				\
-	do { flags = __read_seqlock_excl_irqsave(lock); } while (0)
-
-static inline void
-read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags)
-{
-	spin_unlock_irqrestore(&sl->lock, flags);
-}
-
-static inline unsigned long
-read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq)
-{
-	unsigned long flags = 0;
-
-	if (!(*seq & 1))	/* Even */
-		*seq = read_seqbegin(lock);
-	else			/* Odd */
-		read_seqlock_excl_irqsave(lock, flags);
-
-	return flags;
+	s->sequence++;
 }
 
-static inline void
-done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags)
-{
-	if (seq & 1)
-		read_sequnlock_excl_irqrestore(lock, flags);
-}
 #endif /* __LINUX_SEQLOCK_H */
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 626b768c..d0a84794 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -11,20 +11,25 @@ struct shrink_control {
 
 #define SHRINK_STOP (~0UL)
 
+struct seq_buf;
 struct shrinker {
 	unsigned long (*count_objects)(struct shrinker *,
 				       struct shrink_control *sc);
 	unsigned long (*scan_objects)(struct shrinker *,
 				      struct shrink_control *sc);
+	void (*to_text)(struct seq_buf *, struct shrinker *);
 
 	int seeks;	/* seeks to recreate an obj */
 	long batch;	/* reclaim batch size, 0 = default */
 	struct list_head list;
+	void	*private_data;
 };
 
-int register_shrinker(struct shrinker *);
-void unregister_shrinker(struct shrinker *);
+void shrinker_free(struct shrinker *);
+struct shrinker *shrinker_alloc(unsigned int, const char *, ...);
 
-void run_shrinkers(void);
+int shrinker_register(struct shrinker *);
+
+void run_shrinkers(gfp_t gfp_mask, bool);
 
 #endif /* __TOOLS_LINUX_SHRINKER_H */
diff --git a/include/linux/siphash.h b/include/linux/siphash.h
new file mode 100644
index 00000000..bf21591a
--- /dev/null
+++ b/include/linux/siphash.h
@@ -0,0 +1,145 @@
+/* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.
+ *
+ * SipHash: a fast short-input PRF
+ * https://131002.net/siphash/
+ *
+ * This implementation is specifically for SipHash2-4 for a secure PRF
+ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
+ * hashtables.
+ */
+
+#ifndef _LINUX_SIPHASH_H
+#define _LINUX_SIPHASH_H
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+
+#define SIPHASH_ALIGNMENT __alignof__(u64)
+typedef struct {
+	u64 key[2];
+} siphash_key_t;
+
+static inline bool siphash_key_is_zero(const siphash_key_t *key)
+{
+	return !(key->key[0] | key->key[1]);
+}
+
+u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key);
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key);
+#endif
+
+u64 siphash_1u64(const u64 a, const siphash_key_t *key);
+u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key);
+u64 siphash_3u64(const u64 a, const u64 b, const u64 c,
+		 const siphash_key_t *key);
+u64 siphash_4u64(const u64 a, const u64 b, const u64 c, const u64 d,
+		 const siphash_key_t *key);
+u64 siphash_1u32(const u32 a, const siphash_key_t *key);
+u64 siphash_3u32(const u32 a, const u32 b, const u32 c,
+		 const siphash_key_t *key);
+
+static inline u64 siphash_2u32(const u32 a, const u32 b,
+			       const siphash_key_t *key)
+{
+	return siphash_1u64((u64)b << 32 | a, key);
+}
+static inline u64 siphash_4u32(const u32 a, const u32 b, const u32 c,
+			       const u32 d, const siphash_key_t *key)
+{
+	return siphash_2u64((u64)b << 32 | a, (u64)d << 32 | c, key);
+}
+
+
+static inline u64 ___siphash_aligned(const __le64 *data, size_t len,
+				     const siphash_key_t *key)
+{
+	if (__builtin_constant_p(len) && len == 4)
+		return siphash_1u32(le32_to_cpup((const __le32 *)data), key);
+	if (__builtin_constant_p(len) && len == 8)
+		return siphash_1u64(le64_to_cpu(data[0]), key);
+	if (__builtin_constant_p(len) && len == 16)
+		return siphash_2u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
+				    key);
+	if (__builtin_constant_p(len) && len == 24)
+		return siphash_3u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
+				    le64_to_cpu(data[2]), key);
+	if (__builtin_constant_p(len) && len == 32)
+		return siphash_4u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
+				    le64_to_cpu(data[2]), le64_to_cpu(data[3]),
+				    key);
+	return __siphash_aligned(data, len, key);
+}
+
+/**
+ * siphash - compute 64-bit siphash PRF value
+ * @data: buffer to hash
+ * @size: size of @data
+ * @key: the siphash key
+ */
+static inline u64 siphash(const void *data, size_t len,
+			  const siphash_key_t *key)
+{
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	if (!IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT))
+		return __siphash_unaligned(data, len, key);
+#endif
+	return ___siphash_aligned(data, len, key);
+}
+
+#define HSIPHASH_ALIGNMENT __alignof__(unsigned long)
+typedef struct {
+	unsigned long key[2];
+} hsiphash_key_t;
+
+u32 __hsiphash_aligned(const void *data, size_t len,
+		       const hsiphash_key_t *key);
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u32 __hsiphash_unaligned(const void *data, size_t len,
+			 const hsiphash_key_t *key);
+#endif
+
+u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key);
+u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key);
+u32 hsiphash_3u32(const u32 a, const u32 b, const u32 c,
+		  const hsiphash_key_t *key);
+u32 hsiphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d,
+		  const hsiphash_key_t *key);
+
+static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len,
+				      const hsiphash_key_t *key)
+{
+	if (__builtin_constant_p(len) && len == 4)
+		return hsiphash_1u32(le32_to_cpu(data[0]), key);
+	if (__builtin_constant_p(len) && len == 8)
+		return hsiphash_2u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
+				     key);
+	if (__builtin_constant_p(len) && len == 12)
+		return hsiphash_3u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
+				     le32_to_cpu(data[2]), key);
+	if (__builtin_constant_p(len) && len == 16)
+		return hsiphash_4u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
+				     le32_to_cpu(data[2]), le32_to_cpu(data[3]),
+				     key);
+	return __hsiphash_aligned(data, len, key);
+}
+
+/**
+ * hsiphash - compute 32-bit hsiphash PRF value
+ * @data: buffer to hash
+ * @size: size of @data
+ * @key: the hsiphash key
+ */
+static inline u32 hsiphash(const void *data, size_t len,
+			   const hsiphash_key_t *key)
+{
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	if (!IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT))
+		return __hsiphash_unaligned(data, len, key);
+#endif
+	return ___hsiphash_aligned(data, len, key);
+}
+
+#endif /* _LINUX_SIPHASH_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index c19f190b..24df2cc5 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -6,75 +6,143 @@
 #include <string.h>
 
 #include <linux/kernel.h>
+#include <linux/log2.h>
+#include <linux/overflow.h>
 #include <linux/page.h>
 #include <linux/shrinker.h>
 #include <linux/types.h>
 
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#define alloc_hooks(_do, ...)		_do
+
 #define ARCH_KMALLOC_MINALIGN		16
+#define ARCH_SLAB_MINALIGN		16
 #define KMALLOC_MAX_SIZE		SIZE_MAX
 
-static inline void *kmalloc(size_t size, gfp_t flags)
+#define MAX_PAGE_ORDER			10
+
+static inline size_t kmalloc_size_roundup(size_t s)
 {
-	void *p;
+	return roundup_pow_of_two(s);
+}
 
-	run_shrinkers();
+static inline void *kmalloc_noprof(size_t size, gfp_t flags)
+{
+	unsigned i;
+	void *p;
 
-	p = malloc(size);
-	if (p && (flags & __GFP_ZERO))
-		memset(p, 0, size);
+	for (i = 0; i < 10; i++) {
+		if (size) {
+			size_t alignment = min_t(size_t, PAGE_SIZE,
+						 rounddown_pow_of_two(size));
+			alignment = max(sizeof(void *), alignment);
+			if (posix_memalign(&p, alignment, size))
+				p = NULL;
+		} else {
+			p = malloc(0);
+		}
+
+		if (p) {
+			if (flags & __GFP_ZERO)
+				memset(p, 0, size);
+			break;
+		}
+
+		run_shrinkers(flags, true);
+	}
 
 	return p;
 }
+#define kmalloc		kmalloc_noprof
 
 static inline void *krealloc(void *old, size_t size, gfp_t flags)
 {
 	void *new;
 
-	run_shrinkers();
-
-	new = malloc(size);
+	new = kmalloc(size, flags);
 	if (!new)
 		return NULL;
 
 	if (flags & __GFP_ZERO)
 		memset(new, 0, size);
 
-	memcpy(new, old,
-	       min(malloc_usable_size(old),
-		   malloc_usable_size(new)));
-	free(old);
+	if (old) {
+		memcpy(new, old,
+		       min(malloc_usable_size(old),
+			   malloc_usable_size(new)));
+		free(old);
+	}
 
 	return new;
 }
 
+static inline void *krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t flags)
+{
+	size_t bytes;
+
+	if (unlikely(check_mul_overflow(new_n, new_size, &bytes)))
+		return NULL;
+
+	return krealloc(p, bytes, flags);
+}
+
 #define kzalloc(size, flags)		kmalloc(size, flags|__GFP_ZERO)
-#define kmalloc_array(n, size, flags)					\
+
+static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
+{
+	size_t bytes;
+
+	if (unlikely(check_mul_overflow(n, size, &bytes)))
+		return NULL;
+	return kmalloc(bytes, flags);
+}
+
+#define kvmalloc_array(n, size, flags)					\
 	((size) != 0 && (n) > SIZE_MAX / (size)				\
-	 ? NULL : kmalloc(n * size, flags))
+	 ? NULL : kmalloc((n) * (size), flags))
+
+#define kvmalloc_array_noprof(...)	kvmalloc_array(__VA_ARGS__)
 
 #define kcalloc(n, size, flags)		kmalloc_array(n, size, flags|__GFP_ZERO)
 
-#define kfree(p)			free(p)
-#define kvfree(p)			free(p)
-#define kzfree(p)			free(p)
+#define kfree(p)			free((void *) p)
+#define kzfree(p)			free((void *) p)
 
-static inline struct page *alloc_pages(gfp_t flags, unsigned int order)
+#define kvmalloc(size, flags)		kmalloc(size, flags)
+#define kvmalloc_noprof(size, flags)	kmalloc(size, flags)
+#define kvzalloc(size, flags)		kzalloc(size, flags)
+#define kvfree(p)			kfree(p)
+
+static inline struct page *alloc_pages_noprof(gfp_t flags, unsigned int order)
 {
 	size_t size = PAGE_SIZE << order;
+	unsigned i;
 	void *p;
 
-	run_shrinkers();
+	for (i = 0; i < 10; i++) {
+		p = aligned_alloc(PAGE_SIZE, size);
+
+		if (p) {
+			if (flags & __GFP_ZERO)
+				memset(p, 0, size);
+			break;
+		}
 
-	p = aligned_alloc(PAGE_SIZE, size);
-	if (p && (flags & __GFP_ZERO))
-		memset(p, 0, size);
+		run_shrinkers(flags, true);
+	}
 
 	return p;
 }
+#define alloc_pages			alloc_pages_noprof
 
 #define alloc_page(gfp)			alloc_pages(gfp, 0)
 
+#define _get_free_pages(gfp, order)	((unsigned long) alloc_pages(gfp, order))
 #define __get_free_pages(gfp, order)	((unsigned long) alloc_pages(gfp, order))
+#define get_free_pages_noprof(gfp, order)				\
+					((unsigned long) alloc_pages(gfp, order))
 #define __get_free_page(gfp)		__get_free_pages(gfp, 0)
 
 #define __free_pages(page, order)			\
@@ -122,4 +190,96 @@ static inline void *kmemdup(const void *src, size_t len, gfp_t gfp)
 	return p;
 }
 
+struct kmem_cache {
+	size_t		    obj_size;
+};
+
+static inline void *kmem_cache_alloc(struct kmem_cache *c, gfp_t gfp)
+{
+	return kmalloc(c->obj_size, gfp);
+}
+
+static inline void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t gfp)
+{
+	return kzalloc(c->obj_size, gfp);
+}
+
+static inline void kmem_cache_free(struct kmem_cache *c, void *p)
+{
+	kfree(p);
+}
+
+static inline void kmem_cache_destroy(struct kmem_cache *p)
+{
+	kfree(p);
+}
+
+static inline struct kmem_cache *kmem_cache_create(size_t obj_size)
+{
+	struct kmem_cache *p = kmalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return NULL;
+
+	p->obj_size = obj_size;
+	return p;
+}
+
+#define KMEM_CACHE(_struct, _flags)	kmem_cache_create(sizeof(struct _struct))
+
+#define PAGE_KERNEL		0
+#define PAGE_KERNEL_EXEC	1
+
+#define vfree(p)		free(p)
+
+static inline void *__vmalloc_noprof(unsigned long size, gfp_t flags)
+{
+	unsigned i;
+	void *p;
+
+	size = round_up(size, PAGE_SIZE);
+
+	for (i = 0; i < 10; i++) {
+		p = aligned_alloc(PAGE_SIZE, size);
+
+		if (p) {
+			if (flags & __GFP_ZERO)
+				memset(p, 0, size);
+			break;
+		}
+
+		run_shrinkers(flags, true);
+	}
+
+	return p;
+}
+#define __vmalloc __vmalloc_noprof
+
+static inline void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
+{
+	void *p;
+
+	p = __vmalloc(size, gfp_mask);
+	if (!p)
+		return NULL;
+
+	if (mprotect(p, size, PROT_READ|PROT_WRITE|PROT_EXEC)) {
+		vfree(p);
+		return NULL;
+	}
+
+	return p;
+}
+
+static inline void *vmalloc(unsigned long size)
+{
+	return __vmalloc(size, GFP_KERNEL);
+}
+
+#define vmalloc_noprof(...)	vmalloc(__VA_ARGS__)
+
+static inline void *vzalloc(unsigned long size)
+{
+	return __vmalloc(size, GFP_KERNEL|__GFP_ZERO);
+}
+
 #endif /* __TOOLS_LINUX_SLAB_H */
diff --git a/include/linux/sort.h b/include/linux/sort.h
index d534da2b..17c6ba34 100644
--- a/include/linux/sort.h
+++ b/include/linux/sort.h
@@ -1,10 +1,19 @@
 #ifndef _LINUX_SORT_H
 #define _LINUX_SORT_H
 
+#include <stdlib.h>
 #include <linux/types.h>
 
-void sort(void *base, size_t num, size_t size,
-	  int (*cmp)(const void *, const void *),
-	  void (*swap)(void *, void *, int));
+void sort_r(void *base, size_t num, size_t size,
+	    cmp_r_func_t cmp_func,
+	    swap_r_func_t swap_func,
+	    const void *priv);
+
+static inline void sort(void *base, size_t num, size_t size,
+			int (*cmp_func)(const void *, const void *),
+			void (*swap_func)(void *, void *, int size))
+{
+	return qsort(base, num, size, cmp_func);
+}
 
 #endif
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index c9be6b61..28ce667b 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -1,60 +1 @@
-#ifndef __TOOLS_LINUX_SPINLOCK_H
-#define __TOOLS_LINUX_SPINLOCK_H
-
-#include <linux/atomic.h>
-
-typedef struct {
-	int		count;
-} raw_spinlock_t;
-
-#define __RAW_SPIN_LOCK_UNLOCKED(name)	(raw_spinlock_t) { .count = 0 }
-
-static inline void raw_spin_lock_init(raw_spinlock_t *lock)
-{
-	smp_store_release(&lock->count, 0);
-}
-
-static inline void raw_spin_lock(raw_spinlock_t *lock)
-{
-	while (xchg_acquire(&lock->count, 1))
-		;
-}
-
-static inline void raw_spin_unlock(raw_spinlock_t *lock)
-{
-	smp_store_release(&lock->count, 0);
-}
-
-#define raw_spin_lock_irq(lock)		raw_spin_lock(lock)
-#define raw_spin_unlock_irq(lock)	raw_spin_unlock(lock)
-
-#define raw_spin_lock_irqsave(lock, flags)		\
-do {							\
-	flags = 0;					\
-	raw_spin_lock(lock);				\
-} while (0)
-
-#define raw_spin_unlock_irqrestore(lock, flags) raw_spin_unlock(lock)
-
-typedef raw_spinlock_t spinlock_t;
-
-#define __SPIN_LOCK_UNLOCKED(name)	__RAW_SPIN_LOCK_UNLOCKED(name)
-
-#define DEFINE_SPINLOCK(x)	spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
-
-#define spin_lock_init(lock)		raw_spin_lock_init(lock)
-#define spin_lock(lock)			raw_spin_lock(lock)
-#define spin_unlock(lock)		raw_spin_unlock(lock)
-
-#define spin_lock_nested(lock, n)	spin_lock(lock)
-
-#define spin_lock_bh(lock)		raw_spin_lock(lock)
-#define spin_unlock_bh(lock)		raw_spin_unlock(lock)
-
-#define spin_lock_irq(lock)		raw_spin_lock(lock)
-#define spin_unlock_irq(lock)		raw_spin_unlock(lock)
-
-#define spin_lock_irqsave(lock, flags)	raw_spin_lock_irqsave(lock, flags)
-#define spin_unlock_irqrestore(lock, flags) raw_spin_unlock_irqrestore(lock, flags)
-
-#endif /* __TOOLS_LINUX_SPINLOCK_H */
+#include "linux/spinlock_types.h"
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
new file mode 100644
index 00000000..6c4a623c
--- /dev/null
+++ b/include/linux/spinlock_types.h
@@ -0,0 +1,65 @@
+#ifndef __TOOLS_LINUX_SPINLOCK_H
+#define __TOOLS_LINUX_SPINLOCK_H
+
+#include <linux/atomic.h>
+#include <pthread.h>
+
+typedef struct {
+	pthread_mutex_t lock;
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED(name)	(raw_spinlock_t) { .lock = PTHREAD_MUTEX_INITIALIZER  }
+
+static inline void raw_spin_lock_init(raw_spinlock_t *lock)
+{
+	pthread_mutex_init(&lock->lock, NULL);
+}
+
+static inline bool raw_spin_trylock(raw_spinlock_t *lock)
+{
+	return !pthread_mutex_trylock(&lock->lock);
+}
+
+static inline void raw_spin_lock(raw_spinlock_t *lock)
+{
+	pthread_mutex_lock(&lock->lock);
+}
+
+static inline void raw_spin_unlock(raw_spinlock_t *lock)
+{
+	pthread_mutex_unlock(&lock->lock);
+}
+
+#define raw_spin_lock_irq(lock)		raw_spin_lock(lock)
+#define raw_spin_unlock_irq(lock)	raw_spin_unlock(lock)
+
+#define raw_spin_lock_irqsave(lock, flags)		\
+do {							\
+	flags = 0;					\
+	raw_spin_lock(lock);				\
+} while (0)
+
+#define raw_spin_unlock_irqrestore(lock, flags) raw_spin_unlock(lock)
+
+typedef raw_spinlock_t spinlock_t;
+
+#define __SPIN_LOCK_UNLOCKED(name)	__RAW_SPIN_LOCK_UNLOCKED(name)
+
+#define DEFINE_SPINLOCK(x)	spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
+
+#define spin_lock_init(lock)		raw_spin_lock_init(lock)
+#define spin_lock(lock)			raw_spin_lock(lock)
+#define spin_unlock(lock)		raw_spin_unlock(lock)
+
+#define spin_lock_nested(lock, n)	spin_lock(lock)
+
+#define spin_lock_bh(lock)		raw_spin_lock(lock)
+#define spin_unlock_bh(lock)		raw_spin_unlock(lock)
+
+#define spin_lock_irq(lock)		raw_spin_lock(lock)
+#define spin_unlock_irq(lock)		raw_spin_unlock(lock)
+
+#define spin_lock_irqsave(lock, flags)	raw_spin_lock_irqsave(lock, flags)
+#define spin_unlock_irqrestore(lock, flags) raw_spin_unlock_irqrestore(lock, flags)
+
+#endif /* __TOOLS_LINUX_SPINLOCK_H */
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
new file mode 100644
index 00000000..b93cb8e3
--- /dev/null
+++ b/include/linux/srcu.h
@@ -0,0 +1,69 @@
+#ifndef __TOOLS_LINUX_SRCU_H
+#define __TOOLS_LINUX_SRCU_H
+
+#include <linux/rcupdate.h>
+
+#define NUM_ACTIVE_RCU_POLL_OLDSTATE	2
+
+typedef void (*rcu_callback_t)(struct rcu_head *head);
+
+struct srcu_struct {
+};
+
+static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) {}
+
+static inline int srcu_read_lock(struct srcu_struct *ssp)
+{
+	return 0;
+}
+
+static inline bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+	return true;
+}
+
+static inline unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+{
+	return 0;
+}
+
+static inline unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
+{
+	return 0;
+}
+
+#undef poll_state_synchronize_rcu
+static inline bool poll_state_synchronize_rcu(unsigned long cookie)
+{
+	return false;
+}
+
+#undef start_poll_synchronize_rcu
+static inline unsigned long start_poll_synchronize_rcu()
+{
+	return 0;
+}
+
+static inline unsigned long get_state_synchronize_rcu()
+{
+	return 0;
+}
+
+static inline void synchronize_srcu_expedited(struct srcu_struct *ssp) {}
+
+static inline void srcu_barrier(struct srcu_struct *ssp) {}
+
+static inline void cleanup_srcu_struct(struct srcu_struct *ssp) {}
+
+static inline void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
+			     rcu_callback_t func)
+{
+	call_rcu(rhp, func);
+}
+
+static inline int init_srcu_struct(struct srcu_struct *ssp)
+{
+	return 0;
+}
+
+#endif /* __TOOLS_LINUX_SRCU_H */
diff --git a/include/linux/string.h b/include/linux/string.h
index ec35b8df..f6ce8dde 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -6,11 +6,13 @@
 #include <linux/types.h>	/* for size_t */
 
 extern size_t strlcpy(char *dest, const char *src, size_t size);
-extern char *skip_spaces(const char *);
+extern ssize_t strscpy(char *dest, const char *src, size_t count);
 extern char *strim(char *);
 extern void memzero_explicit(void *, size_t);
 int match_string(const char * const *, size_t, const char *);
+extern void * memscan(void *,int, size_t);
 
 #define kstrndup(s, n, gfp)		strndup(s, n)
+#define kstrdup(s, gfp)			strdup(s)
 
 #endif /* _LINUX_STRING_H_ */
diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h
new file mode 100644
index 00000000..120ca0f2
--- /dev/null
+++ b/include/linux/string_choices.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_STRING_CHOICES_H_
+#define _LINUX_STRING_CHOICES_H_
+
+/*
+ * Here provide a series of helpers in the str_$TRUE_$FALSE format (you can
+ * also expand some helpers as needed), where $TRUE and $FALSE are their
+ * corresponding literal strings. These helpers can be used in the printing
+ * and also in other places where constant strings are required. Using these
+ * helpers offers the following benefits:
+ *  1) Reducing the hardcoding of strings, which makes the code more elegant
+ *     through these simple literal-meaning helpers.
+ *  2) Unifying the output, which prevents the same string from being printed
+ *     in various forms, such as enable/disable, enabled/disabled, en/dis.
+ *  3) Deduping by the linker, which results in a smaller binary file.
+ */
+
+#include <linux/types.h>
+
+static inline const char *str_enable_disable(bool v)
+{
+	return v ? "enable" : "disable";
+}
+#define str_disable_enable(v)		str_enable_disable(!(v))
+
+static inline const char *str_enabled_disabled(bool v)
+{
+	return v ? "enabled" : "disabled";
+}
+#define str_disabled_enabled(v)		str_enabled_disabled(!(v))
+
+static inline const char *str_hi_lo(bool v)
+{
+	return v ? "hi" : "lo";
+}
+#define str_lo_hi(v)		str_hi_lo(!(v))
+
+static inline const char *str_high_low(bool v)
+{
+	return v ? "high" : "low";
+}
+#define str_low_high(v)		str_high_low(!(v))
+
+static inline const char *str_read_write(bool v)
+{
+	return v ? "read" : "write";
+}
+#define str_write_read(v)		str_read_write(!(v))
+
+static inline const char *str_on_off(bool v)
+{
+	return v ? "on" : "off";
+}
+#define str_off_on(v)		str_on_off(!(v))
+
+static inline const char *str_yes_no(bool v)
+{
+	return v ? "yes" : "no";
+}
+#define str_no_yes(v)		str_yes_no(!(v))
+
+static inline const char *str_up_down(bool v)
+{
+	return v ? "up" : "down";
+}
+#define str_down_up(v)		str_up_down(!(v))
+
+static inline const char *str_true_false(bool v)
+{
+	return v ? "true" : "false";
+}
+#define str_false_true(v)		str_true_false(!(v))
+
+/**
+ * str_plural - Return the simple pluralization based on English counts
+ * @num: Number used for deciding pluralization
+ *
+ * If @num is 1, returns empty string, otherwise returns "s".
+ */
+static inline const char *str_plural(size_t num)
+{
+	return num == 1 ? "" : "s";
+}
+
+#endif
diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
new file mode 100644
index 00000000..af587706
--- /dev/null
+++ b/include/linux/string_helpers.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_STRING_HELPERS_H_
+#define _LINUX_STRING_HELPERS_H_
+
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+
+/* Descriptions of the types of units to
+ * print in */
+enum string_size_units {
+	STRING_UNITS_10,	/* use powers of 10^3 (standard SI) */
+	STRING_UNITS_2,		/* use binary powers of 2^10 */
+};
+
+int string_get_size(u64 size, u64 blk_size, enum string_size_units units,
+		    char *buf, int len);
+
+#endif
diff --git a/include/linux/stringify.h b/include/linux/stringify.h
deleted file mode 100644
index 841cec8e..00000000
--- a/include/linux/stringify.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef __LINUX_STRINGIFY_H
-#define __LINUX_STRINGIFY_H
-
-/* Indirect stringification.  Doing two levels allows the parameter to be a
- * macro itself.  For example, compile with -DFOO=bar, __stringify(FOO)
- * converts to "bar".
- */
-
-#define __stringify_1(x...)	#x
-#define __stringify(x...)	__stringify_1(x)
-
-#endif	/* !__LINUX_STRINGIFY_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
new file mode 100644
index 00000000..81864222
--- /dev/null
+++ b/include/linux/swap.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SWAP_H
+#define _LINUX_SWAP_H
+
+static inline void mm_account_reclaimed_pages(unsigned long pages) {}
+
+#endif /* _LINUX_SWAP_H */
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index dde16922..cb75d88b 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -2,7 +2,6 @@
 #define _SYSFS_H_
 
 #include <linux/compiler.h>
-#include <linux/stringify.h>
 
 struct kobject;
 
@@ -11,11 +10,9 @@ struct attribute {
 	umode_t			mode;
 };
 
-#define __ATTR(_name, _mode, _show, _store) {				\
-	.attr = {.name = __stringify(_name), .mode = _mode },		\
-	.show	= _show,						\
-	.store	= _store,						\
-}
+struct attribute_group {
+	struct attribute	**attrs;
+};
 
 struct sysfs_ops {
 	ssize_t	(*show)(struct kobject *, struct attribute *, char *);
diff --git a/include/linux/thread_with_file.h b/include/linux/thread_with_file.h
new file mode 100644
index 00000000..2a66e762
--- /dev/null
+++ b/include/linux/thread_with_file.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev>
+ */
+#ifndef _LINUX_THREAD_WITH_FILE_H
+#define _LINUX_THREAD_WITH_FILE_H
+
+struct stdio_redirect;
+
+__printf(3, 0)
+static inline void stdio_redirect_vprintf(struct stdio_redirect *s, bool nonblocking, const char *msg, va_list args) {}
+__printf(3, 4)
+static inline void stdio_redirect_printf(struct stdio_redirect *s, bool nonblocking, const char *msg, ...) {}
+
+#endif /* _LINUX_THREAD_WITH_FILE_H */
diff --git a/include/linux/thread_with_file_types.h b/include/linux/thread_with_file_types.h
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/include/linux/thread_with_file_types.h
diff --git a/include/linux/time.h b/include/linux/time.h
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/include/linux/time.h
diff --git a/include/linux/time64.h b/include/linux/time64.h
index fd59a9a6..0cef3f8c 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -3,6 +3,8 @@
 
 #include <linux/types.h>
 
+#define timespec64  timespec
+
 typedef __s64 time64_t;
 
 /* Parameters used to convert the timespec values: */
@@ -42,4 +44,22 @@ static inline struct timespec timespec_trunc(struct timespec t, unsigned gran)
 	return t;
 }
 
+static inline void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
+{
+	while (nsec >= NSEC_PER_SEC) {
+		nsec -= NSEC_PER_SEC;
+		++sec;
+	}
+	while (nsec < 0) {
+		nsec += NSEC_PER_SEC;
+		--sec;
+	}
+	ts->tv_sec = sec;
+	ts->tv_nsec = nsec;
+}
+
+#define ns_to_timespec64	ns_to_timespec
+#define timespec64_to_ns	timespec_to_ns
+#define timespec64_trunc	timespec_trunc
+
 #endif /* _LINUX_TIME64_H */
diff --git a/include/linux/types.h b/include/linux/types.h
index ee94a222..5ee5ebc6 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -6,11 +6,17 @@
 #include <stdint.h>
 
 #include <fcntl.h>
+#include <sys/stat.h>
 #include <sys/types.h>
+#include <linux/posix_types.h>
 
+#ifndef __SANE_USERSPACE_TYPES__
 #define __SANE_USERSPACE_TYPES__	/* For PPC64, to get LL64 types */
+#endif
 #include <asm/types.h>
 
+#include <linux/cache.h>
+
 #define BITS_PER_LONG	__BITS_PER_LONG
 
 struct page;
@@ -22,15 +28,19 @@ typedef unsigned short		umode_t;
 
 typedef unsigned gfp_t;
 
-#define GFP_KERNEL	0
 #define GFP_ATOMIC	0
 #define GFP_NOFS	0
 #define GFP_NOIO	0
 #define GFP_NOWAIT	0
+#define __GFP_FS	0
 #define __GFP_IO	0
 #define __GFP_NOWARN	0
 #define __GFP_NORETRY	0
+#define __GFP_NOFAIL	0
+#define __GFP_ACCOUNT	0
+#define __GFP_RECLAIMABLE 0
 #define __GFP_ZERO	1
+#define GFP_KERNEL	2
 
 #define PAGE_ALLOC_COSTLY_ORDER	6
 
@@ -72,4 +82,15 @@ typedef __u64 __bitwise __be64;
 
 typedef u64 sector_t;
 
+typedef void (*swap_r_func_t)(void *a, void *b, int size, const void *priv);
+typedef void (*swap_func_t)(void *a, void *b, int size);
+
+typedef int (*cmp_r_func_t)(const void *a, const void *b, const void *priv);
+typedef int (*cmp_func_t)(const void *a, const void *b);
+
+typedef unsigned int __bitwise slab_flags_t;
+typedef u64 phys_addr_t;
+struct vm_struct;
+struct mnt_idmap;
+
 #endif /* _TOOLS_LINUX_TYPES_H_ */
diff --git a/include/linux/unaligned.h b/include/linux/unaligned.h
new file mode 100644
index 00000000..f6d94a85
--- /dev/null
+++ b/include/linux/unaligned.h
@@ -0,0 +1 @@
+#include <asm/unaligned.h>
diff --git a/include/linux/unaligned/access_ok.h b/include/linux/unaligned/access_ok.h
deleted file mode 100644
index 33383ca2..00000000
--- a/include/linux/unaligned/access_ok.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef _LINUX_UNALIGNED_ACCESS_OK_H
-#define _LINUX_UNALIGNED_ACCESS_OK_H
-
-#include <linux/kernel.h>
-#include <asm/byteorder.h>
-
-static __always_inline u16 get_unaligned_le16(const void *p)
-{
-	return le16_to_cpup((__le16 *)p);
-}
-
-static __always_inline u32 get_unaligned_le32(const void *p)
-{
-	return le32_to_cpup((__le32 *)p);
-}
-
-static __always_inline u64 get_unaligned_le64(const void *p)
-{
-	return le64_to_cpup((__le64 *)p);
-}
-
-static __always_inline u16 get_unaligned_be16(const void *p)
-{
-	return be16_to_cpup((__be16 *)p);
-}
-
-static __always_inline u32 get_unaligned_be32(const void *p)
-{
-	return be32_to_cpup((__be32 *)p);
-}
-
-static __always_inline u64 get_unaligned_be64(const void *p)
-{
-	return be64_to_cpup((__be64 *)p);
-}
-
-static __always_inline void put_unaligned_le16(u16 val, void *p)
-{
-	*((__le16 *)p) = cpu_to_le16(val);
-}
-
-static __always_inline void put_unaligned_le32(u32 val, void *p)
-{
-	*((__le32 *)p) = cpu_to_le32(val);
-}
-
-static __always_inline void put_unaligned_le64(u64 val, void *p)
-{
-	*((__le64 *)p) = cpu_to_le64(val);
-}
-
-static __always_inline void put_unaligned_be16(u16 val, void *p)
-{
-	*((__be16 *)p) = cpu_to_be16(val);
-}
-
-static __always_inline void put_unaligned_be32(u32 val, void *p)
-{
-	*((__be32 *)p) = cpu_to_be32(val);
-}
-
-static __always_inline void put_unaligned_be64(u64 val, void *p)
-{
-	*((__be64 *)p) = cpu_to_be64(val);
-}
-
-#endif /* _LINUX_UNALIGNED_ACCESS_OK_H */
diff --git a/include/linux/unaligned/be_memmove.h b/include/linux/unaligned/be_memmove.h
deleted file mode 100644
index c2a76c5c..00000000
--- a/include/linux/unaligned/be_memmove.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef _LINUX_UNALIGNED_BE_MEMMOVE_H
-#define _LINUX_UNALIGNED_BE_MEMMOVE_H
-
-#include <linux/unaligned/memmove.h>
-
-static inline u16 get_unaligned_be16(const void *p)
-{
-	return __get_unaligned_memmove16((const u8 *)p);
-}
-
-static inline u32 get_unaligned_be32(const void *p)
-{
-	return __get_unaligned_memmove32((const u8 *)p);
-}
-
-static inline u64 get_unaligned_be64(const void *p)
-{
-	return __get_unaligned_memmove64((const u8 *)p);
-}
-
-static inline void put_unaligned_be16(u16 val, void *p)
-{
-	__put_unaligned_memmove16(val, p);
-}
-
-static inline void put_unaligned_be32(u32 val, void *p)
-{
-	__put_unaligned_memmove32(val, p);
-}
-
-static inline void put_unaligned_be64(u64 val, void *p)
-{
-	__put_unaligned_memmove64(val, p);
-}
-
-#endif /* _LINUX_UNALIGNED_LE_MEMMOVE_H */
diff --git a/include/linux/unaligned/le_memmove.h b/include/linux/unaligned/le_memmove.h
deleted file mode 100644
index 269849be..00000000
--- a/include/linux/unaligned/le_memmove.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef _LINUX_UNALIGNED_LE_MEMMOVE_H
-#define _LINUX_UNALIGNED_LE_MEMMOVE_H
-
-#include <linux/unaligned/memmove.h>
-
-static inline u16 get_unaligned_le16(const void *p)
-{
-	return __get_unaligned_memmove16((const u8 *)p);
-}
-
-static inline u32 get_unaligned_le32(const void *p)
-{
-	return __get_unaligned_memmove32((const u8 *)p);
-}
-
-static inline u64 get_unaligned_le64(const void *p)
-{
-	return __get_unaligned_memmove64((const u8 *)p);
-}
-
-static inline void put_unaligned_le16(u16 val, void *p)
-{
-	__put_unaligned_memmove16(val, p);
-}
-
-static inline void put_unaligned_le32(u32 val, void *p)
-{
-	__put_unaligned_memmove32(val, p);
-}
-
-static inline void put_unaligned_le64(u64 val, void *p)
-{
-	__put_unaligned_memmove64(val, p);
-}
-
-#endif /* _LINUX_UNALIGNED_LE_MEMMOVE_H */
diff --git a/include/linux/unaligned/memmove.h b/include/linux/unaligned/memmove.h
deleted file mode 100644
index eeb5a779..00000000
--- a/include/linux/unaligned/memmove.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef _LINUX_UNALIGNED_MEMMOVE_H
-#define _LINUX_UNALIGNED_MEMMOVE_H
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-
-/* Use memmove here, so gcc does not insert a __builtin_memcpy. */
-
-static inline u16 __get_unaligned_memmove16(const void *p)
-{
-	u16 tmp;
-	memmove(&tmp, p, 2);
-	return tmp;
-}
-
-static inline u32 __get_unaligned_memmove32(const void *p)
-{
-	u32 tmp;
-	memmove(&tmp, p, 4);
-	return tmp;
-}
-
-static inline u64 __get_unaligned_memmove64(const void *p)
-{
-	u64 tmp;
-	memmove(&tmp, p, 8);
-	return tmp;
-}
-
-static inline void __put_unaligned_memmove16(u16 val, void *p)
-{
-	memmove(p, &val, 2);
-}
-
-static inline void __put_unaligned_memmove32(u32 val, void *p)
-{
-	memmove(p, &val, 4);
-}
-
-static inline void __put_unaligned_memmove64(u64 val, void *p)
-{
-	memmove(p, &val, 8);
-}
-
-#endif /* _LINUX_UNALIGNED_MEMMOVE_H */
diff --git a/include/linux/uuid.h b/include/linux/uuid.h
index b81992dd..a9990902 100644
--- a/include/linux/uuid.h
+++ b/include/linux/uuid.h
@@ -16,12 +16,26 @@
 #ifndef _LINUX_UUID_H_
 #define _LINUX_UUID_H_
 
-#include <uapi/linux/uuid.h>
 #include <string.h>
+#include <asm/types.h>
+#include <stdbool.h>
 
-static inline int uuid_le_cmp(const uuid_le u1, const uuid_le u2)
+#define UUID_SIZE 16
+
+typedef struct {
+	__u8 b[UUID_SIZE];
+} __uuid_t;
+
+#define UUID_INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)			\
+((__uuid_t)								\
+{{ ((a) >> 24) & 0xff, ((a) >> 16) & 0xff, ((a) >> 8) & 0xff, (a) & 0xff, \
+   ((b) >> 8) & 0xff, (b) & 0xff,					\
+   ((c) >> 8) & 0xff, (c) & 0xff,					\
+   (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
+
+static inline bool uuid_equal(const __uuid_t *u1, const __uuid_t *u2)
 {
-	return memcmp(&u1, &u2, sizeof(uuid_le));
+	return memcmp(u1, u2, sizeof(__uuid_t)) == 0;
 }
 
 #endif
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index c91e3a80..55fffb59 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -1,47 +1,6 @@
 #ifndef __TOOLS_LINUX_VMALLOC_H
 #define __TOOLS_LINUX_VMALLOC_H
 
-#include <stdlib.h>
-#include <sys/mman.h>
-
 #include "linux/slab.h"
-#include "tools-util.h"
-
-#define PAGE_KERNEL		0
-#define PAGE_KERNEL_EXEC	1
-
-#define vfree(p)		free(p)
-
-static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask, unsigned prot)
-{
-	void *p;
-
-	run_shrinkers();
-
-	p = aligned_alloc(PAGE_SIZE, size);
-	if (!p)
-		return NULL;
-
-	if (prot == PAGE_KERNEL_EXEC &&
-	    mprotect(p, size, PROT_READ|PROT_WRITE|PROT_EXEC)) {
-		vfree(p);
-		return NULL;
-	}
-
-	if (gfp_mask & __GFP_ZERO)
-		memset(p, 0, size);
-
-	return p;
-}
-
-static inline void *vmalloc(unsigned long size)
-{
-	return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
-}
-
-static inline void *vzalloc(unsigned long size)
-{
-	return __vmalloc(size, GFP_KERNEL|__GFP_ZERO, PAGE_KERNEL);
-}
 
 #endif /* __TOOLS_LINUX_VMALLOC_H */
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 62d15e5d..d0fd3dca 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -18,12 +18,15 @@ struct __wait_queue {
 	struct list_head	task_list;
 };
 
-typedef struct {
+struct wait_queue_head {
 	spinlock_t		lock;
 	struct list_head	task_list;
-} wait_queue_head_t;
+};
+
+typedef struct wait_queue_head wait_queue_head_t;
 
 void wake_up(wait_queue_head_t *);
+void wake_up_all(wait_queue_head_t *);
 void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
 void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
 int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
@@ -41,7 +44,7 @@ int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *ke
 	.task_list	= { &(name).task_list, &(name).task_list } }
 
 #define DECLARE_WAIT_QUEUE_HEAD(name) \
-	wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
+	struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
 
 static inline void init_waitqueue_head(wait_queue_head_t *q)
 {
@@ -90,7 +93,10 @@ do {									\
 	__wait_event(wq, condition);					\
 } while (0)
 
+#define wait_event_freezable(wq, condition)	({wait_event(wq, condition); 0; })
 #define wait_event_killable(wq, condition)	({wait_event(wq, condition); 0; })
+#define wait_event_interruptible(wq, condition)	({wait_event(wq, condition); 0; })
+#define wait_event_state(wq, condition, state)	({wait_event(wq, condition); 0; })
 
 #define __wait_event_timeout(wq, condition, timeout)			\
 	___wait_event(wq, ___wait_cond_timeout(condition),		\
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 1406c958..5d2ca5f8 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -151,7 +151,7 @@ extern void workqueue_set_max_active(struct workqueue_struct *wq,
 extern bool current_is_workqueue_rescuer(void);
 extern bool workqueue_congested(int cpu, struct workqueue_struct *wq);
 extern unsigned int work_busy(struct work_struct *work);
-extern __printf(1, 2) void set_worker_desc(const char *fmt, ...);
+static inline __printf(1, 2) void set_worker_desc(const char *fmt, ...) {}
 extern void print_worker_info(const char *log_lvl, struct task_struct *task);
 extern void show_workqueue_state(void);
 
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index d7fade73..1cfdb6cf 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -16,8 +16,17 @@
 #include <linux/spinlock.h>
 #include <uapi/linux/xattr.h>
 
+#ifndef XATTR_CREATE
+#define XATTR_CREATE	0x1
+#endif
+
+#ifndef XATTR_REPLACE
+#define XATTR_REPLACE	0x2
+#endif
+
 struct inode;
 struct dentry;
+struct user_namespace;
 
 /*
  * struct xattr_handler: When @name is set, match attributes with exactly that
@@ -32,11 +41,18 @@ struct xattr_handler {
 	int (*get)(const struct xattr_handler *, struct dentry *dentry,
 		   struct inode *inode, const char *name, void *buffer,
 		   size_t size);
-	int (*set)(const struct xattr_handler *, struct dentry *dentry,
+	int (*set)(const struct xattr_handler *,
+		   struct mnt_idmap *idmap, struct dentry *dentry,
 		   struct inode *inode, const char *name, const void *buffer,
 		   size_t size, int flags);
 };
 
+static inline bool xattr_handler_can_list(const struct xattr_handler *handler,
+					  struct dentry *dentry)
+{
+	return handler && (!handler->list || handler->list(dentry));
+}
+
 const char *xattr_full_name(const struct xattr_handler *, const char *);
 
 struct xattr {
diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h
new file mode 100644
index 00000000..df425114
--- /dev/null
+++ b/include/linux/xxhash.h
@@ -0,0 +1,259 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following disclaimer
+ *     in the documentation and/or other materials provided with the
+ *     distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at:
+ * - xxHash homepage: https://cyan4973.github.io/xxHash/
+ * - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*
+ * Notice extracted from xxHash homepage:
+ *
+ * xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+ * It also successfully passes all tests from the SMHasher suite.
+ *
+ * Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2
+ * Duo @3GHz)
+ *
+ * Name            Speed       Q.Score   Author
+ * xxHash          5.4 GB/s     10
+ * CrapWow         3.2 GB/s      2       Andrew
+ * MumurHash 3a    2.7 GB/s     10       Austin Appleby
+ * SpookyHash      2.0 GB/s     10       Bob Jenkins
+ * SBox            1.4 GB/s      9       Bret Mulvey
+ * Lookup3         1.2 GB/s      9       Bob Jenkins
+ * SuperFastHash   1.2 GB/s      1       Paul Hsieh
+ * CityHash64      1.05 GB/s    10       Pike & Alakuijala
+ * FNV             0.55 GB/s     5       Fowler, Noll, Vo
+ * CRC32           0.43 GB/s     9
+ * MD5-32          0.33 GB/s    10       Ronald L. Rivest
+ * SHA1-32         0.28 GB/s    10
+ *
+ * Q.Score is a measure of quality of the hash function.
+ * It depends on successfully passing SMHasher test set.
+ * 10 is a perfect score.
+ *
+ * A 64-bits version, named xxh64 offers much better speed,
+ * but for 64-bits applications only.
+ * Name     Speed on 64 bits    Speed on 32 bits
+ * xxh64       13.8 GB/s            1.9 GB/s
+ * xxh32        6.8 GB/s            6.0 GB/s
+ */
+
+#ifndef XXHASH_H
+#define XXHASH_H
+
+#include <linux/types.h>
+
+/*-****************************
+ * Simple Hash Functions
+ *****************************/
+
+/**
+ * xxh32() - calculate the 32-bit hash of the input with a given seed.
+ *
+ * @input:  The data to hash.
+ * @length: The length of the data to hash.
+ * @seed:   The seed can be used to alter the result predictably.
+ *
+ * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+ *
+ * Return:  The 32-bit hash of the data.
+ */
+uint32_t xxh32(const void *input, size_t length, uint32_t seed);
+
+/**
+ * xxh64() - calculate the 64-bit hash of the input with a given seed.
+ *
+ * @input:  The data to hash.
+ * @length: The length of the data to hash.
+ * @seed:   The seed can be used to alter the result predictably.
+ *
+ * This function runs 2x faster on 64-bit systems, but slower on 32-bit systems.
+ *
+ * Return:  The 64-bit hash of the data.
+ */
+uint64_t xxh64(const void *input, size_t length, uint64_t seed);
+
+/**
+ * xxhash() - calculate wordsize hash of the input with a given seed
+ * @input:  The data to hash.
+ * @length: The length of the data to hash.
+ * @seed:   The seed can be used to alter the result predictably.
+ *
+ * If the hash does not need to be comparable between machines with
+ * different word sizes, this function will call whichever of xxh32()
+ * or xxh64() is faster.
+ *
+ * Return:  wordsize hash of the data.
+ */
+
+static inline unsigned long xxhash(const void *input, size_t length,
+				   uint64_t seed)
+{
+#if BITS_PER_LONG == 64
+       return xxh64(input, length, seed);
+#else
+       return xxh32(input, length, seed);
+#endif
+}
+
+/*-****************************
+ * Streaming Hash Functions
+ *****************************/
+
+/*
+ * These definitions are only meant to allow allocation of XXH state
+ * statically, on stack, or in a struct for example.
+ * Do not use members directly.
+ */
+
+/**
+ * struct xxh32_state - private xxh32 state, do not use members directly
+ */
+struct xxh32_state {
+	uint32_t total_len_32;
+	uint32_t large_len;
+	uint32_t v1;
+	uint32_t v2;
+	uint32_t v3;
+	uint32_t v4;
+	uint32_t mem32[4];
+	uint32_t memsize;
+};
+
+/**
+ * struct xxh32_state - private xxh64 state, do not use members directly
+ */
+struct xxh64_state {
+	uint64_t total_len;
+	uint64_t v1;
+	uint64_t v2;
+	uint64_t v3;
+	uint64_t v4;
+	uint64_t mem64[4];
+	uint32_t memsize;
+};
+
+/**
+ * xxh32_reset() - reset the xxh32 state to start a new hashing operation
+ *
+ * @state: The xxh32 state to reset.
+ * @seed:  Initialize the hash state with this seed.
+ *
+ * Call this function on any xxh32_state to prepare for a new hashing operation.
+ */
+void xxh32_reset(struct xxh32_state *state, uint32_t seed);
+
+/**
+ * xxh32_update() - hash the data given and update the xxh32 state
+ *
+ * @state:  The xxh32 state to update.
+ * @input:  The data to hash.
+ * @length: The length of the data to hash.
+ *
+ * After calling xxh32_reset() call xxh32_update() as many times as necessary.
+ *
+ * Return:  Zero on success, otherwise an error code.
+ */
+int xxh32_update(struct xxh32_state *state, const void *input, size_t length);
+
+/**
+ * xxh32_digest() - produce the current xxh32 hash
+ *
+ * @state: Produce the current xxh32 hash of this state.
+ *
+ * A hash value can be produced at any time. It is still possible to continue
+ * inserting input into the hash state after a call to xxh32_digest(), and
+ * generate new hashes later on, by calling xxh32_digest() again.
+ *
+ * Return: The xxh32 hash stored in the state.
+ */
+uint32_t xxh32_digest(const struct xxh32_state *state);
+
+/**
+ * xxh64_reset() - reset the xxh64 state to start a new hashing operation
+ *
+ * @state: The xxh64 state to reset.
+ * @seed:  Initialize the hash state with this seed.
+ */
+void xxh64_reset(struct xxh64_state *state, uint64_t seed);
+
+/**
+ * xxh64_update() - hash the data given and update the xxh64 state
+ * @state:  The xxh64 state to update.
+ * @input:  The data to hash.
+ * @length: The length of the data to hash.
+ *
+ * After calling xxh64_reset() call xxh64_update() as many times as necessary.
+ *
+ * Return:  Zero on success, otherwise an error code.
+ */
+int xxh64_update(struct xxh64_state *state, const void *input, size_t length);
+
+/**
+ * xxh64_digest() - produce the current xxh64 hash
+ *
+ * @state: Produce the current xxh64 hash of this state.
+ *
+ * A hash value can be produced at any time. It is still possible to continue
+ * inserting input into the hash state after a call to xxh64_digest(), and
+ * generate new hashes later on, by calling xxh64_digest() again.
+ *
+ * Return: The xxh64 hash stored in the state.
+ */
+uint64_t xxh64_digest(const struct xxh64_state *state);
+
+/*-**************************
+ * Utils
+ ***************************/
+
+/**
+ * xxh32_copy_state() - copy the source state into the destination state
+ *
+ * @src: The source xxh32 state.
+ * @dst: The destination xxh32 state.
+ */
+void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src);
+
+/**
+ * xxh64_copy_state() - copy the source state into the destination state
+ *
+ * @src: The source xxh64 state.
+ * @dst: The destination xxh64 state.
+ */
+void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src);
+
+#endif /* XXHASH_H */
diff --git a/include/linux/zstd.h b/include/linux/zstd.h
index 0dd1b023..b0fa1eda 100644
--- a/include/linux/zstd.h
+++ b/include/linux/zstd.h
@@ -1,10 +1,447 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd) and
+ * the GPLv2 (found in the COPYING file in the root directory of
+ * https://github.com/facebook/zstd). You may select, at your option, one of the
+ * above-listed licenses.
+ */
+
+#ifndef LINUX_ZSTD_H
+#define LINUX_ZSTD_H
+
+/**
+ * This is a kernel-style API that wraps the upstream zstd API, which cannot be
+ * used directly because the symbols aren't exported. It exposes the minimal
+ * functionality which is currently required by users of zstd in the kernel.
+ * Expose extra functions from lib/zstd/zstd.h as needed.
+ */
+
+/* ======   Dependency   ====== */
+#include <linux/types.h>
 #include <zstd.h>
+#include <linux/zstd_errors.h>
+
+/* ======   Helper Functions   ====== */
+/**
+ * zstd_compress_bound() - maximum compressed size in worst case scenario
+ * @src_size: The size of the data to compress.
+ *
+ * Return:    The maximum compressed size in the worst case scenario.
+ */
+size_t zstd_compress_bound(size_t src_size);
+
+/**
+ * zstd_is_error() - tells if a size_t function result is an error code
+ * @code:  The function result to check for error.
+ *
+ * Return: Non-zero iff the code is an error.
+ */
+unsigned int zstd_is_error(size_t code);
+
+/**
+ * enum zstd_error_code - zstd error codes
+ */
+typedef ZSTD_ErrorCode zstd_error_code;
+
+/**
+ * zstd_get_error_code() - translates an error function result to an error code
+ * @code:  The function result for which zstd_is_error(code) is true.
+ *
+ * Return: A unique error code for this error.
+ */
+zstd_error_code zstd_get_error_code(size_t code);
+
+/**
+ * zstd_get_error_name() - translates an error function result to a string
+ * @code:  The function result for which zstd_is_error(code) is true.
+ *
+ * Return: An error string corresponding to the error code.
+ */
+const char *zstd_get_error_name(size_t code);
+
+/**
+ * zstd_min_clevel() - minimum allowed compression level
+ *
+ * Return: The minimum allowed compression level.
+ */
+int zstd_min_clevel(void);
+
+/**
+ * zstd_max_clevel() - maximum allowed compression level
+ *
+ * Return: The maximum allowed compression level.
+ */
+int zstd_max_clevel(void);
+
+/* ======   Parameter Selection   ====== */
+
+/**
+ * enum zstd_strategy - zstd compression search strategy
+ *
+ * From faster to stronger. See zstd_lib.h.
+ */
+typedef ZSTD_strategy zstd_strategy;
+
+/**
+ * struct zstd_compression_parameters - zstd compression parameters
+ * @windowLog:    Log of the largest match distance. Larger means more
+ *                compression, and more memory needed during decompression.
+ * @chainLog:     Fully searched segment. Larger means more compression,
+ *                slower, and more memory (useless for fast).
+ * @hashLog:      Dispatch table. Larger means more compression,
+ *                slower, and more memory.
+ * @searchLog:    Number of searches. Larger means more compression and slower.
+ * @searchLength: Match length searched. Larger means faster decompression,
+ *                sometimes less compression.
+ * @targetLength: Acceptable match size for optimal parser (only). Larger means
+ *                more compression, and slower.
+ * @strategy:     The zstd compression strategy.
+ *
+ * See zstd_lib.h.
+ */
+typedef ZSTD_compressionParameters zstd_compression_parameters;
+
+/**
+ * struct zstd_frame_parameters - zstd frame parameters
+ * @contentSizeFlag: Controls whether content size will be present in the
+ *                   frame header (when known).
+ * @checksumFlag:    Controls whether a 32-bit checksum is generated at the
+ *                   end of the frame for error detection.
+ * @noDictIDFlag:    Controls whether dictID will be saved into the frame
+ *                   header when using dictionary compression.
+ *
+ * The default value is all fields set to 0. See zstd_lib.h.
+ */
+typedef ZSTD_frameParameters zstd_frame_parameters;
+
+/**
+ * struct zstd_parameters - zstd parameters
+ * @cParams: The compression parameters.
+ * @fParams: The frame parameters.
+ */
+typedef ZSTD_parameters zstd_parameters;
+
+/**
+ * zstd_get_params() - returns zstd_parameters for selected level
+ * @level:              The compression level
+ * @estimated_src_size: The estimated source size to compress or 0
+ *                      if unknown.
+ *
+ * Return:              The selected zstd_parameters.
+ */
+zstd_parameters zstd_get_params(int level,
+	unsigned long long estimated_src_size);
+
+/* ======   Single-pass Compression   ====== */
+
+typedef ZSTD_CCtx zstd_cctx;
+
+/**
+ * zstd_cctx_workspace_bound() - max memory needed to initialize a zstd_cctx
+ * @parameters: The compression parameters to be used.
+ *
+ * If multiple compression parameters might be used, the caller must call
+ * zstd_cctx_workspace_bound() for each set of parameters and use the maximum
+ * size.
+ *
+ * Return:      A lower bound on the size of the workspace that is passed to
+ *              zstd_init_cctx().
+ */
+size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *parameters);
+
+/**
+ * zstd_init_cctx() - initialize a zstd compression context
+ * @workspace:      The workspace to emplace the context into. It must outlive
+ *                  the returned context.
+ * @workspace_size: The size of workspace. Use zstd_cctx_workspace_bound() to
+ *                  determine how large the workspace must be.
+ *
+ * Return:          A zstd compression context or NULL on error.
+ */
+zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size);
+
+/**
+ * zstd_compress_cctx() - compress src into dst with the initialized parameters
+ * @cctx:         The context. Must have been initialized with zstd_init_cctx().
+ * @dst:          The buffer to compress src into.
+ * @dst_capacity: The size of the destination buffer. May be any size, but
+ *                ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src:          The data to compress.
+ * @src_size:     The size of the data to compress.
+ * @parameters:   The compression parameters to be used.
+ *
+ * Return:        The compressed size or an error, which can be checked using
+ *                zstd_is_error().
+ */
+size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity,
+	const void *src, size_t src_size, const zstd_parameters *parameters);
+
+/* ======   Single-pass Decompression   ====== */
+
+typedef ZSTD_DCtx zstd_dctx;
+
+/**
+ * zstd_dctx_workspace_bound() - max memory needed to initialize a zstd_dctx
+ *
+ * Return: A lower bound on the size of the workspace that is passed to
+ *         zstd_init_dctx().
+ */
+size_t zstd_dctx_workspace_bound(void);
+
+/**
+ * zstd_init_dctx() - initialize a zstd decompression context
+ * @workspace:      The workspace to emplace the context into. It must outlive
+ *                  the returned context.
+ * @workspace_size: The size of workspace. Use zstd_dctx_workspace_bound() to
+ *                  determine how large the workspace must be.
+ *
+ * Return:          A zstd decompression context or NULL on error.
+ */
+zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size);
+
+/**
+ * zstd_decompress_dctx() - decompress zstd compressed src into dst
+ * @dctx:         The decompression context.
+ * @dst:          The buffer to decompress src into.
+ * @dst_capacity: The size of the destination buffer. Must be at least as large
+ *                as the decompressed size. If the caller cannot upper bound the
+ *                decompressed size, then it's better to use the streaming API.
+ * @src:          The zstd compressed data to decompress. Multiple concatenated
+ *                frames and skippable frames are allowed.
+ * @src_size:     The exact size of the data to decompress.
+ *
+ * Return:        The decompressed size or an error, which can be checked using
+ *                zstd_is_error().
+ */
+size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity,
+	const void *src, size_t src_size);
+
+/* ======   Streaming Buffers   ====== */
+
+/**
+ * struct zstd_in_buffer - input buffer for streaming
+ * @src:  Start of the input buffer.
+ * @size: Size of the input buffer.
+ * @pos:  Position where reading stopped. Will be updated.
+ *        Necessarily 0 <= pos <= size.
+ *
+ * See zstd_lib.h.
+ */
+typedef ZSTD_inBuffer zstd_in_buffer;
+
+/**
+ * struct zstd_out_buffer - output buffer for streaming
+ * @dst:  Start of the output buffer.
+ * @size: Size of the output buffer.
+ * @pos:  Position where writing stopped. Will be updated.
+ *        Necessarily 0 <= pos <= size.
+ *
+ * See zstd_lib.h.
+ */
+typedef ZSTD_outBuffer zstd_out_buffer;
+
+/* ======   Streaming Compression   ====== */
+
+typedef ZSTD_CStream zstd_cstream;
+
+/**
+ * zstd_cstream_workspace_bound() - memory needed to initialize a zstd_cstream
+ * @cparams: The compression parameters to be used for compression.
+ *
+ * Return:   A lower bound on the size of the workspace that is passed to
+ *           zstd_init_cstream().
+ */
+size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams);
+
+/**
+ * zstd_init_cstream() - initialize a zstd streaming compression context
+ * @parameters        The zstd parameters to use for compression.
+ * @pledged_src_size: If params.fParams.contentSizeFlag == 1 then the caller
+ *                    must pass the source size (zero means empty source).
+ *                    Otherwise, the caller may optionally pass the source
+ *                    size, or zero if unknown.
+ * @workspace:        The workspace to emplace the context into. It must outlive
+ *                    the returned context.
+ * @workspace_size:   The size of workspace.
+ *                    Use zstd_cstream_workspace_bound(params->cparams) to
+ *                    determine how large the workspace must be.
+ *
+ * Return:            The zstd streaming compression context or NULL on error.
+ */
+zstd_cstream *zstd_init_cstream(const zstd_parameters *parameters,
+	unsigned long long pledged_src_size, void *workspace, size_t workspace_size);
+
+/**
+ * zstd_reset_cstream() - reset the context using parameters from creation
+ * @cstream:          The zstd streaming compression context to reset.
+ * @pledged_src_size: Optionally the source size, or zero if unknown.
+ *
+ * Resets the context using the parameters from creation. Skips dictionary
+ * loading, since it can be reused. If `pledged_src_size` is non-zero the frame
+ * content size is always written into the frame header.
+ *
+ * Return:            Zero or an error, which can be checked using
+ *                    zstd_is_error().
+ */
+size_t zstd_reset_cstream(zstd_cstream *cstream,
+	unsigned long long pledged_src_size);
+
+/**
+ * zstd_compress_stream() - streaming compress some of input into output
+ * @cstream: The zstd streaming compression context.
+ * @output:  Destination buffer. `output->pos` is updated to indicate how much
+ *           compressed data was written.
+ * @input:   Source buffer. `input->pos` is updated to indicate how much data
+ *           was read. Note that it may not consume the entire input, in which
+ *           case `input->pos < input->size`, and it's up to the caller to
+ *           present remaining data again.
+ *
+ * The `input` and `output` buffers may be any size. Guaranteed to make some
+ * forward progress if `input` and `output` are not empty.
+ *
+ * Return:   A hint for the number of bytes to use as the input for the next
+ *           function call or an error, which can be checked using
+ *           zstd_is_error().
+ */
+size_t zstd_compress_stream(zstd_cstream *cstream, zstd_out_buffer *output,
+	zstd_in_buffer *input);
+
+/**
+ * zstd_flush_stream() - flush internal buffers into output
+ * @cstream: The zstd streaming compression context.
+ * @output:  Destination buffer. `output->pos` is updated to indicate how much
+ *           compressed data was written.
+ *
+ * zstd_flush_stream() must be called until it returns 0, meaning all the data
+ * has been flushed. Since zstd_flush_stream() causes a block to be ended,
+ * calling it too often will degrade the compression ratio.
+ *
+ * Return:   The number of bytes still present within internal buffers or an
+ *           error, which can be checked using zstd_is_error().
+ */
+size_t zstd_flush_stream(zstd_cstream *cstream, zstd_out_buffer *output);
+
+/**
+ * zstd_end_stream() - flush internal buffers into output and end the frame
+ * @cstream: The zstd streaming compression context.
+ * @output:  Destination buffer. `output->pos` is updated to indicate how much
+ *           compressed data was written.
+ *
+ * zstd_end_stream() must be called until it returns 0, meaning all the data has
+ * been flushed and the frame epilogue has been written.
+ *
+ * Return:   The number of bytes still present within internal buffers or an
+ *           error, which can be checked using zstd_is_error().
+ */
+size_t zstd_end_stream(zstd_cstream *cstream, zstd_out_buffer *output);
+
+/* ======   Streaming Decompression   ====== */
+
+typedef ZSTD_DStream zstd_dstream;
+
+/**
+ * zstd_dstream_workspace_bound() - memory needed to initialize a zstd_dstream
+ * @max_window_size: The maximum window size allowed for compressed frames.
+ *
+ * Return:           A lower bound on the size of the workspace that is passed
+ *                   to zstd_init_dstream().
+ */
+size_t zstd_dstream_workspace_bound(size_t max_window_size);
+
+/**
+ * zstd_init_dstream() - initialize a zstd streaming decompression context
+ * @max_window_size: The maximum window size allowed for compressed frames.
+ * @workspace:       The workspace to emplace the context into. It must outlive
+ *                   the returned context.
+ * @workspaceSize:   The size of workspace.
+ *                   Use zstd_dstream_workspace_bound(max_window_size) to
+ *                   determine how large the workspace must be.
+ *
+ * Return:           The zstd streaming decompression context.
+ */
+zstd_dstream *zstd_init_dstream(size_t max_window_size, void *workspace,
+	size_t workspace_size);
+
+/**
+ * zstd_reset_dstream() - reset the context using parameters from creation
+ * @dstream: The zstd streaming decompression context to reset.
+ *
+ * Resets the context using the parameters from creation. Skips dictionary
+ * loading, since it can be reused.
+ *
+ * Return:   Zero or an error, which can be checked using zstd_is_error().
+ */
+size_t zstd_reset_dstream(zstd_dstream *dstream);
+
+/**
+ * zstd_decompress_stream() - streaming decompress some of input into output
+ * @dstream: The zstd streaming decompression context.
+ * @output:  Destination buffer. `output.pos` is updated to indicate how much
+ *           decompressed data was written.
+ * @input:   Source buffer. `input.pos` is updated to indicate how much data was
+ *           read. Note that it may not consume the entire input, in which case
+ *           `input.pos < input.size`, and it's up to the caller to present
+ *           remaining data again.
+ *
+ * The `input` and `output` buffers may be any size. Guaranteed to make some
+ * forward progress if `input` and `output` are not empty.
+ * zstd_decompress_stream() will not consume the last byte of the frame until
+ * the entire frame is flushed.
+ *
+ * Return:   Returns 0 iff a frame is completely decoded and fully flushed.
+ *           Otherwise returns a hint for the number of bytes to use as the
+ *           input for the next function call or an error, which can be checked
+ *           using zstd_is_error(). The size hint will never load more than the
+ *           frame.
+ */
+size_t zstd_decompress_stream(zstd_dstream *dstream, zstd_out_buffer *output,
+	zstd_in_buffer *input);
+
+/* ======   Frame Inspection Functions ====== */
+
+/**
+ * zstd_find_frame_compressed_size() - returns the size of a compressed frame
+ * @src:      Source buffer. It should point to the start of a zstd encoded
+ *            frame or a skippable frame.
+ * @src_size: The size of the source buffer. It must be at least as large as the
+ *            size of the frame.
+ *
+ * Return:    The compressed size of the frame pointed to by `src` or an error,
+ *            which can be check with zstd_is_error().
+ *            Suitable to pass to ZSTD_decompress() or similar functions.
+ */
+size_t zstd_find_frame_compressed_size(const void *src, size_t src_size);
 
-#define ZSTD_initDCtx(w, s)	ZSTD_initStaticDCtx(w, s)
-#define ZSTD_initCCtx(w, s)	ZSTD_initStaticCCtx(w, s)
+/**
+ * struct zstd_frame_params - zstd frame parameters stored in the frame header
+ * @frameContentSize: The frame content size, or ZSTD_CONTENTSIZE_UNKNOWN if not
+ *                    present.
+ * @windowSize:       The window size, or 0 if the frame is a skippable frame.
+ * @blockSizeMax:     The maximum block size.
+ * @frameType:        The frame type (zstd or skippable)
+ * @headerSize:       The size of the frame header.
+ * @dictID:           The dictionary id, or 0 if not present.
+ * @checksumFlag:     Whether a checksum was used.
+ *
+ * See zstd_lib.h.
+ */
+typedef ZSTD_frameHeader zstd_frame_header;
 
-#define ZSTD_compressCCtx(w, dst, d_len, src, src_len, params)	\
-	ZSTD_compressCCtx(w, dst, d_len, src, src_len, 0)
+/**
+ * zstd_get_frame_header() - extracts parameters from a zstd or skippable frame
+ * @params:   On success the frame parameters are written here.
+ * @src:      The source buffer. It must point to a zstd or skippable frame.
+ * @src_size: The size of the source buffer.
+ *
+ * Return:    0 on success. If more data is required it returns how many bytes
+ *            must be provided to make forward progress. Otherwise it returns
+ *            an error, which can be checked using zstd_is_error().
+ */
+size_t zstd_get_frame_header(zstd_frame_header *params, const void *src,
+	size_t src_size);
 
-#define ZSTD_CCtxWorkspaceBound(p)	ZSTD_estimateCCtxSize(0)
-#define ZSTD_DCtxWorkspaceBound()	ZSTD_estimateDCtxSize()
+#endif  /* LINUX_ZSTD_H */
diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h
new file mode 100644
index 00000000..58b6dd45
--- /dev/null
+++ b/include/linux/zstd_errors.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_ERRORS_H_398273423
+#define ZSTD_ERRORS_H_398273423
+
+
+/*===== dependency =====*/
+#include <linux/types.h>   /* size_t */
+
+
+/* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
+#define ZSTDERRORLIB_VISIBILITY 
+#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
+
+/*-*********************************************
+ *  Error codes list
+ *-*********************************************
+ *  Error codes _values_ are pinned down since v1.3.1 only.
+ *  Therefore, don't rely on values if you may link to any version < v1.3.1.
+ *
+ *  Only values < 100 are considered stable.
+ *
+ *  note 1 : this API shall be used with static linking only.
+ *           dynamic linking is not yet officially supported.
+ *  note 2 : Prefer relying on the enum than on its value whenever possible
+ *           This is the only supported way to use the error list < v1.3.1
+ *  note 3 : ZSTD_isError() is always correct, whatever the library version.
+ **********************************************/
+typedef enum {
+  ZSTD_error_no_error = 0,
+  ZSTD_error_GENERIC  = 1,
+  ZSTD_error_prefix_unknown                = 10,
+  ZSTD_error_version_unsupported           = 12,
+  ZSTD_error_frameParameter_unsupported    = 14,
+  ZSTD_error_frameParameter_windowTooLarge = 16,
+  ZSTD_error_corruption_detected = 20,
+  ZSTD_error_checksum_wrong      = 22,
+  ZSTD_error_dictionary_corrupted      = 30,
+  ZSTD_error_dictionary_wrong          = 32,
+  ZSTD_error_dictionaryCreation_failed = 34,
+  ZSTD_error_parameter_unsupported   = 40,
+  ZSTD_error_parameter_outOfBound    = 42,
+  ZSTD_error_tableLog_tooLarge       = 44,
+  ZSTD_error_maxSymbolValue_tooLarge = 46,
+  ZSTD_error_maxSymbolValue_tooSmall = 48,
+  ZSTD_error_stage_wrong       = 60,
+  ZSTD_error_init_missing      = 62,
+  ZSTD_error_memory_allocation = 64,
+  ZSTD_error_workSpace_tooSmall= 66,
+  ZSTD_error_dstSize_tooSmall = 70,
+  ZSTD_error_srcSize_wrong    = 72,
+  ZSTD_error_dstBuffer_null   = 74,
+  /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+  ZSTD_error_frameIndex_tooLarge = 100,
+  ZSTD_error_seekableIO          = 102,
+  ZSTD_error_dstBuffer_wrong     = 104,
+  ZSTD_error_srcBuffer_wrong     = 105,
+  ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+} ZSTD_ErrorCode;
+
+/*! ZSTD_getErrorCode() :
+    convert a `size_t` function result into a `ZSTD_ErrorCode` enum type,
+    which can be used to compare with enum list published above */
+ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult);
+ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code);   /*< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */
+
+
+
+#endif /* ZSTD_ERRORS_H_398273423 */
diff --git a/include/linux/zutil.h b/include/linux/zutil.h
deleted file mode 100644
index 8caa7d3c..00000000
--- a/include/linux/zutil.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* zutil.h -- internal interface and configuration of the compression library
- * Copyright (C) 1995-1998 Jean-loup Gailly.
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-/* WARNING: this file should *not* be used by applications. It is
-   part of the implementation of the compression library and is
-   subject to change. Applications should only use zlib.h.
- */
-
-/* @(#) $Id: zutil.h,v 1.1 2000/01/01 03:32:23 davem Exp $ */
-
-#ifndef _Z_UTIL_H
-#define _Z_UTIL_H
-
-#include <stdlib.h>
-#include <string.h>
-#include <linux/zlib.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-
-typedef unsigned char  uch;
-typedef unsigned short ush;
-typedef unsigned long  ulg;
-
-        /* common constants */
-
-#define STORED_BLOCK 0
-#define STATIC_TREES 1
-#define DYN_TREES    2
-/* The three kinds of block type */
-
-#define MIN_MATCH  3
-#define MAX_MATCH  258
-/* The minimum and maximum match lengths */
-
-#define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */
-
-        /* target dependencies */
-
-        /* Common defaults */
-
-#ifndef OS_CODE
-#  define OS_CODE  0x03  /* assume Unix */
-#endif
-
-         /* functions */
-
-typedef uLong (*check_func) (uLong check, const Byte *buf,
-				       uInt len);
-
-
-                        /* checksum functions */
-
-#define BASE 65521L /* largest prime smaller than 65536 */
-#define NMAX 5552
-/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
-
-#define DO1(buf,i)  {s1 += buf[i]; s2 += s1;}
-#define DO2(buf,i)  DO1(buf,i); DO1(buf,i+1);
-#define DO4(buf,i)  DO2(buf,i); DO2(buf,i+2);
-#define DO8(buf,i)  DO4(buf,i); DO4(buf,i+4);
-#define DO16(buf)   DO8(buf,0); DO8(buf,8);
-
-/* ========================================================================= */
-/*
-     Update a running Adler-32 checksum with the bytes buf[0..len-1] and
-   return the updated checksum. If buf is NULL, this function returns
-   the required initial value for the checksum.
-   An Adler-32 checksum is almost as reliable as a CRC32 but can be computed
-   much faster. Usage example:
-
-     uLong adler = zlib_adler32(0L, NULL, 0);
-
-     while (read_buffer(buffer, length) != EOF) {
-       adler = zlib_adler32(adler, buffer, length);
-     }
-     if (adler != original_adler) error();
-*/
-static inline uLong zlib_adler32(uLong adler,
-				 const Byte *buf,
-				 uInt len)
-{
-    unsigned long s1 = adler & 0xffff;
-    unsigned long s2 = (adler >> 16) & 0xffff;
-    int k;
-
-    if (buf == NULL) return 1L;
-
-    while (len > 0) {
-        k = len < NMAX ? len : NMAX;
-        len -= k;
-        while (k >= 16) {
-            DO16(buf);
-	    buf += 16;
-            k -= 16;
-        }
-        if (k != 0) do {
-            s1 += *buf++;
-	    s2 += s1;
-        } while (--k);
-        s1 %= BASE;
-        s2 %= BASE;
-    }
-    return (s2 << 16) | s1;
-}
-
-#endif /* _Z_UTIL_H */
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
deleted file mode 100644
index 13264b82..00000000
--- a/include/trace/events/bcachefs.h
+++ /dev/null
@@ -1,529 +0,0 @@
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM bcachefs
-
-#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_BCACHE_H
-
-#include <linux/tracepoint.h>
-
-DECLARE_EVENT_CLASS(bpos,
-	TP_PROTO(struct bpos p),
-	TP_ARGS(p),
-
-	TP_STRUCT__entry(
-		__field(u64,	inode				)
-		__field(u64,	offset				)
-	),
-
-	TP_fast_assign(
-		__entry->inode	= p.inode;
-		__entry->offset	= p.offset;
-	),
-
-	TP_printk("%llu:%llu", __entry->inode, __entry->offset)
-);
-
-DECLARE_EVENT_CLASS(bkey,
-	TP_PROTO(const struct bkey *k),
-	TP_ARGS(k),
-
-	TP_STRUCT__entry(
-		__field(u64,	inode				)
-		__field(u64,	offset				)
-		__field(u32,	size				)
-	),
-
-	TP_fast_assign(
-		__entry->inode	= k->p.inode;
-		__entry->offset	= k->p.offset;
-		__entry->size	= k->size;
-	),
-
-	TP_printk("%llu:%llu len %u", __entry->inode,
-		  __entry->offset, __entry->size)
-);
-
-DECLARE_EVENT_CLASS(bch_dev,
-	TP_PROTO(struct bch_dev *ca),
-	TP_ARGS(ca),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16	)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, ca->uuid.b, 16);
-	),
-
-	TP_printk("%pU", __entry->uuid)
-);
-
-DECLARE_EVENT_CLASS(bch_fs,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16 )
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-	),
-
-	TP_printk("%pU", __entry->uuid)
-);
-
-DECLARE_EVENT_CLASS(bio,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__field(sector_t,	sector			)
-		__field(unsigned int,	nr_sector		)
-		__array(char,		rwbs,	6		)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= bio->bi_disk ? bio_dev(bio) : 0;
-		__entry->sector		= bio->bi_iter.bi_sector;
-		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
-	),
-
-	TP_printk("%d,%d  %s %llu + %u",
-		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  (unsigned long long)__entry->sector, __entry->nr_sector)
-);
-
-/* io.c: */
-
-DEFINE_EVENT(bio, read_split,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bio, read_bounce,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bio, read_retry,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bio, promote,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio)
-);
-
-/* Journal */
-
-DEFINE_EVENT(bch_fs, journal_full,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, journal_entry_full,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(bio, journal_write,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio)
-);
-
-/* bset.c: */
-
-DEFINE_EVENT(bpos, bkey_pack_pos_fail,
-	TP_PROTO(struct bpos p),
-	TP_ARGS(p)
-);
-
-/* Btree */
-
-DECLARE_EVENT_CLASS(btree_node,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,		16	)
-		__field(u8,		level			)
-		__field(u8,		id			)
-		__field(u64,		inode			)
-		__field(u64,		offset			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->level		= b->level;
-		__entry->id		= b->btree_id;
-		__entry->inode		= b->key.k.p.inode;
-		__entry->offset		= b->key.k.p.offset;
-	),
-
-	TP_printk("%pU  %u id %u %llu:%llu",
-		  __entry->uuid, __entry->level, __entry->id,
-		  __entry->inode, __entry->offset)
-);
-
-DEFINE_EVENT(btree_node, btree_read,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b)
-);
-
-TRACE_EVENT(btree_write,
-	TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
-	TP_ARGS(b, bytes, sectors),
-
-	TP_STRUCT__entry(
-		__field(enum bkey_type,	type)
-		__field(unsigned,	bytes			)
-		__field(unsigned,	sectors			)
-	),
-
-	TP_fast_assign(
-		__entry->type	= btree_node_type(b);
-		__entry->bytes	= bytes;
-		__entry->sectors = sectors;
-	),
-
-	TP_printk("bkey type %u bytes %u sectors %u",
-		  __entry->type , __entry->bytes, __entry->sectors)
-);
-
-DEFINE_EVENT(btree_node, btree_node_alloc,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_free,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_reap,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b)
-);
-
-DECLARE_EVENT_CLASS(btree_node_cannibalize_lock,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c),
-
-	TP_STRUCT__entry(
-		__array(char,			uuid,	16	)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-	),
-
-	TP_printk("%pU", __entry->uuid)
-);
-
-DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock_fail,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-TRACE_EVENT(btree_reserve_get_fail,
-	TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl),
-	TP_ARGS(c, required, cl),
-
-	TP_STRUCT__entry(
-		__array(char,			uuid,	16	)
-		__field(size_t,			required	)
-		__field(struct closure *,	cl		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->required = required;
-		__entry->cl = cl;
-	),
-
-	TP_printk("%pU required %zu by %p", __entry->uuid,
-		  __entry->required, __entry->cl)
-);
-
-TRACE_EVENT(btree_insert_key,
-	TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k),
-	TP_ARGS(c, b, k),
-
-	TP_STRUCT__entry(
-		__field(u8,		id			)
-		__field(u64,		inode			)
-		__field(u64,		offset			)
-		__field(u32,		size			)
-	),
-
-	TP_fast_assign(
-		__entry->id		= b->btree_id;
-		__entry->inode		= k->k.p.inode;
-		__entry->offset		= k->k.p.offset;
-		__entry->size		= k->k.size;
-	),
-
-	TP_printk("btree %u: %llu:%llu len %u", __entry->id,
-		  __entry->inode, __entry->offset, __entry->size)
-);
-
-DEFINE_EVENT(btree_node, btree_split,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(btree_node, btree_compact,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(btree_node, btree_merge,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(btree_node, btree_set_root,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b)
-);
-
-/* Garbage collection */
-
-DEFINE_EVENT(btree_node, btree_gc_coalesce,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b)
-);
-
-TRACE_EVENT(btree_gc_coalesce_fail,
-	TP_PROTO(struct bch_fs *c, int reason),
-	TP_ARGS(c, reason),
-
-	TP_STRUCT__entry(
-		__field(u8,		reason			)
-		__array(char,		uuid,	16		)
-	),
-
-	TP_fast_assign(
-		__entry->reason		= reason;
-		memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16);
-	),
-
-	TP_printk("%pU: %u", __entry->uuid, __entry->reason)
-);
-
-DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(bch_fs, gc_start,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, gc_end,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, gc_coalesce_start,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, gc_coalesce_end,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_dev, sectors_saturated,
-	TP_PROTO(struct bch_dev *ca),
-	TP_ARGS(ca)
-);
-
-DEFINE_EVENT(bch_fs, gc_sectors_saturated,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-/* Allocator */
-
-TRACE_EVENT(alloc_batch,
-	TP_PROTO(struct bch_dev *ca, size_t free, size_t total),
-	TP_ARGS(ca, free, total),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16	)
-		__field(size_t,		free		)
-		__field(size_t,		total		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, ca->uuid.b, 16);
-		__entry->free = free;
-		__entry->total = total;
-	),
-
-	TP_printk("%pU free %zu total %zu",
-		__entry->uuid, __entry->free, __entry->total)
-);
-
-TRACE_EVENT(invalidate,
-	TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors),
-	TP_ARGS(ca, offset, sectors),
-
-	TP_STRUCT__entry(
-		__field(unsigned,	sectors			)
-		__field(dev_t,		dev			)
-		__field(__u64,		offset			)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= ca->disk_sb.bdev->bd_dev;
-		__entry->offset		= offset,
-		__entry->sectors	= sectors;
-	),
-
-	TP_printk("invalidated %u sectors at %d,%d sector=%llu",
-		  __entry->sectors, MAJOR(__entry->dev),
-		  MINOR(__entry->dev), __entry->offset)
-);
-
-DEFINE_EVENT(bch_fs, rescale_prios,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DECLARE_EVENT_CLASS(bucket_alloc,
-	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
-	TP_ARGS(ca, reserve),
-
-	TP_STRUCT__entry(
-		__array(char,			uuid,	16)
-		__field(enum alloc_reserve,	reserve	  )
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, ca->uuid.b, 16);
-		__entry->reserve = reserve;
-	),
-
-	TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve)
-);
-
-DEFINE_EVENT(bucket_alloc, bucket_alloc,
-	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
-	TP_ARGS(ca, reserve)
-);
-
-DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
-	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
-	TP_ARGS(ca, reserve)
-);
-
-DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail,
-	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
-	TP_ARGS(ca, reserve)
-);
-
-/* Moving IO */
-
-DEFINE_EVENT(bkey, move_extent,
-	TP_PROTO(const struct bkey *k),
-	TP_ARGS(k)
-);
-
-DEFINE_EVENT(bkey, move_alloc_fail,
-	TP_PROTO(const struct bkey *k),
-	TP_ARGS(k)
-);
-
-DEFINE_EVENT(bkey, move_race,
-	TP_PROTO(const struct bkey *k),
-	TP_ARGS(k)
-);
-
-TRACE_EVENT(move_data,
-	TP_PROTO(struct bch_fs *c, u64 sectors_moved,
-		 u64 keys_moved),
-	TP_ARGS(c, sectors_moved, keys_moved),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16	)
-		__field(u64,		sectors_moved	)
-		__field(u64,		keys_moved	)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->sectors_moved = sectors_moved;
-		__entry->keys_moved = keys_moved;
-	),
-
-	TP_printk("%pU sectors_moved %llu keys_moved %llu",
-		__entry->uuid, __entry->sectors_moved, __entry->keys_moved)
-);
-
-TRACE_EVENT(copygc,
-	TP_PROTO(struct bch_dev *ca,
-		 u64 sectors_moved, u64 sectors_not_moved,
-		 u64 buckets_moved, u64 buckets_not_moved),
-	TP_ARGS(ca,
-		sectors_moved, sectors_not_moved,
-		buckets_moved, buckets_not_moved),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16		)
-		__field(u64,		sectors_moved		)
-		__field(u64,		sectors_not_moved	)
-		__field(u64,		buckets_moved		)
-		__field(u64,		buckets_not_moved	)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, ca->uuid.b, 16);
-		__entry->sectors_moved		= sectors_moved;
-		__entry->sectors_not_moved	= sectors_not_moved;
-		__entry->buckets_moved		= buckets_moved;
-		__entry->buckets_not_moved = buckets_moved;
-	),
-
-	TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu",
-		__entry->uuid,
-		__entry->sectors_moved, __entry->sectors_not_moved,
-		__entry->buckets_moved, __entry->buckets_not_moved)
-);
-
-#endif /* _TRACE_BCACHE_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
new file mode 100644
index 00000000..9ebd081e
--- /dev/null
+++ b/include/trace/events/lock.h
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM lock
+
+#if !defined(_TRACE_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_LOCK_H
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+/* flags for lock:contention_begin */
+#define LCB_F_SPIN	(1U << 0)
+#define LCB_F_READ	(1U << 1)
+#define LCB_F_WRITE	(1U << 2)
+#define LCB_F_RT	(1U << 3)
+#define LCB_F_PERCPU	(1U << 4)
+#define LCB_F_MUTEX	(1U << 5)
+
+
+#ifdef CONFIG_LOCKDEP
+
+#include <linux/lockdep.h>
+
+TRACE_EVENT(lock_acquire,
+
+	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
+		int trylock, int read, int check,
+		struct lockdep_map *next_lock, unsigned long ip),
+
+	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, flags)
+		__string(name, lock->name)
+		__field(void *, lockdep_addr)
+	),
+
+	TP_fast_assign(
+		__entry->flags = (trylock ? 1 : 0) | (read ? 2 : 0);
+		__assign_str(name, lock->name);
+		__entry->lockdep_addr = lock;
+	),
+
+	TP_printk("%p %s%s%s", __entry->lockdep_addr,
+		  (__entry->flags & 1) ? "try " : "",
+		  (__entry->flags & 2) ? "read " : "",
+		  __get_str(name))
+);
+
+DECLARE_EVENT_CLASS(lock,
+
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+
+	TP_ARGS(lock, ip),
+
+	TP_STRUCT__entry(
+		__string(	name, 	lock->name	)
+		__field(	void *, lockdep_addr	)
+	),
+
+	TP_fast_assign(
+		__assign_str(name, lock->name);
+		__entry->lockdep_addr = lock;
+	),
+
+	TP_printk("%p %s",  __entry->lockdep_addr, __get_str(name))
+);
+
+DEFINE_EVENT(lock, lock_release,
+
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+
+	TP_ARGS(lock, ip)
+);
+
+#ifdef CONFIG_LOCK_STAT
+
+DEFINE_EVENT(lock, lock_contended,
+
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+
+	TP_ARGS(lock, ip)
+);
+
+DEFINE_EVENT(lock, lock_acquired,
+
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+
+	TP_ARGS(lock, ip)
+);
+
+#endif /* CONFIG_LOCK_STAT */
+#endif /* CONFIG_LOCKDEP */
+
+TRACE_EVENT(contention_begin,
+
+	TP_PROTO(void *lock, unsigned int flags),
+
+	TP_ARGS(lock, flags),
+
+	TP_STRUCT__entry(
+		__field(void *, lock_addr)
+		__field(unsigned int, flags)
+	),
+
+	TP_fast_assign(
+		__entry->lock_addr = lock;
+		__entry->flags = flags;
+	),
+
+	TP_printk("%p (flags=%s)", __entry->lock_addr,
+		  __print_flags(__entry->flags, "|",
+				{ LCB_F_SPIN,		"SPIN" },
+				{ LCB_F_READ,		"READ" },
+				{ LCB_F_WRITE,		"WRITE" },
+				{ LCB_F_RT,		"RT" },
+				{ LCB_F_PERCPU,		"PERCPU" },
+				{ LCB_F_MUTEX,		"MUTEX" }
+			  ))
+);
+
+TRACE_EVENT(contention_end,
+
+	TP_PROTO(void *lock, int ret),
+
+	TP_ARGS(lock, ret),
+
+	TP_STRUCT__entry(
+		__field(void *, lock_addr)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->lock_addr = lock;
+		__entry->ret = ret;
+	),
+
+	TP_printk("%p (ret=%d)", __entry->lock_addr, __entry->ret)
+);
+
+#endif /* _TRACE_LOCK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
new file mode 100644
index 00000000..bb575f3a
--- /dev/null
+++ b/include/uapi/linux/magic.h
@@ -0,0 +1,107 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __LINUX_MAGIC_H__
+#define __LINUX_MAGIC_H__
+
+#define ADFS_SUPER_MAGIC	0xadf5
+#define AFFS_SUPER_MAGIC	0xadff
+#define AFS_SUPER_MAGIC                0x5346414F
+#define AUTOFS_SUPER_MAGIC	0x0187
+#define CEPH_SUPER_MAGIC	0x00c36400
+#define CODA_SUPER_MAGIC	0x73757245
+#define CRAMFS_MAGIC		0x28cd3d45	/* some random number */
+#define CRAMFS_MAGIC_WEND	0x453dcd28	/* magic number with the wrong endianess */
+#define DEBUGFS_MAGIC          0x64626720
+#define SECURITYFS_MAGIC	0x73636673
+#define SELINUX_MAGIC		0xf97cff8c
+#define SMACK_MAGIC		0x43415d53	/* "SMAC" */
+#define RAMFS_MAGIC		0x858458f6	/* some random number */
+#define TMPFS_MAGIC		0x01021994
+#define HUGETLBFS_MAGIC 	0x958458f6	/* some random number */
+#define SQUASHFS_MAGIC		0x73717368
+#define ECRYPTFS_SUPER_MAGIC	0xf15f
+#define EFS_SUPER_MAGIC		0x414A53
+#define EROFS_SUPER_MAGIC_V1	0xE0F5E1E2
+#define EXT2_SUPER_MAGIC	0xEF53
+#define EXT3_SUPER_MAGIC	0xEF53
+#define XENFS_SUPER_MAGIC	0xabba1974
+#define EXT4_SUPER_MAGIC	0xEF53
+#define BTRFS_SUPER_MAGIC	0x9123683E
+#define NILFS_SUPER_MAGIC	0x3434
+#define F2FS_SUPER_MAGIC	0xF2F52010
+#define HPFS_SUPER_MAGIC	0xf995e849
+#define ISOFS_SUPER_MAGIC	0x9660
+#define JFFS2_SUPER_MAGIC	0x72b6
+#define XFS_SUPER_MAGIC		0x58465342	/* "XFSB" */
+#define PSTOREFS_MAGIC		0x6165676C
+#define EFIVARFS_MAGIC		0xde5e81e4
+#define HOSTFS_SUPER_MAGIC	0x00c0ffee
+#define OVERLAYFS_SUPER_MAGIC	0x794c7630
+#define FUSE_SUPER_MAGIC	0x65735546
+#define BCACHEFS_SUPER_MAGIC	0xca451a4e
+
+#define MINIX_SUPER_MAGIC	0x137F		/* minix v1 fs, 14 char names */
+#define MINIX_SUPER_MAGIC2	0x138F		/* minix v1 fs, 30 char names */
+#define MINIX2_SUPER_MAGIC	0x2468		/* minix v2 fs, 14 char names */
+#define MINIX2_SUPER_MAGIC2	0x2478		/* minix v2 fs, 30 char names */
+#define MINIX3_SUPER_MAGIC	0x4d5a		/* minix v3 fs, 60 char names */
+
+#define MSDOS_SUPER_MAGIC	0x4d44		/* MD */
+#define EXFAT_SUPER_MAGIC	0x2011BAB0
+#define NCP_SUPER_MAGIC		0x564c		/* Guess, what 0x564c is :-) */
+#define NFS_SUPER_MAGIC		0x6969
+#define OCFS2_SUPER_MAGIC	0x7461636f
+#define OPENPROM_SUPER_MAGIC	0x9fa1
+#define QNX4_SUPER_MAGIC	0x002f		/* qnx4 fs detection */
+#define QNX6_SUPER_MAGIC	0x68191122	/* qnx6 fs detection */
+#define AFS_FS_MAGIC		0x6B414653
+
+
+#define REISERFS_SUPER_MAGIC	0x52654973	/* used by gcc */
+					/* used by file system utilities that
+	                                   look at the superblock, etc.  */
+#define REISERFS_SUPER_MAGIC_STRING	"ReIsErFs"
+#define REISER2FS_SUPER_MAGIC_STRING	"ReIsEr2Fs"
+#define REISER2FS_JR_SUPER_MAGIC_STRING	"ReIsEr3Fs"
+
+#define SMB_SUPER_MAGIC		0x517B
+#define CIFS_SUPER_MAGIC	0xFF534D42      /* the first four bytes of SMB PDUs */
+#define SMB2_SUPER_MAGIC	0xFE534D42
+
+#define CGROUP_SUPER_MAGIC	0x27e0eb
+#define CGROUP2_SUPER_MAGIC	0x63677270
+
+#define RDTGROUP_SUPER_MAGIC	0x7655821
+
+#define STACK_END_MAGIC		0x57AC6E9D
+
+#define TRACEFS_MAGIC          0x74726163
+
+#define V9FS_MAGIC		0x01021997
+
+#define BDEVFS_MAGIC            0x62646576
+#define DAXFS_MAGIC             0x64646178
+#define BINFMTFS_MAGIC          0x42494e4d
+#define DEVPTS_SUPER_MAGIC	0x1cd1
+#define BINDERFS_SUPER_MAGIC	0x6c6f6f70
+#define FUTEXFS_SUPER_MAGIC	0xBAD1DEA
+#define PIPEFS_MAGIC            0x50495045
+#define PROC_SUPER_MAGIC	0x9fa0
+#define SOCKFS_MAGIC		0x534F434B
+#define SYSFS_MAGIC		0x62656572
+#define USBDEVICE_SUPER_MAGIC	0x9fa2
+#define MTD_INODE_FS_MAGIC      0x11307854
+#define ANON_INODE_FS_MAGIC	0x09041934
+#define BTRFS_TEST_MAGIC	0x73727279
+#define NSFS_MAGIC		0x6e736673
+#define BPF_FS_MAGIC		0xcafe4a11
+#define AAFS_MAGIC		0x5a3c69f0
+#define ZONEFS_MAGIC		0x5a4f4653
+
+/* Since UDF 2.01 is ISO 13346 based... */
+#define UDF_SUPER_MAGIC		0x15013346
+#define DMA_BUF_MAGIC		0x444d4142	/* "DMAB" */
+#define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
+#define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
+#define PID_FS_MAGIC		0x50494446	/* "PIDF" */
+
+#endif /* __LINUX_MAGIC_H__ */
diff --git a/include/uapi/linux/uuid.h b/include/uapi/linux/uuid.h
deleted file mode 100644
index 6e978107..00000000
--- a/include/uapi/linux/uuid.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * UUID/GUID definition
- *
- * Copyright (C) 2010, Intel Corp.
- *	Huang Ying <ying.huang@intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation;
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#ifndef _UAPI_LINUX_UUID_H_
-#define _UAPI_LINUX_UUID_H_
-
-#include <asm/types.h>
-
-typedef struct {
-	__u8 b[16];
-} uuid_le;
-
-typedef struct {
-	__u8 b[16];
-} uuid_be;
-
-#define UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)		\
-((uuid_le)								\
-{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
-   (b) & 0xff, ((b) >> 8) & 0xff,					\
-   (c) & 0xff, ((c) >> 8) & 0xff,					\
-   (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
-
-#define UUID_BE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)		\
-((uuid_be)								\
-{{ ((a) >> 24) & 0xff, ((a) >> 16) & 0xff, ((a) >> 8) & 0xff, (a) & 0xff, \
-   ((b) >> 8) & 0xff, (b) & 0xff,					\
-   ((c) >> 8) & 0xff, (c) & 0xff,					\
-   (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
-
-#define NULL_UUID_LE							\
-	UUID_LE(0x00000000, 0x0000, 0x0000, 0x00, 0x00, 0x00, 0x00,	\
-		0x00, 0x00, 0x00, 0x00)
-
-#define NULL_UUID_BE							\
-	UUID_BE(0x00000000, 0x0000, 0x0000, 0x00, 0x00, 0x00, 0x00,	\
-		0x00, 0x00, 0x00, 0x00)
-
-
-#endif /* _UAPI_LINUX_UUID_H_ */
diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h
index c95882b3..1590c49c 100644
--- a/include/uapi/linux/xattr.h
+++ b/include/uapi/linux/xattr.h
@@ -13,8 +13,12 @@
 #ifndef _UAPI_LINUX_XATTR_H
 #define _UAPI_LINUX_XATTR_H
 
+#if __UAPI_DEF_XATTR
+#define __USE_KERNEL_XATTR_DEFS
+
 #define XATTR_CREATE	0x1	/* set value, fail if attr already exists */
 #define XATTR_REPLACE	0x2	/* set value, fail if attr does not exist */
+#endif
 
 /* Namespaces */
 #define XATTR_OS2_PREFIX "os2."
diff --git a/initramfs/hook b/initramfs/hook
index aa91469e..00ab52fa 100755
--- a/initramfs/hook
+++ b/initramfs/hook
@@ -16,3 +16,13 @@ esac
 
 . /usr/share/initramfs-tools/hook-functions
 
+manual_add_modules 'bcachefs'
+
+# chacha20 and poly1305 are used for encrypted bcachefs filesystems.
+add_loaded_modules 'chacha20[-_]*'
+add_loaded_modules 'poly1305[-_]*'
+
+# Add the bcachefs utility to the initramfs
+# Note: make install replaces this with the install path, so it must be last
+#copy_exec /usr/local/sbin/bcachefs /sbin/bcachefs
+#copy_exec /usr/local/sbin/mount.bcachefs /sbin/mount.bcachefs
diff --git a/libbcachefs.c b/libbcachefs.c
deleted file mode 100644
index 49790d89..00000000
--- a/libbcachefs.c
+++ /dev/null
@@ -1,824 +0,0 @@
-#include <ctype.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/stat.h>
-#include <sys/sysmacros.h>
-#include <sys/types.h>
-#include <time.h>
-#include <unistd.h>
-
-#include <uuid/uuid.h>
-
-#include "libbcachefs.h"
-#include "crypto.h"
-#include "libbcachefs/bcachefs_format.h"
-#include "libbcachefs/btree_cache.h"
-#include "libbcachefs/checksum.h"
-#include "libbcachefs/disk_groups.h"
-#include "libbcachefs/opts.h"
-#include "libbcachefs/replicas.h"
-#include "libbcachefs/super-io.h"
-
-#define NSEC_PER_SEC	1000000000L
-
-#define BCH_MIN_NR_NBUCKETS	(1 << 10)
-
-/* minimum size filesystem we can create, given a bucket size: */
-static u64 min_size(unsigned bucket_size)
-{
-	return BCH_MIN_NR_NBUCKETS * bucket_size;
-}
-
-static void init_layout(struct bch_sb_layout *l, unsigned block_size,
-			u64 start, u64 end)
-{
-	unsigned sb_size;
-	u64 backup; /* offset of 2nd sb */
-
-	memset(l, 0, sizeof(*l));
-
-	if (start != BCH_SB_SECTOR)
-		start = round_up(start, block_size);
-	end = round_down(end, block_size);
-
-	if (start >= end)
-		die("insufficient space for superblocks");
-
-	/*
-	 * Create two superblocks in the allowed range: reserve a maximum of 64k
-	 */
-	sb_size = min_t(u64, 128, end - start / 2);
-
-	backup = start + sb_size;
-	backup = round_up(backup, block_size);
-
-	backup = min(backup, end);
-
-	sb_size = min(end - backup, backup- start);
-	sb_size = rounddown_pow_of_two(sb_size);
-
-	if (sb_size < 8)
-		die("insufficient space for superblocks");
-
-	l->magic		= BCACHE_MAGIC;
-	l->layout_type		= 0;
-	l->nr_superblocks	= 2;
-	l->sb_max_size_bits	= ilog2(sb_size);
-	l->sb_offset[0]		= cpu_to_le64(start);
-	l->sb_offset[1]		= cpu_to_le64(backup);
-}
-
-void bch2_pick_bucket_size(struct format_opts opts, struct dev_opts *dev)
-{
-	if (!dev->sb_offset) {
-		dev->sb_offset	= BCH_SB_SECTOR;
-		dev->sb_end	= BCH_SB_SECTOR + 256;
-	}
-
-	if (!dev->size)
-		dev->size = get_size(dev->path, dev->fd) >> 9;
-
-	if (!dev->bucket_size) {
-		if (dev->size < min_size(opts.block_size))
-			die("cannot format %s, too small (%llu sectors, min %llu)",
-			    dev->path, dev->size, min_size(opts.block_size));
-
-		/* Bucket size must be >= block size: */
-		dev->bucket_size = opts.block_size;
-
-		/* Bucket size must be >= btree node size: */
-		dev->bucket_size = max(dev->bucket_size, opts.btree_node_size);
-
-		/* Want a bucket size of at least 128k, if possible: */
-		dev->bucket_size = max(dev->bucket_size, 256U);
-
-		if (dev->size >= min_size(dev->bucket_size)) {
-			unsigned scale = max(1,
-					     ilog2(dev->size / min_size(dev->bucket_size)) / 4);
-
-			scale = rounddown_pow_of_two(scale);
-
-			/* max bucket size 1 mb */
-			dev->bucket_size = min(dev->bucket_size * scale, 1U << 11);
-		} else {
-			do {
-				dev->bucket_size /= 2;
-			} while (dev->size < min_size(dev->bucket_size));
-		}
-	}
-
-	dev->nbuckets	= dev->size / dev->bucket_size;
-
-	if (dev->bucket_size < opts.block_size)
-		die("Bucket size cannot be smaller than block size");
-
-	if (dev->bucket_size < opts.btree_node_size)
-		die("Bucket size cannot be smaller than btree node size");
-
-	if (dev->nbuckets < BCH_MIN_NR_NBUCKETS)
-		die("Not enough buckets: %llu, need %u (bucket size %u)",
-		    dev->nbuckets, BCH_MIN_NR_NBUCKETS, dev->bucket_size);
-
-}
-
-static unsigned parse_target(struct bch_sb_handle *sb,
-			     struct dev_opts *devs, size_t nr_devs,
-			     const char *s)
-{
-	struct dev_opts *i;
-	int idx;
-
-	if (!s)
-		return 0;
-
-	for (i = devs; i < devs + nr_devs; i++)
-		if (!strcmp(s, i->path))
-			return dev_to_target(i - devs);
-
-	idx = bch2_disk_path_find(sb, s);
-	if (idx >= 0)
-		return group_to_target(idx);
-
-	die("Invalid target %s", s);
-	return 0;
-}
-
-struct bch_sb *bch2_format(struct format_opts opts,
-			   struct dev_opts *devs, size_t nr_devs)
-{
-	struct bch_sb_handle sb = { NULL };
-	struct dev_opts *i;
-	struct bch_sb_field_members *mi;
-
-	/* calculate block size: */
-	if (!opts.block_size)
-		for (i = devs; i < devs + nr_devs; i++)
-			opts.block_size = max(opts.block_size,
-					      get_blocksize(i->path, i->fd));
-
-	/* calculate bucket sizes: */
-	for (i = devs; i < devs + nr_devs; i++)
-		bch2_pick_bucket_size(opts, i);
-
-	/* calculate btree node size: */
-	if (!opts.btree_node_size) {
-		/* 256k default btree node size */
-		opts.btree_node_size = 512;
-
-		for (i = devs; i < devs + nr_devs; i++)
-			opts.btree_node_size =
-				min(opts.btree_node_size, i->bucket_size);
-	}
-
-	if (!is_power_of_2(opts.block_size))
-		die("block size must be power of 2");
-
-	if (!is_power_of_2(opts.btree_node_size))
-		die("btree node size must be power of 2");
-
-	if (uuid_is_null(opts.uuid.b))
-		uuid_generate(opts.uuid.b);
-
-	if (bch2_sb_realloc(&sb, 0))
-		die("insufficient memory");
-
-	sb.sb->version		= cpu_to_le64(BCH_SB_VERSION_MAX);
-	sb.sb->magic		= BCACHE_MAGIC;
-	sb.sb->block_size	= cpu_to_le16(opts.block_size);
-	sb.sb->user_uuid	= opts.uuid;
-	sb.sb->nr_devices	= nr_devs;
-
-	uuid_generate(sb.sb->uuid.b);
-
-	if (opts.label)
-		strncpy((char *) sb.sb->label, opts.label, sizeof(sb.sb->label));
-
-	SET_BCH_SB_CSUM_TYPE(sb.sb,		opts.meta_csum_type);
-	SET_BCH_SB_META_CSUM_TYPE(sb.sb,	opts.meta_csum_type);
-	SET_BCH_SB_DATA_CSUM_TYPE(sb.sb,	opts.data_csum_type);
-	SET_BCH_SB_COMPRESSION_TYPE(sb.sb,	opts.compression_type);
-	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(sb.sb,
-						opts.background_compression_type);
-
-	SET_BCH_SB_BTREE_NODE_SIZE(sb.sb,	opts.btree_node_size);
-	SET_BCH_SB_GC_RESERVE(sb.sb,		8);
-	SET_BCH_SB_META_REPLICAS_WANT(sb.sb,	opts.meta_replicas);
-	SET_BCH_SB_META_REPLICAS_REQ(sb.sb,	opts.meta_replicas_required);
-	SET_BCH_SB_DATA_REPLICAS_WANT(sb.sb,	opts.data_replicas);
-	SET_BCH_SB_DATA_REPLICAS_REQ(sb.sb,	opts.data_replicas_required);
-	SET_BCH_SB_ERROR_ACTION(sb.sb,		opts.on_error_action);
-	SET_BCH_SB_STR_HASH_TYPE(sb.sb,		BCH_STR_HASH_SIPHASH);
-	SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb.sb,ilog2(opts.encoded_extent_max));
-
-	SET_BCH_SB_POSIX_ACL(sb.sb,		1);
-
-	struct timespec now;
-	if (clock_gettime(CLOCK_REALTIME, &now))
-		die("error getting current time: %m");
-
-	sb.sb->time_base_lo	= cpu_to_le64(now.tv_sec * NSEC_PER_SEC + now.tv_nsec);
-	sb.sb->time_precision	= cpu_to_le32(1);
-
-	/* Member info: */
-	mi = bch2_sb_resize_members(&sb,
-			(sizeof(*mi) + sizeof(struct bch_member) *
-			 nr_devs) / sizeof(u64));
-
-	for (i = devs; i < devs + nr_devs; i++) {
-		struct bch_member *m = mi->members + (i - devs);
-
-		uuid_generate(m->uuid.b);
-		m->nbuckets	= cpu_to_le64(i->nbuckets);
-		m->first_bucket	= 0;
-		m->bucket_size	= cpu_to_le16(i->bucket_size);
-
-		SET_BCH_MEMBER_REPLACEMENT(m,	CACHE_REPLACEMENT_LRU);
-		SET_BCH_MEMBER_DISCARD(m,	i->discard);
-		SET_BCH_MEMBER_DATA_ALLOWED(m,	i->data_allowed);
-		SET_BCH_MEMBER_DURABILITY(m,	i->durability + 1);
-	}
-
-	/* Disk groups */
-	for (i = devs; i < devs + nr_devs; i++) {
-		struct bch_member *m = mi->members + (i - devs);
-		int idx;
-
-		if (!i->group)
-			continue;
-
-		idx = bch2_disk_path_find_or_create(&sb, i->group);
-		if (idx < 0)
-			die("error creating disk path: %s", idx);
-
-		SET_BCH_MEMBER_GROUP(m,	idx + 1);
-	}
-
-	SET_BCH_SB_FOREGROUND_TARGET(sb.sb,
-		parse_target(&sb, devs, nr_devs, opts.foreground_target));
-	SET_BCH_SB_BACKGROUND_TARGET(sb.sb,
-		parse_target(&sb, devs, nr_devs, opts.background_target));
-	SET_BCH_SB_PROMOTE_TARGET(sb.sb,
-		parse_target(&sb, devs, nr_devs, opts.promote_target));
-
-	/* Crypt: */
-	if (opts.encrypted) {
-		struct bch_sb_field_crypt *crypt =
-			bch2_sb_resize_crypt(&sb, sizeof(*crypt) / sizeof(u64));
-
-		bch_sb_crypt_init(sb.sb, crypt, opts.passphrase);
-		SET_BCH_SB_ENCRYPTION_TYPE(sb.sb, 1);
-	}
-
-	for (i = devs; i < devs + nr_devs; i++) {
-		sb.sb->dev_idx = i - devs;
-
-		init_layout(&sb.sb->layout, opts.block_size,
-			    i->sb_offset, i->sb_end);
-
-		if (i->sb_offset == BCH_SB_SECTOR) {
-			/* Zero start of disk */
-			static const char zeroes[BCH_SB_SECTOR << 9];
-
-			xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0);
-		}
-
-		bch2_super_write(i->fd, sb.sb);
-		close(i->fd);
-	}
-
-	return sb.sb;
-}
-
-void bch2_super_write(int fd, struct bch_sb *sb)
-{
-	struct nonce nonce = { 0 };
-
-	unsigned i;
-	for (i = 0; i < sb->layout.nr_superblocks; i++) {
-		sb->offset = sb->layout.sb_offset[i];
-
-		if (sb->offset == BCH_SB_SECTOR) {
-			/* Write backup layout */
-			xpwrite(fd, &sb->layout, sizeof(sb->layout),
-				BCH_SB_LAYOUT_SECTOR << 9);
-		}
-
-		sb->csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb), nonce, sb);
-		xpwrite(fd, sb, vstruct_bytes(sb),
-			le64_to_cpu(sb->offset) << 9);
-	}
-
-	fsync(fd);
-}
-
-struct bch_sb *__bch2_super_read(int fd, u64 sector)
-{
-	struct bch_sb sb, *ret;
-
-	xpread(fd, &sb, sizeof(sb), sector << 9);
-
-	if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic)))
-		die("not a bcachefs superblock");
-
-	size_t bytes = vstruct_bytes(&sb);
-
-	ret = malloc(bytes);
-
-	xpread(fd, ret, bytes, sector << 9);
-
-	return ret;
-}
-
-static unsigned get_dev_has_data(struct bch_sb *sb, unsigned dev)
-{
-	struct bch_sb_field_replicas *replicas;
-	struct bch_replicas_entry *r;
-	unsigned i, data_has = 0;
-
-	replicas = bch2_sb_get_replicas(sb);
-
-	if (replicas)
-		for_each_replicas_entry(replicas, r)
-			for (i = 0; i < r->nr; i++)
-				if (r->devs[i] == dev)
-					data_has |= 1 << r->data_type;
-
-	return data_has;
-}
-
-/* superblock printing: */
-
-static void bch2_sb_print_layout(struct bch_sb *sb, enum units units)
-{
-	struct bch_sb_layout *l = &sb->layout;
-	unsigned i;
-
-	printf("  type:				%u\n"
-	       "  superblock max size:		%s\n"
-	       "  nr superblocks:		%u\n"
-	       "  Offsets:			",
-	       l->layout_type,
-	       pr_units(1 << l->sb_max_size_bits, units),
-	       l->nr_superblocks);
-
-	for (i = 0; i < l->nr_superblocks; i++) {
-		if (i)
-			printf(", ");
-		printf("%llu", le64_to_cpu(l->sb_offset[i]));
-	}
-	putchar('\n');
-}
-
-static void bch2_sb_print_journal(struct bch_sb *sb, struct bch_sb_field *f,
-				  enum units units)
-{
-	struct bch_sb_field_journal *journal = field_to_type(f, journal);
-	unsigned i, nr = bch2_nr_journal_buckets(journal);
-
-	printf("  Buckets:			");
-	for (i = 0; i < nr; i++) {
-		if (i)
-			putchar(' ');
-		printf("%llu", le64_to_cpu(journal->buckets[i]));
-	}
-	putchar('\n');
-}
-
-static void bch2_sb_print_members(struct bch_sb *sb, struct bch_sb_field *f,
-				  enum units units)
-{
-	struct bch_sb_field_members *mi = field_to_type(f, members);
-	struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
-	unsigned i;
-
-	for (i = 0; i < sb->nr_devices; i++) {
-		struct bch_member *m = mi->members + i;
-		time_t last_mount = le64_to_cpu(m->last_mount);
-		char member_uuid_str[40];
-		char data_allowed_str[100];
-		char data_has_str[100];
-		char group[64];
-
-		if (!bch2_member_exists(m))
-			continue;
-
-		uuid_unparse(m->uuid.b, member_uuid_str);
-
-		if (BCH_MEMBER_GROUP(m)) {
-			unsigned idx = BCH_MEMBER_GROUP(m) - 1;
-
-			if (idx < disk_groups_nr(gi)) {
-				memcpy(group, gi->entries[idx].label,
-				       BCH_SB_LABEL_SIZE);
-				group[BCH_SB_LABEL_SIZE] = '\0';
-			} else {
-				strcpy(group, "(bad disk groups section");
-			}
-		}
-
-		bch2_scnprint_flag_list(data_allowed_str,
-					sizeof(data_allowed_str),
-					bch2_data_types,
-					BCH_MEMBER_DATA_ALLOWED(m));
-		if (!data_allowed_str[0])
-			strcpy(data_allowed_str, "(none)");
-
-		bch2_scnprint_flag_list(data_has_str,
-					sizeof(data_has_str),
-					bch2_data_types,
-					get_dev_has_data(sb, i));
-		if (!data_has_str[0])
-			strcpy(data_has_str, "(none)");
-
-		printf("  Device %u:\n"
-		       "    UUID:			%s\n"
-		       "    Size:			%s\n"
-		       "    Bucket size:		%s\n"
-		       "    First bucket:		%u\n"
-		       "    Buckets:			%llu\n"
-		       "    Last mount:			%s\n"
-		       "    State:			%s\n"
-		       "    Group:			%s\n"
-		       "    Data allowed:		%s\n"
-
-		       "    Has data:			%s\n"
-
-		       "    Replacement policy:		%s\n"
-		       "    Discard:			%llu\n",
-		       i, member_uuid_str,
-		       pr_units(le16_to_cpu(m->bucket_size) *
-				le64_to_cpu(m->nbuckets), units),
-		       pr_units(le16_to_cpu(m->bucket_size), units),
-		       le16_to_cpu(m->first_bucket),
-		       le64_to_cpu(m->nbuckets),
-		       last_mount ? ctime(&last_mount) : "(never)",
-
-		       BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
-		       ? bch2_dev_state[BCH_MEMBER_STATE(m)]
-		       : "unknown",
-
-		       group,
-		       data_allowed_str,
-		       data_has_str,
-
-		       BCH_MEMBER_REPLACEMENT(m) < CACHE_REPLACEMENT_NR
-		       ? bch2_cache_replacement_policies[BCH_MEMBER_REPLACEMENT(m)]
-		       : "unknown",
-
-		       BCH_MEMBER_DISCARD(m));
-	}
-}
-
-static void bch2_sb_print_crypt(struct bch_sb *sb, struct bch_sb_field *f,
-				enum units units)
-{
-	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
-	printf("  KFD:			%llu\n"
-	       "  scrypt n:		%llu\n"
-	       "  scrypt r:		%llu\n"
-	       "  scrypt p:		%llu\n",
-	       BCH_CRYPT_KDF_TYPE(crypt),
-	       BCH_KDF_SCRYPT_N(crypt),
-	       BCH_KDF_SCRYPT_R(crypt),
-	       BCH_KDF_SCRYPT_P(crypt));
-}
-
-static void bch2_sb_print_replicas(struct bch_sb *sb, struct bch_sb_field *f,
-				   enum units units)
-{
-	struct bch_sb_field_replicas *replicas = field_to_type(f, replicas);
-	struct bch_replicas_entry *e;
-	unsigned i;
-
-	for_each_replicas_entry(replicas, e) {
-		printf_pad(32, "  %s:", bch2_data_types[e->data_type]);
-
-		putchar('[');
-		for (i = 0; i < e->nr; i++) {
-			if (i)
-				putchar(' ');
-			printf("%u", e->devs[i]);
-		}
-		printf("]\n");
-	}
-}
-
-static void bch2_sb_print_quota(struct bch_sb *sb, struct bch_sb_field *f,
-				enum units units)
-{
-}
-
-static void bch2_sb_print_disk_groups(struct bch_sb *sb, struct bch_sb_field *f,
-				      enum units units)
-{
-}
-
-static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f,
-				enum units units)
-{
-}
-
-typedef void (*sb_field_print_fn)(struct bch_sb *, struct bch_sb_field *, enum units);
-
-struct bch_sb_field_toolops {
-	sb_field_print_fn	print;
-};
-
-static const struct bch_sb_field_toolops bch2_sb_field_ops[] = {
-#define x(f, nr)					\
-	[BCH_SB_FIELD_##f] = {				\
-		.print	= bch2_sb_print_##f,		\
-	},
-	BCH_SB_FIELDS()
-#undef x
-};
-
-static inline void bch2_sb_field_print(struct bch_sb *sb,
-				       struct bch_sb_field *f,
-				       enum units units)
-{
-	unsigned type = le32_to_cpu(f->type);
-
-	if (type < BCH_SB_FIELD_NR)
-		bch2_sb_field_ops[type].print(sb, f, units);
-	else
-		printf("(unknown field %u)\n", type);
-}
-
-void bch2_sb_print(struct bch_sb *sb, bool print_layout,
-		   unsigned fields, enum units units)
-{
-	struct bch_sb_field_members *mi;
-	char user_uuid_str[40], internal_uuid_str[40];
-	char fields_have_str[200];
-	char label[BCH_SB_LABEL_SIZE + 1];
-	struct bch_sb_field *f;
-	u64 fields_have = 0;
-	unsigned nr_devices = 0;
-
-	memset(label, 0, sizeof(label));
-	memcpy(label, sb->label, sizeof(sb->label));
-	uuid_unparse(sb->user_uuid.b, user_uuid_str);
-	uuid_unparse(sb->uuid.b, internal_uuid_str);
-
-	mi = bch2_sb_get_members(sb);
-	if (mi) {
-		struct bch_member *m;
-
-		for (m = mi->members;
-		     m < mi->members + sb->nr_devices;
-		     m++)
-			nr_devices += bch2_member_exists(m);
-	}
-
-	vstruct_for_each(sb, f)
-		fields_have |= 1 << le32_to_cpu(f->type);
-	bch2_scnprint_flag_list(fields_have_str, sizeof(fields_have_str),
-				bch2_sb_fields, fields_have);
-
-	printf("External UUID:			%s\n"
-	       "Internal UUID:			%s\n"
-	       "Label:				%s\n"
-	       "Version:			%llu\n"
-	       "Block_size:			%s\n"
-	       "Btree node size:		%s\n"
-	       "Error action:			%s\n"
-	       "Clean:				%llu\n"
-
-	       "Metadata replicas:		%llu\n"
-	       "Data replicas:			%llu\n"
-
-	       "Metadata checksum type:		%s (%llu)\n"
-	       "Data checksum type:		%s (%llu)\n"
-	       "Compression type:		%s (%llu)\n"
-
-	       "Foreground write target:	%llu\n"
-	       "Background write target:	%llu\n"
-	       "Promote target:			%llu\n"
-
-	       "String hash type:		%s (%llu)\n"
-	       "32 bit inodes:			%llu\n"
-	       "GC reserve percentage:		%llu%%\n"
-	       "Root reserve percentage:	%llu%%\n"
-
-	       "Devices:			%u live, %u total\n"
-	       "Sections:			%s\n"
-	       "Superblock size:		%llu\n",
-	       user_uuid_str,
-	       internal_uuid_str,
-	       label,
-	       le64_to_cpu(sb->version),
-	       pr_units(le16_to_cpu(sb->block_size), units),
-	       pr_units(BCH_SB_BTREE_NODE_SIZE(sb), units),
-
-	       BCH_SB_ERROR_ACTION(sb) < BCH_NR_ERROR_ACTIONS
-	       ? bch2_error_actions[BCH_SB_ERROR_ACTION(sb)]
-	       : "unknown",
-
-	       BCH_SB_CLEAN(sb),
-
-	       BCH_SB_META_REPLICAS_WANT(sb),
-	       BCH_SB_DATA_REPLICAS_WANT(sb),
-
-	       BCH_SB_META_CSUM_TYPE(sb) < BCH_CSUM_OPT_NR
-	       ? bch2_csum_types[BCH_SB_META_CSUM_TYPE(sb)]
-	       : "unknown",
-	       BCH_SB_META_CSUM_TYPE(sb),
-
-	       BCH_SB_DATA_CSUM_TYPE(sb) < BCH_CSUM_OPT_NR
-	       ? bch2_csum_types[BCH_SB_DATA_CSUM_TYPE(sb)]
-	       : "unknown",
-	       BCH_SB_DATA_CSUM_TYPE(sb),
-
-	       BCH_SB_COMPRESSION_TYPE(sb) < BCH_COMPRESSION_OPT_NR
-	       ? bch2_compression_types[BCH_SB_COMPRESSION_TYPE(sb)]
-	       : "unknown",
-	       BCH_SB_COMPRESSION_TYPE(sb),
-
-	       BCH_SB_FOREGROUND_TARGET(sb),
-	       BCH_SB_BACKGROUND_TARGET(sb),
-	       BCH_SB_PROMOTE_TARGET(sb),
-
-	       BCH_SB_STR_HASH_TYPE(sb) < BCH_STR_HASH_NR
-	       ? bch2_str_hash_types[BCH_SB_STR_HASH_TYPE(sb)]
-	       : "unknown",
-	       BCH_SB_STR_HASH_TYPE(sb),
-
-	       BCH_SB_INODE_32BIT(sb),
-	       BCH_SB_GC_RESERVE(sb),
-	       BCH_SB_ROOT_RESERVE(sb),
-
-	       nr_devices, sb->nr_devices,
-	       fields_have_str,
-	       vstruct_bytes(sb));
-
-	if (print_layout) {
-		printf("\n"
-		       "Layout:\n");
-		bch2_sb_print_layout(sb, units);
-	}
-
-	vstruct_for_each(sb, f) {
-		unsigned type = le32_to_cpu(f->type);
-		char name[60];
-
-		if (!(fields & (1 << type)))
-			continue;
-
-		if (type < BCH_SB_FIELD_NR) {
-			scnprintf(name, sizeof(name), "%s", bch2_sb_fields[type]);
-			name[0] = toupper(name[0]);
-		} else {
-			scnprintf(name, sizeof(name), "(unknown field %u)", type);
-		}
-
-		printf("\n%s (size %llu):\n", name, vstruct_bytes(f));
-		if (type < BCH_SB_FIELD_NR)
-			bch2_sb_field_print(sb, f, units);
-	}
-}
-
-/* ioctl interface: */
-
-/* Global control device: */
-int bcachectl_open(void)
-{
-	return xopen("/dev/bcachefs-ctl", O_RDWR);
-}
-
-/* Filesystem handles (ioctl, sysfs dir): */
-
-#define SYSFS_BASE "/sys/fs/bcachefs/"
-
-void bcache_fs_close(struct bchfs_handle fs)
-{
-	close(fs.ioctl_fd);
-	close(fs.sysfs_fd);
-}
-
-struct bchfs_handle bcache_fs_open(const char *path)
-{
-	struct bchfs_handle ret;
-
-	if (!uuid_parse(path, ret.uuid.b)) {
-		/* It's a UUID, look it up in sysfs: */
-		char *sysfs = mprintf(SYSFS_BASE "%s", path);
-		ret.sysfs_fd = xopen(sysfs, O_RDONLY);
-
-		char *minor = read_file_str(ret.sysfs_fd, "minor");
-		char *ctl = mprintf("/dev/bcachefs%s-ctl", minor);
-		ret.ioctl_fd = xopen(ctl, O_RDWR);
-
-		free(sysfs);
-		free(minor);
-		free(ctl);
-	} else {
-		/* It's a path: */
-		ret.ioctl_fd = xopen(path, O_RDONLY);
-
-		struct bch_ioctl_query_uuid uuid;
-		if (ioctl(ret.ioctl_fd, BCH_IOCTL_QUERY_UUID, &uuid) < 0)
-			die("error opening %s: not a bcachefs filesystem", path);
-
-		ret.uuid = uuid.uuid;
-
-		char uuid_str[40];
-		uuid_unparse(uuid.uuid.b, uuid_str);
-
-		char *sysfs = mprintf(SYSFS_BASE "%s", uuid_str);
-		ret.sysfs_fd = xopen(sysfs, O_RDONLY);
-		free(sysfs);
-	}
-
-	return ret;
-}
-
-/*
- * Given a path to a block device, open the filesystem it belongs to; also
- * return the device's idx:
- */
-struct bchfs_handle bchu_fs_open_by_dev(const char *path, unsigned *idx)
-{
-	char buf[1024], *uuid_str;
-
-	struct stat stat = xstat(path);
-
-	if (!S_ISBLK(stat.st_mode))
-		die("%s is not a block device", path);
-
-	char *sysfs = mprintf("/sys/dev/block/%u:%u/bcachefs",
-			      major(stat.st_dev),
-			      minor(stat.st_dev));
-	ssize_t len = readlink(sysfs, buf, sizeof(buf));
-	free(sysfs);
-
-	if (len > 0) {
-		char *p = strrchr(buf, '/');
-		if (!p || sscanf(p + 1, "dev-%u", idx) != 1)
-			die("error parsing sysfs");
-
-		*p = '\0';
-		p = strrchr(buf, '/');
-		uuid_str = p + 1;
-	} else {
-		struct bch_opts opts = bch2_opts_empty();
-
-		opt_set(opts, noexcl,	true);
-		opt_set(opts, nochanges, true);
-
-		struct bch_sb_handle sb;
-		int ret = bch2_read_super(path, &opts, &sb);
-		if (ret)
-			die("Error opening %s: %s", path, strerror(-ret));
-
-		*idx = sb.sb->dev_idx;
-		uuid_str = buf;
-		uuid_unparse(sb.sb->user_uuid.b, uuid_str);
-
-		bch2_free_super(&sb);
-	}
-
-	return bcache_fs_open(uuid_str);
-}
-
-int bchu_data(struct bchfs_handle fs, struct bch_ioctl_data cmd)
-{
-	int progress_fd = xioctl(fs.ioctl_fd, BCH_IOCTL_DATA, &cmd);
-
-	while (1) {
-		struct bch_ioctl_data_progress p;
-
-		if (read(progress_fd, &p, sizeof(p)) != sizeof(p))
-			die("error reading from progress fd");
-
-		if (p.data_type == U8_MAX)
-			break;
-
-		printf("\33[2K\r");
-
-		printf("%llu%% complete: current position %s",
-		       p.sectors_done * 100 / p.sectors_total,
-		       bch2_data_types[p.data_type]);
-
-		switch (p.data_type) {
-		case BCH_DATA_BTREE:
-		case BCH_DATA_USER:
-			printf(" %s:%llu:%llu",
-			       bch2_btree_ids[p.btree_id],
-			       p.pos.inode,
-			       p.pos.offset);
-		}
-
-		sleep(1);
-	}
-	printf("\nDone\n");
-
-	close(progress_fd);
-	return 0;
-}
diff --git a/libbcachefs.h b/libbcachefs.h
deleted file mode 100644
index 7f914d22..00000000
--- a/libbcachefs.h
+++ /dev/null
@@ -1,214 +0,0 @@
-#ifndef _LIBBCACHE_H
-#define _LIBBCACHE_H
-
-#include <linux/uuid.h>
-#include <stdbool.h>
-
-#include "libbcachefs/bcachefs_format.h"
-#include "libbcachefs/bcachefs_ioctl.h"
-#include "tools-util.h"
-#include "libbcachefs/vstructs.h"
-
-struct format_opts {
-	char		*label;
-	uuid_le		uuid;
-
-	unsigned	on_error_action;
-
-	unsigned	block_size;
-	unsigned	btree_node_size;
-	unsigned	encoded_extent_max;
-
-	unsigned	meta_replicas;
-	unsigned	data_replicas;
-
-	unsigned	meta_replicas_required;
-	unsigned	data_replicas_required;
-
-	const char	*foreground_target;
-	const char	*background_target;
-	const char	*promote_target;
-
-	unsigned	meta_csum_type;
-	unsigned	data_csum_type;
-	unsigned	compression_type;
-	unsigned	background_compression_type;
-
-	bool		encrypted;
-	char		*passphrase;
-};
-
-static inline struct format_opts format_opts_default()
-{
-	return (struct format_opts) {
-		.on_error_action	= BCH_ON_ERROR_RO,
-		.encoded_extent_max	= 128,
-		.meta_csum_type		= BCH_CSUM_OPT_CRC32C,
-		.data_csum_type		= BCH_CSUM_OPT_CRC32C,
-		.meta_replicas		= 1,
-		.data_replicas		= 1,
-		.meta_replicas_required	= 1,
-		.data_replicas_required	= 1,
-	};
-}
-
-struct dev_opts {
-	int		fd;
-	char		*path;
-	u64		size; /* 512 byte sectors */
-	unsigned	bucket_size;
-	const char	*group;
-	unsigned	data_allowed;
-	unsigned	durability;
-	bool		discard;
-
-	u64		nbuckets;
-
-	u64		sb_offset;
-	u64		sb_end;
-};
-
-static inline struct dev_opts dev_opts_default()
-{
-	return (struct dev_opts) {
-		.data_allowed		= ~0U << 2,
-		.durability		= 1,
-	};
-}
-
-void bch2_pick_bucket_size(struct format_opts, struct dev_opts *);
-struct bch_sb *bch2_format(struct format_opts, struct dev_opts *, size_t);
-
-void bch2_super_write(int, struct bch_sb *);
-struct bch_sb *__bch2_super_read(int, u64);
-
-void bch2_sb_print(struct bch_sb *, bool, unsigned, enum units);
-
-/* ioctl interface: */
-
-int bcachectl_open(void);
-
-struct bchfs_handle {
-	uuid_le	uuid;
-	int	ioctl_fd;
-	int	sysfs_fd;
-};
-
-void bcache_fs_close(struct bchfs_handle);
-struct bchfs_handle bcache_fs_open(const char *);
-struct bchfs_handle bchu_fs_open_by_dev(const char *, unsigned *);
-
-static inline void bchu_disk_add(struct bchfs_handle fs, char *dev)
-{
-	struct bch_ioctl_disk i = { .dev = (unsigned long) dev, };
-
-	xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_ADD, &i);
-}
-
-static inline void bchu_disk_remove(struct bchfs_handle fs, unsigned dev_idx,
-				    unsigned flags)
-{
-	struct bch_ioctl_disk i = {
-		.flags	= flags|BCH_BY_INDEX,
-		.dev	= dev_idx,
-	};
-
-	xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_REMOVE, &i);
-}
-
-static inline void bchu_disk_online(struct bchfs_handle fs, char *dev)
-{
-	struct bch_ioctl_disk i = { .dev = (unsigned long) dev, };
-
-	xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_ONLINE, &i);
-}
-
-static inline void bchu_disk_offline(struct bchfs_handle fs, unsigned dev_idx,
-				     unsigned flags)
-{
-	struct bch_ioctl_disk i = {
-		.flags	= flags|BCH_BY_INDEX,
-		.dev	= dev_idx,
-	};
-
-	xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_OFFLINE, &i);
-}
-
-static inline void bchu_disk_set_state(struct bchfs_handle fs, unsigned dev,
-				       unsigned new_state, unsigned flags)
-{
-	struct bch_ioctl_disk_set_state i = {
-		.flags		= flags|BCH_BY_INDEX,
-		.new_state	= new_state,
-		.dev		= dev,
-	};
-
-	xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_SET_STATE, &i);
-}
-
-static inline struct bch_ioctl_usage *bchu_usage(struct bchfs_handle fs)
-{
-	struct bch_ioctl_usage *u = NULL;
-	unsigned nr_devices = 4;
-
-	while (1) {
-		u = xrealloc(u, sizeof(*u) + sizeof(u->devs[0]) * nr_devices);
-		u->nr_devices = nr_devices;
-
-		if (!ioctl(fs.ioctl_fd, BCH_IOCTL_USAGE, u))
-			return u;
-
-		if (errno != ENOSPC)
-			die("BCH_IOCTL_USAGE error: %m");
-		nr_devices *= 2;
-	}
-}
-
-static inline struct bch_sb *bchu_read_super(struct bchfs_handle fs, unsigned idx)
-{
-	size_t size = 4096;
-	struct bch_sb *sb = NULL;
-
-	while (1) {
-		sb = xrealloc(sb, size);
-		struct bch_ioctl_read_super i = {
-			.size	= size,
-			.sb	= (unsigned long) sb,
-		};
-
-		if (idx != -1) {
-			i.flags |= BCH_READ_DEV|BCH_BY_INDEX;
-			i.dev = idx;
-		}
-
-		if (!ioctl(fs.ioctl_fd, BCH_IOCTL_READ_SUPER, &i))
-			return sb;
-		if (errno != ERANGE)
-			die("BCH_IOCTL_READ_SUPER error: %m");
-		size *= 2;
-	}
-}
-
-static inline unsigned bchu_disk_get_idx(struct bchfs_handle fs, dev_t dev)
-{
-	struct bch_ioctl_disk_get_idx i = { .dev = dev };
-
-	return xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_GET_IDX, &i);
-}
-
-static inline void bchu_disk_resize(struct bchfs_handle fs,
-				    unsigned idx,
-				    u64 nbuckets)
-{
-	struct bch_ioctl_disk_resize i = {
-		.flags	= BCH_BY_INDEX,
-		.dev	= idx,
-		.nbuckets = nbuckets,
-	};
-
-	xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_RESIZE, &i);
-}
-
-int bchu_data(struct bchfs_handle, struct bch_ioctl_data);
-
-#endif /* _LIBBCACHE_H */
diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c
index 534ea94e..99487727 100644
--- a/libbcachefs/acl.c
+++ b/libbcachefs/acl.c
@@ -1,17 +1,71 @@
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
 
-#include <linux/fs.h>
+#include "acl.h"
+#include "xattr.h"
+
 #include <linux/posix_acl.h>
+
+static const char * const acl_types[] = {
+	[ACL_USER_OBJ]	= "user_obj",
+	[ACL_USER]	= "user",
+	[ACL_GROUP_OBJ]	= "group_obj",
+	[ACL_GROUP]	= "group",
+	[ACL_MASK]	= "mask",
+	[ACL_OTHER]	= "other",
+	NULL,
+};
+
+void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size)
+{
+	const void *p, *end = value + size;
+
+	if (!value ||
+	    size < sizeof(bch_acl_header) ||
+	    ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION))
+		return;
+
+	p = value + sizeof(bch_acl_header);
+	while (p < end) {
+		const bch_acl_entry *in = p;
+		unsigned tag = le16_to_cpu(in->e_tag);
+
+		prt_str(out, acl_types[tag]);
+
+		switch (tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			p += sizeof(bch_acl_entry_short);
+			break;
+		case ACL_USER:
+			prt_printf(out, " uid %u", le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			prt_printf(out, " gid %u", le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		}
+
+		prt_printf(out, " %o", le16_to_cpu(in->e_perm));
+
+		if (p != end)
+			prt_char(out, ' ');
+	}
+}
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+#include "fs.h"
+
+#include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
-#include "acl.h"
-#include "fs.h"
-#include "xattr.h"
-
 static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
 {
 	return sizeof(bch_acl_header) +
@@ -23,9 +77,9 @@ static inline int acl_to_xattr_type(int type)
 {
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		return BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
+		return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS;
 	case ACL_TYPE_DEFAULT:
-		return BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT;
 	default:
 		BUG();
 	}
@@ -34,12 +88,14 @@ static inline int acl_to_xattr_type(int type)
 /*
  * Convert from filesystem to in-memory representation.
  */
-static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size)
+static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans,
+					    const void *value, size_t size)
 {
 	const void *p, *end = value + size;
 	struct posix_acl *acl;
 	struct posix_acl_entry *out;
 	unsigned count = 0;
+	int ret;
 
 	if (!value)
 		return NULL;
@@ -80,9 +136,14 @@ static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size)
 	if (!count)
 		return NULL;
 
-	acl = posix_acl_alloc(count, GFP_KERNEL);
+	acl = allocate_dropping_locks(trans, ret,
+			posix_acl_alloc(count, _gfp));
 	if (!acl)
 		return ERR_PTR(-ENOMEM);
+	if (ret) {
+		kfree(acl);
+		return ERR_PTR(ret);
+	}
 
 	out = acl->a_entries;
 
@@ -123,11 +184,6 @@ invalid:
 	return ERR_PTR(-EINVAL);
 }
 
-#define acl_for_each_entry(acl, acl_e)			\
-	for (acl_e = acl->a_entries;			\
-	     acl_e < acl->a_entries + acl->a_count;	\
-	     acl_e++)
-
 /*
  * Convert from in-memory to filesystem representation.
  */
@@ -138,11 +194,11 @@ bch2_acl_to_xattr(struct btree_trans *trans,
 {
 	struct bkey_i_xattr *xattr;
 	bch_acl_header *acl_header;
-	const struct posix_acl_entry *acl_e;
+	const struct posix_acl_entry *acl_e, *pe;
 	void *outptr;
 	unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
 
-	acl_for_each_entry(acl, acl_e) {
+	FOREACH_ACL_ENTRY(acl_e, acl, pe) {
 		switch (acl_e->e_tag) {
 		case ACL_USER:
 		case ACL_GROUP:
@@ -172,7 +228,7 @@ bch2_acl_to_xattr(struct btree_trans *trans,
 	bkey_xattr_init(&xattr->k_i);
 	xattr->k.u64s		= u64s;
 	xattr->v.x_type		= acl_to_xattr_type(type);
-	xattr->v.x_name_len	= 0,
+	xattr->v.x_name_len	= 0;
 	xattr->v.x_val_len	= cpu_to_le16(acl_len);
 
 	acl_header = xattr_val(&xattr->v);
@@ -180,7 +236,7 @@ bch2_acl_to_xattr(struct btree_trans *trans,
 
 	outptr = (void *) acl_header + sizeof(*acl_header);
 
-	acl_for_each_entry(acl, acl_e) {
+	FOREACH_ACL_ENTRY(acl_e, acl, pe) {
 		bch_acl_entry *entry = outptr;
 
 		entry->e_tag = cpu_to_le16(acl_e->e_tag);
@@ -211,49 +267,52 @@ bch2_acl_to_xattr(struct btree_trans *trans,
 	return xattr;
 }
 
-struct posix_acl *bch2_get_acl(struct inode *vinode, int type)
+struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu)
 {
 	struct bch_inode_info *inode = to_bch_ei(vinode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct bkey_s_c_xattr xattr;
+	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
+	struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
+	struct btree_iter iter = { NULL };
 	struct posix_acl *acl = NULL;
 
-	bch2_trans_init(&trans, c);
+	if (rcu)
+		return ERR_PTR(-ECHILD);
+
+	struct btree_trans *trans = bch2_trans_get(c);
 retry:
-	bch2_trans_begin(&trans);
-
-	iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
-			&inode->ei_str_hash, inode->v.i_ino,
-			&X_SEARCH(acl_to_xattr_type(type), "", 0),
-			0);
-	if (IS_ERR(iter)) {
-		if (PTR_ERR(iter) == -EINTR)
-			goto retry;
-
-		if (PTR_ERR(iter) != -ENOENT)
-			acl = ERR_CAST(iter);
-		goto out;
-	}
+	bch2_trans_begin(trans);
 
-	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+	struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
+					     &hash, inode_inum(inode), &search, 0);
+	int ret = bkey_err(k);
+	if (ret)
+		goto err;
 
-	acl = bch2_acl_from_disk(xattr_val(xattr.v),
-			le16_to_cpu(xattr.v->x_val_len));
+	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
+	acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
+				 le16_to_cpu(xattr.v->x_val_len));
+	ret = PTR_ERR_OR_ZERO(acl);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
 
-	if (!IS_ERR(acl))
+	if (ret)
+		acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL;
+
+	if (!IS_ERR_OR_NULL(acl))
 		set_cached_acl(&inode->v, type, acl);
-out:
-	bch2_trans_exit(&trans);
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return acl;
 }
 
-int bch2_set_acl_trans(struct btree_trans *trans,
+int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
 		       struct bch_inode_unpacked *inode_u,
-		       const struct bch_hash_info *hash_info,
 		       struct posix_acl *acl, int type)
 {
+	struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u);
 	int ret;
 
 	if (type == ACL_TYPE_DEFAULT &&
@@ -266,117 +325,120 @@ int bch2_set_acl_trans(struct btree_trans *trans,
 		if (IS_ERR(xattr))
 			return PTR_ERR(xattr);
 
-		ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
-				      inode_u->bi_inum, &xattr->k_i, 0);
+		ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info,
+				    inum, &xattr->k_i, 0);
 	} else {
 		struct xattr_search_key search =
 			X_SEARCH(acl_to_xattr_type(type), "", 0);
 
-		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
-				       inode_u->bi_inum, &search);
+		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info,
+				       inum, &search);
 	}
 
-	return ret == -ENOENT ? 0 : ret;
+	return bch2_err_matches(ret, ENOENT) ? 0 : ret;
 }
 
-static int inode_update_for_set_acl_fn(struct bch_inode_info *inode,
-				       struct bch_inode_unpacked *bi,
-				       void *p)
+int bch2_set_acl(struct mnt_idmap *idmap,
+		 struct dentry *dentry,
+		 struct posix_acl *_acl, int type)
 {
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct timespec now = current_time(&inode->v);
-	umode_t mode = (unsigned long) p;
-
-	bi->bi_ctime	= timespec_to_bch2_time(c, now);
-	bi->bi_mode	= mode;
-	return 0;
-}
-
-int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
-{
-	struct bch_inode_info *inode = to_bch_ei(vinode);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans trans;
+	struct btree_iter inode_iter = { NULL };
 	struct bch_inode_unpacked inode_u;
-	umode_t mode = inode->v.i_mode;
+	struct posix_acl *acl;
+	umode_t mode;
 	int ret;
 
-	if (type == ACL_TYPE_ACCESS && acl) {
-		ret = posix_acl_update_mode(&inode->v, &mode, &acl);
+	mutex_lock(&inode->ei_update_lock);
+	struct btree_trans *trans = bch2_trans_get(c);
+retry:
+	bch2_trans_begin(trans);
+	acl = _acl;
+
+	ret   = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?:
+		bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
+			      BTREE_ITER_intent);
+	if (ret)
+		goto btree_err;
+
+	mode = inode_u.bi_mode;
+
+	if (type == ACL_TYPE_ACCESS) {
+		ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
 		if (ret)
-			return ret;
+			goto btree_err;
 	}
 
-	bch2_trans_init(&trans, c);
-retry:
-	bch2_trans_begin(&trans);
-
-	ret   = bch2_set_acl_trans(&trans,
-				   &inode->ei_inode,
-				   &inode->ei_str_hash,
-				   acl, type) ?:
-		bch2_write_inode_trans(&trans, inode, &inode_u,
-				       inode_update_for_set_acl_fn,
-				       (void *)(unsigned long) mode) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
-				  &inode->ei_journal_seq,
-				  BTREE_INSERT_ATOMIC|
-				  BTREE_INSERT_NOUNLOCK);
-	if (ret == -EINTR)
+	ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type);
+	if (ret)
+		goto btree_err;
+
+	inode_u.bi_ctime	= bch2_current_time(c);
+	inode_u.bi_mode		= mode;
+
+	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+		bch2_trans_commit(trans, NULL, NULL, 0);
+btree_err:
+	bch2_trans_iter_exit(trans, &inode_iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 	if (unlikely(ret))
 		goto err;
 
-	bch2_inode_update_after_write(c, inode, &inode_u,
+	bch2_inode_update_after_write(trans, inode, &inode_u,
 				      ATTR_CTIME|ATTR_MODE);
 
 	set_cached_acl(&inode->v, type, acl);
 err:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
+	mutex_unlock(&inode->ei_update_lock);
 
 	return ret;
 }
 
-int bch2_acl_chmod(struct btree_trans *trans,
-		   struct bch_inode_info *inode,
+int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
+		   struct bch_inode_unpacked *inode,
 		   umode_t mode,
 		   struct posix_acl **new_acl)
 {
-	struct btree_iter *iter;
-	struct bkey_s_c_xattr xattr;
-	struct bkey_i_xattr *new;
-	struct posix_acl *acl;
-	int ret = 0;
+	struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
+	struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0);
+	struct btree_iter iter;
+	struct posix_acl *acl = NULL;
 
-	iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
-			&inode->ei_str_hash, inode->v.i_ino,
-			&X_SEARCH(BCH_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
-			BTREE_ITER_INTENT);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0;
+	struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
+			       &hash_info, inum, &search, BTREE_ITER_intent);
+	int ret = bkey_err(k);
+	if (ret)
+		return bch2_err_matches(ret, ENOENT) ? 0 : ret;
 
-	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
 
-	acl = bch2_acl_from_disk(xattr_val(xattr.v),
+	acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
 			le16_to_cpu(xattr.v->x_val_len));
-	if (IS_ERR_OR_NULL(acl))
-		return PTR_ERR(acl);
+	ret = PTR_ERR_OR_ZERO(acl);
+	if (ret)
+		goto err;
 
-	ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
+	ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode));
 	if (ret)
 		goto err;
 
-	new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
-	if (IS_ERR(new)) {
-		ret = PTR_ERR(new);
+	struct bkey_i_xattr *new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
+	ret = PTR_ERR_OR_ZERO(new);
+	if (ret)
 		goto err;
-	}
 
-	bch2_trans_update(trans, iter, &new->k_i, 0);
+	new->k.p = iter.pos;
+	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
 	*new_acl = acl;
 	acl = NULL;
 err:
-	kfree(acl);
+	bch2_trans_iter_exit(trans, &iter);
+	if (!IS_ERR_OR_NULL(acl))
+		kfree(acl);
 	return ret;
 }
 
diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h
index e0672430..fe730a6b 100644
--- a/libbcachefs/acl.h
+++ b/libbcachefs/acl.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_ACL_H
 #define _BCACHEFS_ACL_H
 
@@ -6,8 +7,6 @@ struct bch_hash_info;
 struct bch_inode_info;
 struct posix_acl;
 
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-
 #define BCH_ACL_VERSION	0x0001
 
 typedef struct {
@@ -25,28 +24,31 @@ typedef struct {
 	__le32		a_version;
 } bch_acl_header;
 
-struct posix_acl *bch2_get_acl(struct inode *, int);
+void bch2_acl_to_text(struct printbuf *, const void *, size_t);
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+struct posix_acl *bch2_get_acl(struct inode *, int, bool);
 
-int bch2_set_acl_trans(struct btree_trans *,
+int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
 		       struct bch_inode_unpacked *,
-		       const struct bch_hash_info *,
 		       struct posix_acl *, int);
-int bch2_set_acl(struct inode *, struct posix_acl *, int);
-int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *,
+int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
+int bch2_acl_chmod(struct btree_trans *, subvol_inum,
+		   struct bch_inode_unpacked *,
 		   umode_t, struct posix_acl **);
 
 #else
 
-static inline int bch2_set_acl_trans(struct btree_trans *trans,
+static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
 				     struct bch_inode_unpacked *inode_u,
-				     const struct bch_hash_info *hash_info,
 				     struct posix_acl *acl, int type)
 {
 	return 0;
 }
 
-static inline int bch2_acl_chmod(struct btree_trans *trans,
-				 struct bch_inode_info *inode,
+static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
+				 struct bch_inode_unpacked *inode,
 				 umode_t mode,
 				 struct posix_acl **new_acl)
 {
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
deleted file mode 100644
index ac2c7d1f..00000000
--- a/libbcachefs/alloc.c
+++ /dev/null
@@ -1,2205 +0,0 @@
-/*
- * Primary bucket allocation code
- *
- * Copyright 2012 Google, Inc.
- *
- * Allocation in bcache is done in terms of buckets:
- *
- * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
- * btree pointers - they must match for the pointer to be considered valid.
- *
- * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
- * bucket simply by incrementing its gen.
- *
- * The gens (along with the priorities; it's really the gens are important but
- * the code is named as if it's the priorities) are written in an arbitrary list
- * of buckets on disk, with a pointer to them in the journal header.
- *
- * When we invalidate a bucket, we have to write its new gen to disk and wait
- * for that write to complete before we use it - otherwise after a crash we
- * could have pointers that appeared to be good but pointed to data that had
- * been overwritten.
- *
- * Since the gens and priorities are all stored contiguously on disk, we can
- * batch this up: We fill up the free_inc list with freshly invalidated buckets,
- * call prio_write(), and when prio_write() finishes we pull buckets off the
- * free_inc list and optionally discard them.
- *
- * free_inc isn't the only freelist - if it was, we'd often have to sleep while
- * priorities and gens were being written before we could allocate. c->free is a
- * smaller freelist, and buckets on that list are always ready to be used.
- *
- * If we've got discards enabled, that happens when a bucket moves from the
- * free_inc list to the free list.
- *
- * It's important to ensure that gens don't wrap around - with respect to
- * either the oldest gen in the btree or the gen on disk. This is quite
- * difficult to do in practice, but we explicitly guard against it anyways - if
- * a bucket is in danger of wrapping around we simply skip invalidating it that
- * time around, and we garbage collect or rewrite the priorities sooner than we
- * would have otherwise.
- *
- * bch2_bucket_alloc() allocates a single bucket from a specific device.
- *
- * bch2_bucket_alloc_set() allocates one or more buckets from different devices
- * in a given filesystem.
- *
- * invalidate_buckets() drives all the processes described above. It's called
- * from bch2_bucket_alloc() and a few other places that need to make sure free
- * buckets are ready.
- *
- * invalidate_buckets_(lru|fifo)() find buckets that are available to be
- * invalidated, and then invalidate them and stick them on the free_inc list -
- * in either lru or fifo order.
- */
-
-#include "bcachefs.h"
-#include "alloc.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "clock.h"
-#include "debug.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "extents.h"
-#include "io.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "super-io.h"
-
-#include <linux/blkdev.h>
-#include <linux/kthread.h>
-#include <linux/math64.h>
-#include <linux/random.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/sched/task.h>
-#include <linux/sort.h>
-#include <trace/events/bcachefs.h>
-
-static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
-
-/* Ratelimiting/PD controllers */
-
-static void pd_controllers_update(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(to_delayed_work(work),
-					   struct bch_fs,
-					   pd_controllers_update);
-	struct bch_dev *ca;
-	unsigned i;
-
-	for_each_member_device(ca, c, i) {
-		struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
-
-		u64 free = bucket_to_sector(ca,
-				__dev_buckets_free(ca, stats)) << 9;
-		/*
-		 * Bytes of internal fragmentation, which can be
-		 * reclaimed by copy GC
-		 */
-		s64 fragmented = (bucket_to_sector(ca,
-					stats.buckets[BCH_DATA_USER] +
-					stats.buckets[BCH_DATA_CACHED]) -
-				  (stats.sectors[BCH_DATA_USER] +
-				   stats.sectors[BCH_DATA_CACHED])) << 9;
-
-		fragmented = max(0LL, fragmented);
-
-		bch2_pd_controller_update(&ca->copygc_pd,
-					 free, fragmented, -1);
-	}
-
-	schedule_delayed_work(&c->pd_controllers_update,
-			      c->pd_controllers_update_seconds * HZ);
-}
-
-/* Persistent alloc info: */
-
-static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
-{
-	unsigned bytes = offsetof(struct bch_alloc, data);
-
-	if (a->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
-		bytes += 2;
-	if (a->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
-		bytes += 2;
-
-	return DIV_ROUND_UP(bytes, sizeof(u64));
-}
-
-const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
-{
-	if (k.k->p.inode >= c->sb.nr_devices ||
-	    !c->devs[k.k->p.inode])
-		return "invalid device";
-
-	switch (k.k->type) {
-	case BCH_ALLOC: {
-		struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
-
-		if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k))
-			return "incorrect value size";
-		break;
-	}
-	default:
-		return "invalid type";
-	}
-
-	return NULL;
-}
-
-void bch2_alloc_to_text(struct bch_fs *c, char *buf,
-			size_t size, struct bkey_s_c k)
-{
-	buf[0] = '\0';
-
-	switch (k.k->type) {
-	case BCH_ALLOC:
-		break;
-	}
-}
-
-static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
-{
-	unsigned v;
-
-	switch (bytes) {
-	case 1:
-		v = **p;
-		break;
-	case 2:
-		v = le16_to_cpup((void *) *p);
-		break;
-	case 4:
-		v = le32_to_cpup((void *) *p);
-		break;
-	default:
-		BUG();
-	}
-
-	*p += bytes;
-	return v;
-}
-
-static inline void put_alloc_field(u8 **p, unsigned bytes, unsigned v)
-{
-	switch (bytes) {
-	case 1:
-		**p = v;
-		break;
-	case 2:
-		*((__le16 *) *p) = cpu_to_le16(v);
-		break;
-	case 4:
-		*((__le32 *) *p) = cpu_to_le32(v);
-		break;
-	default:
-		BUG();
-	}
-
-	*p += bytes;
-}
-
-static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bch_dev *ca;
-	struct bkey_s_c_alloc a;
-	struct bucket_mark new;
-	struct bucket *g;
-	const u8 *d;
-
-	if (k.k->type != BCH_ALLOC)
-		return;
-
-	a = bkey_s_c_to_alloc(k);
-	ca = bch_dev_bkey_exists(c, a.k->p.inode);
-
-	if (a.k->p.offset >= ca->mi.nbuckets)
-		return;
-
-	percpu_down_read_preempt_disable(&c->usage_lock);
-
-	g = bucket(ca, a.k->p.offset);
-	bucket_cmpxchg(g, new, ({
-		new.gen = a.v->gen;
-		new.gen_valid = 1;
-	}));
-
-	d = a.v->data;
-	if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
-		g->io_time[READ] = get_alloc_field(&d, 2);
-	if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
-		g->io_time[WRITE] = get_alloc_field(&d, 2);
-
-	percpu_up_read_preempt_enable(&c->usage_lock);
-}
-
-int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
-{
-	struct journal_replay *r;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bch_dev *ca;
-	unsigned i;
-	int ret;
-
-	for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) {
-		bch2_alloc_read_key(c, k);
-		bch2_btree_iter_cond_resched(&iter);
-	}
-
-	ret = bch2_btree_iter_unlock(&iter);
-	if (ret)
-		return ret;
-
-	list_for_each_entry(r, journal_replay_list, list) {
-		struct bkey_i *k, *n;
-		struct jset_entry *entry;
-
-		for_each_jset_key(k, n, entry, &r->j)
-			if (entry->btree_id == BTREE_ID_ALLOC)
-				bch2_alloc_read_key(c, bkey_i_to_s_c(k));
-	}
-
-	mutex_lock(&c->bucket_clock[READ].lock);
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		bch2_recalc_oldest_io(c, ca, READ);
-		up_read(&ca->bucket_lock);
-	}
-	mutex_unlock(&c->bucket_clock[READ].lock);
-
-	mutex_lock(&c->bucket_clock[WRITE].lock);
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		bch2_recalc_oldest_io(c, ca, WRITE);
-		up_read(&ca->bucket_lock);
-	}
-	mutex_unlock(&c->bucket_clock[WRITE].lock);
-
-	return 0;
-}
-
-static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
-				  size_t b, struct btree_iter *iter,
-				  u64 *journal_seq, bool nowait)
-{
-	struct bucket_mark m;
-	__BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
-	struct bucket *g;
-	struct bkey_i_alloc *a;
-	u8 *d;
-	int ret;
-	unsigned flags = BTREE_INSERT_ATOMIC|
-		BTREE_INSERT_NOFAIL|
-		BTREE_INSERT_USE_RESERVE|
-		BTREE_INSERT_USE_ALLOC_RESERVE;
-
-	if (nowait)
-		flags |= BTREE_INSERT_NOWAIT;
-
-	bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
-
-	do {
-		ret = btree_iter_err(bch2_btree_iter_peek_slot(iter));
-		if (ret)
-			break;
-
-		percpu_down_read_preempt_disable(&c->usage_lock);
-		g = bucket(ca, b);
-
-		/* read mark under btree node lock: */
-		m = READ_ONCE(g->mark);
-		a = bkey_alloc_init(&alloc_key.k);
-		a->k.p		= iter->pos;
-		a->v.fields	= 0;
-		a->v.gen	= m.gen;
-		set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
-
-		d = a->v.data;
-		if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
-			put_alloc_field(&d, 2, g->io_time[READ]);
-		if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
-			put_alloc_field(&d, 2, g->io_time[WRITE]);
-		percpu_up_read_preempt_enable(&c->usage_lock);
-
-		ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, flags,
-					   BTREE_INSERT_ENTRY(iter, &a->k_i));
-		bch2_btree_iter_cond_resched(iter);
-	} while (ret == -EINTR);
-
-	return ret;
-}
-
-int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
-{
-	struct bch_dev *ca;
-	struct btree_iter iter;
-	int ret;
-
-	if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
-		return 0;
-
-	ca = bch_dev_bkey_exists(c, pos.inode);
-
-	if (pos.offset >= ca->mi.nbuckets)
-		return 0;
-
-	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-	ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter,
-				     NULL, false);
-	bch2_btree_iter_unlock(&iter);
-	return ret;
-}
-
-int bch2_alloc_write(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	unsigned i;
-	int ret = 0;
-
-	for_each_rw_member(ca, c, i) {
-		struct btree_iter iter;
-		unsigned long bucket;
-
-		bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
-				     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-		down_read(&ca->bucket_lock);
-		for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) {
-			ret = __bch2_alloc_write_key(c, ca, bucket, &iter,
-						     NULL, false);
-			if (ret)
-				break;
-
-			clear_bit(bucket, ca->buckets_dirty);
-		}
-		up_read(&ca->bucket_lock);
-		bch2_btree_iter_unlock(&iter);
-
-		if (ret) {
-			percpu_ref_put(&ca->io_ref);
-			break;
-		}
-	}
-
-	return ret;
-}
-
-/* Bucket IO clocks: */
-
-static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
-{
-	struct bucket_clock *clock = &c->bucket_clock[rw];
-	struct bucket_array *buckets = bucket_array(ca);
-	struct bucket *g;
-	u16 max_last_io = 0;
-	unsigned i;
-
-	lockdep_assert_held(&c->bucket_clock[rw].lock);
-
-	/* Recalculate max_last_io for this device: */
-	for_each_bucket(g, buckets)
-		max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
-
-	ca->max_last_bucket_io[rw] = max_last_io;
-
-	/* Recalculate global max_last_io: */
-	max_last_io = 0;
-
-	for_each_member_device(ca, c, i)
-		max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
-
-	clock->max_last_io = max_last_io;
-}
-
-static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
-{
-	struct bucket_clock *clock = &c->bucket_clock[rw];
-	struct bucket_array *buckets;
-	struct bch_dev *ca;
-	struct bucket *g;
-	unsigned i;
-
-	trace_rescale_prios(c);
-
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		buckets = bucket_array(ca);
-
-		for_each_bucket(g, buckets)
-			g->io_time[rw] = clock->hand -
-			bucket_last_io(c, g, rw) / 2;
-
-		bch2_recalc_oldest_io(c, ca, rw);
-
-		up_read(&ca->bucket_lock);
-	}
-}
-
-static void bch2_inc_clock_hand(struct io_timer *timer)
-{
-	struct bucket_clock *clock = container_of(timer,
-						struct bucket_clock, rescale);
-	struct bch_fs *c = container_of(clock,
-					struct bch_fs, bucket_clock[clock->rw]);
-	struct bch_dev *ca;
-	u64 capacity;
-	unsigned i;
-
-	mutex_lock(&clock->lock);
-
-	/* if clock cannot be advanced more, rescale prio */
-	if (clock->max_last_io >= U16_MAX - 2)
-		bch2_rescale_bucket_io_times(c, clock->rw);
-
-	BUG_ON(clock->max_last_io >= U16_MAX - 2);
-
-	for_each_member_device(ca, c, i)
-		ca->max_last_bucket_io[clock->rw]++;
-	clock->max_last_io++;
-	clock->hand++;
-
-	mutex_unlock(&clock->lock);
-
-	capacity = READ_ONCE(c->capacity);
-
-	if (!capacity)
-		return;
-
-	/*
-	 * we only increment when 0.1% of the filesystem capacity has been read
-	 * or written too, this determines if it's time
-	 *
-	 * XXX: we shouldn't really be going off of the capacity of devices in
-	 * RW mode (that will be 0 when we're RO, yet we can still service
-	 * reads)
-	 */
-	timer->expire += capacity >> 10;
-
-	bch2_io_timer_add(&c->io_clock[clock->rw], timer);
-}
-
-static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
-{
-	struct bucket_clock *clock = &c->bucket_clock[rw];
-
-	clock->hand		= 1;
-	clock->rw		= rw;
-	clock->rescale.fn	= bch2_inc_clock_hand;
-	clock->rescale.expire	= c->capacity >> 10;
-	mutex_init(&clock->lock);
-}
-
-/* Background allocator thread: */
-
-/*
- * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
- * (marking them as invalidated on disk), then optionally issues discard
- * commands to the newly free buckets, then puts them on the various freelists.
- */
-
-static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
-				   size_t bucket)
-{
-	if (expensive_debug_checks(c) &&
-	    test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) {
-		size_t iter;
-		long i;
-		unsigned j;
-
-		for (j = 0; j < RESERVE_NR; j++)
-			fifo_for_each_entry(i, &ca->free[j], iter)
-				BUG_ON(i == bucket);
-		fifo_for_each_entry(i, &ca->free_inc, iter)
-			BUG_ON(i == bucket);
-	}
-}
-
-#define BUCKET_GC_GEN_MAX	96U
-
-/**
- * wait_buckets_available - wait on reclaimable buckets
- *
- * If there aren't enough available buckets to fill up free_inc, wait until
- * there are.
- */
-static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
-{
-	unsigned long gc_count = c->gc_count;
-	int ret = 0;
-
-	while (1) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (kthread_should_stop()) {
-			ret = 1;
-			break;
-		}
-
-		if (gc_count != c->gc_count)
-			ca->inc_gen_really_needs_gc = 0;
-
-		if ((ssize_t) (dev_buckets_available(c, ca) -
-			       ca->inc_gen_really_needs_gc) >=
-		    (ssize_t) fifo_free(&ca->free_inc))
-			break;
-
-		up_read(&c->gc_lock);
-		schedule();
-		try_to_freeze();
-		down_read(&c->gc_lock);
-	}
-
-	__set_current_state(TASK_RUNNING);
-	return ret;
-}
-
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
-				       size_t bucket,
-				       struct bucket_mark mark)
-{
-	u8 gc_gen;
-
-	if (!is_available_bucket(mark))
-		return false;
-
-	gc_gen = bucket_gc_gen(ca, bucket);
-
-	if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
-		ca->inc_gen_needs_gc++;
-
-	if (gc_gen >= BUCKET_GC_GEN_MAX)
-		ca->inc_gen_really_needs_gc++;
-
-	return gc_gen < BUCKET_GC_GEN_MAX;
-}
-
-static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
-				       size_t bucket)
-{
-	struct bucket_mark m;
-
-	percpu_down_read_preempt_disable(&c->usage_lock);
-	spin_lock(&c->freelist_lock);
-
-	if (!bch2_invalidate_bucket(c, ca, bucket, &m)) {
-		spin_unlock(&c->freelist_lock);
-		percpu_up_read_preempt_enable(&c->usage_lock);
-		return;
-	}
-
-	verify_not_on_freelist(c, ca, bucket);
-	BUG_ON(!fifo_push(&ca->free_inc, bucket));
-
-	spin_unlock(&c->freelist_lock);
-	percpu_up_read_preempt_enable(&c->usage_lock);
-
-	/* gc lock held: */
-	bucket_io_clock_reset(c, ca, bucket, READ);
-	bucket_io_clock_reset(c, ca, bucket, WRITE);
-
-	if (m.cached_sectors) {
-		ca->allocator_invalidating_data = true;
-	} else if (m.journal_seq_valid) {
-		u64 journal_seq = atomic64_read(&c->journal.seq);
-		u64 bucket_seq	= journal_seq;
-
-		bucket_seq &= ~((u64) U16_MAX);
-		bucket_seq |= m.journal_seq;
-
-		if (bucket_seq > journal_seq)
-			bucket_seq -= 1 << 16;
-
-		ca->allocator_journal_seq_flush =
-			max(ca->allocator_journal_seq_flush, bucket_seq);
-	}
-}
-
-/*
- * Determines what order we're going to reuse buckets, smallest bucket_key()
- * first.
- *
- *
- * - We take into account the read prio of the bucket, which gives us an
- *   indication of how hot the data is -- we scale the prio so that the prio
- *   farthest from the clock is worth 1/8th of the closest.
- *
- * - The number of sectors of cached data in the bucket, which gives us an
- *   indication of the cost in cache misses this eviction will cause.
- *
- * - If hotness * sectors used compares equal, we pick the bucket with the
- *   smallest bucket_gc_gen() - since incrementing the same bucket's generation
- *   number repeatedly forces us to run mark and sweep gc to avoid generation
- *   number wraparound.
- */
-
-static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
-				     size_t b, struct bucket_mark m)
-{
-	unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
-	unsigned max_last_io = ca->max_last_bucket_io[READ];
-
-	/*
-	 * Time since last read, scaled to [0, 8) where larger value indicates
-	 * more recently read data:
-	 */
-	unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
-
-	/* How much we want to keep the data in this bucket: */
-	unsigned long data_wantness =
-		(hotness + 1) * bucket_sectors_used(m);
-
-	unsigned long needs_journal_commit =
-		bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
-
-	return  (data_wantness << 9) |
-		(needs_journal_commit << 8) |
-		bucket_gc_gen(ca, b);
-}
-
-static inline int bucket_alloc_cmp(alloc_heap *h,
-				   struct alloc_heap_entry l,
-				   struct alloc_heap_entry r)
-{
-	return (l.key > r.key) - (l.key < r.key) ?:
-		(l.nr < r.nr)  - (l.nr  > r.nr) ?:
-		(l.bucket > r.bucket) - (l.bucket < r.bucket);
-}
-
-static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct bucket_array *buckets;
-	struct alloc_heap_entry e = { 0 };
-	size_t b;
-
-	ca->alloc_heap.used = 0;
-
-	mutex_lock(&c->bucket_clock[READ].lock);
-	down_read(&ca->bucket_lock);
-
-	buckets = bucket_array(ca);
-
-	bch2_recalc_oldest_io(c, ca, READ);
-
-	/*
-	 * Find buckets with lowest read priority, by building a maxheap sorted
-	 * by read priority and repeatedly replacing the maximum element until
-	 * all buckets have been visited.
-	 */
-	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
-		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
-		unsigned long key = bucket_sort_key(c, ca, b, m);
-
-		if (!bch2_can_invalidate_bucket(ca, b, m))
-			continue;
-
-		if (e.nr && e.bucket + e.nr == b && e.key == key) {
-			e.nr++;
-		} else {
-			if (e.nr)
-				heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
-
-			e = (struct alloc_heap_entry) {
-				.bucket = b,
-				.nr	= 1,
-				.key	= key,
-			};
-		}
-
-		cond_resched();
-	}
-
-	if (e.nr)
-		heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
-
-	up_read(&ca->bucket_lock);
-	mutex_unlock(&c->bucket_clock[READ].lock);
-
-	heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
-
-	while (heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) {
-		for (b = e.bucket;
-		     b < e.bucket + e.nr;
-		     b++) {
-			if (fifo_full(&ca->free_inc))
-				return;
-
-			bch2_invalidate_one_bucket(c, ca, b);
-		}
-	}
-}
-
-static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct bucket_array *buckets = bucket_array(ca);
-	struct bucket_mark m;
-	size_t b, checked;
-
-	for (checked = 0;
-	     checked < ca->mi.nbuckets && !fifo_full(&ca->free_inc);
-	     checked++) {
-		if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
-		    ca->fifo_last_bucket >= ca->mi.nbuckets)
-			ca->fifo_last_bucket = ca->mi.first_bucket;
-
-		b = ca->fifo_last_bucket++;
-
-		m = READ_ONCE(buckets->b[b].mark);
-
-		if (bch2_can_invalidate_bucket(ca, b, m))
-			bch2_invalidate_one_bucket(c, ca, b);
-
-		cond_resched();
-	}
-}
-
-static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct bucket_array *buckets = bucket_array(ca);
-	struct bucket_mark m;
-	size_t checked;
-
-	for (checked = 0;
-	     checked < ca->mi.nbuckets / 2 && !fifo_full(&ca->free_inc);
-	     checked++) {
-		size_t b = bch2_rand_range(ca->mi.nbuckets -
-					   ca->mi.first_bucket) +
-			ca->mi.first_bucket;
-
-		m = READ_ONCE(buckets->b[b].mark);
-
-		if (bch2_can_invalidate_bucket(ca, b, m))
-			bch2_invalidate_one_bucket(c, ca, b);
-
-		cond_resched();
-	}
-}
-
-static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
-	ca->inc_gen_needs_gc			= 0;
-	ca->inc_gen_really_needs_gc		= 0;
-
-	switch (ca->mi.replacement) {
-	case CACHE_REPLACEMENT_LRU:
-		find_reclaimable_buckets_lru(c, ca);
-		break;
-	case CACHE_REPLACEMENT_FIFO:
-		find_reclaimable_buckets_fifo(c, ca);
-		break;
-	case CACHE_REPLACEMENT_RANDOM:
-		find_reclaimable_buckets_random(c, ca);
-		break;
-	}
-}
-
-static int size_t_cmp(const void *_l, const void *_r)
-{
-	const size_t *l = _l, *r = _r;
-
-	return (*l > *r) - (*l < *r);
-}
-
-static void sort_free_inc(struct bch_fs *c, struct bch_dev *ca)
-{
-	BUG_ON(ca->free_inc.front);
-
-	spin_lock(&c->freelist_lock);
-	sort(ca->free_inc.data,
-	     ca->free_inc.back,
-	     sizeof(ca->free_inc.data[0]),
-	     size_t_cmp, NULL);
-	spin_unlock(&c->freelist_lock);
-}
-
-static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
-				    u64 *journal_seq, size_t nr,
-				    bool nowait)
-{
-	struct btree_iter iter;
-	int ret = 0;
-
-	bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-	/* Only use nowait if we've already invalidated at least one bucket: */
-	while (ca->nr_invalidated < min(nr, fifo_used(&ca->free_inc))) {
-		size_t b = fifo_idx_entry(&ca->free_inc, ca->nr_invalidated);
-
-		ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq,
-					     nowait && ca->nr_invalidated);
-		if (ret)
-			break;
-
-		ca->nr_invalidated++;
-	}
-
-	bch2_btree_iter_unlock(&iter);
-
-	/* If we used NOWAIT, don't return the error: */
-	return ca->nr_invalidated ? 0 : ret;
-}
-
-static bool __push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
-{
-	unsigned i;
-
-	/*
-	 * Don't remove from free_inc until after it's added to
-	 * freelist, so gc can find it:
-	 */
-	spin_lock(&c->freelist_lock);
-	for (i = 0; i < RESERVE_NR; i++)
-		if (fifo_push(&ca->free[i], bucket)) {
-			fifo_pop(&ca->free_inc, bucket);
-			--ca->nr_invalidated;
-			closure_wake_up(&c->freelist_wait);
-			spin_unlock(&c->freelist_lock);
-			return true;
-		}
-	spin_unlock(&c->freelist_lock);
-
-	return false;
-}
-
-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
-{
-	int ret = 0;
-
-	while (1) {
-		set_current_state(TASK_INTERRUPTIBLE);
-
-		if (__push_invalidated_bucket(c, ca, bucket))
-			break;
-
-		if ((current->flags & PF_KTHREAD) &&
-		    kthread_should_stop()) {
-			ret = 1;
-			break;
-		}
-
-		schedule();
-		try_to_freeze();
-	}
-
-	__set_current_state(TASK_RUNNING);
-	return ret;
-}
-
-/*
- * Given an invalidated, ready to use bucket: issue a discard to it if enabled,
- * then add it to the freelist, waiting until there's room if necessary:
- */
-static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
-	while (ca->nr_invalidated) {
-		size_t bucket = fifo_peek(&ca->free_inc);
-
-		BUG_ON(fifo_empty(&ca->free_inc) || !ca->nr_invalidated);
-
-		if (ca->mi.discard &&
-		    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
-			blkdev_issue_discard(ca->disk_sb.bdev,
-					     bucket_to_sector(ca, bucket),
-					     ca->mi.bucket_size, GFP_NOIO, 0);
-
-		if (push_invalidated_bucket(c, ca, bucket))
-			return 1;
-	}
-
-	return 0;
-}
-
-/**
- * bch_allocator_thread - move buckets from free_inc to reserves
- *
- * The free_inc FIFO is populated by find_reclaimable_buckets(), and
- * the reserves are depleted by bucket allocation. When we run out
- * of free_inc, try to invalidate some buckets and write out
- * prios and gens.
- */
-static int bch2_allocator_thread(void *arg)
-{
-	struct bch_dev *ca = arg;
-	struct bch_fs *c = ca->fs;
-	u64 journal_seq;
-	int ret;
-
-	set_freezable();
-
-	while (1) {
-		while (1) {
-			cond_resched();
-
-			pr_debug("discarding %zu invalidated buckets",
-				 ca->nr_invalidated);
-
-			ret = discard_invalidated_buckets(c, ca);
-			if (ret)
-				goto stop;
-
-			if (fifo_empty(&ca->free_inc))
-				break;
-
-			pr_debug("invalidating %zu buckets",
-				 fifo_used(&ca->free_inc));
-
-			journal_seq = 0;
-			ret = bch2_invalidate_free_inc(c, ca, &journal_seq,
-						       SIZE_MAX, true);
-			if (ret) {
-				bch_err(ca, "error invalidating buckets: %i", ret);
-				goto stop;
-			}
-
-			if (!ca->nr_invalidated) {
-				bch_err(ca, "allocator thread unable to make forward progress!");
-				goto stop;
-			}
-
-			if (ca->allocator_invalidating_data)
-				ret = bch2_journal_flush_seq(&c->journal, journal_seq);
-			else if (ca->allocator_journal_seq_flush)
-				ret = bch2_journal_flush_seq(&c->journal,
-						       ca->allocator_journal_seq_flush);
-
-			/*
-			 * journal error - buckets haven't actually been
-			 * invalidated, can't discard them:
-			 */
-			if (ret) {
-				bch_err(ca, "journal error: %i", ret);
-				goto stop;
-			}
-		}
-
-		pr_debug("free_inc now empty");
-
-		/* Reset front/back so we can easily sort fifo entries later: */
-		ca->free_inc.front = ca->free_inc.back	= 0;
-		ca->allocator_journal_seq_flush		= 0;
-		ca->allocator_invalidating_data		= false;
-
-		down_read(&c->gc_lock);
-		while (1) {
-			size_t prev = fifo_used(&ca->free_inc);
-
-			if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
-				up_read(&c->gc_lock);
-				bch_err(ca, "gc failure");
-				goto stop;
-			}
-
-			/*
-			 * Find some buckets that we can invalidate, either
-			 * they're completely unused, or only contain clean data
-			 * that's been written back to the backing device or
-			 * another cache tier
-			 */
-
-			pr_debug("scanning for reclaimable buckets");
-
-			find_reclaimable_buckets(c, ca);
-
-			pr_debug("found %zu buckets (free_inc %zu/%zu)",
-				 fifo_used(&ca->free_inc) - prev,
-				 fifo_used(&ca->free_inc), ca->free_inc.size);
-
-			trace_alloc_batch(ca, fifo_used(&ca->free_inc),
-					  ca->free_inc.size);
-
-			if ((ca->inc_gen_needs_gc >= ca->free_inc.size ||
-			     (!fifo_full(&ca->free_inc) &&
-			      ca->inc_gen_really_needs_gc >=
-			      fifo_free(&ca->free_inc))) &&
-			    c->gc_thread) {
-				atomic_inc(&c->kick_gc);
-				wake_up_process(c->gc_thread);
-			}
-
-			if (fifo_full(&ca->free_inc))
-				break;
-
-			if (!fifo_empty(&ca->free_inc) &&
-			    !fifo_full(&ca->free[RESERVE_MOVINGGC]))
-				break;
-
-			/*
-			 * copygc may be waiting until either its reserve fills
-			 * up, or we can't make forward progress:
-			 */
-			ca->allocator_blocked = true;
-			closure_wake_up(&c->freelist_wait);
-
-			ret = wait_buckets_available(c, ca);
-			if (ret) {
-				up_read(&c->gc_lock);
-				goto stop;
-			}
-		}
-
-		ca->allocator_blocked = false;
-		up_read(&c->gc_lock);
-
-		pr_debug("free_inc now %zu/%zu",
-			 fifo_used(&ca->free_inc),
-			 ca->free_inc.size);
-
-		sort_free_inc(c, ca);
-
-		/*
-		 * free_inc is now full of newly-invalidated buckets: next,
-		 * write out the new bucket gens:
-		 */
-	}
-
-stop:
-	pr_debug("alloc thread stopping (ret %i)", ret);
-	return 0;
-}
-
-/* Allocation */
-
-/*
- * Open buckets represent a bucket that's currently being allocated from.  They
- * serve two purposes:
- *
- *  - They track buckets that have been partially allocated, allowing for
- *    sub-bucket sized allocations - they're used by the sector allocator below
- *
- *  - They provide a reference to the buckets they own that mark and sweep GC
- *    can find, until the new allocation has a pointer to it inserted into the
- *    btree
- *
- * When allocating some space with the sector allocator, the allocation comes
- * with a reference to an open bucket - the caller is required to put that
- * reference _after_ doing the index update that makes its allocation reachable.
- */
-
-void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
-{
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-
-	percpu_down_read_preempt_disable(&c->usage_lock);
-	spin_lock(&ob->lock);
-
-	bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr),
-			       false, gc_pos_alloc(c, ob), 0);
-	ob->valid = false;
-
-	spin_unlock(&ob->lock);
-	percpu_up_read_preempt_enable(&c->usage_lock);
-
-	spin_lock(&c->freelist_lock);
-	ob->freelist = c->open_buckets_freelist;
-	c->open_buckets_freelist = ob - c->open_buckets;
-	c->open_buckets_nr_free++;
-	spin_unlock(&c->freelist_lock);
-
-	closure_wake_up(&c->open_buckets_wait);
-}
-
-static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
-{
-	struct open_bucket *ob;
-
-	BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
-
-	ob = c->open_buckets + c->open_buckets_freelist;
-	c->open_buckets_freelist = ob->freelist;
-	atomic_set(&ob->pin, 1);
-
-	c->open_buckets_nr_free--;
-	return ob;
-}
-
-/* _only_ for allocating the journal on a new device: */
-long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
-{
-	struct bucket_array *buckets;
-	ssize_t b;
-
-	rcu_read_lock();
-	buckets = bucket_array(ca);
-
-	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
-		if (is_available_bucket(buckets->b[b].mark))
-			goto success;
-	b = -1;
-success:
-	rcu_read_unlock();
-	return b;
-}
-
-static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
-{
-	switch (reserve) {
-	case RESERVE_ALLOC:
-		return 0;
-	case RESERVE_BTREE:
-		return BTREE_NODE_RESERVE / 2;
-	default:
-		return BTREE_NODE_RESERVE;
-	}
-}
-
-/**
- * bch_bucket_alloc - allocate a single bucket from a specific device
- *
- * Returns index of bucket on success, 0 on failure
- * */
-int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
-		      enum alloc_reserve reserve,
-		      bool may_alloc_partial,
-		      struct closure *cl)
-{
-	struct bucket_array *buckets;
-	struct open_bucket *ob;
-	long bucket;
-
-	spin_lock(&c->freelist_lock);
-
-	if (may_alloc_partial &&
-	    ca->open_buckets_partial_nr) {
-		int ret = ca->open_buckets_partial[--ca->open_buckets_partial_nr];
-		c->open_buckets[ret].on_partial_list = false;
-		spin_unlock(&c->freelist_lock);
-		return ret;
-	}
-
-	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
-		if (cl)
-			closure_wait(&c->open_buckets_wait, cl);
-		spin_unlock(&c->freelist_lock);
-		trace_open_bucket_alloc_fail(ca, reserve);
-		return OPEN_BUCKETS_EMPTY;
-	}
-
-	if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket)))
-		goto out;
-
-	switch (reserve) {
-	case RESERVE_ALLOC:
-		if (fifo_pop(&ca->free[RESERVE_BTREE], bucket))
-			goto out;
-		break;
-	case RESERVE_BTREE:
-		if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >=
-		    ca->free[RESERVE_BTREE].size &&
-		    fifo_pop(&ca->free[RESERVE_BTREE], bucket))
-			goto out;
-		break;
-	case RESERVE_MOVINGGC:
-		if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket))
-			goto out;
-		break;
-	default:
-		break;
-	}
-
-	if (cl)
-		closure_wait(&c->freelist_wait, cl);
-
-	spin_unlock(&c->freelist_lock);
-
-	trace_bucket_alloc_fail(ca, reserve);
-	return FREELIST_EMPTY;
-out:
-	verify_not_on_freelist(c, ca, bucket);
-
-	ob = bch2_open_bucket_alloc(c);
-
-	spin_lock(&ob->lock);
-	buckets = bucket_array(ca);
-
-	ob->valid	= true;
-	ob->sectors_free = ca->mi.bucket_size;
-	ob->ptr		= (struct bch_extent_ptr) {
-		.gen	= buckets->b[bucket].mark.gen,
-		.offset	= bucket_to_sector(ca, bucket),
-		.dev	= ca->dev_idx,
-	};
-
-	bucket_io_clock_reset(c, ca, bucket, READ);
-	bucket_io_clock_reset(c, ca, bucket, WRITE);
-	spin_unlock(&ob->lock);
-
-	spin_unlock(&c->freelist_lock);
-
-	bch2_wake_allocator(ca);
-
-	trace_bucket_alloc(ca, reserve);
-	return ob - c->open_buckets;
-}
-
-static int __dev_alloc_cmp(struct write_point *wp,
-			   unsigned l, unsigned r)
-{
-	return ((wp->next_alloc[l] > wp->next_alloc[r]) -
-		(wp->next_alloc[l] < wp->next_alloc[r]));
-}
-
-#define dev_alloc_cmp(l, r) __dev_alloc_cmp(wp, l, r)
-
-struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
-					 struct write_point *wp,
-					 struct bch_devs_mask *devs)
-{
-	struct dev_alloc_list ret = { .nr = 0 };
-	struct bch_dev *ca;
-	unsigned i;
-
-	for_each_member_device_rcu(ca, c, i, devs)
-		ret.devs[ret.nr++] = i;
-
-	bubble_sort(ret.devs, ret.nr, dev_alloc_cmp);
-	return ret;
-}
-
-void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
-		     struct write_point *wp)
-{
-	u64 *v = wp->next_alloc + ca->dev_idx;
-	u64 free_space = dev_buckets_free(c, ca);
-	u64 free_space_inv = free_space
-		? div64_u64(1ULL << 48, free_space)
-		: 1ULL << 48;
-	u64 scale = *v / 4;
-
-	if (*v + free_space_inv >= *v)
-		*v += free_space_inv;
-	else
-		*v = U64_MAX;
-
-	for (v = wp->next_alloc;
-	     v < wp->next_alloc + ARRAY_SIZE(wp->next_alloc); v++)
-		*v = *v < scale ? 0 : *v - scale;
-}
-
-static enum bucket_alloc_ret bch2_bucket_alloc_set(struct bch_fs *c,
-					struct write_point *wp,
-					unsigned nr_replicas,
-					enum alloc_reserve reserve,
-					struct bch_devs_mask *devs,
-					struct closure *cl)
-{
-	enum bucket_alloc_ret ret = NO_DEVICES;
-	struct dev_alloc_list devs_sorted;
-	struct bch_dev *ca;
-	unsigned i, nr_ptrs_effective = 0;
-	bool have_cache_dev = false;
-
-	BUG_ON(nr_replicas > ARRAY_SIZE(wp->ptrs));
-
-	for (i = wp->first_ptr; i < wp->nr_ptrs; i++) {
-		ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
-
-		nr_ptrs_effective += ca->mi.durability;
-		have_cache_dev |= !ca->mi.durability;
-	}
-
-	if (nr_ptrs_effective >= nr_replicas)
-		return ALLOC_SUCCESS;
-
-	devs_sorted = bch2_wp_alloc_list(c, wp, devs);
-
-	for (i = 0; i < devs_sorted.nr; i++) {
-		int ob;
-
-		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
-		if (!ca)
-			continue;
-
-		if (!ca->mi.durability &&
-		    (have_cache_dev ||
-		     wp->type != BCH_DATA_USER))
-			continue;
-
-		ob = bch2_bucket_alloc(c, ca, reserve,
-				       wp->type == BCH_DATA_USER, cl);
-		if (ob < 0) {
-			ret = ob;
-			if (ret == OPEN_BUCKETS_EMPTY)
-				break;
-			continue;
-		}
-
-		BUG_ON(ob <= 0 || ob > U8_MAX);
-		BUG_ON(wp->nr_ptrs >= ARRAY_SIZE(wp->ptrs));
-
-		wp->ptrs[wp->nr_ptrs++] = c->open_buckets + ob;
-
-		bch2_wp_rescale(c, ca, wp);
-
-		nr_ptrs_effective += ca->mi.durability;
-		have_cache_dev |= !ca->mi.durability;
-
-		__clear_bit(ca->dev_idx, devs->d);
-
-		if (nr_ptrs_effective >= nr_replicas) {
-			ret = ALLOC_SUCCESS;
-			break;
-		}
-	}
-
-	EBUG_ON(reserve == RESERVE_MOVINGGC &&
-		ret != ALLOC_SUCCESS &&
-		ret != OPEN_BUCKETS_EMPTY);
-
-	switch (ret) {
-	case ALLOC_SUCCESS:
-		return 0;
-	case NO_DEVICES:
-		return -EROFS;
-	case FREELIST_EMPTY:
-	case OPEN_BUCKETS_EMPTY:
-		return cl ? -EAGAIN : -ENOSPC;
-	default:
-		BUG();
-	}
-}
-
-/* Sector allocator */
-
-static void writepoint_drop_ptr(struct bch_fs *c,
-				struct write_point *wp,
-				unsigned i)
-{
-	struct open_bucket *ob = wp->ptrs[i];
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-
-	BUG_ON(ca->open_buckets_partial_nr >=
-	       ARRAY_SIZE(ca->open_buckets_partial));
-
-	if (wp->type == BCH_DATA_USER) {
-		spin_lock(&c->freelist_lock);
-		ob->on_partial_list = true;
-		ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
-			ob - c->open_buckets;
-		spin_unlock(&c->freelist_lock);
-
-		closure_wake_up(&c->open_buckets_wait);
-		closure_wake_up(&c->freelist_wait);
-	} else {
-		bch2_open_bucket_put(c, ob);
-	}
-
-	array_remove_item(wp->ptrs, wp->nr_ptrs, i);
-
-	if (i < wp->first_ptr)
-		wp->first_ptr--;
-}
-
-static void writepoint_drop_ptrs(struct bch_fs *c,
-				 struct write_point *wp,
-				 u16 target, bool in_target)
-{
-	int i;
-
-	for (i = wp->first_ptr - 1; i >= 0; --i)
-		if (bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
-				       target) == in_target)
-			writepoint_drop_ptr(c, wp, i);
-}
-
-static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct open_bucket *ob;
-	unsigned i;
-
-	writepoint_for_each_ptr_all(wp, ob, i) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-
-		BUG_ON(ptr_stale(ca, &ob->ptr));
-	}
-#endif
-}
-
-static int open_bucket_add_buckets(struct bch_fs *c,
-				   u16 target,
-				   struct write_point *wp,
-				   struct bch_devs_list *devs_have,
-				   unsigned nr_replicas,
-				   enum alloc_reserve reserve,
-				   struct closure *cl)
-{
-	struct bch_devs_mask devs = c->rw_devs[wp->type];
-	const struct bch_devs_mask *t;
-	struct open_bucket *ob;
-	unsigned i;
-	int ret;
-
-	percpu_down_read_preempt_disable(&c->usage_lock);
-	rcu_read_lock();
-
-	/* Don't allocate from devices we already have pointers to: */
-	for (i = 0; i < devs_have->nr; i++)
-		__clear_bit(devs_have->devs[i], devs.d);
-
-	writepoint_for_each_ptr_all(wp, ob, i)
-		__clear_bit(ob->ptr.dev, devs.d);
-
-	t = bch2_target_to_mask(c, target);
-	if (t)
-		bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
-
-	ret = bch2_bucket_alloc_set(c, wp, nr_replicas, reserve, &devs, cl);
-
-	rcu_read_unlock();
-	percpu_up_read_preempt_enable(&c->usage_lock);
-
-	return ret;
-}
-
-static struct write_point *__writepoint_find(struct hlist_head *head,
-					     unsigned long write_point)
-{
-	struct write_point *wp;
-
-	hlist_for_each_entry_rcu(wp, head, node)
-		if (wp->write_point == write_point)
-			return wp;
-
-	return NULL;
-}
-
-static struct hlist_head *writepoint_hash(struct bch_fs *c,
-					  unsigned long write_point)
-{
-	unsigned hash =
-		hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
-
-	return &c->write_points_hash[hash];
-}
-
-static struct write_point *writepoint_find(struct bch_fs *c,
-					   unsigned long write_point)
-{
-	struct write_point *wp, *oldest;
-	struct hlist_head *head;
-
-	if (!(write_point & 1UL)) {
-		wp = (struct write_point *) write_point;
-		mutex_lock(&wp->lock);
-		return wp;
-	}
-
-	head = writepoint_hash(c, write_point);
-restart_find:
-	wp = __writepoint_find(head, write_point);
-	if (wp) {
-lock_wp:
-		mutex_lock(&wp->lock);
-		if (wp->write_point == write_point)
-			goto out;
-		mutex_unlock(&wp->lock);
-		goto restart_find;
-	}
-
-	oldest = NULL;
-	for (wp = c->write_points;
-	     wp < c->write_points + ARRAY_SIZE(c->write_points);
-	     wp++)
-		if (!oldest || time_before64(wp->last_used, oldest->last_used))
-			oldest = wp;
-
-	mutex_lock(&oldest->lock);
-	mutex_lock(&c->write_points_hash_lock);
-	wp = __writepoint_find(head, write_point);
-	if (wp && wp != oldest) {
-		mutex_unlock(&c->write_points_hash_lock);
-		mutex_unlock(&oldest->lock);
-		goto lock_wp;
-	}
-
-	wp = oldest;
-	hlist_del_rcu(&wp->node);
-	wp->write_point = write_point;
-	hlist_add_head_rcu(&wp->node, head);
-	mutex_unlock(&c->write_points_hash_lock);
-out:
-	wp->last_used = sched_clock();
-	return wp;
-}
-
-/*
- * Get us an open_bucket we can allocate from, return with it locked:
- */
-struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
-				unsigned target,
-				struct write_point_specifier write_point,
-				struct bch_devs_list *devs_have,
-				unsigned nr_replicas,
-				unsigned nr_replicas_required,
-				enum alloc_reserve reserve,
-				unsigned flags,
-				struct closure *cl)
-{
-	struct write_point *wp;
-	struct open_bucket *ob;
-	struct bch_dev *ca;
-	unsigned nr_ptrs_have, nr_ptrs_effective;
-	int ret, i, cache_idx = -1;
-
-	BUG_ON(!nr_replicas || !nr_replicas_required);
-
-	wp = writepoint_find(c, write_point.v);
-
-	wp->first_ptr = 0;
-
-	/* does writepoint have ptrs we can't use? */
-	writepoint_for_each_ptr(wp, ob, i)
-		if (bch2_dev_list_has_dev(*devs_have, ob->ptr.dev)) {
-			swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
-			wp->first_ptr++;
-		}
-
-	nr_ptrs_have = wp->first_ptr;
-
-	/* does writepoint have ptrs we don't want to use? */
-	if (target)
-		writepoint_for_each_ptr(wp, ob, i)
-			if (!bch2_dev_in_target(c, ob->ptr.dev, target)) {
-				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
-				wp->first_ptr++;
-			}
-
-	if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) {
-		ret = open_bucket_add_buckets(c, target, wp, devs_have,
-					      nr_replicas, reserve, cl);
-	} else {
-		ret = open_bucket_add_buckets(c, target, wp, devs_have,
-					      nr_replicas, reserve, NULL);
-		if (!ret)
-			goto alloc_done;
-
-		wp->first_ptr = nr_ptrs_have;
-
-		ret = open_bucket_add_buckets(c, 0, wp, devs_have,
-					      nr_replicas, reserve, cl);
-	}
-
-	if (ret && ret != -EROFS)
-		goto err;
-alloc_done:
-	/* check for more than one cache: */
-	for (i = wp->nr_ptrs - 1; i >= wp->first_ptr; --i) {
-		ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
-
-		if (ca->mi.durability)
-			continue;
-
-		/*
-		 * if we ended up with more than one cache device, prefer the
-		 * one in the target we want:
-		 */
-		if (cache_idx >= 0) {
-			if (!bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
-						target)) {
-				writepoint_drop_ptr(c, wp, i);
-			} else {
-				writepoint_drop_ptr(c, wp, cache_idx);
-				cache_idx = i;
-			}
-		} else {
-			cache_idx = i;
-		}
-	}
-
-	/* we might have more effective replicas than required: */
-	nr_ptrs_effective = 0;
-	writepoint_for_each_ptr(wp, ob, i) {
-		ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-		nr_ptrs_effective += ca->mi.durability;
-	}
-
-	if (ret == -EROFS &&
-	    nr_ptrs_effective >= nr_replicas_required)
-		ret = 0;
-
-	if (ret)
-		goto err;
-
-	if (nr_ptrs_effective > nr_replicas) {
-		writepoint_for_each_ptr(wp, ob, i) {
-			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-
-			if (ca->mi.durability &&
-			    ca->mi.durability <= nr_ptrs_effective - nr_replicas &&
-			    !bch2_dev_in_target(c, ob->ptr.dev, target)) {
-				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
-				wp->first_ptr++;
-				nr_ptrs_effective -= ca->mi.durability;
-			}
-		}
-	}
-
-	if (nr_ptrs_effective > nr_replicas) {
-		writepoint_for_each_ptr(wp, ob, i) {
-			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-
-			if (ca->mi.durability &&
-			    ca->mi.durability <= nr_ptrs_effective - nr_replicas) {
-				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
-				wp->first_ptr++;
-				nr_ptrs_effective -= ca->mi.durability;
-			}
-		}
-	}
-
-	/* Remove pointers we don't want to use: */
-	if (target)
-		writepoint_drop_ptrs(c, wp, target, false);
-
-	BUG_ON(wp->first_ptr >= wp->nr_ptrs);
-	BUG_ON(nr_ptrs_effective < nr_replicas_required);
-
-	wp->sectors_free = UINT_MAX;
-
-	writepoint_for_each_ptr(wp, ob, i)
-		wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
-
-	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
-
-	verify_not_stale(c, wp);
-
-	return wp;
-err:
-	mutex_unlock(&wp->lock);
-	return ERR_PTR(ret);
-}
-
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
-void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
-				    struct bkey_i_extent *e, unsigned sectors)
-{
-	struct open_bucket *ob;
-	unsigned i;
-
-	BUG_ON(sectors > wp->sectors_free);
-	wp->sectors_free -= sectors;
-
-	writepoint_for_each_ptr(wp, ob, i) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-		struct bch_extent_ptr tmp = ob->ptr;
-
-		EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev));
-
-		tmp.cached = bkey_extent_is_cached(&e->k) ||
-			(!ca->mi.durability && wp->type == BCH_DATA_USER);
-
-		tmp.offset += ca->mi.bucket_size - ob->sectors_free;
-		extent_ptr_append(e, tmp);
-
-		BUG_ON(sectors > ob->sectors_free);
-		ob->sectors_free -= sectors;
-	}
-}
-
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
-void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
-{
-	int i;
-
-	for (i = wp->nr_ptrs - 1; i >= 0; --i) {
-		struct open_bucket *ob = wp->ptrs[i];
-
-		if (!ob->sectors_free) {
-			array_remove_item(wp->ptrs, wp->nr_ptrs, i);
-			bch2_open_bucket_put(c, ob);
-		}
-	}
-
-	mutex_unlock(&wp->lock);
-}
-
-/* Startup/shutdown (ro/rw): */
-
-void bch2_recalc_capacity(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	u64 total_capacity, capacity = 0, reserved_sectors = 0;
-	unsigned long ra_pages = 0;
-	unsigned i, j;
-
-	lockdep_assert_held(&c->state_lock);
-
-	for_each_online_member(ca, c, i) {
-		struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi;
-
-		ra_pages += bdi->ra_pages;
-	}
-
-	bch2_set_ra_pages(c, ra_pages);
-
-	for_each_rw_member(ca, c, i) {
-		size_t reserve = 0;
-
-		/*
-		 * We need to reserve buckets (from the number
-		 * of currently available buckets) against
-		 * foreground writes so that mainly copygc can
-		 * make forward progress.
-		 *
-		 * We need enough to refill the various reserves
-		 * from scratch - copygc will use its entire
-		 * reserve all at once, then run against when
-		 * its reserve is refilled (from the formerly
-		 * available buckets).
-		 *
-		 * This reserve is just used when considering if
-		 * allocations for foreground writes must wait -
-		 * not -ENOSPC calculations.
-		 */
-		for (j = 0; j < RESERVE_NONE; j++)
-			reserve += ca->free[j].size;
-
-		reserve += ca->free_inc.size;
-
-		reserve += ARRAY_SIZE(c->write_points);
-
-		reserve += 1;	/* btree write point */
-
-		reserved_sectors += bucket_to_sector(ca, reserve);
-
-		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
-					     ca->mi.first_bucket);
-	}
-
-	total_capacity = capacity;
-
-	capacity *= (100 - c->opts.gc_reserve_percent);
-	capacity = div64_u64(capacity, 100);
-
-	BUG_ON(reserved_sectors > total_capacity);
-
-	capacity = min(capacity, total_capacity - reserved_sectors);
-
-	c->capacity = capacity;
-
-	if (c->capacity) {
-		bch2_io_timer_add(&c->io_clock[READ],
-				 &c->bucket_clock[READ].rescale);
-		bch2_io_timer_add(&c->io_clock[WRITE],
-				 &c->bucket_clock[WRITE].rescale);
-	} else {
-		bch2_io_timer_del(&c->io_clock[READ],
-				 &c->bucket_clock[READ].rescale);
-		bch2_io_timer_del(&c->io_clock[WRITE],
-				 &c->bucket_clock[WRITE].rescale);
-	}
-
-	/* Wake up case someone was waiting for buckets */
-	closure_wake_up(&c->freelist_wait);
-}
-
-static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca,
-				  struct write_point *wp)
-{
-	struct bch_devs_mask not_self;
-
-	bitmap_complement(not_self.d, ca->self.d, BCH_SB_MEMBERS_MAX);
-
-	mutex_lock(&wp->lock);
-	wp->first_ptr = wp->nr_ptrs;
-	writepoint_drop_ptrs(c, wp, dev_to_target(ca->dev_idx), true);
-	mutex_unlock(&wp->lock);
-}
-
-static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct open_bucket *ob;
-	bool ret = false;
-
-	for (ob = c->open_buckets;
-	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
-	     ob++) {
-		spin_lock(&ob->lock);
-		if (ob->valid && !ob->on_partial_list &&
-		    ob->ptr.dev == ca->dev_idx)
-			ret = true;
-		spin_unlock(&ob->lock);
-	}
-
-	return ret;
-}
-
-/* device goes ro: */
-void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
-{
-	unsigned i;
-
-	BUG_ON(ca->alloc_thread);
-
-	/* First, remove device from allocation groups: */
-
-	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
-		clear_bit(ca->dev_idx, c->rw_devs[i].d);
-
-	/*
-	 * Capacity is calculated based off of devices in allocation groups:
-	 */
-	bch2_recalc_capacity(c);
-
-	/* Next, close write points that point to this device... */
-	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
-		bch2_stop_write_point(c, ca, &c->write_points[i]);
-
-	bch2_stop_write_point(c, ca, &ca->copygc_write_point);
-	bch2_stop_write_point(c, ca, &c->rebalance_write_point);
-	bch2_stop_write_point(c, ca, &c->btree_write_point);
-
-	mutex_lock(&c->btree_reserve_cache_lock);
-	while (c->btree_reserve_cache_nr) {
-		struct btree_alloc *a =
-			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
-		bch2_open_bucket_put_refs(c, &a->ob.nr, a->ob.refs);
-	}
-	mutex_unlock(&c->btree_reserve_cache_lock);
-
-	/*
-	 * Wake up threads that were blocked on allocation, so they can notice
-	 * the device can no longer be removed and the capacity has changed:
-	 */
-	closure_wake_up(&c->freelist_wait);
-
-	/*
-	 * journal_res_get() can block waiting for free space in the journal -
-	 * it needs to notice there may not be devices to allocate from anymore:
-	 */
-	wake_up(&c->journal.wait);
-
-	/* Now wait for any in flight writes: */
-
-	closure_wait_event(&c->open_buckets_wait,
-			   !bch2_dev_has_open_write_point(c, ca));
-}
-
-/* device goes rw: */
-void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
-{
-	unsigned i;
-
-	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
-		if (ca->mi.data_allowed & (1 << i))
-			set_bit(ca->dev_idx, c->rw_devs[i].d);
-}
-
-/* stop allocator thread: */
-void bch2_dev_allocator_stop(struct bch_dev *ca)
-{
-	struct task_struct *p;
-
-	p = rcu_dereference_protected(ca->alloc_thread, 1);
-	ca->alloc_thread = NULL;
-
-	/*
-	 * We need an rcu barrier between setting ca->alloc_thread = NULL and
-	 * the thread shutting down to avoid bch2_wake_allocator() racing:
-	 *
-	 * XXX: it would be better to have the rcu barrier be asynchronous
-	 * instead of blocking us here
-	 */
-	synchronize_rcu();
-
-	if (p) {
-		kthread_stop(p);
-		put_task_struct(p);
-	}
-}
-
-/* start allocator thread: */
-int bch2_dev_allocator_start(struct bch_dev *ca)
-{
-	struct task_struct *p;
-
-	/*
-	 * allocator thread already started?
-	 */
-	if (ca->alloc_thread)
-		return 0;
-
-	p = kthread_create(bch2_allocator_thread, ca,
-			   "bch_alloc[%s]", ca->name);
-	if (IS_ERR(p))
-		return PTR_ERR(p);
-
-	get_task_struct(p);
-	rcu_assign_pointer(ca->alloc_thread, p);
-	wake_up_process(p);
-	return 0;
-}
-
-static void allocator_start_issue_discards(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	unsigned dev_iter;
-	size_t i, bu;
-
-	for_each_rw_member(ca, c, dev_iter) {
-		unsigned done = 0;
-
-		fifo_for_each_entry(bu, &ca->free_inc, i) {
-			if (done == ca->nr_invalidated)
-				break;
-
-			blkdev_issue_discard(ca->disk_sb.bdev,
-					     bucket_to_sector(ca, bu),
-					     ca->mi.bucket_size, GFP_NOIO, 0);
-			done++;
-		}
-	}
-}
-
-static int __bch2_fs_allocator_start(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	size_t bu, i;
-	unsigned dev_iter;
-	u64 journal_seq = 0;
-	bool invalidating_data = false;
-	int ret = 0;
-
-	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
-		return -1;
-
-	/* Scan for buckets that are already invalidated: */
-	for_each_rw_member(ca, c, dev_iter) {
-		struct btree_iter iter;
-		struct bucket_mark m;
-		struct bkey_s_c k;
-
-		for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), 0, k) {
-			if (k.k->type != BCH_ALLOC)
-				continue;
-
-			bu = k.k->p.offset;
-			m = READ_ONCE(bucket(ca, bu)->mark);
-
-			if (!is_available_bucket(m) || m.cached_sectors)
-				continue;
-
-			percpu_down_read_preempt_disable(&c->usage_lock);
-			bch2_mark_alloc_bucket(c, ca, bu, true,
-					gc_pos_alloc(c, NULL),
-					BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					BCH_BUCKET_MARK_GC_LOCK_HELD);
-			percpu_up_read_preempt_enable(&c->usage_lock);
-
-			fifo_push(&ca->free_inc, bu);
-			ca->nr_invalidated++;
-
-			if (fifo_full(&ca->free_inc))
-				break;
-		}
-		bch2_btree_iter_unlock(&iter);
-	}
-
-	/* did we find enough buckets? */
-	for_each_rw_member(ca, c, dev_iter)
-		if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) {
-			percpu_ref_put(&ca->io_ref);
-			goto not_enough;
-		}
-
-	return 0;
-not_enough:
-	pr_debug("did not find enough empty buckets; issuing discards");
-
-	/* clear out free_inc - find_reclaimable_buckets() assumes it's empty */
-	for_each_rw_member(ca, c, dev_iter)
-		discard_invalidated_buckets(c, ca);
-
-	pr_debug("scanning for reclaimable buckets");
-
-	for_each_rw_member(ca, c, dev_iter) {
-		BUG_ON(!fifo_empty(&ca->free_inc));
-		ca->free_inc.front = ca->free_inc.back	= 0;
-
-		find_reclaimable_buckets(c, ca);
-		sort_free_inc(c, ca);
-
-		invalidating_data |= ca->allocator_invalidating_data;
-
-		fifo_for_each_entry(bu, &ca->free_inc, i)
-			if (!fifo_push(&ca->free[RESERVE_BTREE], bu))
-				break;
-	}
-
-	pr_debug("done scanning for reclaimable buckets");
-
-	/*
-	 * We're moving buckets to freelists _before_ they've been marked as
-	 * invalidated on disk - we have to so that we can allocate new btree
-	 * nodes to mark them as invalidated on disk.
-	 *
-	 * However, we can't _write_ to any of these buckets yet - they might
-	 * have cached data in them, which is live until they're marked as
-	 * invalidated on disk:
-	 */
-	if (invalidating_data) {
-		pr_debug("invalidating existing data");
-		set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
-	} else {
-		pr_debug("issuing discards");
-		allocator_start_issue_discards(c);
-	}
-
-	/*
-	 * XXX: it's possible for this to deadlock waiting on journal reclaim,
-	 * since we're holding btree writes. What then?
-	 */
-
-	for_each_rw_member(ca, c, dev_iter) {
-		ret = bch2_invalidate_free_inc(c, ca, &journal_seq,
-					       ca->free[RESERVE_BTREE].size,
-					       false);
-		if (ret) {
-			percpu_ref_put(&ca->io_ref);
-			return ret;
-		}
-	}
-
-	if (invalidating_data) {
-		pr_debug("flushing journal");
-
-		ret = bch2_journal_flush_seq(&c->journal, journal_seq);
-		if (ret)
-			return ret;
-
-		pr_debug("issuing discards");
-		allocator_start_issue_discards(c);
-	}
-
-	for_each_rw_member(ca, c, dev_iter)
-		while (ca->nr_invalidated) {
-			BUG_ON(!fifo_pop(&ca->free_inc, bu));
-			ca->nr_invalidated--;
-		}
-
-	set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
-
-	/* now flush dirty btree nodes: */
-	if (invalidating_data) {
-		struct bucket_table *tbl;
-		struct rhash_head *pos;
-		struct btree *b;
-		bool flush_updates;
-		size_t nr_pending_updates;
-
-		clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
-again:
-		pr_debug("flushing dirty btree nodes");
-		cond_resched();
-
-		flush_updates = false;
-		nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
-
-
-		rcu_read_lock();
-		for_each_cached_btree(b, c, tbl, i, pos)
-			if (btree_node_dirty(b) && (!b->written || b->level)) {
-				if (btree_node_may_write(b)) {
-					rcu_read_unlock();
-					btree_node_lock_type(c, b, SIX_LOCK_read);
-					bch2_btree_node_write(c, b, SIX_LOCK_read);
-					six_unlock_read(&b->lock);
-					goto again;
-				} else {
-					flush_updates = true;
-				}
-			}
-		rcu_read_unlock();
-
-		/*
-		 * This is ugly, but it's needed to flush btree node writes
-		 * without spinning...
-		 */
-		if (flush_updates) {
-			closure_wait_event(&c->btree_interior_update_wait,
-				bch2_btree_interior_updates_nr_pending(c) <
-				nr_pending_updates);
-			goto again;
-		}
-	}
-
-	return 0;
-}
-
-int bch2_fs_allocator_start(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	unsigned i;
-	int ret;
-
-	down_read(&c->gc_lock);
-	ret = __bch2_fs_allocator_start(c);
-	up_read(&c->gc_lock);
-
-	if (ret)
-		return ret;
-
-	for_each_rw_member(ca, c, i) {
-		ret = bch2_dev_allocator_start(ca);
-		if (ret) {
-			percpu_ref_put(&ca->io_ref);
-			return ret;
-		}
-	}
-
-	return bch2_alloc_write(c);
-}
-
-void bch2_fs_allocator_init(struct bch_fs *c)
-{
-	struct open_bucket *ob;
-	struct write_point *wp;
-
-	mutex_init(&c->write_points_hash_lock);
-	spin_lock_init(&c->freelist_lock);
-	bch2_bucket_clock_init(c, READ);
-	bch2_bucket_clock_init(c, WRITE);
-
-	/* open bucket 0 is a sentinal NULL: */
-	spin_lock_init(&c->open_buckets[0].lock);
-
-	for (ob = c->open_buckets + 1;
-	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
-		spin_lock_init(&ob->lock);
-		c->open_buckets_nr_free++;
-
-		ob->freelist = c->open_buckets_freelist;
-		c->open_buckets_freelist = ob - c->open_buckets;
-	}
-
-	writepoint_init(&c->btree_write_point, BCH_DATA_BTREE);
-	writepoint_init(&c->rebalance_write_point, BCH_DATA_USER);
-
-	for (wp = c->write_points;
-	     wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) {
-		writepoint_init(wp, BCH_DATA_USER);
-
-		wp->last_used	= sched_clock();
-		wp->write_point	= (unsigned long) wp;
-		hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
-	}
-
-	c->pd_controllers_update_seconds = 5;
-	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
-}
diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h
deleted file mode 100644
index 00d01f46..00000000
--- a/libbcachefs/alloc.h
+++ /dev/null
@@ -1,141 +0,0 @@
-#ifndef _BCACHEFS_ALLOC_H
-#define _BCACHEFS_ALLOC_H
-
-#include "bcachefs.h"
-#include "alloc_types.h"
-
-struct bkey;
-struct bch_dev;
-struct bch_fs;
-struct bch_devs_List;
-
-const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
-
-#define bch2_bkey_alloc_ops (struct bkey_ops) {		\
-	.key_invalid	= bch2_alloc_invalid,		\
-	.val_to_text	= bch2_alloc_to_text,		\
-}
-
-struct dev_alloc_list {
-	unsigned	nr;
-	u8		devs[BCH_SB_MEMBERS_MAX];
-};
-
-struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *,
-					 struct write_point *,
-					 struct bch_devs_mask *);
-void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
-		     struct write_point *);
-
-int bch2_alloc_read(struct bch_fs *, struct list_head *);
-int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
-
-enum bucket_alloc_ret {
-	ALLOC_SUCCESS		= 0,
-	OPEN_BUCKETS_EMPTY	= -1,
-	FREELIST_EMPTY		= -2,	/* Allocator thread not keeping up */
-	NO_DEVICES		= -3,	/* -EROFS */
-};
-
-long bch2_bucket_alloc_new_fs(struct bch_dev *);
-
-int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
-		      struct closure *);
-
-#define __writepoint_for_each_ptr(_wp, _ob, _i, _start)			\
-	for ((_i) = (_start);						\
-	     (_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true);	\
-	     (_i)++)
-
-#define writepoint_for_each_ptr_all(_wp, _ob, _i)			\
-	__writepoint_for_each_ptr(_wp, _ob, _i, 0)
-
-#define writepoint_for_each_ptr(_wp, _ob, _i)				\
-	__writepoint_for_each_ptr(_wp, _ob, _i, wp->first_ptr)
-
-void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
-
-static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
-{
-	if (atomic_dec_and_test(&ob->pin))
-		__bch2_open_bucket_put(c, ob);
-}
-
-static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs)
-{
-	unsigned i;
-
-	for (i = 0; i < *nr; i++)
-		bch2_open_bucket_put(c, c->open_buckets + refs[i]);
-
-	*nr = 0;
-}
-
-static inline void bch2_open_bucket_get(struct bch_fs *c,
-					struct write_point *wp,
-					u8 *nr, u8 *refs)
-{
-	struct open_bucket *ob;
-	unsigned i;
-
-	writepoint_for_each_ptr(wp, ob, i) {
-		atomic_inc(&ob->pin);
-		refs[(*nr)++] = ob - c->open_buckets;
-	}
-}
-
-struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
-					     unsigned,
-					     struct write_point_specifier,
-					     struct bch_devs_list *,
-					     unsigned, unsigned,
-					     enum alloc_reserve,
-					     unsigned,
-					     struct closure *);
-
-void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
-				    struct bkey_i_extent *, unsigned);
-void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
-
-static inline void bch2_wake_allocator(struct bch_dev *ca)
-{
-	struct task_struct *p;
-
-	rcu_read_lock();
-	p = rcu_dereference(ca->alloc_thread);
-	if (p)
-		wake_up_process(p);
-	rcu_read_unlock();
-}
-
-static inline struct write_point_specifier writepoint_hashed(unsigned long v)
-{
-	return (struct write_point_specifier) { .v = v | 1 };
-}
-
-static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
-{
-	return (struct write_point_specifier) { .v = (unsigned long) wp };
-}
-
-void bch2_recalc_capacity(struct bch_fs *);
-
-void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
-
-void bch2_dev_allocator_stop(struct bch_dev *);
-int bch2_dev_allocator_start(struct bch_dev *);
-
-static inline void writepoint_init(struct write_point *wp,
-				   enum bch_data_type type)
-{
-	mutex_init(&wp->lock);
-	wp->type = type;
-}
-
-int bch2_alloc_write(struct bch_fs *);
-int bch2_fs_allocator_start(struct bch_fs *);
-void bch2_fs_allocator_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_ALLOC_H */
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
new file mode 100644
index 00000000..b2d57045
--- /dev/null
+++ b/libbcachefs/alloc_background.c
@@ -0,0 +1,2567 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "backpointers.h"
+#include "bkey_buf.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_gc.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
+#include "clock.h"
+#include "debug.h"
+#include "disk_accounting.h"
+#include "ec.h"
+#include "error.h"
+#include "lru.h"
+#include "recovery.h"
+#include "trace.h"
+#include "varint.h"
+
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/task.h>
+#include <linux/sort.h>
+#include <linux/jiffies.h>
+
+static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
+
+/* Persistent alloc info: */
+
+static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
+#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
+	BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
+struct bkey_alloc_unpacked {
+	u64		journal_seq;
+	u8		gen;
+	u8		oldest_gen;
+	u8		data_type;
+	bool		need_discard:1;
+	bool		need_inc_gen:1;
+#define x(_name, _bits)	u##_bits _name;
+	BCH_ALLOC_FIELDS_V2()
+#undef  x
+};
+
+static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
+				     const void **p, unsigned field)
+{
+	unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
+	u64 v;
+
+	if (!(a->fields & (1 << field)))
+		return 0;
+
+	switch (bytes) {
+	case 1:
+		v = *((const u8 *) *p);
+		break;
+	case 2:
+		v = le16_to_cpup(*p);
+		break;
+	case 4:
+		v = le32_to_cpup(*p);
+		break;
+	case 8:
+		v = le64_to_cpup(*p);
+		break;
+	default:
+		BUG();
+	}
+
+	*p += bytes;
+	return v;
+}
+
+static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
+				 struct bkey_s_c k)
+{
+	const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
+	const void *d = in->data;
+	unsigned idx = 0;
+
+	out->gen = in->gen;
+
+#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
+	BCH_ALLOC_FIELDS_V1()
+#undef  x
+}
+
+static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
+				struct bkey_s_c k)
+{
+	struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
+	const u8 *in = a.v->data;
+	const u8 *end = bkey_val_end(a);
+	unsigned fieldnr = 0;
+	int ret;
+	u64 v;
+
+	out->gen	= a.v->gen;
+	out->oldest_gen	= a.v->oldest_gen;
+	out->data_type	= a.v->data_type;
+
+#define x(_name, _bits)							\
+	if (fieldnr < a.v->nr_fields) {					\
+		ret = bch2_varint_decode_fast(in, end, &v);		\
+		if (ret < 0)						\
+			return ret;					\
+		in += ret;						\
+	} else {							\
+		v = 0;							\
+	}								\
+	out->_name = v;							\
+	if (v != out->_name)						\
+		return -1;						\
+	fieldnr++;
+
+	BCH_ALLOC_FIELDS_V2()
+#undef  x
+	return 0;
+}
+
+static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
+				struct bkey_s_c k)
+{
+	struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
+	const u8 *in = a.v->data;
+	const u8 *end = bkey_val_end(a);
+	unsigned fieldnr = 0;
+	int ret;
+	u64 v;
+
+	out->gen	= a.v->gen;
+	out->oldest_gen	= a.v->oldest_gen;
+	out->data_type	= a.v->data_type;
+	out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
+	out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
+	out->journal_seq = le64_to_cpu(a.v->journal_seq);
+
+#define x(_name, _bits)							\
+	if (fieldnr < a.v->nr_fields) {					\
+		ret = bch2_varint_decode_fast(in, end, &v);		\
+		if (ret < 0)						\
+			return ret;					\
+		in += ret;						\
+	} else {							\
+		v = 0;							\
+	}								\
+	out->_name = v;							\
+	if (v != out->_name)						\
+		return -1;						\
+	fieldnr++;
+
+	BCH_ALLOC_FIELDS_V2()
+#undef  x
+	return 0;
+}
+
+static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+{
+	struct bkey_alloc_unpacked ret = { .gen	= 0 };
+
+	switch (k.k->type) {
+	case KEY_TYPE_alloc:
+		bch2_alloc_unpack_v1(&ret, k);
+		break;
+	case KEY_TYPE_alloc_v2:
+		bch2_alloc_unpack_v2(&ret, k);
+		break;
+	case KEY_TYPE_alloc_v3:
+		bch2_alloc_unpack_v3(&ret, k);
+		break;
+	}
+
+	return ret;
+}
+
+static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
+{
+	unsigned i, bytes = offsetof(struct bch_alloc, data);
+
+	for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
+		if (a->fields & (1 << i))
+			bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
+
+	return DIV_ROUND_UP(bytes, sizeof(u64));
+}
+
+int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k,
+			   struct bkey_validate_context from)
+{
+	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+	int ret = 0;
+
+	/* allow for unknown fields */
+	bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v),
+			 c, alloc_v1_val_size_bad,
+			 "incorrect value size (%zu < %u)",
+			 bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
+fsck_err:
+	return ret;
+}
+
+int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k,
+			   struct bkey_validate_context from)
+{
+	struct bkey_alloc_unpacked u;
+	int ret = 0;
+
+	bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k),
+			 c, alloc_v2_unpack_error,
+			 "unpack error");
+fsck_err:
+	return ret;
+}
+
+int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
+			   struct bkey_validate_context from)
+{
+	struct bkey_alloc_unpacked u;
+	int ret = 0;
+
+	bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
+			 c, alloc_v2_unpack_error,
+			 "unpack error");
+fsck_err:
+	return ret;
+}
+
+int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
+			   struct bkey_validate_context from)
+{
+	struct bch_alloc_v4 a;
+	int ret = 0;
+
+	bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k));
+
+	bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k),
+			 c, alloc_v4_val_size_bad,
+			 "bad val size (%u > %zu)",
+			 alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k));
+
+	bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) &&
+			 BCH_ALLOC_V4_NR_BACKPOINTERS(&a),
+			 c, alloc_v4_backpointers_start_bad,
+			 "invalid backpointers_start");
+
+	bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type,
+			 c, alloc_key_data_type_bad,
+			 "invalid data type (got %u should be %u)",
+			 a.data_type, alloc_data_type(a, a.data_type));
+
+	for (unsigned i = 0; i < 2; i++)
+		bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX,
+				 c, alloc_key_io_time_bad,
+				 "invalid io_time[%s]: %llu, max %llu",
+				 i == READ ? "read" : "write",
+				 a.io_time[i], LRU_TIME_MAX);
+
+	unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) >
+		offsetof(struct bch_alloc_v4, stripe_sectors)
+		? a.stripe_sectors
+		: 0;
+
+	switch (a.data_type) {
+	case BCH_DATA_free:
+	case BCH_DATA_need_gc_gens:
+	case BCH_DATA_need_discard:
+		bkey_fsck_err_on(stripe_sectors ||
+				 a.dirty_sectors ||
+				 a.cached_sectors ||
+				 a.stripe,
+				 c, alloc_key_empty_but_have_data,
+				 "empty data type free but have data %u.%u.%u %u",
+				 stripe_sectors,
+				 a.dirty_sectors,
+				 a.cached_sectors,
+				 a.stripe);
+		break;
+	case BCH_DATA_sb:
+	case BCH_DATA_journal:
+	case BCH_DATA_btree:
+	case BCH_DATA_user:
+	case BCH_DATA_parity:
+		bkey_fsck_err_on(!a.dirty_sectors &&
+				 !stripe_sectors,
+				 c, alloc_key_dirty_sectors_0,
+				 "data_type %s but dirty_sectors==0",
+				 bch2_data_type_str(a.data_type));
+		break;
+	case BCH_DATA_cached:
+		bkey_fsck_err_on(!a.cached_sectors ||
+				 a.dirty_sectors ||
+				 stripe_sectors ||
+				 a.stripe,
+				 c, alloc_key_cached_inconsistency,
+				 "data type inconsistency");
+
+		bkey_fsck_err_on(!a.io_time[READ] &&
+				 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
+				 c, alloc_key_cached_but_read_time_zero,
+				 "cached bucket with read_time == 0");
+		break;
+	case BCH_DATA_stripe:
+		break;
+	}
+fsck_err:
+	return ret;
+}
+
+void bch2_alloc_v4_swab(struct bkey_s k)
+{
+	struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
+
+	a->journal_seq		= swab64(a->journal_seq);
+	a->flags		= swab32(a->flags);
+	a->dirty_sectors	= swab32(a->dirty_sectors);
+	a->cached_sectors	= swab32(a->cached_sectors);
+	a->io_time[0]		= swab64(a->io_time[0]);
+	a->io_time[1]		= swab64(a->io_time[1]);
+	a->stripe		= swab32(a->stripe);
+	a->nr_external_backpointers = swab32(a->nr_external_backpointers);
+	a->stripe_sectors	= swab32(a->stripe_sectors);
+}
+
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bch_alloc_v4 _a;
+	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
+	struct bch_dev *ca = c ? bch2_dev_bucket_tryget_noerror(c, k.k->p) : NULL;
+
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
+	bch2_prt_data_type(out, a->data_type);
+	prt_newline(out);
+	prt_printf(out, "journal_seq       %llu\n",	a->journal_seq);
+	prt_printf(out, "need_discard      %llu\n",	BCH_ALLOC_V4_NEED_DISCARD(a));
+	prt_printf(out, "need_inc_gen      %llu\n",	BCH_ALLOC_V4_NEED_INC_GEN(a));
+	prt_printf(out, "dirty_sectors     %u\n",	a->dirty_sectors);
+	prt_printf(out, "stripe_sectors    %u\n",	a->stripe_sectors);
+	prt_printf(out, "cached_sectors    %u\n",	a->cached_sectors);
+	prt_printf(out, "stripe            %u\n",	a->stripe);
+	prt_printf(out, "stripe_redundancy %u\n",	a->stripe_redundancy);
+	prt_printf(out, "io_time[READ]     %llu\n",	a->io_time[READ]);
+	prt_printf(out, "io_time[WRITE]    %llu\n",	a->io_time[WRITE]);
+
+	if (ca)
+		prt_printf(out, "fragmentation     %llu\n",	alloc_lru_idx_fragmentation(*a, ca));
+	prt_printf(out, "bp_start          %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a));
+	printbuf_indent_sub(out, 2);
+
+	bch2_dev_put(ca);
+}
+
+void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
+{
+	if (k.k->type == KEY_TYPE_alloc_v4) {
+		void *src, *dst;
+
+		*out = *bkey_s_c_to_alloc_v4(k).v;
+
+		src = alloc_v4_backpointers(out);
+		SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
+		dst = alloc_v4_backpointers(out);
+
+		if (src < dst)
+			memset(src, 0, dst - src);
+
+		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
+	} else {
+		struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+
+		*out = (struct bch_alloc_v4) {
+			.journal_seq		= u.journal_seq,
+			.flags			= u.need_discard,
+			.gen			= u.gen,
+			.oldest_gen		= u.oldest_gen,
+			.data_type		= u.data_type,
+			.stripe_redundancy	= u.stripe_redundancy,
+			.dirty_sectors		= u.dirty_sectors,
+			.cached_sectors		= u.cached_sectors,
+			.io_time[READ]		= u.read_time,
+			.io_time[WRITE]		= u.write_time,
+			.stripe			= u.stripe,
+		};
+
+		SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
+	}
+}
+
+static noinline struct bkey_i_alloc_v4 *
+__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bkey_i_alloc_v4 *ret;
+
+	ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
+	if (IS_ERR(ret))
+		return ret;
+
+	if (k.k->type == KEY_TYPE_alloc_v4) {
+		void *src, *dst;
+
+		bkey_reassemble(&ret->k_i, k);
+
+		src = alloc_v4_backpointers(&ret->v);
+		SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
+		dst = alloc_v4_backpointers(&ret->v);
+
+		if (src < dst)
+			memset(src, 0, dst - src);
+
+		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
+		set_alloc_v4_u64s(ret);
+	} else {
+		bkey_alloc_v4_init(&ret->k_i);
+		ret->k.p = k.k->p;
+		bch2_alloc_to_v4(k, &ret->v);
+	}
+	return ret;
+}
+
+static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bkey_s_c_alloc_v4 a;
+
+	if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
+	    ((a = bkey_s_c_to_alloc_v4(k), true) &&
+	     BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
+		return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
+
+	return __bch2_alloc_to_v4_mut(trans, k);
+}
+
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+	return bch2_alloc_to_v4_mut_inlined(trans, k);
+}
+
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter,
+				       struct bpos pos)
+{
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
+					       BTREE_ITER_with_updates|
+					       BTREE_ITER_cached|
+					       BTREE_ITER_intent);
+	int ret = bkey_err(k);
+	if (unlikely(ret))
+		return ERR_PTR(ret);
+
+	struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (unlikely(ret))
+		goto err;
+	return a;
+err:
+	bch2_trans_iter_exit(trans, iter);
+	return ERR_PTR(ret);
+}
+
+__flatten
+struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos,
+						      enum btree_iter_update_trigger_flags flags)
+{
+	struct btree_iter iter;
+	struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
+	int ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		return ERR_PTR(ret);
+
+	ret = bch2_trans_update(trans, &iter, &a->k_i, flags);
+	bch2_trans_iter_exit(trans, &iter);
+	return unlikely(ret) ? ERR_PTR(ret) : a;
+}
+
+static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
+{
+	*offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
+
+	pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
+	return pos;
+}
+
+static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
+{
+	pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
+	pos.offset += offset;
+	return pos;
+}
+
+static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
+{
+	return k.k->type == KEY_TYPE_bucket_gens
+		? bkey_s_c_to_bucket_gens(k).v->gens[offset]
+		: 0;
+}
+
+int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k,
+			      struct bkey_validate_context from)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens),
+			 c, bucket_gens_val_size_bad,
+			 "bad val size (%zu != %zu)",
+			 bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
+fsck_err:
+	return ret;
+}
+
+void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
+		if (i)
+			prt_char(out, ' ');
+		prt_printf(out, "%u", g.v->gens[i]);
+	}
+}
+
+int bch2_bucket_gens_init(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bkey_i_bucket_gens g;
+	bool have_bucket_gens_key = false;
+	int ret;
+
+	ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+				 BTREE_ITER_prefetch, k, ({
+		/*
+		 * Not a fsck error because this is checked/repaired by
+		 * bch2_check_alloc_key() which runs later:
+		 */
+		if (!bch2_dev_bucket_exists(c, k.k->p))
+			continue;
+
+		struct bch_alloc_v4 a;
+		u8 gen = bch2_alloc_to_v4(k, &a)->gen;
+		unsigned offset;
+		struct bpos pos = alloc_gens_pos(iter.pos, &offset);
+		int ret2 = 0;
+
+		if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) {
+			ret2 =  bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
+				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+			if (ret2)
+				goto iter_err;
+			have_bucket_gens_key = false;
+		}
+
+		if (!have_bucket_gens_key) {
+			bkey_bucket_gens_init(&g.k_i);
+			g.k.p = pos;
+			have_bucket_gens_key = true;
+		}
+
+		g.v.gens[offset] = gen;
+iter_err:
+		ret2;
+	}));
+
+	if (have_bucket_gens_key && !ret)
+		ret = commit_do(trans, NULL, NULL,
+				BCH_TRANS_COMMIT_no_enospc,
+			bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+
+	bch2_trans_put(trans);
+
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+int bch2_alloc_read(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bch_dev *ca = NULL;
+	int ret;
+
+	if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
+		ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
+					 BTREE_ITER_prefetch, k, ({
+			u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
+			u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
+
+			if (k.k->type != KEY_TYPE_bucket_gens)
+				continue;
+
+			ca = bch2_dev_iterate(c, ca, k.k->p.inode);
+			/*
+			 * Not a fsck error because this is checked/repaired by
+			 * bch2_check_alloc_key() which runs later:
+			 */
+			if (!ca) {
+				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+				continue;
+			}
+
+			const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
+
+			for (u64 b = max_t(u64, ca->mi.first_bucket, start);
+			     b < min_t(u64, ca->mi.nbuckets, end);
+			     b++)
+				*bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
+			0;
+		}));
+	} else {
+		ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+					 BTREE_ITER_prefetch, k, ({
+			ca = bch2_dev_iterate(c, ca, k.k->p.inode);
+			/*
+			 * Not a fsck error because this is checked/repaired by
+			 * bch2_check_alloc_key() which runs later:
+			 */
+			if (!ca) {
+				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+				continue;
+			}
+
+			if (k.k->p.offset < ca->mi.first_bucket) {
+				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket));
+				continue;
+			}
+
+			if (k.k->p.offset >= ca->mi.nbuckets) {
+				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+				continue;
+			}
+
+			struct bch_alloc_v4 a;
+			*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
+			0;
+		}));
+	}
+
+	bch2_dev_put(ca);
+	bch2_trans_put(trans);
+
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+/* Free space/discard btree: */
+
+static int __need_discard_or_freespace_err(struct btree_trans *trans,
+					   struct bkey_s_c alloc_k,
+					   bool set, bool discard, bool repair)
+{
+	struct bch_fs *c = trans->c;
+	enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0);
+	enum bch_sb_error_id err_id = discard
+		? BCH_FSCK_ERR_need_discard_key_wrong
+		: BCH_FSCK_ERR_freespace_key_wrong;
+	enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace;
+	struct printbuf buf = PRINTBUF;
+
+	bch2_bkey_val_to_text(&buf, c, alloc_k);
+
+	int ret = __bch2_fsck_err(NULL, trans, flags, err_id,
+				  "bucket incorrectly %sset in %s btree\n"
+				  "  %s",
+				  set ? "" : "un",
+				  bch2_btree_id_str(btree),
+				  buf.buf);
+	if (ret == -BCH_ERR_fsck_ignore ||
+	    ret == -BCH_ERR_fsck_errors_not_fixed)
+		ret = 0;
+
+	printbuf_exit(&buf);
+	return ret;
+}
+
+#define need_discard_or_freespace_err(...)		\
+	fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__))
+
+#define need_discard_or_freespace_err_on(cond, ...)		\
+	(unlikely(cond) ?  need_discard_or_freespace_err(__VA_ARGS__) : false)
+
+static int bch2_bucket_do_index(struct btree_trans *trans,
+				struct bch_dev *ca,
+				struct bkey_s_c alloc_k,
+				const struct bch_alloc_v4 *a,
+				bool set)
+{
+	enum btree_id btree;
+	struct bpos pos;
+
+	if (a->data_type != BCH_DATA_free &&
+	    a->data_type != BCH_DATA_need_discard)
+		return 0;
+
+	switch (a->data_type) {
+	case BCH_DATA_free:
+		btree = BTREE_ID_freespace;
+		pos = alloc_freespace_pos(alloc_k.k->p, *a);
+		break;
+	case BCH_DATA_need_discard:
+		btree = BTREE_ID_need_discard;
+		pos = alloc_k.k->p;
+		break;
+	default:
+		return 0;
+	}
+
+	struct btree_iter iter;
+	struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent);
+	int ret = bkey_err(old);
+	if (ret)
+		return ret;
+
+	need_discard_or_freespace_err_on(ca->mi.freespace_initialized &&
+					 !old.k->type != set,
+					 trans, alloc_k, set,
+					 btree == BTREE_ID_need_discard, false);
+
+	ret = bch2_btree_bit_mod_iter(trans, &iter, set);
+fsck_err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
+					   struct bpos bucket, u8 gen)
+{
+	struct btree_iter iter;
+	unsigned offset;
+	struct bpos pos = alloc_gens_pos(bucket, &offset);
+	struct bkey_i_bucket_gens *g;
+	struct bkey_s_c k;
+	int ret;
+
+	g = bch2_trans_kmalloc(trans, sizeof(*g));
+	ret = PTR_ERR_OR_ZERO(g);
+	if (ret)
+		return ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
+			       BTREE_ITER_intent|
+			       BTREE_ITER_with_updates);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (k.k->type != KEY_TYPE_bucket_gens) {
+		bkey_bucket_gens_init(&g->k_i);
+		g->k.p = iter.pos;
+	} else {
+		bkey_reassemble(&g->k_i, k);
+	}
+
+	g->v.gens[offset] = gen;
+
+	ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca,
+						    enum bch_data_type data_type,
+						    s64 delta_buckets,
+						    s64 delta_sectors,
+						    s64 delta_fragmented, unsigned flags)
+{
+	struct disk_accounting_pos acc = {
+		.type = BCH_DISK_ACCOUNTING_dev_data_type,
+		.dev_data_type.dev		= ca->dev_idx,
+		.dev_data_type.data_type	= data_type,
+	};
+	s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
+
+	return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc);
+}
+
+int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
+				   const struct bch_alloc_v4 *old,
+				   const struct bch_alloc_v4 *new,
+				   unsigned flags)
+{
+	s64 old_sectors = bch2_bucket_sectors(*old);
+	s64 new_sectors = bch2_bucket_sectors(*new);
+	if (old->data_type != new->data_type) {
+		int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
+				 1,  new_sectors,  bch2_bucket_sectors_fragmented(ca, *new), flags) ?:
+			  bch2_dev_data_type_accounting_mod(trans, ca, old->data_type,
+				-1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags);
+		if (ret)
+			return ret;
+	} else if (old_sectors != new_sectors) {
+		int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
+					 0,
+					 new_sectors - old_sectors,
+					 bch2_bucket_sectors_fragmented(ca, *new) -
+					 bch2_bucket_sectors_fragmented(ca, *old), flags);
+		if (ret)
+			return ret;
+	}
+
+	s64 old_unstriped = bch2_bucket_sectors_unstriped(*old);
+	s64 new_unstriped = bch2_bucket_sectors_unstriped(*new);
+	if (old_unstriped != new_unstriped) {
+		int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped,
+					 !!new_unstriped - !!old_unstriped,
+					 new_unstriped - old_unstriped,
+					 0,
+					 flags);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int bch2_trigger_alloc(struct btree_trans *trans,
+		       enum btree_id btree, unsigned level,
+		       struct bkey_s_c old, struct bkey_s new,
+		       enum btree_iter_update_trigger_flags flags)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
+	if (!ca)
+		return -EIO;
+
+	struct bch_alloc_v4 old_a_convert;
+	const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
+
+	struct bch_alloc_v4 *new_a;
+	if (likely(new.k->type == KEY_TYPE_alloc_v4)) {
+		new_a = bkey_s_to_alloc_v4(new).v;
+	} else {
+		BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair)));
+
+		struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c);
+		ret = PTR_ERR_OR_ZERO(new_ka);
+		if (unlikely(ret))
+			goto err;
+		new_a = &new_ka->v;
+	}
+
+	if (flags & BTREE_TRIGGER_transactional) {
+		alloc_data_type_set(new_a, new_a->data_type);
+
+		if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
+			new_a->io_time[READ] = bch2_current_io_time(c, READ);
+			new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
+			SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
+			SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
+		}
+
+		if (data_type_is_empty(new_a->data_type) &&
+		    BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
+		    !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
+			new_a->gen++;
+			SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
+			alloc_data_type_set(new_a, new_a->data_type);
+		}
+
+		if (old_a->data_type != new_a->data_type ||
+		    (new_a->data_type == BCH_DATA_free &&
+		     alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
+			ret =   bch2_bucket_do_index(trans, ca, old, old_a, false) ?:
+				bch2_bucket_do_index(trans, ca, new.s_c, new_a, true);
+			if (ret)
+				goto err;
+		}
+
+		if (new_a->data_type == BCH_DATA_cached &&
+		    !new_a->io_time[READ])
+			new_a->io_time[READ] = bch2_current_io_time(c, READ);
+
+		u64 old_lru = alloc_lru_idx_read(*old_a);
+		u64 new_lru = alloc_lru_idx_read(*new_a);
+		if (old_lru != new_lru) {
+			ret = bch2_lru_change(trans, new.k->p.inode,
+					      bucket_to_u64(new.k->p),
+					      old_lru, new_lru);
+			if (ret)
+				goto err;
+		}
+
+		old_lru = alloc_lru_idx_fragmentation(*old_a, ca);
+		new_lru = alloc_lru_idx_fragmentation(*new_a, ca);
+		if (old_lru != new_lru) {
+			ret = bch2_lru_change(trans,
+					BCH_LRU_FRAGMENTATION_START,
+					bucket_to_u64(new.k->p),
+					old_lru, new_lru);
+			if (ret)
+				goto err;
+		}
+
+		if (old_a->gen != new_a->gen) {
+			ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
+			if (ret)
+				goto err;
+		}
+
+		if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
+		    old_a->cached_sectors) {
+			ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx,
+					 -((s64) old_a->cached_sectors),
+					 flags & BTREE_TRIGGER_gc);
+			if (ret)
+				goto err;
+		}
+
+		ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
+		if (ret)
+			goto err;
+	}
+
+	if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
+		u64 transaction_seq = trans->journal_res.seq;
+
+		if (log_fsck_err_on(transaction_seq && new_a->journal_seq > transaction_seq,
+				    trans, alloc_key_journal_seq_in_future,
+				    "bucket journal seq in future (currently at %llu)\n%s",
+				    journal_cur_seq(&c->journal),
+				    (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)))
+			new_a->journal_seq = transaction_seq;
+
+		int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
+				     (int) data_type_is_empty(old_a->data_type);
+
+		/* Record journal sequence number of empty -> nonempty transition: */
+		if (is_empty_delta < 0)
+			new_a->journal_seq = max(new_a->journal_seq, transaction_seq);
+
+		/*
+		 * Bucket becomes empty: mark it as waiting for a journal flush,
+		 * unless updates since empty -> nonempty transition were never
+		 * flushed - we may need to ask the journal not to flush
+		 * intermediate sequence numbers:
+		 */
+		if (is_empty_delta > 0) {
+			if (new_a->journal_seq == transaction_seq ||
+			    bch2_journal_noflush_seq(&c->journal, new_a->journal_seq))
+				new_a->journal_seq = 0;
+			else {
+				new_a->journal_seq = transaction_seq;
+
+				ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+						c->journal.flushed_seq_ondisk,
+						new.k->p.inode, new.k->p.offset,
+						transaction_seq);
+				if (bch2_fs_fatal_err_on(ret, c,
+						"setting bucket_needs_journal_commit: %s", bch2_err_str(ret)))
+					goto err;
+			}
+		}
+
+		if (new_a->gen != old_a->gen) {
+			rcu_read_lock();
+			u8 *gen = bucket_gen(ca, new.k->p.offset);
+			if (unlikely(!gen)) {
+				rcu_read_unlock();
+				goto invalid_bucket;
+			}
+			*gen = new_a->gen;
+			rcu_read_unlock();
+		}
+
+#define eval_state(_a, expr)		({ const struct bch_alloc_v4 *a = _a; expr; })
+#define statechange(expr)		!eval_state(old_a, expr) && eval_state(new_a, expr)
+#define bucket_flushed(a)		(!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
+
+		if (statechange(a->data_type == BCH_DATA_free) &&
+		    bucket_flushed(new_a))
+			closure_wake_up(&c->freelist_wait);
+
+		if (statechange(a->data_type == BCH_DATA_need_discard) &&
+		    !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
+		    bucket_flushed(new_a))
+			bch2_discard_one_bucket_fast(ca, new.k->p.offset);
+
+		if (statechange(a->data_type == BCH_DATA_cached) &&
+		    !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
+		    should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
+			bch2_dev_do_invalidates(ca);
+
+		if (statechange(a->data_type == BCH_DATA_need_gc_gens))
+			bch2_gc_gens_async(c);
+	}
+
+	if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
+		rcu_read_lock();
+		struct bucket *g = gc_bucket(ca, new.k->p.offset);
+		if (unlikely(!g)) {
+			rcu_read_unlock();
+			goto invalid_bucket;
+		}
+		g->gen_valid	= 1;
+		g->gen		= new_a->gen;
+		rcu_read_unlock();
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	bch2_dev_put(ca);
+	return ret;
+invalid_bucket:
+	bch2_fs_inconsistent(c, "reference to invalid bucket\n  %s",
+			     (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
+	ret = -EIO;
+	goto err;
+}
+
+/*
+ * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
+ * extents style btrees, but works on non-extents btrees:
+ */
+static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
+{
+	struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+
+	if (bkey_err(k))
+		return k;
+
+	if (k.k->type) {
+		return k;
+	} else {
+		struct btree_iter iter2;
+		struct bpos next;
+
+		bch2_trans_copy_iter(&iter2, iter);
+
+		struct btree_path *path = btree_iter_path(iter->trans, iter);
+		if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
+			end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
+
+		end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
+
+		/*
+		 * btree node min/max is a closed interval, upto takes a half
+		 * open interval:
+		 */
+		k = bch2_btree_iter_peek_max(&iter2, end);
+		next = iter2.pos;
+		bch2_trans_iter_exit(iter->trans, &iter2);
+
+		BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
+
+		if (bkey_err(k))
+			return k;
+
+		bkey_init(hole);
+		hole->p = iter->pos;
+
+		bch2_key_resize(hole, next.offset - iter->pos.offset);
+		return (struct bkey_s_c) { hole, NULL };
+	}
+}
+
+static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket)
+{
+	if (*ca) {
+		if (bucket->offset < (*ca)->mi.first_bucket)
+			bucket->offset = (*ca)->mi.first_bucket;
+
+		if (bucket->offset < (*ca)->mi.nbuckets)
+			return true;
+
+		bch2_dev_put(*ca);
+		*ca = NULL;
+		bucket->inode++;
+		bucket->offset = 0;
+	}
+
+	rcu_read_lock();
+	*ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
+	if (*ca) {
+		*bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
+		bch2_dev_get(*ca);
+	}
+	rcu_read_unlock();
+
+	return *ca != NULL;
+}
+
+static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
+					struct bch_dev **ca, struct bkey *hole)
+{
+	struct bch_fs *c = iter->trans->c;
+	struct bkey_s_c k;
+again:
+	k = bch2_get_key_or_hole(iter, POS_MAX, hole);
+	if (bkey_err(k))
+		return k;
+
+	*ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode);
+
+	if (!k.k->type) {
+		struct bpos hole_start = bkey_start_pos(k.k);
+
+		if (!*ca || !bucket_valid(*ca, hole_start.offset)) {
+			if (!next_bucket(c, ca, &hole_start))
+				return bkey_s_c_null;
+
+			bch2_btree_iter_set_pos(iter, hole_start);
+			goto again;
+		}
+
+		if (k.k->p.offset > (*ca)->mi.nbuckets)
+			bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset);
+	}
+
+	return k;
+}
+
+static noinline_for_stack
+int bch2_check_alloc_key(struct btree_trans *trans,
+			 struct bkey_s_c alloc_k,
+			 struct btree_iter *alloc_iter,
+			 struct btree_iter *discard_iter,
+			 struct btree_iter *freespace_iter,
+			 struct btree_iter *bucket_gens_iter)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
+	unsigned gens_offset;
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
+	if (fsck_err_on(!ca,
+			trans, alloc_key_to_missing_dev_bucket,
+			"alloc key for invalid device:bucket %llu:%llu",
+			alloc_k.k->p.inode, alloc_k.k->p.offset))
+		ret = bch2_btree_delete_at(trans, alloc_iter, 0);
+	if (!ca)
+		return ret;
+
+	if (!ca->mi.freespace_initialized)
+		goto out;
+
+	a = bch2_alloc_to_v4(alloc_k, &a_convert);
+
+	bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
+	k = bch2_btree_iter_peek_slot(discard_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	bool is_discarded = a->data_type == BCH_DATA_need_discard;
+	if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded,
+					     trans, alloc_k, !is_discarded, true, true)) {
+		ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded);
+		if (ret)
+			goto err;
+	}
+
+	bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
+	k = bch2_btree_iter_peek_slot(freespace_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	bool is_free = a->data_type == BCH_DATA_free;
+	if (need_discard_or_freespace_err_on(!!k.k->type != is_free,
+					     trans, alloc_k, !is_free, false, true)) {
+		ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free);
+		if (ret)
+			goto err;
+	}
+
+	bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
+	k = bch2_btree_iter_peek_slot(bucket_gens_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
+			trans, bucket_gens_key_wrong,
+			"incorrect gen in bucket_gens btree (got %u should be %u)\n"
+			"  %s",
+			alloc_gen(k, gens_offset), a->gen,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+		struct bkey_i_bucket_gens *g =
+			bch2_trans_kmalloc(trans, sizeof(*g));
+
+		ret = PTR_ERR_OR_ZERO(g);
+		if (ret)
+			goto err;
+
+		if (k.k->type == KEY_TYPE_bucket_gens) {
+			bkey_reassemble(&g->k_i, k);
+		} else {
+			bkey_bucket_gens_init(&g->k_i);
+			g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
+		}
+
+		g->v.gens[gens_offset] = a->gen;
+
+		ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
+		if (ret)
+			goto err;
+	}
+out:
+err:
+fsck_err:
+	bch2_dev_put(ca);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static noinline_for_stack
+int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
+				    struct bch_dev *ca,
+				    struct bpos start,
+				    struct bpos *end,
+				    struct btree_iter *freespace_iter)
+{
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	if (!ca->mi.freespace_initialized)
+		return 0;
+
+	bch2_btree_iter_set_pos(freespace_iter, start);
+
+	k = bch2_btree_iter_peek_slot(freespace_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	*end = bkey_min(k.k->p, *end);
+
+	if (fsck_err_on(k.k->type != KEY_TYPE_set,
+			trans, freespace_hole_missing,
+			"hole in alloc btree missing in freespace btree\n"
+			"  device %llu buckets %llu-%llu",
+			freespace_iter->pos.inode,
+			freespace_iter->pos.offset,
+			end->offset)) {
+		struct bkey_i *update =
+			bch2_trans_kmalloc(trans, sizeof(*update));
+
+		ret = PTR_ERR_OR_ZERO(update);
+		if (ret)
+			goto err;
+
+		bkey_init(&update->k);
+		update->k.type	= KEY_TYPE_set;
+		update->k.p	= freespace_iter->pos;
+		bch2_key_resize(&update->k,
+				min_t(u64, U32_MAX, end->offset -
+				      freespace_iter->pos.offset));
+
+		ret = bch2_trans_update(trans, freespace_iter, update, 0);
+		if (ret)
+			goto err;
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static noinline_for_stack
+int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
+				      struct bpos start,
+				      struct bpos *end,
+				      struct btree_iter *bucket_gens_iter)
+{
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+	unsigned i, gens_offset, gens_end_offset;
+	int ret;
+
+	bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
+
+	k = bch2_btree_iter_peek_slot(bucket_gens_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
+		     alloc_gens_pos(*end,  &gens_end_offset)))
+		gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
+
+	if (k.k->type == KEY_TYPE_bucket_gens) {
+		struct bkey_i_bucket_gens g;
+		bool need_update = false;
+
+		bkey_reassemble(&g.k_i, k);
+
+		for (i = gens_offset; i < gens_end_offset; i++) {
+			if (fsck_err_on(g.v.gens[i], trans,
+					bucket_gens_hole_wrong,
+					"hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
+					bucket_gens_pos_to_alloc(k.k->p, i).inode,
+					bucket_gens_pos_to_alloc(k.k->p, i).offset,
+					g.v.gens[i])) {
+				g.v.gens[i] = 0;
+				need_update = true;
+			}
+		}
+
+		if (need_update) {
+			struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
+
+			ret = PTR_ERR_OR_ZERO(u);
+			if (ret)
+				goto err;
+
+			memcpy(u, &g, sizeof(g));
+
+			ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
+			if (ret)
+				goto err;
+		}
+	}
+
+	*end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+struct check_discard_freespace_key_async {
+	struct work_struct	work;
+	struct bch_fs		*c;
+	struct bbpos		pos;
+};
+
+static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	u8 gen;
+	ret = k.k->type != KEY_TYPE_set
+		? bch2_check_discard_freespace_key(trans, &iter, &gen, false)
+		: 0;
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static void check_discard_freespace_key_work(struct work_struct *work)
+{
+	struct check_discard_freespace_key_async *w =
+		container_of(work, struct check_discard_freespace_key_async, work);
+
+	bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos));
+	bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key);
+	kfree(w);
+}
+
+int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen,
+				     bool async_repair)
+{
+	struct bch_fs *c = trans->c;
+	enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
+		? BCH_DATA_need_discard
+		: BCH_DATA_free;
+	struct printbuf buf = PRINTBUF;
+
+	struct bpos bucket = iter->pos;
+	bucket.offset &= ~(~0ULL << 56);
+	u64 genbits = iter->pos.offset & (~0ULL << 56);
+
+	struct btree_iter alloc_iter;
+	struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
+						     BTREE_ID_alloc, bucket, BTREE_ITER_cached);
+	int ret = bkey_err(alloc_k);
+	if (ret)
+		return ret;
+
+	if (!bch2_dev_bucket_exists(c, bucket)) {
+		if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket,
+			     "entry in %s btree for nonexistant dev:bucket %llu:%llu",
+			     bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset))
+			goto delete;
+		ret = 1;
+		goto out;
+	}
+
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
+
+	if (a->data_type != state ||
+	    (state == BCH_DATA_free &&
+	     genbits != alloc_freespace_genbits(*a))) {
+		if (fsck_err(trans, need_discard_freespace_key_bad,
+			     "%s\n  incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
+			     (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
+			     bch2_btree_id_str(iter->btree_id),
+			     iter->pos.inode,
+			     iter->pos.offset,
+			     a->data_type == state,
+			     genbits >> 56, alloc_freespace_genbits(*a) >> 56))
+			goto delete;
+		ret = 1;
+		goto out;
+	}
+
+	*gen = a->gen;
+out:
+fsck_err:
+	bch2_set_btree_iter_dontneed(&alloc_iter);
+	bch2_trans_iter_exit(trans, &alloc_iter);
+	printbuf_exit(&buf);
+	return ret;
+delete:
+	if (!async_repair) {
+		ret =   bch2_btree_bit_mod_iter(trans, iter, false) ?:
+			bch2_trans_commit(trans, NULL, NULL,
+				BCH_TRANS_COMMIT_no_enospc) ?:
+			-BCH_ERR_transaction_restart_commit;
+		goto out;
+	} else {
+		/*
+		 * We can't repair here when called from the allocator path: the
+		 * commit will recurse back into the allocator
+		 */
+		struct check_discard_freespace_key_async *w =
+			kzalloc(sizeof(*w), GFP_KERNEL);
+		if (!w)
+			goto out;
+
+		if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) {
+			kfree(w);
+			goto out;
+		}
+
+		INIT_WORK(&w->work, check_discard_freespace_key_work);
+		w->c = c;
+		w->pos = BBPOS(iter->btree_id, iter->pos);
+		queue_work(c->write_ref_wq, &w->work);
+		goto out;
+	}
+}
+
+static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter)
+{
+	u8 gen;
+	int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false);
+	return ret < 0 ? ret : 0;
+}
+
+/*
+ * We've already checked that generation numbers in the bucket_gens btree are
+ * valid for buckets that exist; this just checks for keys for nonexistent
+ * buckets.
+ */
+static noinline_for_stack
+int bch2_check_bucket_gens_key(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i_bucket_gens g;
+	u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
+	u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
+	u64 b;
+	bool need_update = false;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
+	bkey_reassemble(&g.k_i, k);
+
+	struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
+	if (!ca) {
+		if (fsck_err(trans, bucket_gens_to_invalid_dev,
+			     "bucket_gens key for invalid device:\n  %s",
+			     (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			ret = bch2_btree_delete_at(trans, iter, 0);
+		goto out;
+	}
+
+	if (fsck_err_on(end <= ca->mi.first_bucket ||
+			start >= ca->mi.nbuckets,
+			trans, bucket_gens_to_invalid_buckets,
+			"bucket_gens key for invalid buckets:\n  %s",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, iter, 0);
+		goto out;
+	}
+
+	for (b = start; b < ca->mi.first_bucket; b++)
+		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
+				trans, bucket_gens_nonzero_for_invalid_buckets,
+				"bucket_gens key has nonzero gen for invalid bucket")) {
+			g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
+			need_update = true;
+		}
+
+	for (b = ca->mi.nbuckets; b < end; b++)
+		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
+				trans, bucket_gens_nonzero_for_invalid_buckets,
+				"bucket_gens key has nonzero gen for invalid bucket")) {
+			g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
+			need_update = true;
+		}
+
+	if (need_update) {
+		struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
+
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto out;
+
+		memcpy(u, &g, sizeof(g));
+		ret = bch2_trans_update(trans, iter, u, 0);
+	}
+out:
+fsck_err:
+	bch2_dev_put(ca);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_check_alloc_info(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
+	struct bch_dev *ca = NULL;
+	struct bkey hole;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
+			     BTREE_ITER_prefetch);
+	bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
+			     BTREE_ITER_prefetch);
+	bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
+			     BTREE_ITER_prefetch);
+	bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
+			     BTREE_ITER_prefetch);
+
+	while (1) {
+		struct bpos next;
+
+		bch2_trans_begin(trans);
+
+		k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole);
+		ret = bkey_err(k);
+		if (ret)
+			goto bkey_err;
+
+		if (!k.k)
+			break;
+
+		if (k.k->type) {
+			next = bpos_nosnap_successor(k.k->p);
+
+			ret = bch2_check_alloc_key(trans,
+						   k, &iter,
+						   &discard_iter,
+						   &freespace_iter,
+						   &bucket_gens_iter);
+			if (ret)
+				goto bkey_err;
+		} else {
+			next = k.k->p;
+
+			ret = bch2_check_alloc_hole_freespace(trans, ca,
+						    bkey_start_pos(k.k),
+						    &next,
+						    &freespace_iter) ?:
+				bch2_check_alloc_hole_bucket_gens(trans,
+						    bkey_start_pos(k.k),
+						    &next,
+						    &bucket_gens_iter);
+			if (ret)
+				goto bkey_err;
+		}
+
+		ret = bch2_trans_commit(trans, NULL, NULL,
+					BCH_TRANS_COMMIT_no_enospc);
+		if (ret)
+			goto bkey_err;
+
+		bch2_btree_iter_set_pos(&iter, next);
+bkey_err:
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &bucket_gens_iter);
+	bch2_trans_iter_exit(trans, &freespace_iter);
+	bch2_trans_iter_exit(trans, &discard_iter);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_dev_put(ca);
+	ca = NULL;
+
+	if (ret < 0)
+		goto err;
+
+	ret = for_each_btree_key(trans, iter,
+			BTREE_ID_need_discard, POS_MIN,
+			BTREE_ITER_prefetch, k,
+		bch2_check_discard_freespace_key_fsck(trans, &iter));
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
+			     BTREE_ITER_prefetch);
+	while (1) {
+		bch2_trans_begin(trans);
+		k = bch2_btree_iter_peek(&iter);
+		if (!k.k)
+			break;
+
+		ret = bkey_err(k) ?:
+			bch2_check_discard_freespace_key_fsck(trans, &iter);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+			ret = 0;
+			continue;
+		}
+		if (ret) {
+			struct printbuf buf = PRINTBUF;
+			bch2_bkey_val_to_text(&buf, c, k);
+
+			bch_err(c, "while checking %s", buf.buf);
+			printbuf_exit(&buf);
+			break;
+		}
+
+		bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
+	}
+	bch2_trans_iter_exit(trans, &iter);
+	if (ret)
+		goto err;
+
+	ret = for_each_btree_key_commit(trans, iter,
+			BTREE_ID_bucket_gens, POS_MIN,
+			BTREE_ITER_prefetch, k,
+			NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+		bch2_check_bucket_gens_key(trans, &iter, k));
+err:
+	bch2_trans_put(trans);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
+				       struct btree_iter *alloc_iter,
+				       struct bkey_buf *last_flushed)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
+	struct bkey_s_c alloc_k;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	alloc_k = bch2_btree_iter_peek(alloc_iter);
+	if (!alloc_k.k)
+		return 0;
+
+	ret = bkey_err(alloc_k);
+	if (ret)
+		return ret;
+
+	struct bch_dev *ca = bch2_dev_tryget_noerror(c, alloc_k.k->p.inode);
+	if (!ca)
+		return 0;
+
+	a = bch2_alloc_to_v4(alloc_k, &a_convert);
+
+	u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
+	if (lru_idx) {
+		ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START,
+					 lru_idx, alloc_k, last_flushed);
+		if (ret)
+			goto err;
+	}
+
+	if (a->data_type != BCH_DATA_cached)
+		goto err;
+
+	if (fsck_err_on(!a->io_time[READ],
+			trans, alloc_key_cached_but_read_time_zero,
+			"cached bucket with read_time 0\n"
+			"  %s",
+		(printbuf_reset(&buf),
+		 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+		struct bkey_i_alloc_v4 *a_mut =
+			bch2_alloc_to_v4_mut(trans, alloc_k);
+		ret = PTR_ERR_OR_ZERO(a_mut);
+		if (ret)
+			goto err;
+
+		a_mut->v.io_time[READ] = bch2_current_io_time(c, READ);
+		ret = bch2_trans_update(trans, alloc_iter,
+					&a_mut->k_i, BTREE_TRIGGER_norun);
+		if (ret)
+			goto err;
+
+		a = &a_mut->v;
+	}
+
+	ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ],
+				 alloc_k, last_flushed);
+	if (ret)
+		goto err;
+err:
+fsck_err:
+	bch2_dev_put(ca);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
+{
+	struct bkey_buf last_flushed;
+
+	bch2_bkey_buf_init(&last_flushed);
+	bkey_init(&last_flushed.k->k);
+
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
+				POS_MIN, BTREE_ITER_prefetch, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed)));
+
+	bch2_bkey_buf_exit(&last_flushed, c);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
+{
+	int ret;
+
+	mutex_lock(&ca->discard_buckets_in_flight_lock);
+	darray_for_each(ca->discard_buckets_in_flight, i)
+		if (i->bucket == bucket) {
+			ret = -BCH_ERR_EEXIST_discard_in_flight_add;
+			goto out;
+		}
+
+	ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
+			   .in_progress = in_progress,
+			   .bucket	= bucket,
+	}));
+out:
+	mutex_unlock(&ca->discard_buckets_in_flight_lock);
+	return ret;
+}
+
+static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
+{
+	mutex_lock(&ca->discard_buckets_in_flight_lock);
+	darray_for_each(ca->discard_buckets_in_flight, i)
+		if (i->bucket == bucket) {
+			BUG_ON(!i->in_progress);
+			darray_remove_item(&ca->discard_buckets_in_flight, i);
+			goto found;
+		}
+	BUG();
+found:
+	mutex_unlock(&ca->discard_buckets_in_flight_lock);
+}
+
+struct discard_buckets_state {
+	u64		seen;
+	u64		open;
+	u64		need_journal_commit;
+	u64		discarded;
+	u64		need_journal_commit_this_dev;
+};
+
+static int bch2_discard_one_bucket(struct btree_trans *trans,
+				   struct bch_dev *ca,
+				   struct btree_iter *need_discard_iter,
+				   struct bpos *discard_pos_done,
+				   struct discard_buckets_state *s,
+				   bool fastpath)
+{
+	struct bch_fs *c = trans->c;
+	struct bpos pos = need_discard_iter->pos;
+	struct btree_iter iter = { NULL };
+	struct bkey_s_c k;
+	struct bkey_i_alloc_v4 *a;
+	struct printbuf buf = PRINTBUF;
+	bool discard_locked = false;
+	int ret = 0;
+
+	if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
+		s->open++;
+		goto out;
+	}
+
+	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+			c->journal.flushed_seq_ondisk,
+			pos.inode, pos.offset)) {
+		s->need_journal_commit++;
+		s->need_journal_commit_this_dev++;
+		goto out;
+	}
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
+			       need_discard_iter->pos,
+			       BTREE_ITER_cached);
+	ret = bkey_err(k);
+	if (ret)
+		goto out;
+
+	a = bch2_alloc_to_v4_mut(trans, k);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		goto out;
+
+	if (a->v.data_type != BCH_DATA_need_discard) {
+		if (need_discard_or_freespace_err(trans, k, true, true, true)) {
+			ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false);
+			if (ret)
+				goto out;
+			goto commit;
+		}
+
+		goto out;
+	}
+
+	if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
+		if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+					       trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
+					       a->v.journal_seq,
+					       c->journal.flushed_seq_ondisk,
+					       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			ret = -EIO;
+		goto out;
+	}
+
+	if (!fastpath) {
+		if (discard_in_flight_add(ca, iter.pos.offset, true))
+			goto out;
+
+		discard_locked = true;
+	}
+
+	if (!bkey_eq(*discard_pos_done, iter.pos) &&
+	    ca->mi.discard && !c->opts.nochanges) {
+		/*
+		 * This works without any other locks because this is the only
+		 * thread that removes items from the need_discard tree
+		 */
+		bch2_trans_unlock_long(trans);
+		blkdev_issue_discard(ca->disk_sb.bdev,
+				     k.k->p.offset * ca->mi.bucket_size,
+				     ca->mi.bucket_size,
+				     GFP_KERNEL);
+		*discard_pos_done = iter.pos;
+		s->discarded++;
+
+		ret = bch2_trans_relock_notrace(trans);
+		if (ret)
+			goto out;
+	}
+
+	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
+	alloc_data_type_set(&a->v, a->v.data_type);
+
+	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+	if (ret)
+		goto out;
+commit:
+	ret = bch2_trans_commit(trans, NULL, NULL,
+				BCH_WATERMARK_btree|
+				BCH_TRANS_COMMIT_no_enospc);
+	if (ret)
+		goto out;
+
+	count_event(c, bucket_discard);
+out:
+fsck_err:
+	if (discard_locked)
+		discard_in_flight_remove(ca, iter.pos.offset);
+	if (!ret)
+		s->seen++;
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static void bch2_do_discards_work(struct work_struct *work)
+{
+	struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
+	struct bch_fs *c = ca->fs;
+	struct discard_buckets_state s = {};
+	struct bpos discard_pos_done = POS_MAX;
+	int ret;
+
+	/*
+	 * We're doing the commit in bch2_discard_one_bucket instead of using
+	 * for_each_btree_key_commit() so that we can increment counters after
+	 * successful commit:
+	 */
+	ret = bch2_trans_run(c,
+		for_each_btree_key_max(trans, iter,
+				   BTREE_ID_need_discard,
+				   POS(ca->dev_idx, 0),
+				   POS(ca->dev_idx, U64_MAX), 0, k,
+			bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false)));
+
+	trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
+			      bch2_err_str(ret));
+
+	percpu_ref_put(&ca->io_ref);
+	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+}
+
+void bch2_dev_do_discards(struct bch_dev *ca)
+{
+	struct bch_fs *c = ca->fs;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
+		return;
+
+	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+		goto put_write_ref;
+
+	if (queue_work(c->write_ref_wq, &ca->discard_work))
+		return;
+
+	percpu_ref_put(&ca->io_ref);
+put_write_ref:
+	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+}
+
+void bch2_do_discards(struct bch_fs *c)
+{
+	for_each_member_device(c, ca)
+		bch2_dev_do_discards(ca);
+}
+
+static int bch2_do_discards_fast_one(struct btree_trans *trans,
+				     struct bch_dev *ca,
+				     u64 bucket,
+				     struct bpos *discard_pos_done,
+				     struct discard_buckets_state *s)
+{
+	struct btree_iter need_discard_iter;
+	struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter,
+					BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0);
+	int ret = bkey_err(discard_k);
+	if (ret)
+		return ret;
+
+	if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set,
+			    trans, discarding_bucket_not_in_need_discard_btree,
+			    "attempting to discard bucket %u:%llu not in need_discard btree",
+			    ca->dev_idx, bucket))
+		goto out;
+
+	ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true);
+out:
+fsck_err:
+	bch2_trans_iter_exit(trans, &need_discard_iter);
+	return ret;
+}
+
+static void bch2_do_discards_fast_work(struct work_struct *work)
+{
+	struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
+	struct bch_fs *c = ca->fs;
+	struct discard_buckets_state s = {};
+	struct bpos discard_pos_done = POS_MAX;
+	struct btree_trans *trans = bch2_trans_get(c);
+	int ret = 0;
+
+	while (1) {
+		bool got_bucket = false;
+		u64 bucket;
+
+		mutex_lock(&ca->discard_buckets_in_flight_lock);
+		darray_for_each(ca->discard_buckets_in_flight, i) {
+			if (i->in_progress)
+				continue;
+
+			got_bucket = true;
+			bucket = i->bucket;
+			i->in_progress = true;
+			break;
+		}
+		mutex_unlock(&ca->discard_buckets_in_flight_lock);
+
+		if (!got_bucket)
+			break;
+
+		ret = lockrestart_do(trans,
+			bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s));
+		bch_err_fn(c, ret);
+
+		discard_in_flight_remove(ca, bucket);
+
+		if (ret)
+			break;
+	}
+
+	trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
+
+	bch2_trans_put(trans);
+	percpu_ref_put(&ca->io_ref);
+	bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+}
+
+static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
+{
+	struct bch_fs *c = ca->fs;
+
+	if (discard_in_flight_add(ca, bucket, false))
+		return;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
+		return;
+
+	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+		goto put_ref;
+
+	if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
+		return;
+
+	percpu_ref_put(&ca->io_ref);
+put_ref:
+	bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+}
+
+static int invalidate_one_bucket(struct btree_trans *trans,
+				 struct btree_iter *lru_iter,
+				 struct bkey_s_c lru_k,
+				 s64 *nr_to_invalidate)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i_alloc_v4 *a = NULL;
+	struct printbuf buf = PRINTBUF;
+	struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
+	unsigned cached_sectors;
+	int ret = 0;
+
+	if (*nr_to_invalidate <= 0)
+		return 1;
+
+	if (!bch2_dev_bucket_exists(c, bucket)) {
+		if (fsck_err(trans, lru_entry_to_invalid_bucket,
+			     "lru key points to nonexistent device:bucket %llu:%llu",
+			     bucket.inode, bucket.offset))
+			return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
+		goto out;
+	}
+
+	if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
+		return 0;
+
+	a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		goto out;
+
+	/* We expect harmless races here due to the btree write buffer: */
+	if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
+		goto out;
+
+	BUG_ON(a->v.data_type != BCH_DATA_cached);
+	BUG_ON(a->v.dirty_sectors);
+
+	if (!a->v.cached_sectors)
+		bch_err(c, "invalidating empty bucket, confused");
+
+	cached_sectors = a->v.cached_sectors;
+
+	SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+	a->v.gen++;
+	a->v.data_type		= 0;
+	a->v.dirty_sectors	= 0;
+	a->v.stripe_sectors	= 0;
+	a->v.cached_sectors	= 0;
+	a->v.io_time[READ]	= bch2_current_io_time(c, READ);
+	a->v.io_time[WRITE]	= bch2_current_io_time(c, WRITE);
+
+	ret = bch2_trans_commit(trans, NULL, NULL,
+				BCH_WATERMARK_btree|
+				BCH_TRANS_COMMIT_no_enospc);
+	if (ret)
+		goto out;
+
+	trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
+	--*nr_to_invalidate;
+out:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
+				    struct bch_dev *ca, bool *wrapped)
+{
+	struct bkey_s_c k;
+again:
+	k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
+	if (!k.k && !*wrapped) {
+		bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
+		*wrapped = true;
+		goto again;
+	}
+
+	return k;
+}
+
+static void bch2_do_invalidates_work(struct work_struct *work)
+{
+	struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work);
+	struct bch_fs *c = ca->fs;
+	struct btree_trans *trans = bch2_trans_get(c);
+	int ret = 0;
+
+	ret = bch2_btree_write_buffer_tryflush(trans);
+	if (ret)
+		goto err;
+
+	s64 nr_to_invalidate =
+		should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
+	struct btree_iter iter;
+	bool wrapped = false;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
+			     lru_pos(ca->dev_idx, 0,
+				     ((bch2_current_io_time(c, READ) + U32_MAX) &
+				      LRU_TIME_MAX)), 0);
+
+	while (true) {
+		bch2_trans_begin(trans);
+
+		struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
+		ret = bkey_err(k);
+		if (ret)
+			goto restart_err;
+		if (!k.k)
+			break;
+
+		ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
+restart_err:
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+
+		bch2_btree_iter_advance(&iter);
+	}
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	bch2_trans_put(trans);
+	percpu_ref_put(&ca->io_ref);
+	bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+}
+
+void bch2_dev_do_invalidates(struct bch_dev *ca)
+{
+	struct bch_fs *c = ca->fs;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
+		return;
+
+	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+		goto put_ref;
+
+	if (queue_work(c->write_ref_wq, &ca->invalidate_work))
+		return;
+
+	percpu_ref_put(&ca->io_ref);
+put_ref:
+	bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+}
+
+void bch2_do_invalidates(struct bch_fs *c)
+{
+	for_each_member_device(c, ca)
+		bch2_dev_do_invalidates(ca);
+}
+
+int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
+			    u64 bucket_start, u64 bucket_end)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey hole;
+	struct bpos end = POS(ca->dev_idx, bucket_end);
+	struct bch_member *m;
+	unsigned long last_updated = jiffies;
+	int ret;
+
+	BUG_ON(bucket_start > bucket_end);
+	BUG_ON(bucket_end > ca->mi.nbuckets);
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+		POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
+		BTREE_ITER_prefetch);
+	/*
+	 * Scan the alloc btree for every bucket on @ca, and add buckets to the
+	 * freespace/need_discard/need_gc_gens btrees as needed:
+	 */
+	while (1) {
+		if (time_after(jiffies, last_updated + HZ * 10)) {
+			bch_info(ca, "%s: currently at %llu/%llu",
+				 __func__, iter.pos.offset, ca->mi.nbuckets);
+			last_updated = jiffies;
+		}
+
+		bch2_trans_begin(trans);
+
+		if (bkey_ge(iter.pos, end)) {
+			ret = 0;
+			break;
+		}
+
+		k = bch2_get_key_or_hole(&iter, end, &hole);
+		ret = bkey_err(k);
+		if (ret)
+			goto bkey_err;
+
+		if (k.k->type) {
+			/*
+			 * We process live keys in the alloc btree one at a
+			 * time:
+			 */
+			struct bch_alloc_v4 a_convert;
+			const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+
+			ret =   bch2_bucket_do_index(trans, ca, k, a, true) ?:
+				bch2_trans_commit(trans, NULL, NULL,
+						  BCH_TRANS_COMMIT_no_enospc);
+			if (ret)
+				goto bkey_err;
+
+			bch2_btree_iter_advance(&iter);
+		} else {
+			struct bkey_i *freespace;
+
+			freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
+			ret = PTR_ERR_OR_ZERO(freespace);
+			if (ret)
+				goto bkey_err;
+
+			bkey_init(&freespace->k);
+			freespace->k.type	= KEY_TYPE_set;
+			freespace->k.p		= k.k->p;
+			freespace->k.size	= k.k->size;
+
+			ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
+				bch2_trans_commit(trans, NULL, NULL,
+						  BCH_TRANS_COMMIT_no_enospc);
+			if (ret)
+				goto bkey_err;
+
+			bch2_btree_iter_set_pos(&iter, k.k->p);
+		}
+bkey_err:
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+
+	if (ret < 0) {
+		bch_err_msg(ca, ret, "initializing free space");
+		return ret;
+	}
+
+	mutex_lock(&c->sb_lock);
+	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+	SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+int bch2_fs_freespace_init(struct bch_fs *c)
+{
+	int ret = 0;
+	bool doing_init = false;
+
+	/*
+	 * We can crash during the device add path, so we need to check this on
+	 * every mount:
+	 */
+
+	for_each_member_device(c, ca) {
+		if (ca->mi.freespace_initialized)
+			continue;
+
+		if (!doing_init) {
+			bch_info(c, "initializing freespace");
+			doing_init = true;
+		}
+
+		ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
+		if (ret) {
+			bch2_dev_put(ca);
+			bch_err_fn(c, ret);
+			return ret;
+		}
+	}
+
+	if (doing_init) {
+		mutex_lock(&c->sb_lock);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+		bch_verbose(c, "done initializing freespace");
+	}
+
+	return 0;
+}
+
+/* device removal */
+
+int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bpos start	= POS(ca->dev_idx, 0);
+	struct bpos end		= POS(ca->dev_idx, U64_MAX);
+	int ret;
+
+	/*
+	 * We clear the LRU and need_discard btrees first so that we don't race
+	 * with bch2_do_invalidates() and bch2_do_discards()
+	 */
+	ret =   bch2_dev_remove_stripes(c, ca->dev_idx) ?:
+		bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
+					BTREE_TRIGGER_norun, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
+					BTREE_TRIGGER_norun, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
+					BTREE_TRIGGER_norun, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
+					BTREE_TRIGGER_norun, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
+					BTREE_TRIGGER_norun, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
+					BTREE_TRIGGER_norun, NULL) ?:
+		bch2_dev_usage_remove(c, ca->dev_idx);
+	bch_err_msg(ca, ret, "removing dev alloc info");
+	return ret;
+}
+
+/* Bucket IO clocks: */
+
+static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+				size_t bucket_nr, int rw)
+{
+	struct bch_fs *c = trans->c;
+
+	struct btree_iter iter;
+	struct bkey_i_alloc_v4 *a =
+		bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
+	int ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		return ret;
+
+	u64 now = bch2_current_io_time(c, rw);
+	if (a->v.io_time[rw] == now)
+		goto out;
+
+	a->v.io_time[rw] = now;
+
+	ret   = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL, 0);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+			      size_t bucket_nr, int rw)
+{
+	if (bch2_trans_relock(trans))
+		bch2_trans_begin(trans);
+
+	return nested_lockrestart_do(trans, __bch2_bucket_io_time_reset(trans, dev, bucket_nr, rw));
+}
+
+/* Startup/shutdown (ro/rw): */
+
+void bch2_recalc_capacity(struct bch_fs *c)
+{
+	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
+	unsigned bucket_size_max = 0;
+	unsigned long ra_pages = 0;
+
+	lockdep_assert_held(&c->state_lock);
+
+	for_each_online_member(c, ca) {
+		struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
+
+		ra_pages += bdi->ra_pages;
+	}
+
+	bch2_set_ra_pages(c, ra_pages);
+
+	for_each_rw_member(c, ca) {
+		u64 dev_reserve = 0;
+
+		/*
+		 * We need to reserve buckets (from the number
+		 * of currently available buckets) against
+		 * foreground writes so that mainly copygc can
+		 * make forward progress.
+		 *
+		 * We need enough to refill the various reserves
+		 * from scratch - copygc will use its entire
+		 * reserve all at once, then run against when
+		 * its reserve is refilled (from the formerly
+		 * available buckets).
+		 *
+		 * This reserve is just used when considering if
+		 * allocations for foreground writes must wait -
+		 * not -ENOSPC calculations.
+		 */
+
+		dev_reserve += ca->nr_btree_reserve * 2;
+		dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
+
+		dev_reserve += 1;	/* btree write point */
+		dev_reserve += 1;	/* copygc write point */
+		dev_reserve += 1;	/* rebalance write point */
+
+		dev_reserve *= ca->mi.bucket_size;
+
+		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
+					     ca->mi.first_bucket);
+
+		reserved_sectors += dev_reserve * 2;
+
+		bucket_size_max = max_t(unsigned, bucket_size_max,
+					ca->mi.bucket_size);
+	}
+
+	gc_reserve = c->opts.gc_reserve_bytes
+		? c->opts.gc_reserve_bytes >> 9
+		: div64_u64(capacity * c->opts.gc_reserve_percent, 100);
+
+	reserved_sectors = max(gc_reserve, reserved_sectors);
+
+	reserved_sectors = min(reserved_sectors, capacity);
+
+	c->reserved = reserved_sectors;
+	c->capacity = capacity - reserved_sectors;
+
+	c->bucket_size_max = bucket_size_max;
+
+	/* Wake up case someone was waiting for buckets */
+	closure_wake_up(&c->freelist_wait);
+}
+
+u64 bch2_min_rw_member_capacity(struct bch_fs *c)
+{
+	u64 ret = U64_MAX;
+
+	for_each_rw_member(c, ca)
+		ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
+	return ret;
+}
+
+static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct open_bucket *ob;
+	bool ret = false;
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++) {
+		spin_lock(&ob->lock);
+		if (ob->valid && !ob->on_partial_list &&
+		    ob->dev == ca->dev_idx)
+			ret = true;
+		spin_unlock(&ob->lock);
+	}
+
+	return ret;
+}
+
+/* device goes ro: */
+void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
+{
+	lockdep_assert_held(&c->state_lock);
+
+	/* First, remove device from allocation groups: */
+
+	for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+		clear_bit(ca->dev_idx, c->rw_devs[i].d);
+
+	c->rw_devs_change_count++;
+
+	/*
+	 * Capacity is calculated based off of devices in allocation groups:
+	 */
+	bch2_recalc_capacity(c);
+
+	bch2_open_buckets_stop(c, ca, false);
+
+	/*
+	 * Wake up threads that were blocked on allocation, so they can notice
+	 * the device can no longer be removed and the capacity has changed:
+	 */
+	closure_wake_up(&c->freelist_wait);
+
+	/*
+	 * journal_res_get() can block waiting for free space in the journal -
+	 * it needs to notice there may not be devices to allocate from anymore:
+	 */
+	wake_up(&c->journal.wait);
+
+	/* Now wait for any in flight writes: */
+
+	closure_wait_event(&c->open_buckets_wait,
+			   !bch2_dev_has_open_write_point(c, ca));
+}
+
+/* device goes rw: */
+void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
+{
+	lockdep_assert_held(&c->state_lock);
+
+	for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+		if (ca->mi.data_allowed & (1 << i))
+			set_bit(ca->dev_idx, c->rw_devs[i].d);
+
+	c->rw_devs_change_count++;
+}
+
+void bch2_dev_allocator_background_exit(struct bch_dev *ca)
+{
+	darray_exit(&ca->discard_buckets_in_flight);
+}
+
+void bch2_dev_allocator_background_init(struct bch_dev *ca)
+{
+	mutex_init(&ca->discard_buckets_in_flight_lock);
+	INIT_WORK(&ca->discard_work, bch2_do_discards_work);
+	INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work);
+	INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work);
+}
+
+void bch2_fs_allocator_background_init(struct bch_fs *c)
+{
+	spin_lock_init(&c->freelist_lock);
+}
diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h
new file mode 100644
index 00000000..de25ba4e
--- /dev/null
+++ b/libbcachefs/alloc_background.h
@@ -0,0 +1,361 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_BACKGROUND_H
+#define _BCACHEFS_ALLOC_BACKGROUND_H
+
+#include "bcachefs.h"
+#include "alloc_types.h"
+#include "buckets.h"
+#include "debug.h"
+#include "super.h"
+
+/* How out of date a pointer gen is allowed to be: */
+#define BUCKET_GC_GEN_MAX	96U
+
+static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
+{
+	rcu_read_lock();
+	struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
+	bool ret = ca && bucket_valid(ca, pos.offset);
+	rcu_read_unlock();
+	return ret;
+}
+
+static inline u64 bucket_to_u64(struct bpos bucket)
+{
+	return (bucket.inode << 48) | bucket.offset;
+}
+
+static inline struct bpos u64_to_bucket(u64 bucket)
+{
+	return POS(bucket >> 48, bucket & ~(~0ULL << 48));
+}
+
+static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
+{
+	return a.gen - a.oldest_gen;
+}
+
+static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src)
+{
+	dst->gen		= src.gen;
+	dst->data_type		= src.data_type;
+	dst->stripe_sectors	= src.stripe_sectors;
+	dst->dirty_sectors	= src.dirty_sectors;
+	dst->cached_sectors	= src.cached_sectors;
+	dst->stripe		= src.stripe;
+}
+
+static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket src)
+{
+	dst->gen		= src.gen;
+	dst->data_type		= src.data_type;
+	dst->stripe_sectors	= src.stripe_sectors;
+	dst->dirty_sectors	= src.dirty_sectors;
+	dst->cached_sectors	= src.cached_sectors;
+	dst->stripe		= src.stripe;
+}
+
+static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b)
+{
+	struct bch_alloc_v4 ret = {};
+	__bucket_m_to_alloc(&ret, b);
+	return ret;
+}
+
+static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type)
+{
+	switch (data_type) {
+	case BCH_DATA_cached:
+	case BCH_DATA_stripe:
+		return BCH_DATA_user;
+	default:
+		return data_type;
+	}
+}
+
+static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
+					     enum bch_data_type ptr)
+{
+	return !data_type_is_empty(bucket) &&
+		bucket_data_type(bucket) != bucket_data_type(ptr);
+}
+
+/*
+ * It is my general preference to use unsigned types for unsigned quantities -
+ * however, these helpers are used in disk accounting calculations run by
+ * triggers where the output will be negated and added to an s64. unsigned is
+ * right out even though all these quantities will fit in 32 bits, since it
+ * won't be sign extended correctly; u64 will negate "correctly", but s64 is the
+ * simpler option here.
+ */
+static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a)
+{
+	return a.stripe_sectors + a.dirty_sectors + a.cached_sectors;
+}
+
+static inline s64 bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
+{
+	return a.stripe_sectors + a.dirty_sectors;
+}
+
+static inline s64 bch2_bucket_sectors(struct bch_alloc_v4 a)
+{
+	return a.data_type == BCH_DATA_cached
+		? a.cached_sectors
+		: bch2_bucket_sectors_dirty(a);
+}
+
+static inline s64 bch2_bucket_sectors_fragmented(struct bch_dev *ca,
+						 struct bch_alloc_v4 a)
+{
+	int d = bch2_bucket_sectors(a);
+
+	return d ? max(0, ca->mi.bucket_size - d) : 0;
+}
+
+static inline s64 bch2_gc_bucket_sectors_fragmented(struct bch_dev *ca, struct bucket a)
+{
+	int d = a.stripe_sectors + a.dirty_sectors;
+
+	return d ? max(0, ca->mi.bucket_size - d) : 0;
+}
+
+static inline s64 bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a)
+{
+	return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0;
+}
+
+static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
+						 enum bch_data_type data_type)
+{
+	if (a.stripe)
+		return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
+	if (bch2_bucket_sectors_dirty(a))
+		return data_type;
+	if (a.cached_sectors)
+		return BCH_DATA_cached;
+	if (BCH_ALLOC_V4_NEED_DISCARD(&a))
+		return BCH_DATA_need_discard;
+	if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
+		return BCH_DATA_need_gc_gens;
+	return BCH_DATA_free;
+}
+
+static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_type data_type)
+{
+	a->data_type = alloc_data_type(*a, data_type);
+}
+
+static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
+{
+	return a.data_type == BCH_DATA_cached
+		? a.io_time[READ] & LRU_TIME_MAX
+		: 0;
+}
+
+#define DATA_TYPES_MOVABLE		\
+	((1U << BCH_DATA_btree)|	\
+	 (1U << BCH_DATA_user)|		\
+	 (1U << BCH_DATA_stripe))
+
+static inline bool data_type_movable(enum bch_data_type type)
+{
+	return (1U << type) & DATA_TYPES_MOVABLE;
+}
+
+static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
+					      struct bch_dev *ca)
+{
+	if (a.data_type >= BCH_DATA_NR)
+		return 0;
+
+	if (!data_type_movable(a.data_type) ||
+	    !bch2_bucket_sectors_fragmented(ca, a))
+		return 0;
+
+	/*
+	 * avoid overflowing LRU_TIME_BITS on a corrupted fs, when
+	 * bucket_sectors_dirty is (much) bigger than bucket_size
+	 */
+	u64 d = min_t(s64, bch2_bucket_sectors_dirty(a),
+		      ca->mi.bucket_size);
+
+	return div_u64(d * (1ULL << 31), ca->mi.bucket_size);
+}
+
+static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
+{
+	return ((u64) alloc_gc_gen(a) >> 4) << 56;
+}
+
+static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a)
+{
+	pos.offset |= alloc_freespace_genbits(a);
+	return pos;
+}
+
+static inline unsigned alloc_v4_u64s_noerror(const struct bch_alloc_v4 *a)
+{
+	return (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+			BCH_ALLOC_V4_U64s_V0) +
+		BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
+		(sizeof(struct bch_backpointer) / sizeof(u64));
+}
+
+static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
+{
+	unsigned ret = alloc_v4_u64s_noerror(a);
+	BUG_ON(ret > U8_MAX - BKEY_U64s);
+	return ret;
+}
+
+static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
+{
+	set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v));
+}
+
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos);
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *, struct bpos,
+			      enum btree_iter_update_trigger_flags);
+
+void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
+
+static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert)
+{
+	const struct bch_alloc_v4 *ret;
+
+	if (unlikely(k.k->type != KEY_TYPE_alloc_v4))
+		goto slowpath;
+
+	ret = bkey_s_c_to_alloc_v4(k).v;
+	if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s)
+		goto slowpath;
+
+	return ret;
+slowpath:
+	__bch2_alloc_to_v4(k, convert);
+	return convert;
+}
+
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c);
+
+int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
+
+int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c,
+			   struct bkey_validate_context);
+int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c,
+			   struct bkey_validate_context);
+int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c,
+			   struct bkey_validate_context);
+int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c,
+			   struct bkey_validate_context);
+void bch2_alloc_v4_swab(struct bkey_s);
+void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_alloc ((struct bkey_ops) {	\
+	.key_validate	= bch2_alloc_v1_validate,	\
+	.val_to_text	= bch2_alloc_to_text,		\
+	.trigger	= bch2_trigger_alloc,		\
+	.min_val_size	= 8,				\
+})
+
+#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) {	\
+	.key_validate	= bch2_alloc_v2_validate,	\
+	.val_to_text	= bch2_alloc_to_text,		\
+	.trigger	= bch2_trigger_alloc,		\
+	.min_val_size	= 8,				\
+})
+
+#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) {	\
+	.key_validate	= bch2_alloc_v3_validate,	\
+	.val_to_text	= bch2_alloc_to_text,		\
+	.trigger	= bch2_trigger_alloc,		\
+	.min_val_size	= 16,				\
+})
+
+#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) {	\
+	.key_validate	= bch2_alloc_v4_validate,	\
+	.val_to_text	= bch2_alloc_to_text,		\
+	.swab		= bch2_alloc_v4_swab,		\
+	.trigger	= bch2_trigger_alloc,		\
+	.min_val_size	= 48,				\
+})
+
+int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c,
+			      struct bkey_validate_context);
+void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) {	\
+	.key_validate	= bch2_bucket_gens_validate,	\
+	.val_to_text	= bch2_bucket_gens_to_text,	\
+})
+
+int bch2_bucket_gens_init(struct bch_fs *);
+
+static inline bool bkey_is_alloc(const struct bkey *k)
+{
+	return  k->type == KEY_TYPE_alloc ||
+		k->type == KEY_TYPE_alloc_v2 ||
+		k->type == KEY_TYPE_alloc_v3;
+}
+
+int bch2_alloc_read(struct bch_fs *);
+
+int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *,
+				   const struct bch_alloc_v4 *,
+				   const struct bch_alloc_v4 *, unsigned);
+int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
+		       struct bkey_s_c, struct bkey_s,
+		       enum btree_iter_update_trigger_flags);
+
+int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool);
+int bch2_check_alloc_info(struct bch_fs *);
+int bch2_check_alloc_to_lru_refs(struct bch_fs *);
+void bch2_dev_do_discards(struct bch_dev *);
+void bch2_do_discards(struct bch_fs *);
+
+static inline u64 should_invalidate_buckets(struct bch_dev *ca,
+					    struct bch_dev_usage u)
+{
+	u64 want_free = ca->mi.nbuckets >> 7;
+	u64 free = max_t(s64, 0,
+			   u.d[BCH_DATA_free].buckets
+			 + u.d[BCH_DATA_need_discard].buckets
+			 - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe));
+
+	return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
+}
+
+void bch2_dev_do_invalidates(struct bch_dev *);
+void bch2_do_invalidates(struct bch_fs *);
+
+static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
+{
+	return (void *) ((u64 *) &a->v +
+			 (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+			  BCH_ALLOC_V4_U64s_V0));
+}
+
+static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a)
+{
+	return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
+}
+
+int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
+int bch2_fs_freespace_init(struct bch_fs *);
+int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *);
+
+void bch2_recalc_capacity(struct bch_fs *);
+u64 bch2_min_rw_member_capacity(struct bch_fs *);
+
+void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
+void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
+
+void bch2_dev_allocator_background_exit(struct bch_dev *);
+void bch2_dev_allocator_background_init(struct bch_dev *);
+
+void bch2_fs_allocator_background_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/libbcachefs/alloc_background_format.h b/libbcachefs/alloc_background_format.h
new file mode 100644
index 00000000..befdaa95
--- /dev/null
+++ b/libbcachefs/alloc_background_format.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
+#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
+
+struct bch_alloc {
+	struct bch_val		v;
+	__u8			fields;
+	__u8			gen;
+	__u8			data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V1()			\
+	x(read_time,		16)		\
+	x(write_time,		16)		\
+	x(data_type,		8)		\
+	x(dirty_sectors,	16)		\
+	x(cached_sectors,	16)		\
+	x(oldest_gen,		8)		\
+	x(stripe,		32)		\
+	x(stripe_redundancy,	8)
+
+enum {
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+	BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
+struct bch_alloc_v2 {
+	struct bch_val		v;
+	__u8			nr_fields;
+	__u8			gen;
+	__u8			oldest_gen;
+	__u8			data_type;
+	__u8			data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V2()			\
+	x(read_time,		64)		\
+	x(write_time,		64)		\
+	x(dirty_sectors,	32)		\
+	x(cached_sectors,	32)		\
+	x(stripe,		32)		\
+	x(stripe_redundancy,	8)
+
+struct bch_alloc_v3 {
+	struct bch_val		v;
+	__le64			journal_seq;
+	__le32			flags;
+	__u8			nr_fields;
+	__u8			gen;
+	__u8			oldest_gen;
+	__u8			data_type;
+	__u8			data[];
+} __packed __aligned(8);
+
+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
+
+struct bch_alloc_v4 {
+	struct bch_val		v;
+	__u64			journal_seq;
+	__u32			flags;
+	__u8			gen;
+	__u8			oldest_gen;
+	__u8			data_type;
+	__u8			stripe_redundancy;
+	__u32			dirty_sectors;
+	__u32			cached_sectors;
+	__u64			io_time[2];
+	__u32			stripe;
+	__u32			nr_external_backpointers;
+	/* end of fields in original version of alloc_v4 */
+	__u64			_fragmentation_lru; /* obsolete */
+	__u32			stripe_sectors;
+	__u32			pad;
+} __packed __aligned(8);
+
+#define BCH_ALLOC_V4_U64s_V0	6
+#define BCH_ALLOC_V4_U64s	(sizeof(struct bch_alloc_v4) / sizeof(__u64))
+
+BITMASK(BCH_ALLOC_V4_NEED_DISCARD,	struct bch_alloc_v4, flags,  0,  1)
+BITMASK(BCH_ALLOC_V4_NEED_INC_GEN,	struct bch_alloc_v4, flags,  1,  2)
+BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags,  2,  8)
+BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS,	struct bch_alloc_v4, flags,  8,  14)
+
+#define KEY_TYPE_BUCKET_GENS_BITS	8
+#define KEY_TYPE_BUCKET_GENS_NR		(1U << KEY_TYPE_BUCKET_GENS_BITS)
+#define KEY_TYPE_BUCKET_GENS_MASK	(KEY_TYPE_BUCKET_GENS_NR - 1)
+
+struct bch_bucket_gens {
+	struct bch_val		v;
+	u8			gens[KEY_TYPE_BUCKET_GENS_NR];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
new file mode 100644
index 00000000..095bfe7c
--- /dev/null
+++ b/libbcachefs/alloc_foreground.c
@@ -0,0 +1,1733 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2012 Google, Inc.
+ *
+ * Foreground allocator code: allocate buckets from freelist, and allocate in
+ * sector granularity from writepoints.
+ *
+ * bch2_bucket_alloc() allocates a single bucket from a specific device.
+ *
+ * bch2_bucket_alloc_set() allocates one or more buckets from different devices
+ * in a given filesystem.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "backpointers.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
+#include "clock.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io_write.h"
+#include "journal.h"
+#include "movinggc.h"
+#include "nocow_locking.h"
+#include "trace.h"
+
+#include <linux/math64.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+
+static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
+					   struct mutex *lock)
+{
+	if (!mutex_trylock(lock)) {
+		bch2_trans_unlock(trans);
+		mutex_lock(lock);
+	}
+}
+
+const char * const bch2_watermarks[] = {
+#define x(t) #t,
+	BCH_WATERMARKS()
+#undef x
+	NULL
+};
+
+/*
+ * Open buckets represent a bucket that's currently being allocated from.  They
+ * serve two purposes:
+ *
+ *  - They track buckets that have been partially allocated, allowing for
+ *    sub-bucket sized allocations - they're used by the sector allocator below
+ *
+ *  - They provide a reference to the buckets they own that mark and sweep GC
+ *    can find, until the new allocation has a pointer to it inserted into the
+ *    btree
+ *
+ * When allocating some space with the sector allocator, the allocation comes
+ * with a reference to an open bucket - the caller is required to put that
+ * reference _after_ doing the index update that makes its allocation reachable.
+ */
+
+void bch2_reset_alloc_cursors(struct bch_fs *c)
+{
+	rcu_read_lock();
+	for_each_member_device_rcu(c, ca, NULL)
+		memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor));
+	rcu_read_unlock();
+}
+
+static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
+{
+	open_bucket_idx_t idx = ob - c->open_buckets;
+	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
+
+	ob->hash = *slot;
+	*slot = idx;
+}
+
+static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
+{
+	open_bucket_idx_t idx = ob - c->open_buckets;
+	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
+
+	while (*slot != idx) {
+		BUG_ON(!*slot);
+		slot = &c->open_buckets[*slot].hash;
+	}
+
+	*slot = ob->hash;
+	ob->hash = 0;
+}
+
+void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct bch_dev *ca = ob_dev(c, ob);
+
+	if (ob->ec) {
+		ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
+		return;
+	}
+
+	percpu_down_read(&c->mark_lock);
+	spin_lock(&ob->lock);
+
+	ob->valid = false;
+	ob->data_type = 0;
+
+	spin_unlock(&ob->lock);
+	percpu_up_read(&c->mark_lock);
+
+	spin_lock(&c->freelist_lock);
+	bch2_open_bucket_hash_remove(c, ob);
+
+	ob->freelist = c->open_buckets_freelist;
+	c->open_buckets_freelist = ob - c->open_buckets;
+
+	c->open_buckets_nr_free++;
+	ca->nr_open_buckets--;
+	spin_unlock(&c->freelist_lock);
+
+	closure_wake_up(&c->open_buckets_wait);
+}
+
+void bch2_open_bucket_write_error(struct bch_fs *c,
+				  struct open_buckets *obs,
+				  unsigned dev)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, obs, ob, i)
+		if (ob->dev == dev && ob->ec)
+			bch2_ec_bucket_cancel(c, ob);
+}
+
+static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
+{
+	struct open_bucket *ob;
+
+	BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
+
+	ob = c->open_buckets + c->open_buckets_freelist;
+	c->open_buckets_freelist = ob->freelist;
+	atomic_set(&ob->pin, 1);
+	ob->data_type = 0;
+
+	c->open_buckets_nr_free--;
+	return ob;
+}
+
+static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
+{
+	if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs)
+		return false;
+
+	return bch2_is_superblock_bucket(ca, b);
+}
+
+static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
+{
+	BUG_ON(c->open_buckets_partial_nr >=
+	       ARRAY_SIZE(c->open_buckets_partial));
+
+	spin_lock(&c->freelist_lock);
+	rcu_read_lock();
+	bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++;
+	rcu_read_unlock();
+
+	ob->on_partial_list = true;
+	c->open_buckets_partial[c->open_buckets_partial_nr++] =
+		ob - c->open_buckets;
+	spin_unlock(&c->freelist_lock);
+
+	closure_wake_up(&c->open_buckets_wait);
+	closure_wake_up(&c->freelist_wait);
+}
+
+static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
+{
+	switch (watermark) {
+	case BCH_WATERMARK_interior_updates:
+		return 0;
+	case BCH_WATERMARK_reclaim:
+		return OPEN_BUCKETS_COUNT / 6;
+	case BCH_WATERMARK_btree:
+	case BCH_WATERMARK_btree_copygc:
+		return OPEN_BUCKETS_COUNT / 4;
+	case BCH_WATERMARK_copygc:
+		return OPEN_BUCKETS_COUNT / 3;
+	default:
+		return OPEN_BUCKETS_COUNT / 2;
+	}
+}
+
+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+					      u64 bucket, u8 gen,
+					      enum bch_watermark watermark,
+					      struct bucket_alloc_state *s,
+					      struct closure *cl)
+{
+	struct open_bucket *ob;
+
+	if (unlikely(is_superblock_bucket(c, ca, bucket)))
+		return NULL;
+
+	if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
+		s->skipped_nouse++;
+		return NULL;
+	}
+
+	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+		s->skipped_open++;
+		return NULL;
+	}
+
+	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+			c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
+		s->skipped_need_journal_commit++;
+		return NULL;
+	}
+
+	if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
+		s->skipped_nocow++;
+		return NULL;
+	}
+
+	spin_lock(&c->freelist_lock);
+
+	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
+		if (cl)
+			closure_wait(&c->open_buckets_wait, cl);
+
+		track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
+		spin_unlock(&c->freelist_lock);
+		return ERR_PTR(-BCH_ERR_open_buckets_empty);
+	}
+
+	/* Recheck under lock: */
+	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+		spin_unlock(&c->freelist_lock);
+		s->skipped_open++;
+		return NULL;
+	}
+
+	ob = bch2_open_bucket_alloc(c);
+
+	spin_lock(&ob->lock);
+
+	ob->valid	= true;
+	ob->sectors_free = ca->mi.bucket_size;
+	ob->dev		= ca->dev_idx;
+	ob->gen		= gen;
+	ob->bucket	= bucket;
+	spin_unlock(&ob->lock);
+
+	ca->nr_open_buckets++;
+	bch2_open_bucket_hash_add(c, ob);
+
+	track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
+	track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
+
+	spin_unlock(&c->freelist_lock);
+	return ob;
+}
+
+static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
+					    enum bch_watermark watermark,
+					    struct bucket_alloc_state *s,
+					    struct btree_iter *freespace_iter,
+					    struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	u64 b = freespace_iter->pos.offset & ~(~0ULL << 56);
+	u8 gen;
+
+	int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret)
+		return NULL;
+
+	return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl);
+}
+
+/*
+ * This path is for before the freespace btree is initialized:
+ */
+static noinline struct open_bucket *
+bch2_bucket_alloc_early(struct btree_trans *trans,
+			struct bch_dev *ca,
+			enum bch_watermark watermark,
+			struct bucket_alloc_state *s,
+			struct closure *cl)
+{
+	struct btree_iter iter, citer;
+	struct bkey_s_c k, ck;
+	struct open_bucket *ob = NULL;
+	u64 first_bucket = ca->mi.first_bucket;
+	u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
+	u64 alloc_start = max(first_bucket, *dev_alloc_cursor);
+	u64 alloc_cursor = alloc_start;
+	int ret;
+
+	/*
+	 * Scan with an uncached iterator to avoid polluting the key cache. An
+	 * uncached iter will return a cached key if one exists, but if not
+	 * there is no other underlying protection for the associated key cache
+	 * slot. To avoid racing bucket allocations, look up the cached key slot
+	 * of any likely allocation candidate before attempting to proceed with
+	 * the allocation. This provides proper exclusion on the associated
+	 * bucket.
+	 */
+again:
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
+			   BTREE_ITER_slots, k, ret) {
+		u64 bucket = k.k->p.offset;
+
+		if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
+			break;
+
+		if (s->btree_bitmap != BTREE_BITMAP_ANY &&
+		    s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
+				bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
+			if (s->btree_bitmap == BTREE_BITMAP_YES &&
+			    bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
+				break;
+
+			bucket = sector_to_bucket(ca,
+					round_up(bucket_to_sector(ca, bucket) + 1,
+						 1ULL << ca->mi.btree_bitmap_shift));
+			bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket));
+			s->buckets_seen++;
+			s->skipped_mi_btree_bitmap++;
+			continue;
+		}
+
+		struct bch_alloc_v4 a_convert;
+		const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+		if (a->data_type != BCH_DATA_free)
+			continue;
+
+		/* now check the cached key to serialize concurrent allocs of the bucket */
+		ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached);
+		ret = bkey_err(ck);
+		if (ret)
+			break;
+
+		a = bch2_alloc_to_v4(ck, &a_convert);
+		if (a->data_type != BCH_DATA_free)
+			goto next;
+
+		s->buckets_seen++;
+
+		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, a->gen, watermark, s, cl);
+next:
+		bch2_set_btree_iter_dontneed(&citer);
+		bch2_trans_iter_exit(trans, &citer);
+		if (ob)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	alloc_cursor = iter.pos.offset;
+
+	if (!ob && ret)
+		ob = ERR_PTR(ret);
+
+	if (!ob && alloc_start > first_bucket) {
+		alloc_cursor = alloc_start = first_bucket;
+		goto again;
+	}
+
+	*dev_alloc_cursor = alloc_cursor;
+
+	return ob;
+}
+
+static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
+						   struct bch_dev *ca,
+						   enum bch_watermark watermark,
+						   struct bucket_alloc_state *s,
+						   struct closure *cl)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct open_bucket *ob = NULL;
+	u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
+	u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor));
+	u64 alloc_cursor = alloc_start;
+	int ret;
+again:
+	for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace,
+					 POS(ca->dev_idx, alloc_cursor),
+					 POS(ca->dev_idx, U64_MAX),
+					 0, k, ret) {
+		/*
+		 * peek normally dosen't trim extents - they can span iter.pos,
+		 * which is not what we want here:
+		 */
+		iter.k.size = iter.k.p.offset - iter.pos.offset;
+
+		while (iter.k.size) {
+			s->buckets_seen++;
+
+			u64 bucket = iter.pos.offset & ~(~0ULL << 56);
+			if (s->btree_bitmap != BTREE_BITMAP_ANY &&
+			    s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
+					bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
+				if (s->btree_bitmap == BTREE_BITMAP_YES &&
+				    bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
+					goto fail;
+
+				bucket = sector_to_bucket(ca,
+						round_up(bucket_to_sector(ca, bucket + 1),
+							 1ULL << ca->mi.btree_bitmap_shift));
+				alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56));
+
+				bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor));
+				s->skipped_mi_btree_bitmap++;
+				goto next;
+			}
+
+			ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl);
+			if (ob) {
+				if (!IS_ERR(ob))
+					*dev_alloc_cursor = iter.pos.offset;
+				bch2_set_btree_iter_dontneed(&iter);
+				break;
+			}
+
+			iter.k.size--;
+			iter.pos.offset++;
+		}
+next:
+		if (ob || ret)
+			break;
+	}
+fail:
+	bch2_trans_iter_exit(trans, &iter);
+
+	BUG_ON(ob && ret);
+
+	if (ret)
+		ob = ERR_PTR(ret);
+
+	if (!ob && alloc_start > ca->mi.first_bucket) {
+		alloc_cursor = alloc_start = ca->mi.first_bucket;
+		goto again;
+	}
+
+	return ob;
+}
+
+static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca,
+					 enum bch_watermark watermark,
+					 enum bch_data_type data_type,
+					 struct closure *cl,
+					 struct bch_dev_usage *usage,
+					 struct bucket_alloc_state *s,
+					 struct open_bucket *ob)
+{
+	struct printbuf buf = PRINTBUF;
+
+	printbuf_tabstop_push(&buf, 24);
+
+	prt_printf(&buf, "dev\t%s (%u)\n",	ca->name, ca->dev_idx);
+	prt_printf(&buf, "watermark\t%s\n",	bch2_watermarks[watermark]);
+	prt_printf(&buf, "data type\t%s\n",	__bch2_data_types[data_type]);
+	prt_printf(&buf, "blocking\t%u\n",	cl != NULL);
+	prt_printf(&buf, "free\t%llu\n",	usage->d[BCH_DATA_free].buckets);
+	prt_printf(&buf, "avail\t%llu\n",	dev_buckets_free(ca, *usage, watermark));
+	prt_printf(&buf, "copygc_wait\t%lu/%lli\n",
+		   bch2_copygc_wait_amount(c),
+		   c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now));
+	prt_printf(&buf, "seen\t%llu\n",	s->buckets_seen);
+	prt_printf(&buf, "open\t%llu\n",	s->skipped_open);
+	prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit);
+	prt_printf(&buf, "nocow\t%llu\n",	s->skipped_nocow);
+	prt_printf(&buf, "nouse\t%llu\n",	s->skipped_nouse);
+	prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap);
+
+	if (!IS_ERR(ob)) {
+		prt_printf(&buf, "allocated\t%llu\n", ob->bucket);
+		trace_bucket_alloc(c, buf.buf);
+	} else {
+		prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob)));
+		trace_bucket_alloc_fail(c, buf.buf);
+	}
+
+	printbuf_exit(&buf);
+}
+
+/**
+ * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
+ * @trans:	transaction object
+ * @ca:		device to allocate from
+ * @watermark:	how important is this allocation?
+ * @data_type:	BCH_DATA_journal, btree, user...
+ * @cl:		if not NULL, closure to be used to wait if buckets not available
+ * @nowait:	if true, do not wait for buckets to become available
+ * @usage:	for secondarily also returning the current device usage
+ *
+ * Returns:	an open_bucket on success, or an ERR_PTR() on failure.
+ */
+static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
+				      struct bch_dev *ca,
+				      enum bch_watermark watermark,
+				      enum bch_data_type data_type,
+				      struct closure *cl,
+				      bool nowait,
+				      struct bch_dev_usage *usage)
+{
+	struct bch_fs *c = trans->c;
+	struct open_bucket *ob = NULL;
+	bool freespace = READ_ONCE(ca->mi.freespace_initialized);
+	u64 avail;
+	struct bucket_alloc_state s = {
+		.btree_bitmap = data_type == BCH_DATA_btree,
+	};
+	bool waiting = nowait;
+again:
+	bch2_dev_usage_read_fast(ca, usage);
+	avail = dev_buckets_free(ca, *usage, watermark);
+
+	if (usage->d[BCH_DATA_need_discard].buckets > avail)
+		bch2_dev_do_discards(ca);
+
+	if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
+		bch2_gc_gens_async(c);
+
+	if (should_invalidate_buckets(ca, *usage))
+		bch2_dev_do_invalidates(ca);
+
+	if (!avail) {
+		if (watermark > BCH_WATERMARK_normal &&
+		    c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
+			goto alloc;
+
+		if (cl && !waiting) {
+			closure_wait(&c->freelist_wait, cl);
+			waiting = true;
+			goto again;
+		}
+
+		track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
+
+		ob = ERR_PTR(-BCH_ERR_freelist_empty);
+		goto err;
+	}
+
+	if (waiting)
+		closure_wake_up(&c->freelist_wait);
+alloc:
+	ob = likely(freespace)
+		? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
+		: bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
+
+	if (s.skipped_need_journal_commit * 2 > avail)
+		bch2_journal_flush_async(&c->journal, NULL);
+
+	if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) {
+		s.btree_bitmap = BTREE_BITMAP_ANY;
+		goto alloc;
+	}
+
+	if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
+		freespace = false;
+		goto alloc;
+	}
+err:
+	if (!ob)
+		ob = ERR_PTR(-BCH_ERR_no_buckets_found);
+
+	if (!IS_ERR(ob))
+		ob->data_type = data_type;
+
+	if (!IS_ERR(ob))
+		count_event(c, bucket_alloc);
+	else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
+		count_event(c, bucket_alloc_fail);
+
+	if (!IS_ERR(ob)
+	    ? trace_bucket_alloc_enabled()
+	    : trace_bucket_alloc_fail_enabled())
+		trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob);
+
+	return ob;
+}
+
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+				      enum bch_watermark watermark,
+				      enum bch_data_type data_type,
+				      struct closure *cl)
+{
+	struct bch_dev_usage usage;
+	struct open_bucket *ob;
+
+	bch2_trans_do(c,
+		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
+							data_type, cl, false, &usage)));
+	return ob;
+}
+
+static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
+			    unsigned l, unsigned r)
+{
+	return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
+		(stripe->next_alloc[l] < stripe->next_alloc[r]));
+}
+
+#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
+
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
+					  struct dev_stripe_state *stripe,
+					  struct bch_devs_mask *devs)
+{
+	struct dev_alloc_list ret = { .nr = 0 };
+	unsigned i;
+
+	for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
+		ret.devs[ret.nr++] = i;
+
+	bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
+	return ret;
+}
+
+static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
+			       struct dev_stripe_state *stripe,
+			       struct bch_dev_usage *usage)
+{
+	u64 *v = stripe->next_alloc + ca->dev_idx;
+	u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal);
+	u64 free_space_inv = free_space
+		? div64_u64(1ULL << 48, free_space)
+		: 1ULL << 48;
+	u64 scale = *v / 4;
+
+	if (*v + free_space_inv >= *v)
+		*v += free_space_inv;
+	else
+		*v = U64_MAX;
+
+	for (v = stripe->next_alloc;
+	     v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
+		*v = *v < scale ? 0 : *v - scale;
+}
+
+void bch2_dev_stripe_increment(struct bch_dev *ca,
+			       struct dev_stripe_state *stripe)
+{
+	struct bch_dev_usage usage;
+
+	bch2_dev_usage_read_fast(ca, &usage);
+	bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
+}
+
+static int add_new_bucket(struct bch_fs *c,
+			   struct open_buckets *ptrs,
+			   struct bch_devs_mask *devs_may_alloc,
+			   unsigned nr_replicas,
+			   unsigned *nr_effective,
+			   bool *have_cache,
+			   struct open_bucket *ob)
+{
+	unsigned durability = ob_dev(c, ob)->mi.durability;
+
+	BUG_ON(*nr_effective >= nr_replicas);
+
+	__clear_bit(ob->dev, devs_may_alloc->d);
+	*nr_effective	+= durability;
+	*have_cache	|= !durability;
+
+	ob_push(c, ptrs, ob);
+
+	if (*nr_effective >= nr_replicas)
+		return 1;
+	if (ob->ec)
+		return 1;
+	return 0;
+}
+
+int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
+		      struct open_buckets *ptrs,
+		      struct dev_stripe_state *stripe,
+		      struct bch_devs_mask *devs_may_alloc,
+		      unsigned nr_replicas,
+		      unsigned *nr_effective,
+		      bool *have_cache,
+		      enum bch_write_flags flags,
+		      enum bch_data_type data_type,
+		      enum bch_watermark watermark,
+		      struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	struct dev_alloc_list devs_sorted =
+		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
+	int ret = -BCH_ERR_insufficient_devices;
+
+	BUG_ON(*nr_effective >= nr_replicas);
+
+	for (unsigned i = 0; i < devs_sorted.nr; i++) {
+		struct bch_dev_usage usage;
+		struct open_bucket *ob;
+
+		unsigned dev = devs_sorted.devs[i];
+		struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
+		if (!ca)
+			continue;
+
+		if (!ca->mi.durability && *have_cache) {
+			bch2_dev_put(ca);
+			continue;
+		}
+
+		ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
+					     cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage);
+		if (!IS_ERR(ob))
+			bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
+		bch2_dev_put(ca);
+
+		if (IS_ERR(ob)) {
+			ret = PTR_ERR(ob);
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
+				break;
+			continue;
+		}
+
+		if (add_new_bucket(c, ptrs, devs_may_alloc,
+				   nr_replicas, nr_effective,
+				   have_cache, ob)) {
+			ret = 0;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/* Allocate from stripes: */
+
+/*
+ * if we can't allocate a new stripe because there are already too many
+ * partially filled stripes, force allocating from an existing stripe even when
+ * it's to a device we don't want:
+ */
+
+static int bucket_alloc_from_stripe(struct btree_trans *trans,
+			 struct open_buckets *ptrs,
+			 struct write_point *wp,
+			 struct bch_devs_mask *devs_may_alloc,
+			 u16 target,
+			 unsigned nr_replicas,
+			 unsigned *nr_effective,
+			 bool *have_cache,
+			 enum bch_watermark watermark,
+			 enum bch_write_flags flags,
+			 struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	struct dev_alloc_list devs_sorted;
+	struct ec_stripe_head *h;
+	struct open_bucket *ob;
+	unsigned i, ec_idx;
+	int ret = 0;
+
+	if (nr_replicas < 2)
+		return 0;
+
+	if (ec_open_bucket(c, ptrs))
+		return 0;
+
+	h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+	if (!h)
+		return 0;
+
+	devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
+
+	for (i = 0; i < devs_sorted.nr; i++)
+		for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
+			if (!h->s->blocks[ec_idx])
+				continue;
+
+			ob = c->open_buckets + h->s->blocks[ec_idx];
+			if (ob->dev == devs_sorted.devs[i] &&
+			    !test_and_set_bit(ec_idx, h->s->blocks_allocated))
+				goto got_bucket;
+		}
+	goto out_put_head;
+got_bucket:
+	ob->ec_idx	= ec_idx;
+	ob->ec		= h->s;
+	ec_stripe_new_get(h->s, STRIPE_REF_io);
+
+	ret = add_new_bucket(c, ptrs, devs_may_alloc,
+			     nr_replicas, nr_effective,
+			     have_cache, ob);
+out_put_head:
+	bch2_ec_stripe_head_put(c, h);
+	return ret;
+}
+
+/* Sector allocator */
+
+static bool want_bucket(struct bch_fs *c,
+			struct write_point *wp,
+			struct bch_devs_mask *devs_may_alloc,
+			bool *have_cache, bool ec,
+			struct open_bucket *ob)
+{
+	struct bch_dev *ca = ob_dev(c, ob);
+
+	if (!test_bit(ob->dev, devs_may_alloc->d))
+		return false;
+
+	if (ob->data_type != wp->data_type)
+		return false;
+
+	if (!ca->mi.durability &&
+	    (wp->data_type == BCH_DATA_btree || ec || *have_cache))
+		return false;
+
+	if (ec != (ob->ec != NULL))
+		return false;
+
+	return true;
+}
+
+static int bucket_alloc_set_writepoint(struct bch_fs *c,
+				       struct open_buckets *ptrs,
+				       struct write_point *wp,
+				       struct bch_devs_mask *devs_may_alloc,
+				       unsigned nr_replicas,
+				       unsigned *nr_effective,
+				       bool *have_cache,
+				       bool ec)
+{
+	struct open_buckets ptrs_skip = { .nr = 0 };
+	struct open_bucket *ob;
+	unsigned i;
+	int ret = 0;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
+		if (!ret && want_bucket(c, wp, devs_may_alloc,
+					have_cache, ec, ob))
+			ret = add_new_bucket(c, ptrs, devs_may_alloc,
+				       nr_replicas, nr_effective,
+				       have_cache, ob);
+		else
+			ob_push(c, &ptrs_skip, ob);
+	}
+	wp->ptrs = ptrs_skip;
+
+	return ret;
+}
+
+static int bucket_alloc_set_partial(struct bch_fs *c,
+				    struct open_buckets *ptrs,
+				    struct write_point *wp,
+				    struct bch_devs_mask *devs_may_alloc,
+				    unsigned nr_replicas,
+				    unsigned *nr_effective,
+				    bool *have_cache, bool ec,
+				    enum bch_watermark watermark)
+{
+	int i, ret = 0;
+
+	if (!c->open_buckets_partial_nr)
+		return 0;
+
+	spin_lock(&c->freelist_lock);
+
+	if (!c->open_buckets_partial_nr)
+		goto unlock;
+
+	for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
+		struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
+
+		if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
+			struct bch_dev *ca = ob_dev(c, ob);
+			struct bch_dev_usage usage;
+			u64 avail;
+
+			bch2_dev_usage_read_fast(ca, &usage);
+			avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets;
+			if (!avail)
+				continue;
+
+			array_remove_item(c->open_buckets_partial,
+					  c->open_buckets_partial_nr,
+					  i);
+			ob->on_partial_list = false;
+
+			rcu_read_lock();
+			bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
+			rcu_read_unlock();
+
+			ret = add_new_bucket(c, ptrs, devs_may_alloc,
+					     nr_replicas, nr_effective,
+					     have_cache, ob);
+			if (ret)
+				break;
+		}
+	}
+unlock:
+	spin_unlock(&c->freelist_lock);
+	return ret;
+}
+
+static int __open_bucket_add_buckets(struct btree_trans *trans,
+			struct open_buckets *ptrs,
+			struct write_point *wp,
+			struct bch_devs_list *devs_have,
+			u16 target,
+			bool erasure_code,
+			unsigned nr_replicas,
+			unsigned *nr_effective,
+			bool *have_cache,
+			enum bch_watermark watermark,
+			enum bch_write_flags flags,
+			struct closure *_cl)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_devs_mask devs;
+	struct open_bucket *ob;
+	struct closure *cl = NULL;
+	unsigned i;
+	int ret;
+
+	devs = target_rw_devs(c, wp->data_type, target);
+
+	/* Don't allocate from devices we already have pointers to: */
+	darray_for_each(*devs_have, i)
+		__clear_bit(*i, devs.d);
+
+	open_bucket_for_each(c, ptrs, ob, i)
+		__clear_bit(ob->dev, devs.d);
+
+	ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
+				 nr_replicas, nr_effective,
+				 have_cache, erasure_code);
+	if (ret)
+		return ret;
+
+	ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
+				 nr_replicas, nr_effective,
+				 have_cache, erasure_code, watermark);
+	if (ret)
+		return ret;
+
+	if (erasure_code) {
+		ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
+					 target,
+					 nr_replicas, nr_effective,
+					 have_cache,
+					 watermark, flags, _cl);
+	} else {
+retry_blocking:
+		/*
+		 * Try nonblocking first, so that if one device is full we'll try from
+		 * other devices:
+		 */
+		ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
+					nr_replicas, nr_effective, have_cache,
+					flags, wp->data_type, watermark, cl);
+		if (ret &&
+		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+		    !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
+		    !cl && _cl) {
+			cl = _cl;
+			goto retry_blocking;
+		}
+	}
+
+	return ret;
+}
+
+static int open_bucket_add_buckets(struct btree_trans *trans,
+			struct open_buckets *ptrs,
+			struct write_point *wp,
+			struct bch_devs_list *devs_have,
+			u16 target,
+			unsigned erasure_code,
+			unsigned nr_replicas,
+			unsigned *nr_effective,
+			bool *have_cache,
+			enum bch_watermark watermark,
+			enum bch_write_flags flags,
+			struct closure *cl)
+{
+	int ret;
+
+	if (erasure_code && !ec_open_bucket(trans->c, ptrs)) {
+		ret = __open_bucket_add_buckets(trans, ptrs, wp,
+				devs_have, target, erasure_code,
+				nr_replicas, nr_effective, have_cache,
+				watermark, flags, cl);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+		    bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
+		    bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
+		    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+			return ret;
+		if (*nr_effective >= nr_replicas)
+			return 0;
+	}
+
+	ret = __open_bucket_add_buckets(trans, ptrs, wp,
+			devs_have, target, false,
+			nr_replicas, nr_effective, have_cache,
+			watermark, flags, cl);
+	return ret < 0 ? ret : 0;
+}
+
+/**
+ * should_drop_bucket - check if this is open_bucket should go away
+ * @ob:		open_bucket to predicate on
+ * @c:		filesystem handle
+ * @ca:		if set, we're killing buckets for a particular device
+ * @ec:		if true, we're shutting down erasure coding and killing all ec
+ *		open_buckets
+ *		otherwise, return true
+ * Returns: true if we should kill this open_bucket
+ *
+ * We're killing open_buckets because we're shutting down a device, erasure
+ * coding, or the entire filesystem - check if this open_bucket matches:
+ */
+static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
+			       struct bch_dev *ca, bool ec)
+{
+	if (ec) {
+		return ob->ec != NULL;
+	} else if (ca) {
+		bool drop = ob->dev == ca->dev_idx;
+		struct open_bucket *ob2;
+		unsigned i;
+
+		if (!drop && ob->ec) {
+			unsigned nr_blocks;
+
+			mutex_lock(&ob->ec->lock);
+			nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
+
+			for (i = 0; i < nr_blocks; i++) {
+				if (!ob->ec->blocks[i])
+					continue;
+
+				ob2 = c->open_buckets + ob->ec->blocks[i];
+				drop |= ob2->dev == ca->dev_idx;
+			}
+			mutex_unlock(&ob->ec->lock);
+		}
+
+		return drop;
+	} else {
+		return true;
+	}
+}
+
+static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
+				 bool ec, struct write_point *wp)
+{
+	struct open_buckets ptrs = { .nr = 0 };
+	struct open_bucket *ob;
+	unsigned i;
+
+	mutex_lock(&wp->lock);
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		if (should_drop_bucket(ob, c, ca, ec))
+			bch2_open_bucket_put(c, ob);
+		else
+			ob_push(c, &ptrs, ob);
+	wp->ptrs = ptrs;
+	mutex_unlock(&wp->lock);
+}
+
+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
+			    bool ec)
+{
+	unsigned i;
+
+	/* Next, close write points that point to this device... */
+	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+		bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
+
+	bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
+	bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
+	bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+	while (c->btree_reserve_cache_nr) {
+		struct btree_alloc *a =
+			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+		bch2_open_buckets_put(c, &a->ob);
+	}
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+	spin_lock(&c->freelist_lock);
+	i = 0;
+	while (i < c->open_buckets_partial_nr) {
+		struct open_bucket *ob =
+			c->open_buckets + c->open_buckets_partial[i];
+
+		if (should_drop_bucket(ob, c, ca, ec)) {
+			--c->open_buckets_partial_nr;
+			swap(c->open_buckets_partial[i],
+			     c->open_buckets_partial[c->open_buckets_partial_nr]);
+
+			ob->on_partial_list = false;
+
+			rcu_read_lock();
+			bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
+			rcu_read_unlock();
+
+			spin_unlock(&c->freelist_lock);
+			bch2_open_bucket_put(c, ob);
+			spin_lock(&c->freelist_lock);
+		} else {
+			i++;
+		}
+	}
+	spin_unlock(&c->freelist_lock);
+
+	bch2_ec_stop_dev(c, ca);
+}
+
+static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
+						 unsigned long write_point)
+{
+	unsigned hash =
+		hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
+
+	return &c->write_points_hash[hash];
+}
+
+static struct write_point *__writepoint_find(struct hlist_head *head,
+					     unsigned long write_point)
+{
+	struct write_point *wp;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(wp, head, node)
+		if (wp->write_point == write_point)
+			goto out;
+	wp = NULL;
+out:
+	rcu_read_unlock();
+	return wp;
+}
+
+static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
+{
+	u64 stranded	= c->write_points_nr * c->bucket_size_max;
+	u64 free	= bch2_fs_usage_read_short(c).free;
+
+	return stranded * factor > free;
+}
+
+static bool try_increase_writepoints(struct bch_fs *c)
+{
+	struct write_point *wp;
+
+	if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
+	    too_many_writepoints(c, 32))
+		return false;
+
+	wp = c->write_points + c->write_points_nr++;
+	hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
+	return true;
+}
+
+static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
+{
+	struct bch_fs *c = trans->c;
+	struct write_point *wp;
+	struct open_bucket *ob;
+	unsigned i;
+
+	mutex_lock(&c->write_points_hash_lock);
+	if (c->write_points_nr < old_nr) {
+		mutex_unlock(&c->write_points_hash_lock);
+		return true;
+	}
+
+	if (c->write_points_nr == 1 ||
+	    !too_many_writepoints(c, 8)) {
+		mutex_unlock(&c->write_points_hash_lock);
+		return false;
+	}
+
+	wp = c->write_points + --c->write_points_nr;
+
+	hlist_del_rcu(&wp->node);
+	mutex_unlock(&c->write_points_hash_lock);
+
+	bch2_trans_mutex_lock_norelock(trans, &wp->lock);
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		open_bucket_free_unused(c, ob);
+	wp->ptrs.nr = 0;
+	mutex_unlock(&wp->lock);
+	return true;
+}
+
+static struct write_point *writepoint_find(struct btree_trans *trans,
+					   unsigned long write_point)
+{
+	struct bch_fs *c = trans->c;
+	struct write_point *wp, *oldest;
+	struct hlist_head *head;
+
+	if (!(write_point & 1UL)) {
+		wp = (struct write_point *) write_point;
+		bch2_trans_mutex_lock_norelock(trans, &wp->lock);
+		return wp;
+	}
+
+	head = writepoint_hash(c, write_point);
+restart_find:
+	wp = __writepoint_find(head, write_point);
+	if (wp) {
+lock_wp:
+		bch2_trans_mutex_lock_norelock(trans, &wp->lock);
+		if (wp->write_point == write_point)
+			goto out;
+		mutex_unlock(&wp->lock);
+		goto restart_find;
+	}
+restart_find_oldest:
+	oldest = NULL;
+	for (wp = c->write_points;
+	     wp < c->write_points + c->write_points_nr; wp++)
+		if (!oldest || time_before64(wp->last_used, oldest->last_used))
+			oldest = wp;
+
+	bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
+	bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
+	if (oldest >= c->write_points + c->write_points_nr ||
+	    try_increase_writepoints(c)) {
+		mutex_unlock(&c->write_points_hash_lock);
+		mutex_unlock(&oldest->lock);
+		goto restart_find_oldest;
+	}
+
+	wp = __writepoint_find(head, write_point);
+	if (wp && wp != oldest) {
+		mutex_unlock(&c->write_points_hash_lock);
+		mutex_unlock(&oldest->lock);
+		goto lock_wp;
+	}
+
+	wp = oldest;
+	hlist_del_rcu(&wp->node);
+	wp->write_point = write_point;
+	hlist_add_head_rcu(&wp->node, head);
+	mutex_unlock(&c->write_points_hash_lock);
+out:
+	wp->last_used = local_clock();
+	return wp;
+}
+
+static noinline void
+deallocate_extra_replicas(struct bch_fs *c,
+			  struct open_buckets *ptrs,
+			  struct open_buckets *ptrs_no_use,
+			  unsigned extra_replicas)
+{
+	struct open_buckets ptrs2 = { 0 };
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, ptrs, ob, i) {
+		unsigned d = ob_dev(c, ob)->mi.durability;
+
+		if (d && d <= extra_replicas) {
+			extra_replicas -= d;
+			ob_push(c, ptrs_no_use, ob);
+		} else {
+			ob_push(c, &ptrs2, ob);
+		}
+	}
+
+	*ptrs = ptrs2;
+}
+
+/*
+ * Get us an open_bucket we can allocate from, return with it locked:
+ */
+int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
+			     unsigned target,
+			     unsigned erasure_code,
+			     struct write_point_specifier write_point,
+			     struct bch_devs_list *devs_have,
+			     unsigned nr_replicas,
+			     unsigned nr_replicas_required,
+			     enum bch_watermark watermark,
+			     enum bch_write_flags flags,
+			     struct closure *cl,
+			     struct write_point **wp_ret)
+{
+	struct bch_fs *c = trans->c;
+	struct write_point *wp;
+	struct open_bucket *ob;
+	struct open_buckets ptrs;
+	unsigned nr_effective, write_points_nr;
+	bool have_cache;
+	int ret;
+	int i;
+
+	if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
+		erasure_code = false;
+
+	BUG_ON(!nr_replicas || !nr_replicas_required);
+retry:
+	ptrs.nr		= 0;
+	nr_effective	= 0;
+	write_points_nr = c->write_points_nr;
+	have_cache	= false;
+
+	*wp_ret = wp = writepoint_find(trans, write_point.v);
+
+	ret = bch2_trans_relock(trans);
+	if (ret)
+		goto err;
+
+	/* metadata may not allocate on cache devices: */
+	if (wp->data_type != BCH_DATA_user)
+		have_cache = true;
+
+	if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
+		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+					      target, erasure_code,
+					      nr_replicas, &nr_effective,
+					      &have_cache, watermark,
+					      flags, NULL);
+		if (!ret ||
+		    bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto alloc_done;
+
+		/* Don't retry from all devices if we're out of open buckets: */
+		if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
+			int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+					      target, erasure_code,
+					      nr_replicas, &nr_effective,
+					      &have_cache, watermark,
+					      flags, cl);
+			if (!ret2 ||
+			    bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
+			    bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
+				ret = ret2;
+				goto alloc_done;
+			}
+		}
+
+		/*
+		 * Only try to allocate cache (durability = 0 devices) from the
+		 * specified target:
+		 */
+		have_cache = true;
+
+		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+					      0, erasure_code,
+					      nr_replicas, &nr_effective,
+					      &have_cache, watermark,
+					      flags, cl);
+	} else {
+		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+					      target, erasure_code,
+					      nr_replicas, &nr_effective,
+					      &have_cache, watermark,
+					      flags, cl);
+	}
+alloc_done:
+	BUG_ON(!ret && nr_effective < nr_replicas);
+
+	if (erasure_code && !ec_open_bucket(c, &ptrs))
+		pr_debug("failed to get ec bucket: ret %u", ret);
+
+	if (ret == -BCH_ERR_insufficient_devices &&
+	    nr_effective >= nr_replicas_required)
+		ret = 0;
+
+	if (ret)
+		goto err;
+
+	if (nr_effective > nr_replicas)
+		deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
+
+	/* Free buckets we didn't use: */
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		open_bucket_free_unused(c, ob);
+
+	wp->ptrs = ptrs;
+
+	wp->sectors_free = UINT_MAX;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
+
+	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
+
+	return 0;
+err:
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
+			ob_push(c, &ptrs, ob);
+		else
+			open_bucket_free_unused(c, ob);
+	wp->ptrs = ptrs;
+
+	mutex_unlock(&wp->lock);
+
+	if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
+	    try_decrease_writepoints(trans, write_points_nr))
+		goto retry;
+
+	if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+		ret = -BCH_ERR_bucket_alloc_blocked;
+
+	if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) &&
+	    bch2_err_matches(ret, BCH_ERR_freelist_empty))
+		ret = -BCH_ERR_bucket_alloc_blocked;
+
+	return ret;
+}
+
+struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct bch_dev *ca = ob_dev(c, ob);
+
+	return (struct bch_extent_ptr) {
+		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
+		.gen	= ob->gen,
+		.dev	= ob->dev,
+		.offset	= bucket_to_sector(ca, ob->bucket) +
+			ca->mi.bucket_size -
+			ob->sectors_free,
+	};
+}
+
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
+				    struct bkey_i *k, unsigned sectors,
+				    bool cached)
+{
+	bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
+}
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
+{
+	bch2_alloc_sectors_done_inlined(c, wp);
+}
+
+static inline void writepoint_init(struct write_point *wp,
+				   enum bch_data_type type)
+{
+	mutex_init(&wp->lock);
+	wp->data_type = type;
+
+	INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
+	INIT_LIST_HEAD(&wp->writes);
+	spin_lock_init(&wp->writes_lock);
+}
+
+void bch2_fs_allocator_foreground_init(struct bch_fs *c)
+{
+	struct open_bucket *ob;
+	struct write_point *wp;
+
+	mutex_init(&c->write_points_hash_lock);
+	c->write_points_nr = ARRAY_SIZE(c->write_points);
+
+	/* open bucket 0 is a sentinal NULL: */
+	spin_lock_init(&c->open_buckets[0].lock);
+
+	for (ob = c->open_buckets + 1;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
+		spin_lock_init(&ob->lock);
+		c->open_buckets_nr_free++;
+
+		ob->freelist = c->open_buckets_freelist;
+		c->open_buckets_freelist = ob - c->open_buckets;
+	}
+
+	writepoint_init(&c->btree_write_point,		BCH_DATA_btree);
+	writepoint_init(&c->rebalance_write_point,	BCH_DATA_user);
+	writepoint_init(&c->copygc_write_point,		BCH_DATA_user);
+
+	for (wp = c->write_points;
+	     wp < c->write_points + c->write_points_nr; wp++) {
+		writepoint_init(wp, BCH_DATA_user);
+
+		wp->last_used	= local_clock();
+		wp->write_point	= (unsigned long) wp;
+		hlist_add_head_rcu(&wp->node,
+				   writepoint_hash(c, wp->write_point));
+	}
+}
+
+void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
+{
+	struct bch_dev *ca = ob_dev(c, ob);
+	unsigned data_type = ob->data_type;
+	barrier(); /* READ_ONCE() doesn't work on bitfields */
+
+	prt_printf(out, "%zu ref %u ",
+		   ob - c->open_buckets,
+		   atomic_read(&ob->pin));
+	bch2_prt_data_type(out, data_type);
+	prt_printf(out, " %u:%llu gen %u allocated %u/%u",
+		   ob->dev, ob->bucket, ob->gen,
+		   ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
+	if (ob->ec)
+		prt_printf(out, " ec idx %llu", ob->ec->idx);
+	if (ob->on_partial_list)
+		prt_str(out, " partial");
+	prt_newline(out);
+}
+
+void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c,
+			       struct bch_dev *ca)
+{
+	struct open_bucket *ob;
+
+	out->atomic++;
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++) {
+		spin_lock(&ob->lock);
+		if (ob->valid && (!ca || ob->dev == ca->dev_idx))
+			bch2_open_bucket_to_text(out, c, ob);
+		spin_unlock(&ob->lock);
+	}
+
+	--out->atomic;
+}
+
+void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	unsigned i;
+
+	out->atomic++;
+	spin_lock(&c->freelist_lock);
+
+	for (i = 0; i < c->open_buckets_partial_nr; i++)
+		bch2_open_bucket_to_text(out, c,
+				c->open_buckets + c->open_buckets_partial[i]);
+
+	spin_unlock(&c->freelist_lock);
+	--out->atomic;
+}
+
+static const char * const bch2_write_point_states[] = {
+#define x(n)	#n,
+	WRITE_POINT_STATES()
+#undef x
+	NULL
+};
+
+static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
+				     struct write_point *wp)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	prt_printf(out, "%lu: ", wp->write_point);
+	prt_human_readable_u64(out, wp->sectors_allocated);
+
+	prt_printf(out, " last wrote: ");
+	bch2_pr_time_units(out, sched_clock() - wp->last_used);
+
+	for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
+		prt_printf(out, " %s: ", bch2_write_point_states[i]);
+		bch2_pr_time_units(out, wp->time[i]);
+	}
+
+	prt_newline(out);
+
+	printbuf_indent_add(out, 2);
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		bch2_open_bucket_to_text(out, c, ob);
+	printbuf_indent_sub(out, 2);
+}
+
+void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct write_point *wp;
+
+	prt_str(out, "Foreground write points\n");
+	for (wp = c->write_points;
+	     wp < c->write_points + ARRAY_SIZE(c->write_points);
+	     wp++)
+		bch2_write_point_to_text(out, c, wp);
+
+	prt_str(out, "Copygc write point\n");
+	bch2_write_point_to_text(out, c, &c->copygc_write_point);
+
+	prt_str(out, "Rebalance write point\n");
+	bch2_write_point_to_text(out, c, &c->rebalance_write_point);
+
+	prt_str(out, "Btree write point\n");
+	bch2_write_point_to_text(out, c, &c->btree_write_point);
+}
+
+void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	unsigned nr[BCH_DATA_NR];
+
+	memset(nr, 0, sizeof(nr));
+
+	for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
+		nr[c->open_buckets[i].data_type]++;
+
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, 24);
+
+	prt_printf(out, "capacity\t%llu\n",		c->capacity);
+	prt_printf(out, "reserved\t%llu\n",		c->reserved);
+	prt_printf(out, "hidden\t%llu\n",		percpu_u64_get(&c->usage->hidden));
+	prt_printf(out, "btree\t%llu\n",		percpu_u64_get(&c->usage->btree));
+	prt_printf(out, "data\t%llu\n",			percpu_u64_get(&c->usage->data));
+	prt_printf(out, "cached\t%llu\n",		percpu_u64_get(&c->usage->cached));
+	prt_printf(out, "reserved\t%llu\n",		percpu_u64_get(&c->usage->reserved));
+	prt_printf(out, "online_reserved\t%llu\n",	percpu_u64_get(c->online_reserved));
+	prt_printf(out, "nr_inodes\t%llu\n",		percpu_u64_get(&c->usage->nr_inodes));
+
+	prt_newline(out);
+	prt_printf(out, "freelist_wait\t%s\n",			c->freelist_wait.list.first ? "waiting" : "empty");
+	prt_printf(out, "open buckets allocated\t%i\n",		OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
+	prt_printf(out, "open buckets total\t%u\n",		OPEN_BUCKETS_COUNT);
+	prt_printf(out, "open_buckets_wait\t%s\n",		c->open_buckets_wait.list.first ? "waiting" : "empty");
+	prt_printf(out, "open_buckets_btree\t%u\n",		nr[BCH_DATA_btree]);
+	prt_printf(out, "open_buckets_user\t%u\n",		nr[BCH_DATA_user]);
+	prt_printf(out, "btree reserve cache\t%u\n",		c->btree_reserve_cache_nr);
+}
+
+void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
+{
+	struct bch_fs *c = ca->fs;
+	struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+	unsigned nr[BCH_DATA_NR];
+
+	memset(nr, 0, sizeof(nr));
+
+	for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
+		nr[c->open_buckets[i].data_type]++;
+
+	bch2_dev_usage_to_text(out, ca, &stats);
+
+	prt_newline(out);
+
+	prt_printf(out, "reserves:\n");
+	for (unsigned i = 0; i < BCH_WATERMARK_NR; i++)
+		prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i));
+
+	prt_newline(out);
+
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, 12);
+	printbuf_tabstop_push(out, 16);
+
+	prt_printf(out, "open buckets\t%i\r\n",	ca->nr_open_buckets);
+	prt_printf(out, "buckets to invalidate\t%llu\r\n",	should_invalidate_buckets(ca, stats));
+}
+
+static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
+{
+	struct printbuf buf = PRINTBUF;
+
+	prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n",
+		   c->opts.allocator_stuck_timeout);
+
+	prt_printf(&buf, "Allocator debug:\n");
+	printbuf_indent_add(&buf, 2);
+	bch2_fs_alloc_debug_to_text(&buf, c);
+	printbuf_indent_sub(&buf, 2);
+	prt_newline(&buf);
+
+	for_each_online_member(c, ca) {
+		prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
+		printbuf_indent_add(&buf, 2);
+		bch2_dev_alloc_debug_to_text(&buf, ca);
+		printbuf_indent_sub(&buf, 2);
+		prt_newline(&buf);
+	}
+
+	prt_printf(&buf, "Copygc debug:\n");
+	printbuf_indent_add(&buf, 2);
+	bch2_copygc_wait_to_text(&buf, c);
+	printbuf_indent_sub(&buf, 2);
+	prt_newline(&buf);
+
+	prt_printf(&buf, "Journal debug:\n");
+	printbuf_indent_add(&buf, 2);
+	bch2_journal_debug_to_text(&buf, &c->journal);
+	printbuf_indent_sub(&buf, 2);
+
+	bch2_print_string_as_lines(KERN_ERR, buf.buf);
+	printbuf_exit(&buf);
+}
+
+static inline unsigned allocator_wait_timeout(struct bch_fs *c)
+{
+	if (c->allocator_last_stuck &&
+	    time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies))
+		return 0;
+
+	return c->opts.allocator_stuck_timeout * HZ;
+}
+
+void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
+{
+	unsigned t = allocator_wait_timeout(c);
+
+	if (t && closure_sync_timeout(cl, t)) {
+		c->allocator_last_stuck = jiffies;
+		bch2_print_allocator_stuck(c);
+	}
+
+	closure_sync(cl);
+}
diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h
new file mode 100644
index 00000000..4f87745d
--- /dev/null
+++ b/libbcachefs/alloc_foreground.h
@@ -0,0 +1,240 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_FOREGROUND_H
+#define _BCACHEFS_ALLOC_FOREGROUND_H
+
+#include "bcachefs.h"
+#include "alloc_types.h"
+#include "extents.h"
+#include "sb-members.h"
+
+#include <linux/hash.h>
+
+struct bkey;
+struct bch_dev;
+struct bch_fs;
+struct bch_devs_List;
+
+extern const char * const bch2_watermarks[];
+
+void bch2_reset_alloc_cursors(struct bch_fs *);
+
+struct dev_alloc_list {
+	unsigned	nr;
+	u8		devs[BCH_SB_MEMBERS_MAX];
+};
+
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
+					  struct dev_stripe_state *,
+					  struct bch_devs_mask *);
+void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
+
+static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
+{
+	return bch2_dev_have_ref(c, ob->dev);
+}
+
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
+				      enum bch_watermark, enum bch_data_type,
+				      struct closure *);
+
+static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
+			   struct open_bucket *ob)
+{
+	BUG_ON(obs->nr >= ARRAY_SIZE(obs->v));
+
+	obs->v[obs->nr++] = ob - c->open_buckets;
+}
+
+#define open_bucket_for_each(_c, _obs, _ob, _i)				\
+	for ((_i) = 0;							\
+	     (_i) < (_obs)->nr &&					\
+	     ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true);	\
+	     (_i)++)
+
+static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
+						 struct open_buckets *obs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, obs, ob, i)
+		if (ob->ec)
+			return ob;
+
+	return NULL;
+}
+
+void bch2_open_bucket_write_error(struct bch_fs *,
+			struct open_buckets *, unsigned);
+
+void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
+
+static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+	if (atomic_dec_and_test(&ob->pin))
+		__bch2_open_bucket_put(c, ob);
+}
+
+static inline void bch2_open_buckets_put(struct bch_fs *c,
+					 struct open_buckets *ptrs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, ptrs, ob, i)
+		bch2_open_bucket_put(c, ob);
+	ptrs->nr = 0;
+}
+
+static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct write_point *wp)
+{
+	struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 };
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
+	wp->ptrs = keep;
+
+	mutex_unlock(&wp->lock);
+
+	bch2_open_buckets_put(c, &ptrs);
+}
+
+static inline void bch2_open_bucket_get(struct bch_fs *c,
+					struct write_point *wp,
+					struct open_buckets *ptrs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
+		ob->data_type = wp->data_type;
+		atomic_inc(&ob->pin);
+		ob_push(c, ptrs, ob);
+	}
+}
+
+static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c,
+						  unsigned dev, u64 bucket)
+{
+	return c->open_buckets_hash +
+		(jhash_3words(dev, bucket, bucket >> 32, 0) &
+		 (OPEN_BUCKETS_COUNT - 1));
+}
+
+static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket)
+{
+	open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket);
+
+	while (slot) {
+		struct open_bucket *ob = &c->open_buckets[slot];
+
+		if (ob->dev == dev && ob->bucket == bucket)
+			return true;
+
+		slot = ob->hash;
+	}
+
+	return false;
+}
+
+static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket)
+{
+	bool ret;
+
+	if (bch2_bucket_is_open(c, dev, bucket))
+		return true;
+
+	spin_lock(&c->freelist_lock);
+	ret = bch2_bucket_is_open(c, dev, bucket);
+	spin_unlock(&c->freelist_lock);
+
+	return ret;
+}
+
+enum bch_write_flags;
+int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
+		      struct dev_stripe_state *, struct bch_devs_mask *,
+		      unsigned, unsigned *, bool *, enum bch_write_flags,
+		      enum bch_data_type, enum bch_watermark,
+		      struct closure *);
+
+int bch2_alloc_sectors_start_trans(struct btree_trans *,
+				   unsigned, unsigned,
+				   struct write_point_specifier,
+				   struct bch_devs_list *,
+				   unsigned, unsigned,
+				   enum bch_watermark,
+				   enum bch_write_flags,
+				   struct closure *,
+				   struct write_point **);
+
+struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+static inline void
+bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp,
+				       struct bkey_i *k, unsigned sectors,
+				       bool cached)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	BUG_ON(sectors > wp->sectors_free);
+	wp->sectors_free	-= sectors;
+	wp->sectors_allocated	+= sectors;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
+		struct bch_dev *ca = ob_dev(c, ob);
+		struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
+
+		ptr.cached = cached ||
+			(!ca->mi.durability &&
+			 wp->data_type == BCH_DATA_user);
+
+		bch2_bkey_append_ptr(k, ptr);
+
+		BUG_ON(sectors > ob->sectors_free);
+		ob->sectors_free -= sectors;
+	}
+}
+
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
+				    struct bkey_i *, unsigned, bool);
+void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
+
+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool);
+
+static inline struct write_point_specifier writepoint_hashed(unsigned long v)
+{
+	return (struct write_point_specifier) { .v = v | 1 };
+}
+
+static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
+{
+	return (struct write_point_specifier) { .v = (unsigned long) wp };
+}
+
+void bch2_fs_allocator_foreground_init(struct bch_fs *);
+
+void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *);
+void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *, struct bch_dev *);
+void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *);
+void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *);
+
+void __bch2_wait_on_allocator(struct bch_fs *, struct closure *);
+static inline void bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
+{
+	if (cl->closure_get_happened)
+		__bch2_wait_on_allocator(c, cl);
+}
+
+#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h
index 8a71a376..9bbb28e9 100644
--- a/libbcachefs/alloc_types.h
+++ b/libbcachefs/alloc_types.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_ALLOC_TYPES_H
 #define _BCACHEFS_ALLOC_TYPES_H
 
@@ -7,83 +8,127 @@
 #include "clock_types.h"
 #include "fifo.h"
 
-/* There's two of these clocks, one for reads and one for writes: */
-struct bucket_clock {
-	/*
-	 * "now" in (read/write) IO time - incremented whenever we do X amount
-	 * of reads or writes.
-	 *
-	 * Goes with the bucket read/write prios: when we read or write to a
-	 * bucket we reset the bucket's prio to the current hand; thus hand -
-	 * prio = time since bucket was last read/written.
-	 *
-	 * The units are some amount (bytes/sectors) of data read/written, and
-	 * the units can change on the fly if we need to rescale to fit
-	 * everything in a u16 - your only guarantee is that the units are
-	 * consistent.
-	 */
-	u16			hand;
-	u16			max_last_io;
-
-	int			rw;
-
-	struct io_timer		rescale;
-	struct mutex		lock;
+struct bucket_alloc_state {
+	enum {
+		BTREE_BITMAP_NO,
+		BTREE_BITMAP_YES,
+		BTREE_BITMAP_ANY,
+	}	btree_bitmap;
+
+	u64	buckets_seen;
+	u64	skipped_open;
+	u64	skipped_need_journal_commit;
+	u64	skipped_nocow;
+	u64	skipped_nouse;
+	u64	skipped_mi_btree_bitmap;
 };
 
-/* There is one reserve for each type of btree, one for prios and gens
- * and one for moving GC */
-enum alloc_reserve {
-	RESERVE_ALLOC		= -1,
-	RESERVE_BTREE		= 0,
-	RESERVE_MOVINGGC	= 1,
-	RESERVE_NONE		= 2,
-	RESERVE_NR		= 3,
+#define BCH_WATERMARKS()		\
+	x(stripe)			\
+	x(normal)			\
+	x(copygc)			\
+	x(btree)			\
+	x(btree_copygc)			\
+	x(reclaim)			\
+	x(interior_updates)
+
+enum bch_watermark {
+#define x(name)	BCH_WATERMARK_##name,
+	BCH_WATERMARKS()
+#undef x
+	BCH_WATERMARK_NR,
 };
 
-typedef FIFO(long)	alloc_fifo;
+#define BCH_WATERMARK_BITS	3
+#define BCH_WATERMARK_MASK	~(~0U << BCH_WATERMARK_BITS)
+
+#define OPEN_BUCKETS_COUNT	1024
+
+#define WRITE_POINT_HASH_NR	32
+#define WRITE_POINT_MAX		32
 
-/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
-#define OPEN_BUCKETS_COUNT	256
-#define WRITE_POINT_COUNT	32
+/*
+ * 0 is never a valid open_bucket_idx_t:
+ */
+typedef u16			open_bucket_idx_t;
 
 struct open_bucket {
 	spinlock_t		lock;
 	atomic_t		pin;
-	u8			freelist;
-	bool			valid;
-	bool			on_partial_list;
-	unsigned		sectors_free;
-	struct bch_extent_ptr	ptr;
-};
+	open_bucket_idx_t	freelist;
+	open_bucket_idx_t	hash;
 
-struct write_point {
-	struct hlist_node	node;
-	struct mutex		lock;
-	u64			last_used;
-	unsigned long		write_point;
-	enum bch_data_type	type;
+	/*
+	 * When an open bucket has an ec_stripe attached, this is the index of
+	 * the block in the stripe this open_bucket corresponds to:
+	 */
+	u8			ec_idx;
+	enum bch_data_type	data_type:6;
+	unsigned		valid:1;
+	unsigned		on_partial_list:1;
+
+	u8			dev;
+	u8			gen;
+	u32			sectors_free;
+	u64			bucket;
+	struct ec_stripe_new	*ec;
+};
 
-	u8			nr_ptrs;
-	u8			first_ptr;
+#define OPEN_BUCKET_LIST_MAX	15
 
-	/* calculated based on how many pointers we're actually going to use: */
-	unsigned		sectors_free;
+struct open_buckets {
+	open_bucket_idx_t	nr;
+	open_bucket_idx_t	v[OPEN_BUCKET_LIST_MAX];
+};
 
-	struct open_bucket	*ptrs[BCH_REPLICAS_MAX * 2];
+struct dev_stripe_state {
 	u64			next_alloc[BCH_SB_MEMBERS_MAX];
 };
 
-struct write_point_specifier {
-	unsigned long		v;
+#define WRITE_POINT_STATES()		\
+	x(stopped)			\
+	x(waiting_io)			\
+	x(waiting_work)			\
+	x(running)
+
+enum write_point_state {
+#define x(n)	WRITE_POINT_##n,
+	WRITE_POINT_STATES()
+#undef x
+	WRITE_POINT_STATE_NR
 };
 
-struct alloc_heap_entry {
-	size_t			bucket;
-	size_t			nr;
-	unsigned long		key;
+struct write_point {
+	struct {
+		struct hlist_node	node;
+		struct mutex		lock;
+		u64			last_used;
+		unsigned long		write_point;
+		enum bch_data_type	data_type;
+
+		/* calculated based on how many pointers we're actually going to use: */
+		unsigned		sectors_free;
+
+		struct open_buckets	ptrs;
+		struct dev_stripe_state	stripe;
+
+		u64			sectors_allocated;
+	} __aligned(SMP_CACHE_BYTES);
+
+	struct {
+		struct work_struct	index_update_work;
+
+		struct list_head	writes;
+		spinlock_t		writes_lock;
+
+		enum write_point_state	state;
+		u64			last_state_change;
+		u64			time[WRITE_POINT_STATE_NR];
+	} __aligned(SMP_CACHE_BYTES);
 };
 
-typedef HEAP(struct alloc_heap_entry) alloc_heap;
+struct write_point_specifier {
+	unsigned long		v;
+};
 
 #endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c
new file mode 100644
index 00000000..b53d98c7
--- /dev/null
+++ b/libbcachefs/backpointers.c
@@ -0,0 +1,981 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bbpos.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "bkey_buf.h"
+#include "btree_cache.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "checksum.h"
+#include "disk_accounting.h"
+#include "error.h"
+
+#include <linux/mm.h>
+
+int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
+			      struct bkey_validate_context from)
+{
+	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+	int ret = 0;
+
+	bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH,
+			 c, backpointer_level_bad,
+			 "backpointer level bad: %u >= %u",
+			 bp.v->level, BTREE_MAX_DEPTH);
+
+	bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID,
+			 c, backpointer_dev_bad,
+			 "backpointer for BCH_SB_MEMBER_INVALID");
+fsck_err:
+	return ret;
+}
+
+void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+
+	rcu_read_lock();
+	struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
+	if (ca) {
+		u32 bucket_offset;
+		struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset);
+		rcu_read_unlock();
+		prt_printf(out, "bucket=%llu:%llu:%u", bucket.inode, bucket.offset, bucket_offset);
+	} else {
+		rcu_read_unlock();
+		prt_printf(out, "sector=%llu:%llu", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT);
+	}
+
+	bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level);
+	prt_printf(out, " suboffset=%u len=%u pos=",
+		   (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+		   bp.v->bucket_len);
+	bch2_bpos_to_text(out, bp.v->pos);
+}
+
+void bch2_backpointer_swab(struct bkey_s k)
+{
+	struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
+
+	bp.v->bucket_len	= swab32(bp.v->bucket_len);
+	bch2_bpos_swab(&bp.v->pos);
+}
+
+static bool extent_matches_bp(struct bch_fs *c,
+			      enum btree_id btree_id, unsigned level,
+			      struct bkey_s_c k,
+			      struct bkey_s_c_backpointer bp)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+
+	rcu_read_lock();
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		struct bpos bucket2;
+		struct bkey_i_backpointer bp2;
+
+		if (p.ptr.cached)
+			continue;
+
+		struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
+		if (!ca)
+			continue;
+
+		bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, &bucket2, &bp2);
+		if (bpos_eq(bp.k->p, bp2.k.p) &&
+		    !memcmp(bp.v, &bp2.v, sizeof(bp2.v))) {
+			rcu_read_unlock();
+			return true;
+		}
+	}
+	rcu_read_unlock();
+
+	return false;
+}
+
+static noinline int backpointer_mod_err(struct btree_trans *trans,
+					struct bkey_s_c orig_k,
+					struct bkey_i_backpointer *new_bp,
+					struct bkey_s_c found_bp,
+					bool insert)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+
+	if (insert) {
+		prt_printf(&buf, "existing backpointer found when inserting ");
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i));
+		prt_newline(&buf);
+		printbuf_indent_add(&buf, 2);
+
+		prt_printf(&buf, "found ");
+		bch2_bkey_val_to_text(&buf, c, found_bp);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "for ");
+		bch2_bkey_val_to_text(&buf, c, orig_k);
+
+		bch_err(c, "%s", buf.buf);
+	} else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
+		prt_printf(&buf, "backpointer not found when deleting\n");
+		printbuf_indent_add(&buf, 2);
+
+		prt_printf(&buf, "searching for ");
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i));
+		prt_newline(&buf);
+
+		prt_printf(&buf, "got ");
+		bch2_bkey_val_to_text(&buf, c, found_bp);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "for ");
+		bch2_bkey_val_to_text(&buf, c, orig_k);
+
+		bch_err(c, "%s", buf.buf);
+	}
+
+	printbuf_exit(&buf);
+
+	if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
+		return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0;
+	} else {
+		return 0;
+	}
+}
+
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
+				struct bkey_s_c orig_k,
+				struct bkey_i_backpointer *bp,
+				bool insert)
+{
+	struct btree_iter bp_iter;
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
+			       bp->k.p,
+			       BTREE_ITER_intent|
+			       BTREE_ITER_slots|
+			       BTREE_ITER_with_updates);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (insert
+	    ? k.k->type
+	    : (k.k->type != KEY_TYPE_backpointer ||
+	       memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) {
+		ret = backpointer_mod_err(trans, orig_k, bp, k, insert);
+		if (ret)
+			goto err;
+	}
+
+	if (!insert) {
+		bp->k.type = KEY_TYPE_deleted;
+		set_bkey_val_u64s(&bp->k, 0);
+	}
+
+	ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0);
+err:
+	bch2_trans_iter_exit(trans, &bp_iter);
+	return ret;
+}
+
+static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos)
+{
+	return likely(!bch2_backpointers_no_use_write_buffer)
+	       ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos)
+	       : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0);
+}
+
+static int bch2_backpointers_maybe_flush(struct btree_trans *trans,
+					 struct bkey_s_c visiting_k,
+					 struct bkey_buf *last_flushed)
+{
+	return likely(!bch2_backpointers_no_use_write_buffer)
+		? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed)
+		: 0;
+}
+
+static void backpointer_target_not_found(struct btree_trans *trans,
+					 struct bkey_s_c_backpointer bp,
+					 struct bkey_s_c target_k)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+
+	/*
+	 * If we're using the btree write buffer, the backpointer we were
+	 * looking at may have already been deleted - failure to find what it
+	 * pointed to is not an error:
+	 */
+	if (likely(!bch2_backpointers_no_use_write_buffer))
+		return;
+
+	prt_printf(&buf, "backpointer doesn't match %s it points to:\n  ",
+		   bp.v->level ? "btree node" : "extent");
+	bch2_bkey_val_to_text(&buf, c, bp.s_c);
+	prt_printf(&buf, "\n  ");
+
+	bch2_bkey_val_to_text(&buf, c, target_k);
+	if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers)
+		bch_err_ratelimited(c, "%s", buf.buf);
+	else
+		bch2_trans_inconsistent(trans, "%s", buf.buf);
+
+	printbuf_exit(&buf);
+}
+
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
+					 struct bkey_s_c_backpointer bp,
+					 struct btree_iter *iter,
+					 unsigned iter_flags)
+{
+	struct bch_fs *c = trans->c;
+
+	if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c)))
+		return bkey_s_c_null;
+
+	if (likely(!bp.v->level)) {
+		bch2_trans_node_iter_init(trans, iter,
+					  bp.v->btree_id,
+					  bp.v->pos,
+					  0, 0,
+					  iter_flags);
+		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+		if (bkey_err(k)) {
+			bch2_trans_iter_exit(trans, iter);
+			return k;
+		}
+
+		if (k.k &&
+		    extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
+			return k;
+
+		bch2_trans_iter_exit(trans, iter);
+		backpointer_target_not_found(trans, bp, k);
+		return bkey_s_c_null;
+	} else {
+		struct btree *b = bch2_backpointer_get_node(trans, bp, iter);
+
+		if (IS_ERR_OR_NULL(b)) {
+			bch2_trans_iter_exit(trans, iter);
+			return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null;
+		}
+		return bkey_i_to_s_c(&b->key);
+	}
+}
+
+struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
+					struct bkey_s_c_backpointer bp,
+					struct btree_iter *iter)
+{
+	struct bch_fs *c = trans->c;
+
+	BUG_ON(!bp.v->level);
+
+	bch2_trans_node_iter_init(trans, iter,
+				  bp.v->btree_id,
+				  bp.v->pos,
+				  0,
+				  bp.v->level - 1,
+				  0);
+	struct btree *b = bch2_btree_iter_peek_node(iter);
+	if (IS_ERR_OR_NULL(b))
+		goto err;
+
+	BUG_ON(b->c.level != bp.v->level - 1);
+
+	if (extent_matches_bp(c, bp.v->btree_id, bp.v->level,
+			      bkey_i_to_s_c(&b->key), bp))
+		return b;
+
+	if (btree_node_will_make_reachable(b)) {
+		b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
+	} else {
+		backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key));
+		b = NULL;
+	}
+err:
+	bch2_trans_iter_exit(trans, iter);
+	return b;
+}
+
+static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k,
+						   struct bkey_buf *last_flushed)
+{
+	if (k.k->type != KEY_TYPE_backpointer)
+		return 0;
+
+	struct bch_fs *c = trans->c;
+	struct btree_iter alloc_iter = { NULL };
+	struct bkey_s_c alloc_k;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	struct bpos bucket;
+	if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) {
+		ret = bch2_backpointers_maybe_flush(trans, k, last_flushed);
+		if (ret)
+			goto out;
+
+		if (fsck_err(trans, backpointer_to_missing_device,
+			     "backpointer for missing device:\n%s",
+			     (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			ret = bch2_backpointer_del(trans, k.k->p);
+		goto out;
+	}
+
+	alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0);
+	ret = bkey_err(alloc_k);
+	if (ret)
+		goto out;
+
+	if (alloc_k.k->type != KEY_TYPE_alloc_v4) {
+		ret = bch2_backpointers_maybe_flush(trans, k, last_flushed);
+		if (ret)
+			goto out;
+
+		if (fsck_err(trans, backpointer_to_missing_alloc,
+			     "backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
+			     alloc_iter.pos.inode, alloc_iter.pos.offset,
+			     (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			ret = bch2_backpointer_del(trans, k.k->p);
+	}
+out:
+fsck_err:
+	bch2_trans_iter_exit(trans, &alloc_iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/* verify that every backpointer has a corresponding alloc key */
+int bch2_check_btree_backpointers(struct bch_fs *c)
+{
+	struct bkey_buf last_flushed;
+	bch2_bkey_buf_init(&last_flushed);
+	bkey_init(&last_flushed.k->k);
+
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+			BTREE_ID_backpointers, POS_MIN, 0, k,
+			NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+		  bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed)));
+
+	bch2_bkey_buf_exit(&last_flushed, c);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+struct extents_to_bp_state {
+	struct bpos	bp_start;
+	struct bpos	bp_end;
+	struct bkey_buf last_flushed;
+};
+
+static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree,
+			       struct bkey_s_c extent, unsigned dev)
+{
+	struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent);
+	int ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	bch2_bkey_drop_device(bkey_i_to_s(n), dev);
+	return bch2_btree_insert_trans(trans, btree, n, 0);
+}
+
+static int check_extent_checksum(struct btree_trans *trans,
+				 enum btree_id btree, struct bkey_s_c extent,
+				 enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	struct printbuf buf = PRINTBUF;
+	void *data_buf = NULL;
+	struct bio *bio = NULL;
+	size_t bytes;
+	int ret = 0;
+
+	if (bkey_is_btree_ptr(extent.k))
+		return false;
+
+	bkey_for_each_ptr_decode(extent.k, ptrs, p, entry)
+		if (p.ptr.dev == dev)
+			goto found;
+	BUG();
+found:
+	if (!p.crc.csum_type)
+		return false;
+
+	bytes = p.crc.compressed_size << 9;
+
+	struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ);
+	if (!ca)
+		return false;
+
+	data_buf = kvmalloc(bytes, GFP_KERNEL);
+	if (!data_buf) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL);
+	bio->bi_iter.bi_sector = p.ptr.offset;
+	bch2_bio_map(bio, data_buf, bytes);
+	ret = submit_bio_wait(bio);
+	if (ret)
+		goto err;
+
+	prt_str(&buf, "extents pointing to same space, but first extent checksum bad:");
+	prt_printf(&buf, "\n  ");
+	bch2_btree_id_to_text(&buf, btree);
+	prt_str(&buf, " ");
+	bch2_bkey_val_to_text(&buf, c, extent);
+	prt_printf(&buf, "\n  ");
+	bch2_btree_id_to_text(&buf, o_btree);
+	prt_str(&buf, " ");
+	bch2_bkey_val_to_text(&buf, c, extent2);
+
+	struct nonce nonce = extent_nonce(extent.k->bversion, p.crc);
+	struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
+	if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
+			trans, dup_backpointer_to_bad_csum_extent,
+			"%s", buf.buf))
+		ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
+fsck_err:
+err:
+	if (bio)
+		bio_put(bio);
+	kvfree(data_buf);
+	percpu_ref_put(&ca->io_ref);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int check_bp_exists(struct btree_trans *trans,
+			   struct extents_to_bp_state *s,
+			   struct bkey_i_backpointer *bp,
+			   struct bkey_s_c orig_k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter other_extent_iter = {};
+	struct printbuf buf = PRINTBUF;
+
+	if (bpos_lt(bp->k.p, s->bp_start) ||
+	    bpos_gt(bp->k.p, s->bp_end))
+		return 0;
+
+	struct btree_iter bp_iter;
+	struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0);
+	int ret = bkey_err(bp_k);
+	if (ret)
+		goto err;
+
+	if (bp_k.k->type != KEY_TYPE_backpointer ||
+	    memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) {
+		ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed);
+		if (ret)
+			goto err;
+
+		goto check_existing_bp;
+	}
+out:
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &other_extent_iter);
+	bch2_trans_iter_exit(trans, &bp_iter);
+	printbuf_exit(&buf);
+	return ret;
+check_existing_bp:
+	/* Do we have a backpointer for a different extent? */
+	if (bp_k.k->type != KEY_TYPE_backpointer)
+		goto missing;
+
+	struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k);
+
+	struct bkey_s_c other_extent =
+		bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0);
+	ret = bkey_err(other_extent);
+	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+		ret = 0;
+	if (ret)
+		goto err;
+
+	if (!other_extent.k)
+		goto missing;
+
+	if (bch2_extents_match(orig_k, other_extent)) {
+		printbuf_reset(&buf);
+		prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n  ");
+		bch2_bkey_val_to_text(&buf, c, orig_k);
+		prt_str(&buf, "\n  ");
+		bch2_bkey_val_to_text(&buf, c, other_extent);
+		bch_err(c, "%s", buf.buf);
+
+		if (other_extent.k->size <= orig_k.k->size) {
+			ret = drop_dev_and_update(trans, other_bp.v->btree_id,
+						  other_extent, bp->k.p.inode);
+			if (ret)
+				goto err;
+			goto out;
+		} else {
+			ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode);
+			if (ret)
+				goto err;
+			goto missing;
+		}
+	}
+
+	ret = check_extent_checksum(trans,
+				    other_bp.v->btree_id, other_extent,
+				    bp->v.btree_id, orig_k,
+				    bp->k.p.inode);
+	if (ret < 0)
+		goto err;
+	if (ret) {
+		ret = 0;
+		goto missing;
+	}
+
+	ret = check_extent_checksum(trans, bp->v.btree_id, orig_k,
+				    other_bp.v->btree_id, other_extent, bp->k.p.inode);
+	if (ret < 0)
+		goto err;
+	if (ret) {
+		ret = 0;
+		goto out;
+	}
+
+	printbuf_reset(&buf);
+	prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n  ", bp->k.p.inode);
+	bch2_bkey_val_to_text(&buf, c, orig_k);
+	prt_str(&buf, "\n  ");
+	bch2_bkey_val_to_text(&buf, c, other_extent);
+	bch_err(c, "%s", buf.buf);
+	ret = -BCH_ERR_fsck_repair_unimplemented;
+	goto err;
+missing:
+	printbuf_reset(&buf);
+	prt_str(&buf, "missing backpointer ");
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i));
+	prt_newline(&buf);
+	bch2_bkey_val_to_text(&buf, c, orig_k);
+	prt_printf(&buf, "\n  got:   ");
+	bch2_bkey_val_to_text(&buf, c, bp_k);
+
+	if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf))
+		ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true);
+
+	goto out;
+}
+
+static int check_extent_to_backpointers(struct btree_trans *trans,
+					struct extents_to_bp_state *s,
+					enum btree_id btree, unsigned level,
+					struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	int ret;
+
+	ptrs = bch2_bkey_ptrs_c(k);
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		struct bpos bucket_pos;
+		struct bkey_i_backpointer bp;
+
+		if (p.ptr.cached)
+			continue;
+
+		rcu_read_lock();
+		struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
+		if (ca)
+			bch2_extent_ptr_to_bp(c, ca, btree, level, k, p, entry, &bucket_pos, &bp);
+		rcu_read_unlock();
+
+		if (!ca)
+			continue;
+
+		ret = check_bp_exists(trans, s, &bp, k);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int check_btree_root_to_backpointers(struct btree_trans *trans,
+					    struct extents_to_bp_state *s,
+					    enum btree_id btree_id,
+					    int *level)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct btree *b;
+	struct bkey_s_c k;
+	int ret;
+retry:
+	bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN,
+				  0, bch2_btree_id_root(c, btree_id)->b->c.level, 0);
+	b = bch2_btree_iter_peek_node(&iter);
+	ret = PTR_ERR_OR_ZERO(b);
+	if (ret)
+		goto err;
+
+	if (b != btree_node_root(c, b)) {
+		bch2_trans_iter_exit(trans, &iter);
+		goto retry;
+	}
+
+	*level = b->c.level;
+
+	k = bkey_i_to_s_c(&b->key);
+	ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
+{
+	return (struct bbpos) {
+		.btree	= bp.btree_id,
+		.pos	= bp.pos,
+	};
+}
+
+static u64 mem_may_pin_bytes(struct bch_fs *c)
+{
+	struct sysinfo i;
+	si_meminfo(&i);
+
+	u64 mem_bytes = i.totalram * i.mem_unit;
+	return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100);
+}
+
+static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+{
+	return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size);
+}
+
+static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
+					u64 btree_leaf_mask,
+					u64 btree_interior_mask,
+					struct bbpos start, struct bbpos *end)
+{
+	struct bch_fs *c = trans->c;
+	s64 mem_may_pin = mem_may_pin_bytes(c);
+	int ret = 0;
+
+	bch2_btree_cache_unpin(c);
+
+	btree_interior_mask |= btree_leaf_mask;
+
+	c->btree_cache.pinned_nodes_mask[0]		= btree_leaf_mask;
+	c->btree_cache.pinned_nodes_mask[1]		= btree_interior_mask;
+	c->btree_cache.pinned_nodes_start		= start;
+	c->btree_cache.pinned_nodes_end			= *end = BBPOS_MAX;
+
+	for (enum btree_id btree = start.btree;
+	     btree < BTREE_ID_NR && !ret;
+	     btree++) {
+		unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1;
+
+		if (!(BIT_ULL(btree) & btree_leaf_mask) &&
+		    !(BIT_ULL(btree) & btree_interior_mask))
+			continue;
+
+		ret = __for_each_btree_node(trans, iter, btree,
+				      btree == start.btree ? start.pos : POS_MIN,
+				      0, depth, BTREE_ITER_prefetch, b, ({
+			mem_may_pin -= btree_buf_bytes(b);
+			if (mem_may_pin <= 0) {
+				c->btree_cache.pinned_nodes_end = *end =
+					BBPOS(btree, b->key.k.p);
+				break;
+			}
+			bch2_node_pin(c, b);
+			0;
+		}));
+	}
+
+	return ret;
+}
+
+struct progress_indicator_state {
+	unsigned long		next_print;
+	u64			nodes_seen;
+	u64			nodes_total;
+	struct btree		*last_node;
+};
+
+static inline void progress_init(struct progress_indicator_state *s,
+				 struct bch_fs *c,
+				 u64 btree_id_mask)
+{
+	memset(s, 0, sizeof(*s));
+
+	s->next_print = jiffies + HZ * 10;
+
+	for (unsigned i = 0; i < BTREE_ID_NR; i++) {
+		if (!(btree_id_mask & BIT_ULL(i)))
+			continue;
+
+		struct disk_accounting_pos acc = {
+			.type		= BCH_DISK_ACCOUNTING_btree,
+			.btree.id	= i,
+		};
+
+		u64 v;
+		bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
+		s->nodes_total += div64_ul(v, btree_sectors(c));
+	}
+}
+
+static inline bool progress_update_p(struct progress_indicator_state *s)
+{
+	bool ret = time_after_eq(jiffies, s->next_print);
+
+	if (ret)
+		s->next_print = jiffies + HZ * 10;
+	return ret;
+}
+
+static void progress_update_iter(struct btree_trans *trans,
+				 struct progress_indicator_state *s,
+				 struct btree_iter *iter,
+				 const char *msg)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b = path_l(btree_iter_path(trans, iter))->b;
+
+	s->nodes_seen += b != s->last_node;
+	s->last_node = b;
+
+	if (progress_update_p(s)) {
+		struct printbuf buf = PRINTBUF;
+		unsigned percent = s->nodes_total
+			? div64_u64(s->nodes_seen * 100, s->nodes_total)
+			: 0;
+
+		prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
+			   msg, percent, s->nodes_seen, s->nodes_total);
+		bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
+
+		bch_info(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+	}
+}
+
+static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
+						   struct extents_to_bp_state *s)
+{
+	struct bch_fs *c = trans->c;
+	struct progress_indicator_state progress;
+	int ret = 0;
+
+	progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
+
+	for (enum btree_id btree_id = 0;
+	     btree_id < btree_id_nr_alive(c);
+	     btree_id++) {
+		int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
+
+		ret = commit_do(trans, NULL, NULL,
+				BCH_TRANS_COMMIT_no_enospc,
+				check_btree_root_to_backpointers(trans, s, btree_id, &level));
+		if (ret)
+			return ret;
+
+		while (level >= depth) {
+			struct btree_iter iter;
+			bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level,
+						  BTREE_ITER_prefetch);
+
+			ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+				progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
+				check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
+				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+			}));
+			if (ret)
+				return ret;
+
+			--level;
+		}
+	}
+
+	return 0;
+}
+
+int bch2_check_extents_to_backpointers(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct extents_to_bp_state s = { .bp_start = POS_MIN };
+	int ret;
+
+	bch2_bkey_buf_init(&s.last_flushed);
+	bkey_init(&s.last_flushed.k->k);
+
+	while (1) {
+		struct bbpos end;
+		ret = bch2_get_btree_in_memory_pos(trans,
+				BIT_ULL(BTREE_ID_backpointers),
+				BIT_ULL(BTREE_ID_backpointers),
+				BBPOS(BTREE_ID_backpointers, s.bp_start), &end);
+		if (ret)
+			break;
+
+		s.bp_end = end.pos;
+
+		if ( bpos_eq(s.bp_start, POS_MIN) &&
+		    !bpos_eq(s.bp_end, SPOS_MAX))
+			bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
+				    __func__, btree_nodes_fit_in_ram(c));
+
+		if (!bpos_eq(s.bp_start, POS_MIN) ||
+		    !bpos_eq(s.bp_end, SPOS_MAX)) {
+			struct printbuf buf = PRINTBUF;
+
+			prt_str(&buf, "check_extents_to_backpointers(): ");
+			bch2_bpos_to_text(&buf, s.bp_start);
+			prt_str(&buf, "-");
+			bch2_bpos_to_text(&buf, s.bp_end);
+
+			bch_verbose(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+		}
+
+		ret = bch2_check_extents_to_backpointers_pass(trans, &s);
+		if (ret || bpos_eq(s.bp_end, SPOS_MAX))
+			break;
+
+		s.bp_start = bpos_successor(s.bp_end);
+	}
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&s.last_flushed, c);
+
+	bch2_btree_cache_unpin(c);
+
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int check_one_backpointer(struct btree_trans *trans,
+				 struct bbpos start,
+				 struct bbpos end,
+				 struct bkey_s_c bp_k,
+				 struct bkey_buf *last_flushed)
+{
+	if (bp_k.k->type != KEY_TYPE_backpointer)
+		return 0;
+
+	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
+	struct bch_fs *c = trans->c;
+	struct bbpos pos = bp_to_bbpos(*bp.v);
+	struct printbuf buf = PRINTBUF;
+
+	if (bbpos_cmp(pos, start) < 0 ||
+	    bbpos_cmp(pos, end) > 0)
+		return 0;
+
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0);
+	int ret = bkey_err(k);
+	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+		return 0;
+	if (ret)
+		return ret;
+
+	if (!k.k) {
+		ret = bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed);
+		if (ret)
+			goto out;
+
+		if (fsck_err(trans, backpointer_to_missing_ptr,
+			     "backpointer for missing %s\n  %s",
+			     bp.v->level ? "btree node" : "extent",
+			     (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
+			ret = bch2_backpointer_del(trans, bp.k->p);
+			goto out;
+		}
+	}
+out:
+fsck_err:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
+						   struct bbpos start,
+						   struct bbpos end)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_buf last_flushed;
+	struct progress_indicator_state progress;
+
+	bch2_bkey_buf_init(&last_flushed);
+	bkey_init(&last_flushed.k->k);
+	progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
+
+	int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
+				  POS_MIN, BTREE_ITER_prefetch, k,
+				  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+			progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
+			check_one_backpointer(trans, start, end, k, &last_flushed);
+	}));
+
+	bch2_bkey_buf_exit(&last_flushed, c);
+	return ret;
+}
+
+int bch2_check_backpointers_to_extents(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
+	int ret;
+
+	while (1) {
+		ret = bch2_get_btree_in_memory_pos(trans,
+						   BIT_ULL(BTREE_ID_extents)|
+						   BIT_ULL(BTREE_ID_reflink),
+						   ~0,
+						   start, &end);
+		if (ret)
+			break;
+
+		if (!bbpos_cmp(start, BBPOS_MIN) &&
+		    bbpos_cmp(end, BBPOS_MAX))
+			bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass",
+				    __func__, btree_nodes_fit_in_ram(c));
+
+		if (bbpos_cmp(start, BBPOS_MIN) ||
+		    bbpos_cmp(end, BBPOS_MAX)) {
+			struct printbuf buf = PRINTBUF;
+
+			prt_str(&buf, "check_backpointers_to_extents(): ");
+			bch2_bbpos_to_text(&buf, start);
+			prt_str(&buf, "-");
+			bch2_bbpos_to_text(&buf, end);
+
+			bch_verbose(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+		}
+
+		ret = bch2_check_backpointers_to_extents_pass(trans, start, end);
+		if (ret || !bbpos_cmp(end, BBPOS_MAX))
+			break;
+
+		start = bbpos_successor(end);
+	}
+	bch2_trans_put(trans);
+
+	bch2_btree_cache_unpin(c);
+
+	bch_err_fn(c, ret);
+	return ret;
+}
diff --git a/libbcachefs/backpointers.h b/libbcachefs/backpointers.h
new file mode 100644
index 00000000..caffc684
--- /dev/null
+++ b/libbcachefs/backpointers.h
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "error.h"
+#include "super.h"
+
+static inline u64 swab40(u64 x)
+{
+	return (((x & 0x00000000ffULL) << 32)|
+		((x & 0x000000ff00ULL) << 16)|
+		((x & 0x0000ff0000ULL) >>  0)|
+		((x & 0x00ff000000ULL) >> 16)|
+		((x & 0xff00000000ULL) >> 32));
+}
+
+int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k,
+			      struct bkey_validate_context);
+void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_backpointer_swab(struct bkey_s);
+
+#define bch2_bkey_ops_backpointer ((struct bkey_ops) {	\
+	.key_validate	= bch2_backpointer_validate,	\
+	.val_to_text	= bch2_backpointer_to_text,	\
+	.swab		= bch2_backpointer_swab,	\
+	.min_val_size	= 32,				\
+})
+
+#define MAX_EXTENT_COMPRESS_RATIO_SHIFT		10
+
+/*
+ * Convert from pos in backpointer btree to pos of corresponding bucket in alloc
+ * btree:
+ */
+static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos bp_pos)
+{
+	u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
+
+	return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
+}
+
+static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, struct bpos bp_pos,
+						      u32 *bucket_offset)
+{
+	u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
+
+	return POS(bp_pos.inode, sector_to_bucket_and_offset(ca, bucket_sector, bucket_offset));
+}
+
+static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
+{
+	rcu_read_lock();
+	struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode);
+	if (ca)
+		*bucket = bp_pos_to_bucket(ca, bp_pos);
+	rcu_read_unlock();
+	return ca != NULL;
+}
+
+static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca,
+						   struct bpos bucket,
+						   u64 bucket_offset)
+{
+	return POS(bucket.inode,
+		   (bucket_to_sector(ca, bucket.offset) <<
+		    MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
+}
+
+/*
+ * Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
+ */
+static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca,
+					   struct bpos bucket,
+					   u64 bucket_offset)
+{
+	struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset);
+	EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(ca, ret)));
+	return ret;
+}
+
+static inline struct bpos bucket_pos_to_bp_start(const struct bch_dev *ca, struct bpos bucket)
+{
+	return bucket_pos_to_bp(ca, bucket, 0);
+}
+
+static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct bpos bucket)
+{
+	return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0));
+}
+
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *,
+				struct bkey_s_c,
+				struct bkey_i_backpointer *,
+				bool);
+
+static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
+				struct bkey_s_c orig_k,
+				struct bkey_i_backpointer *bp,
+				bool insert)
+{
+	if (unlikely(bch2_backpointers_no_use_write_buffer))
+		return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert);
+
+	if (!insert) {
+		bp->k.type = KEY_TYPE_deleted;
+		set_bkey_val_u64s(&bp->k, 0);
+	}
+
+	return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp->k_i);
+}
+
+static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
+							 struct extent_ptr_decoded p,
+							 const union bch_extent_entry *entry)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+		return BCH_DATA_btree;
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user;
+	case KEY_TYPE_stripe: {
+		const struct bch_extent_ptr *ptr = &entry->ptr;
+		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+		BUG_ON(ptr < s.v->ptrs ||
+		       ptr >= s.v->ptrs + s.v->nr_blocks);
+
+		return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
+			? BCH_DATA_parity
+			: BCH_DATA_user;
+	}
+	default:
+		BUG();
+	}
+}
+
+static inline void __bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
+			   enum btree_id btree_id, unsigned level,
+			   struct bkey_s_c k, struct extent_ptr_decoded p,
+			   const union bch_extent_entry *entry,
+			   struct bpos *bucket, struct bkey_i_backpointer *bp,
+			   u64 sectors)
+{
+	u32 bucket_offset;
+	*bucket = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset);
+
+	u64 bp_bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset;
+
+	bkey_backpointer_init(&bp->k_i);
+	bp->k.p = bucket_pos_to_bp(ca, *bucket, bp_bucket_offset);
+	bp->v	= (struct bch_backpointer) {
+		.btree_id	= btree_id,
+		.level		= level,
+		.data_type	= bch2_bkey_ptr_data_type(k, p, entry),
+		.bucket_gen	= p.ptr.gen,
+		.bucket_len	= sectors,
+		.pos		= k.k->p,
+	};
+}
+
+static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
+			   enum btree_id btree_id, unsigned level,
+			   struct bkey_s_c k, struct extent_ptr_decoded p,
+			   const union bch_extent_entry *entry,
+			   struct bpos *bucket_pos, struct bkey_i_backpointer *bp)
+{
+	u64 sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p);
+
+	__bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, bucket_pos, bp, sectors);
+}
+
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer,
+					 struct btree_iter *, unsigned);
+struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer,
+					struct btree_iter *);
+
+int bch2_check_btree_backpointers(struct bch_fs *);
+int bch2_check_extents_to_backpointers(struct bch_fs *);
+int bch2_check_backpointers_to_extents(struct bch_fs *);
+
+#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
diff --git a/libbcachefs/bbpos.h b/libbcachefs/bbpos.h
new file mode 100644
index 00000000..63abe17f
--- /dev/null
+++ b/libbcachefs/bbpos.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BBPOS_H
+#define _BCACHEFS_BBPOS_H
+
+#include "bbpos_types.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+
+static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
+{
+	return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);
+}
+
+static inline struct bbpos bbpos_successor(struct bbpos pos)
+{
+	if (bpos_cmp(pos.pos, SPOS_MAX)) {
+		pos.pos = bpos_successor(pos.pos);
+		return pos;
+	}
+
+	if (pos.btree != BTREE_ID_NR) {
+		pos.btree++;
+		pos.pos = POS_MIN;
+		return pos;
+	}
+
+	BUG();
+}
+
+static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
+{
+	bch2_btree_id_to_text(out, pos.btree);
+	prt_char(out, ':');
+	bch2_bpos_to_text(out, pos.pos);
+}
+
+#endif /* _BCACHEFS_BBPOS_H */
diff --git a/libbcachefs/bbpos_types.h b/libbcachefs/bbpos_types.h
new file mode 100644
index 00000000..f6389334
--- /dev/null
+++ b/libbcachefs/bbpos_types.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BBPOS_TYPES_H
+#define _BCACHEFS_BBPOS_TYPES_H
+
+struct bbpos {
+	enum btree_id		btree;
+	struct bpos		pos;
+};
+
+static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
+{
+	return (struct bbpos) { btree, pos };
+}
+
+#define BBPOS_MIN	BBPOS(0, POS_MIN)
+#define BBPOS_MAX	BBPOS(BTREE_ID_NR - 1, SPOS_MAX)
+
+#endif /* _BCACHEFS_BBPOS_TYPES_H */
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index bd5ea6fc..b12c9c78 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_H
 #define _BCACHEFS_H
 
@@ -106,7 +107,7 @@
  *
  * BTREE NODES:
  *
- * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
+ * Our unit of allocation is a bucket, and we can't arbitrarily allocate and
  * free smaller than a bucket - so, that's how big our btree nodes are.
  *
  * (If buckets are really big we'll only use part of the bucket for a btree node
@@ -176,32 +177,65 @@
  */
 
 #undef pr_fmt
+#ifdef __KERNEL__
 #define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
+#else
+#define pr_fmt(fmt) "%s() " fmt "\n", __func__
+#endif
 
+#include <linux/backing-dev-defs.h>
 #include <linux/bug.h>
 #include <linux/bio.h>
 #include <linux/closure.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
+#include <linux/math64.h>
 #include <linux/mutex.h>
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
+#include <linux/refcount.h>
 #include <linux/rhashtable.h>
 #include <linux/rwsem.h>
+#include <linux/semaphore.h>
 #include <linux/seqlock.h>
 #include <linux/shrinker.h>
+#include <linux/srcu.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 #include <linux/zstd.h>
 
 #include "bcachefs_format.h"
+#include "btree_journal_iter_types.h"
+#include "disk_accounting_types.h"
+#include "errcode.h"
 #include "fifo.h"
+#include "nocow_locking_types.h"
 #include "opts.h"
+#include "recovery_passes_types.h"
+#include "sb-errors_types.h"
+#include "seqmutex.h"
+#include "time_stats.h"
 #include "util.h"
 
-#include <linux/dynamic_fault.h>
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCH_WRITE_REF_DEBUG
+#endif
+
+#ifndef dynamic_fault
+#define dynamic_fault(...)		0
+#endif
+
+#define race_fault(...)			dynamic_fault("bcachefs:race")
 
-#define bch2_fs_init_fault(name)						\
+#define count_event(_c, _name)	this_cpu_inc((_c)->counters[BCH_COUNTER_##_name])
+
+#define trace_and_count(_c, _name, ...)					\
+do {									\
+	count_event(_c, _name);						\
+	trace_##_name(__VA_ARGS__);					\
+} while (0)
+
+#define bch2_fs_init_fault(name)					\
 	dynamic_fault("bcachefs:bch_fs_init:" name)
 #define bch2_meta_read_fault(name)					\
 	 dynamic_fault("bcachefs:meta:read:" name)
@@ -209,29 +243,127 @@
 	 dynamic_fault("bcachefs:meta:write:" name)
 
 #ifdef __KERNEL__
-#define bch2_fmt(_c, fmt)	"bcachefs (%s): " fmt "\n", ((_c)->name)
+#define BCACHEFS_LOG_PREFIX
+#endif
+
+#ifdef BCACHEFS_LOG_PREFIX
+
+#define bch2_log_msg(_c, fmt)			"bcachefs (%s): " fmt, ((_c)->name)
+#define bch2_fmt_dev(_ca, fmt)			"bcachefs (%s): " fmt "\n", ((_ca)->name)
+#define bch2_fmt_dev_offset(_ca, _offset, fmt)	"bcachefs (%s sector %llu): " fmt "\n", ((_ca)->name), (_offset)
+#define bch2_fmt_inum(_c, _inum, fmt)		"bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
+#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt)			\
+	 "bcachefs (%s inum %llu offset %llu): " fmt "\n", ((_c)->name), (_inum), (_offset)
+
 #else
-#define bch2_fmt(_c, fmt)	fmt "\n"
+
+#define bch2_log_msg(_c, fmt)			fmt
+#define bch2_fmt_dev(_ca, fmt)			"%s: " fmt "\n", ((_ca)->name)
+#define bch2_fmt_dev_offset(_ca, _offset, fmt)	"%s sector %llu: " fmt "\n", ((_ca)->name), (_offset)
+#define bch2_fmt_inum(_c, _inum, fmt)		"inum %llu: " fmt "\n", (_inum)
+#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt)				\
+	 "inum %llu offset %llu: " fmt "\n", (_inum), (_offset)
+
 #endif
 
+#define bch2_fmt(_c, fmt)		bch2_log_msg(_c, fmt "\n")
+
+void bch2_print_str(struct bch_fs *, const char *);
+
+__printf(2, 3)
+void bch2_print_opts(struct bch_opts *, const char *, ...);
+
+__printf(2, 3)
+void __bch2_print(struct bch_fs *c, const char *fmt, ...);
+
+#define maybe_dev_to_fs(_c)	_Generic((_c),				\
+	struct bch_dev *:	((struct bch_dev *) (_c))->fs,		\
+	struct bch_fs *:	(_c))
+
+#define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__)
+
+#define bch2_print_ratelimited(_c, ...)					\
+do {									\
+	static DEFINE_RATELIMIT_STATE(_rs,				\
+				      DEFAULT_RATELIMIT_INTERVAL,	\
+				      DEFAULT_RATELIMIT_BURST);		\
+									\
+	if (__ratelimit(&_rs))						\
+		bch2_print(_c, __VA_ARGS__);				\
+} while (0)
+
 #define bch_info(c, fmt, ...) \
-	printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
+	bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_info_ratelimited(c, fmt, ...) \
+	bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_notice(c, fmt, ...) \
-	printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
+	bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_warn(c, fmt, ...) \
-	printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+	bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn_ratelimited(c, fmt, ...) \
+	bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+
 #define bch_err(c, fmt, ...) \
-	printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+	bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_dev(ca, fmt, ...) \
+	bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+#define bch_err_dev_offset(ca, _offset, fmt, ...) \
+	bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+#define bch_err_inum(c, _inum, fmt, ...) \
+	bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \
+	bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+
+#define bch_err_ratelimited(c, fmt, ...) \
+	bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_dev_ratelimited(ca, fmt, ...) \
+	bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \
+	bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
+	bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \
+	bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+
+static inline bool should_print_err(int err)
+{
+	return err && !bch2_err_matches(err, BCH_ERR_transaction_restart);
+}
+
+#define bch_err_fn(_c, _ret)						\
+do {									\
+	if (should_print_err(_ret))					\
+		bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
+} while (0)
+
+#define bch_err_fn_ratelimited(_c, _ret)				\
+do {									\
+	if (should_print_err(_ret))					\
+		bch_err_ratelimited(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
+} while (0)
+
+#define bch_err_msg(_c, _ret, _msg, ...)				\
+do {									\
+	if (should_print_err(_ret))					\
+		bch_err(_c, "%s(): error " _msg " %s", __func__,	\
+			##__VA_ARGS__, bch2_err_str(_ret));		\
+} while (0)
 
 #define bch_verbose(c, fmt, ...)					\
 do {									\
-	if ((c)->opts.verbose_recovery)					\
+	if ((c)->opts.verbose)						\
 		bch_info(c, fmt, ##__VA_ARGS__);			\
 } while (0)
 
+#define bch_verbose_ratelimited(c, fmt, ...)				\
+do {									\
+	if ((c)->opts.verbose)						\
+		bch_info_ratelimited(c, fmt, ##__VA_ARGS__);		\
+} while (0)
+
 #define pr_verbose_init(opts, fmt, ...)					\
 do {									\
-	if (opt_get(opts, verbose_init))				\
+	if (opt_get(opts, verbose))					\
 		pr_info(fmt, ##__VA_ARGS__);				\
 } while (0)
 
@@ -239,26 +371,35 @@ do {									\
 #define BCH_DEBUG_PARAMS_ALWAYS()					\
 	BCH_DEBUG_PARAM(key_merging_disabled,				\
 		"Disables merging of extents")				\
+	BCH_DEBUG_PARAM(btree_node_merging_disabled,			\
+		"Disables merging of btree nodes")			\
 	BCH_DEBUG_PARAM(btree_gc_always_rewrite,			\
 		"Causes mark and sweep to compact and rewrite every "	\
 		"btree node it traverses")				\
 	BCH_DEBUG_PARAM(btree_gc_rewrite_disabled,			\
 		"Disables rewriting of btree nodes during mark and sweep")\
 	BCH_DEBUG_PARAM(btree_shrinker_disabled,			\
-		"Disables the shrinker callback for the btree node cache")
-
-/* Parameters that should only be compiled in in debug mode: */
-#define BCH_DEBUG_PARAMS_DEBUG()					\
-	BCH_DEBUG_PARAM(expensive_debug_checks,				\
-		"Enables various runtime debugging checks that "	\
-		"significantly affect performance")			\
-	BCH_DEBUG_PARAM(debug_check_bkeys,				\
-		"Run bkey_debugcheck (primarily checking GC/allocation "\
-		"information) when iterating over keys")		\
+		"Disables the shrinker callback for the btree node cache")\
 	BCH_DEBUG_PARAM(verify_btree_ondisk,				\
 		"Reread btree nodes at various points to verify the "	\
 		"mergesort in the read path against modifications "	\
 		"done in memory")					\
+	BCH_DEBUG_PARAM(verify_all_btree_replicas,			\
+		"When reading btree nodes, read all replicas and "	\
+		"compare them")						\
+	BCH_DEBUG_PARAM(backpointers_no_use_write_buffer,		\
+		"Don't use the write buffer for backpointers, enabling "\
+		"extra runtime checks")
+
+/* Parameters that should only be compiled in debug mode: */
+#define BCH_DEBUG_PARAMS_DEBUG()					\
+	BCH_DEBUG_PARAM(expensive_debug_checks,				\
+		"Enables various runtime debugging checks that "	\
+		"significantly affect performance")			\
+	BCH_DEBUG_PARAM(debug_check_iterators,				\
+		"Enables extra verification for btree iterators")	\
+	BCH_DEBUG_PARAM(debug_check_btree_accounting,			\
+		"Verify btree accounting for keys within a node")	\
 	BCH_DEBUG_PARAM(journal_seq_verify,				\
 		"Store the journal sequence number in the version "	\
 		"number of every btree key, and verify that btree "	\
@@ -267,6 +408,15 @@ do {									\
 		"Store the journal sequence number in the version "	\
 		"number of every btree key, and verify that btree "	\
 		"update ordering is preserved during recovery")		\
+	BCH_DEBUG_PARAM(test_alloc_startup,				\
+		"Force allocator startup to use the slowpath where it"	\
+		"can't find enough free buckets without invalidating"	\
+		"cached data")						\
+	BCH_DEBUG_PARAM(force_reconstruct_read,				\
+		"Force reads to use the reconstruct path, when reading"	\
+		"from erasure coded extents")				\
+	BCH_DEBUG_PARAM(test_restart_gc,				\
+		"Test restarting mark and sweep gc when bucket gens change")
 
 #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
 
@@ -276,22 +426,41 @@ do {									\
 #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
 #endif
 
+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#ifndef CONFIG_BCACHEFS_DEBUG
+#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name;
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+#endif
+
 #define BCH_TIME_STATS()			\
 	x(btree_node_mem_alloc)			\
+	x(btree_node_split)			\
+	x(btree_node_compact)			\
+	x(btree_node_merge)			\
+	x(btree_node_sort)			\
+	x(btree_node_read)			\
+	x(btree_node_read_done)			\
+	x(btree_interior_update_foreground)	\
+	x(btree_interior_update_total)		\
 	x(btree_gc)				\
-	x(btree_split)				\
-	x(btree_sort)				\
-	x(btree_read)				\
-	x(btree_lock_contended_read)		\
-	x(btree_lock_contended_intent)		\
-	x(btree_lock_contended_write)		\
 	x(data_write)				\
 	x(data_read)				\
 	x(data_promote)				\
-	x(journal_write)			\
-	x(journal_delay)			\
-	x(journal_blocked)			\
-	x(journal_flush_seq)
+	x(journal_flush_write)			\
+	x(journal_noflush_write)		\
+	x(journal_flush_seq)			\
+	x(blocked_journal_low_on_space)		\
+	x(blocked_journal_low_on_pin)		\
+	x(blocked_journal_max_in_flight)	\
+	x(blocked_key_cache_flush)		\
+	x(blocked_allocate)			\
+	x(blocked_allocate_open_bucket)		\
+	x(blocked_write_buffer_full)		\
+	x(nocow_lock_contended)
 
 enum bch_time_stats {
 #define x(name) BCH_TIME_##name,
@@ -301,14 +470,24 @@ enum bch_time_stats {
 };
 
 #include "alloc_types.h"
+#include "btree_gc_types.h"
 #include "btree_types.h"
+#include "btree_node_scan_types.h"
+#include "btree_write_buffer_types.h"
 #include "buckets_types.h"
+#include "buckets_waiting_for_journal_types.h"
 #include "clock_types.h"
+#include "disk_groups_types.h"
+#include "ec_types.h"
 #include "journal_types.h"
 #include "keylist_types.h"
 #include "quota_types.h"
 #include "rebalance_types.h"
+#include "replicas_types.h"
+#include "sb-members_types.h"
+#include "subvolume_types.h"
 #include "super_types.h"
+#include "thread_with_file_types.h"
 
 /* Number of nodes btree coalesce will try to coalesce at once */
 #define GC_MERGE_NODES		4U
@@ -319,34 +498,28 @@ enum bch_time_stats {
 /* Size of the freelist we allocate btree nodes from: */
 #define BTREE_NODE_RESERVE	(BTREE_RESERVE_MAX * 4)
 
-struct btree;
+#define BTREE_NODE_OPEN_BUCKET_RESERVE	(BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
 
-enum gc_phase {
-	GC_PHASE_START,
-	GC_PHASE_SB,
-
-#define DEF_BTREE_ID(kwd, val, name) GC_PHASE_BTREE_##kwd,
-	DEFINE_BCH_BTREE_IDS()
-#undef DEF_BTREE_ID
-
-	GC_PHASE_PENDING_DELETE,
-	GC_PHASE_ALLOC,
-	GC_PHASE_DONE
-};
-
-struct gc_pos {
-	enum gc_phase		phase;
-	struct bpos		pos;
-	unsigned		level;
-};
+struct btree;
 
 struct io_count {
 	u64			sectors[2][BCH_DATA_NR];
 };
 
+struct discard_in_flight {
+	bool			in_progress:1;
+	u64			bucket:63;
+};
+
 struct bch_dev {
 	struct kobject		kobj;
+#ifdef CONFIG_BCACHEFS_DEBUG
+	atomic_long_t		ref;
+	bool			dying;
+	unsigned long		last_put;
+#else
 	struct percpu_ref	ref;
+#endif
 	struct completion	ref_completion;
 	struct percpu_ref	io_ref;
 	struct completion	io_ref_completion;
@@ -359,81 +532,60 @@ struct bch_dev {
 	 * Committed by bch2_write_super() -> bch_fs_mi_update()
 	 */
 	struct bch_member_cpu	mi;
-	uuid_le			uuid;
+	atomic64_t		errors[BCH_MEMBER_ERROR_NR];
+
+	__uuid_t		uuid;
 	char			name[BDEVNAME_SIZE];
 
 	struct bch_sb_handle	disk_sb;
+	struct bch_sb		*sb_read_scratch;
 	int			sb_write_error;
+	dev_t			dev;
+	atomic_t		flush_seq;
 
 	struct bch_devs_mask	self;
 
-	/* biosets used in cloned bios for writing multiple replicas */
-	struct bio_set		replica_set;
-
 	/*
 	 * Buckets:
-	 * Per-bucket arrays are protected by c->usage_lock, bucket_lock and
-	 * gc_lock, for device resize - holding any is sufficient for access:
-	 * Or rcu_read_lock(), but only for ptr_stale():
+	 * Per-bucket arrays are protected by c->mark_lock, bucket_lock and
+	 * gc_gens_lock, for device resize - holding any is sufficient for
+	 * access: Or rcu_read_lock(), but only for dev_ptr_stale():
 	 */
-	struct bucket_array __rcu *buckets;
-	unsigned long		*buckets_dirty;
-	/* most out of date gen in the btree */
-	u8			*oldest_gens;
+	GENRADIX(struct bucket)	buckets_gc;
+	struct bucket_gens __rcu *bucket_gens;
+	u8			*oldest_gen;
+	unsigned long		*buckets_nouse;
 	struct rw_semaphore	bucket_lock;
 
-	struct bch_dev_usage __percpu *usage_percpu;
-	struct bch_dev_usage	usage_cached;
+	struct bch_dev_usage __percpu	*usage;
 
 	/* Allocator: */
-	struct task_struct __rcu *alloc_thread;
+	u64			alloc_cursor[3];
 
-	/*
-	 * free: Buckets that are ready to be used
-	 *
-	 * free_inc: Incoming buckets - these are buckets that currently have
-	 * cached data in them, and we can't reuse them until after we write
-	 * their new gen to disk. After prio_write() finishes writing the new
-	 * gens/prios, they'll be moved to the free list (and possibly discarded
-	 * in the process)
-	 */
-	alloc_fifo		free[RESERVE_NR];
-	alloc_fifo		free_inc;
-	spinlock_t		freelist_lock;
-	size_t			nr_invalidated;
-
-	u8			open_buckets_partial[OPEN_BUCKETS_COUNT];
-	unsigned		open_buckets_partial_nr;
-
-	size_t			fifo_last_bucket;
-
-	/* last calculated minimum prio */
-	u16			max_last_bucket_io[2];
+	unsigned		nr_open_buckets;
+	unsigned		nr_partial_buckets;
+	unsigned		nr_btree_reserve;
 
-	atomic_long_t		saturated_count;
 	size_t			inc_gen_needs_gc;
 	size_t			inc_gen_really_needs_gc;
-	u64			allocator_journal_seq_flush;
-	bool			allocator_invalidating_data;
-	bool			allocator_blocked;
+	size_t			buckets_waiting_on_journal;
 
-	alloc_heap		alloc_heap;
-
-	/* Copying GC: */
-	struct task_struct	*copygc_thread;
-	copygc_heap		copygc_heap;
-	struct bch_pd_controller copygc_pd;
-	struct write_point	copygc_write_point;
+	struct work_struct	invalidate_work;
+	struct work_struct	discard_work;
+	struct mutex		discard_buckets_in_flight_lock;
+	DARRAY(struct discard_in_flight)	discard_buckets_in_flight;
+	struct work_struct	discard_fast_work;
 
 	atomic64_t		rebalance_work;
 
 	struct journal_device	journal;
+	u64			prev_journal_sector;
 
 	struct work_struct	io_error_work;
 
 	/* The rest of this all shows up in sysfs */
 	atomic64_t		cur_latency[2];
-	struct time_stats	io_latency[2];
+	struct bch2_time_stats_quantiles io_latency[2];
 
 #define CONGESTED_MAX		1024
 	atomic_t		congested;
@@ -443,50 +595,104 @@ struct bch_dev {
 };
 
 /*
- * Flag bits for what phase of startup/shutdown the cache set is at, how we're
- * shutting down, etc.:
- *
- * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
- * all the backing devices first (their cached data gets invalidated, and they
- * won't automatically reattach).
+ * initial_gc_unfixed
+ * error
+ * topology error
  */
-enum {
-	/* startup: */
-	BCH_FS_ALLOC_READ_DONE,
-	BCH_FS_ALLOCATOR_STARTED,
-	BCH_FS_INITIAL_GC_DONE,
-	BCH_FS_FSCK_DONE,
-	BCH_FS_STARTED,
-
-	/* shutdown: */
-	BCH_FS_EMERGENCY_RO,
-	BCH_FS_WRITE_DISABLE_COMPLETE,
-
-	/* errors: */
-	BCH_FS_ERROR,
-	BCH_FS_GC_FAILURE,
-
-	/* misc: */
-	BCH_FS_BDEV_MOUNTED,
-	BCH_FS_FSCK_FIXED_ERRORS,
-	BCH_FS_FSCK_UNFIXED_ERRORS,
-	BCH_FS_FIXED_GENS,
-	BCH_FS_REBUILD_REPLICAS,
-	BCH_FS_HOLD_BTREE_WRITES,
+
+#define BCH_FS_FLAGS()			\
+	x(new_fs)			\
+	x(started)			\
+	x(clean_recovery)		\
+	x(btree_running)		\
+	x(accounting_replay_done)	\
+	x(may_go_rw)			\
+	x(rw)				\
+	x(was_rw)			\
+	x(stopping)			\
+	x(emergency_ro)			\
+	x(going_ro)			\
+	x(write_disable_complete)	\
+	x(clean_shutdown)		\
+	x(recovery_running)		\
+	x(fsck_running)			\
+	x(initial_gc_unfixed)		\
+	x(need_delete_dead_snapshots)	\
+	x(error)			\
+	x(topology_error)		\
+	x(errors_fixed)			\
+	x(errors_not_fixed)		\
+	x(no_invalid_checks)
+
+enum bch_fs_flags {
+#define x(n)		BCH_FS_##n,
+	BCH_FS_FLAGS()
+#undef x
 };
 
 struct btree_debug {
 	unsigned		id;
-	struct dentry		*btree;
-	struct dentry		*btree_format;
-	struct dentry		*failed;
 };
 
-enum bch_fs_state {
-	BCH_FS_STARTING		= 0,
-	BCH_FS_STOPPING,
-	BCH_FS_RO,
-	BCH_FS_RW,
+#define BCH_TRANSACTIONS_NR 128
+
+struct btree_transaction_stats {
+	struct bch2_time_stats	duration;
+	struct bch2_time_stats	lock_hold_times;
+	struct mutex		lock;
+	unsigned		nr_max_paths;
+	unsigned		journal_entries_size;
+	unsigned		max_mem;
+	char			*max_paths_text;
+};
+
+struct bch_fs_pcpu {
+	u64			sectors_available;
+};
+
+struct journal_seq_blacklist_table {
+	size_t			nr;
+	struct journal_seq_blacklist_table_entry {
+		u64		start;
+		u64		end;
+		bool		dirty;
+	}			entries[];
+};
+
+struct btree_trans_buf {
+	struct btree_trans	*trans;
+};
+
+#define BCACHEFS_ROOT_SUBVOL_INUM					\
+	((subvol_inum) { BCACHEFS_ROOT_SUBVOL,	BCACHEFS_ROOT_INO })
+
+#define BCH_WRITE_REFS()						\
+	x(journal)							\
+	x(trans)							\
+	x(write)							\
+	x(promote)							\
+	x(node_rewrite)							\
+	x(stripe_create)						\
+	x(stripe_delete)						\
+	x(reflink)							\
+	x(fallocate)							\
+	x(fsync)							\
+	x(dio_write)							\
+	x(discard)							\
+	x(discard_fast)							\
+	x(check_discard_freespace_key)					\
+	x(invalidate)							\
+	x(delete_dead_snapshots)					\
+	x(gc_gens)							\
+	x(snapshot_delete_pagecache)					\
+	x(sysfs)							\
+	x(btree_write_buffer)
+
+enum bch_write_ref {
+#define x(n) BCH_WRITE_REF_##n,
+	BCH_WRITE_REFS()
+#undef x
+	BCH_WRITE_REF_NR,
 };
 
 struct bch_fs {
@@ -494,6 +700,7 @@ struct bch_fs {
 
 	struct list_head	list;
 	struct kobject		kobj;
+	struct kobject		counters_kobj;
 	struct kobject		internal;
 	struct kobject		opts_dir;
 	struct kobject		time_stats;
@@ -502,32 +709,58 @@ struct bch_fs {
 	int			minor;
 	struct device		*chardev;
 	struct super_block	*vfs_sb;
+	dev_t			dev;
 	char			name[40];
+	struct stdio_redirect	*stdio;
+	struct task_struct	*stdio_filter;
 
-	/* ro/rw, add/remove devices: */
-	struct mutex		state_lock;
-	enum bch_fs_state	state;
+	/* ro/rw, add/remove/resize devices: */
+	struct rw_semaphore	state_lock;
 
 	/* Counts outstanding writes, for clean transition to read-only */
+#ifdef BCH_WRITE_REF_DEBUG
+	atomic_long_t		writes[BCH_WRITE_REF_NR];
+#else
 	struct percpu_ref	writes;
+#endif
+	/*
+	 * Certain operations are only allowed in single threaded mode, during
+	 * recovery, and we want to assert that this is the case:
+	 */
+	struct task_struct	*recovery_task;
+
+	/*
+	 * Analagous to c->writes, for asynchronous ops that don't necessarily
+	 * need fs to be read-write
+	 */
+	refcount_t		ro_ref;
+	wait_queue_head_t	ro_ref_wait;
+
 	struct work_struct	read_only_work;
 
 	struct bch_dev __rcu	*devs[BCH_SB_MEMBERS_MAX];
 
-	struct bch_replicas_cpu __rcu *replicas;
-	struct bch_replicas_cpu __rcu *replicas_gc;
+	struct bch_accounting_mem accounting;
+
+	struct bch_replicas_cpu replicas;
+	struct bch_replicas_cpu replicas_gc;
 	struct mutex		replicas_gc_lock;
 
+	struct journal_entry_res btree_root_journal_res;
+	struct journal_entry_res clock_journal_res;
+
 	struct bch_disk_groups_cpu __rcu *disk_groups;
 
 	struct bch_opts		opts;
 
 	/* Updated by bch2_sb_update():*/
 	struct {
-		uuid_le		uuid;
-		uuid_le		user_uuid;
+		__uuid_t	uuid;
+		__uuid_t	user_uuid;
 
-		u16		encoded_extent_max;
+		u16		version;
+		u16		version_min;
+		u16		version_upgrade_complete;
 
 		u8		nr_devices;
 		u8		clean;
@@ -536,10 +769,15 @@ struct bch_fs {
 
 		u64		time_base_lo;
 		u32		time_base_hi;
-		u32		time_precision;
+		unsigned	time_units_per_sec;
+		unsigned	nsec_per_time_unit;
 		u64		features;
+		u64		compat;
+		unsigned long	errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)];
+		u64		btrees_lost_data;
 	}			sb;
 
+
 	struct bch_sb_handle	disk_sb;
 
 	unsigned short		block_bits;	/* ilog2(block_size) */
@@ -549,17 +787,27 @@ struct bch_fs {
 	struct closure		sb_write;
 	struct mutex		sb_lock;
 
+	/* snapshot.c: */
+	struct snapshot_table __rcu *snapshots;
+	struct mutex		snapshot_table_lock;
+	struct rw_semaphore	snapshot_create_lock;
+
+	struct work_struct	snapshot_delete_work;
+	struct work_struct	snapshot_wait_for_pagecache_and_delete_work;
+	snapshot_id_list	snapshots_unlinked;
+	struct mutex		snapshots_unlinked_lock;
+
 	/* BTREE CACHE */
 	struct bio_set		btree_bio;
+	struct workqueue_struct	*btree_read_complete_wq;
+	struct workqueue_struct	*btree_write_submit_wq;
 
-	struct btree_root	btree_roots[BTREE_ID_NR];
-	bool			btree_roots_dirty;
+	struct btree_root	btree_roots_known[BTREE_ID_NR];
+	DARRAY(struct btree_root) btree_roots_extra;
 	struct mutex		btree_root_lock;
 
 	struct btree_cache	btree_cache;
 
-	mempool_t		btree_reserve_pool;
-
 	/*
 	 * Cache of allocated btree nodes - if we allocate a btree node and
 	 * don't use it, if we free it that space can't be reused until going
@@ -573,20 +821,58 @@ struct bch_fs {
 
 	mempool_t		btree_interior_update_pool;
 	struct list_head	btree_interior_update_list;
+	struct list_head	btree_interior_updates_unwritten;
 	struct mutex		btree_interior_update_lock;
 	struct closure_waitlist	btree_interior_update_wait;
 
-	struct workqueue_struct	*wq;
+	struct workqueue_struct	*btree_interior_update_worker;
+	struct work_struct	btree_interior_update_work;
+
+	struct workqueue_struct	*btree_node_rewrite_worker;
+	struct list_head	btree_node_rewrites;
+	struct list_head	btree_node_rewrites_pending;
+	spinlock_t		btree_node_rewrites_lock;
+	struct closure_waitlist	btree_node_rewrites_wait;
+
+	/* btree_io.c: */
+	spinlock_t		btree_write_error_lock;
+	struct btree_write_stats {
+		atomic64_t	nr;
+		atomic64_t	bytes;
+	}			btree_write_stats[BTREE_WRITE_TYPE_NR];
+
+	/* btree_iter.c: */
+	struct seqmutex		btree_trans_lock;
+	struct list_head	btree_trans_list;
+	mempool_t		btree_trans_pool;
+	mempool_t		btree_trans_mem_pool;
+	struct btree_trans_buf  __percpu	*btree_trans_bufs;
+
+	struct srcu_struct	btree_trans_barrier;
+	bool			btree_trans_barrier_initialized;
+
+	struct btree_key_cache	btree_key_cache;
+	unsigned		btree_key_cache_btrees;
+
+	struct btree_write_buffer btree_write_buffer;
+
+	struct workqueue_struct	*btree_update_wq;
+	struct workqueue_struct	*btree_io_complete_wq;
 	/* copygc needs its own workqueue for index updates.. */
 	struct workqueue_struct	*copygc_wq;
+	/*
+	 * Use a dedicated wq for write ref holder tasks. Required to avoid
+	 * dependency problems with other wq tasks that can block on ref
+	 * draining, such as read-only transition.
+	 */
+	struct workqueue_struct *write_ref_wq;
 
 	/* ALLOCATION */
-	struct delayed_work	pd_controllers_update;
-	unsigned		pd_controllers_update_seconds;
-
 	struct bch_devs_mask	rw_devs[BCH_DATA_NR];
+	unsigned long		rw_devs_change_count;
 
 	u64			capacity; /* sectors */
+	u64			reserved; /* sectors */
 
 	/*
 	 * When capacity _decreases_ (due to a disk being removed), we
@@ -594,52 +880,62 @@ struct bch_fs {
 	 * and forces them to be revalidated
 	 */
 	u32			capacity_gen;
+	unsigned		bucket_size_max;
 
 	atomic64_t		sectors_available;
+	struct mutex		sectors_available_lock;
 
-	struct bch_fs_usage __percpu *usage_percpu;
-	struct bch_fs_usage	usage_cached;
-	struct percpu_rw_semaphore usage_lock;
+	struct bch_fs_pcpu __percpu	*pcpu;
 
-	struct closure_waitlist	freelist_wait;
+	struct percpu_rw_semaphore	mark_lock;
 
-	/*
-	 * When we invalidate buckets, we use both the priority and the amount
-	 * of good data to determine which buckets to reuse first - to weight
-	 * those together consistently we keep track of the smallest nonzero
-	 * priority of any bucket.
-	 */
-	struct bucket_clock	bucket_clock[2];
+	seqcount_t			usage_lock;
+	struct bch_fs_usage_base __percpu *usage;
+	u64 __percpu		*online_reserved;
+
+	unsigned long		allocator_last_stuck;
 
 	struct io_clock		io_clock[2];
 
+	/* JOURNAL SEQ BLACKLIST */
+	struct journal_seq_blacklist_table *
+				journal_seq_blacklist_table;
+
 	/* ALLOCATOR */
 	spinlock_t		freelist_lock;
-	u8			open_buckets_freelist;
-	u8			open_buckets_nr_free;
+	struct closure_waitlist	freelist_wait;
+
+	open_bucket_idx_t	open_buckets_freelist;
+	open_bucket_idx_t	open_buckets_nr_free;
 	struct closure_waitlist	open_buckets_wait;
 	struct open_bucket	open_buckets[OPEN_BUCKETS_COUNT];
+	open_bucket_idx_t	open_buckets_hash[OPEN_BUCKETS_COUNT];
+
+	open_bucket_idx_t	open_buckets_partial[OPEN_BUCKETS_COUNT];
+	open_bucket_idx_t	open_buckets_partial_nr;
 
 	struct write_point	btree_write_point;
 	struct write_point	rebalance_write_point;
 
-	struct write_point	write_points[WRITE_POINT_COUNT];
-	struct hlist_head	write_points_hash[WRITE_POINT_COUNT];
+	struct write_point	write_points[WRITE_POINT_MAX];
+	struct hlist_head	write_points_hash[WRITE_POINT_HASH_NR];
 	struct mutex		write_points_hash_lock;
+	unsigned		write_points_nr;
+
+	struct buckets_waiting_for_journal buckets_waiting_for_journal;
 
 	/* GARBAGE COLLECTION */
-	struct task_struct	*gc_thread;
-	atomic_t		kick_gc;
+	struct work_struct	gc_gens_work;
 	unsigned long		gc_count;
 
+	enum btree_id		gc_gens_btree;
+	struct bpos		gc_gens_pos;
+
 	/*
 	 * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
 	 * has been marked by GC.
 	 *
-	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
-	 *
-	 * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
-	 * currently running, and gc marks are currently valid
+	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.)
 	 *
 	 * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
 	 * can read without a lock.
@@ -652,59 +948,119 @@ struct bch_fs {
 	 * it's not while a gc is in progress.
 	 */
 	struct rw_semaphore	gc_lock;
+	struct mutex		gc_gens_lock;
 
 	/* IO PATH */
+	struct semaphore	io_in_flight;
 	struct bio_set		bio_read;
 	struct bio_set		bio_read_split;
 	struct bio_set		bio_write;
+	struct bio_set		replica_set;
 	struct mutex		bio_bounce_pages_lock;
 	mempool_t		bio_bounce_pages;
+	struct bucket_nocow_lock_table
+				nocow_locks;
 	struct rhashtable	promote_table;
 
 	mempool_t		compression_bounce[2];
-	mempool_t		compress_workspace[BCH_COMPRESSION_NR];
-	mempool_t		decompress_workspace;
-	ZSTD_parameters		zstd_params;
+	mempool_t		compress_workspace[BCH_COMPRESSION_OPT_NR];
+	size_t			zstd_workspace_size;
 
 	struct crypto_shash	*sha256;
-	struct crypto_skcipher	*chacha20;
+	struct crypto_sync_skcipher *chacha20;
 	struct crypto_shash	*poly1305;
 
 	atomic64_t		key_version;
 
+	mempool_t		large_bkey_pool;
+
+	/* MOVE.C */
+	struct list_head	moving_context_list;
+	struct mutex		moving_context_lock;
+
 	/* REBALANCE */
 	struct bch_fs_rebalance	rebalance;
 
+	/* COPYGC */
+	struct task_struct	*copygc_thread;
+	struct write_point	copygc_write_point;
+	s64			copygc_wait_at;
+	s64			copygc_wait;
+	bool			copygc_running;
+	wait_queue_head_t	copygc_running_wq;
+
+	/* STRIPES: */
+	GENRADIX(struct stripe) stripes;
+	GENRADIX(struct gc_stripe) gc_stripes;
+
+	struct hlist_head	ec_stripes_new[32];
+	spinlock_t		ec_stripes_new_lock;
+
+	ec_stripes_heap		ec_stripes_heap;
+	struct mutex		ec_stripes_heap_lock;
+
+	/* ERASURE CODING */
+	struct list_head	ec_stripe_head_list;
+	struct mutex		ec_stripe_head_lock;
+
+	struct list_head	ec_stripe_new_list;
+	struct mutex		ec_stripe_new_lock;
+	wait_queue_head_t	ec_stripe_new_wait;
+
+	struct work_struct	ec_stripe_create_work;
+	u64			ec_stripe_hint;
+
+	struct work_struct	ec_stripe_delete_work;
+
+	struct bio_set		ec_bioset;
+
+	/* REFLINK */
+	reflink_gc_table	reflink_gc_table;
+	size_t			reflink_gc_nr;
+
+	/* fs.c */
+	struct list_head	vfs_inodes_list;
+	struct mutex		vfs_inodes_lock;
+	struct rhashtable	vfs_inodes_table;
+	struct rhltable		vfs_inodes_by_inum_table;
+
 	/* VFS IO PATH - fs-io.c */
 	struct bio_set		writepage_bioset;
 	struct bio_set		dio_write_bioset;
 	struct bio_set		dio_read_bioset;
-
-	struct bio_list		btree_write_error_list;
-	struct work_struct	btree_write_error_work;
-	spinlock_t		btree_write_error_lock;
-
-	/* ERRORS */
-	struct list_head	fsck_errors;
-	struct mutex		fsck_error_lock;
-	bool			fsck_alloc_err;
-
-	/* FILESYSTEM */
-	atomic_long_t		nr_inodes;
+	struct bio_set		nocow_flush_bioset;
 
 	/* QUOTAS */
 	struct bch_memquota_type quotas[QTYP_NR];
 
+	/* RECOVERY */
+	u64			journal_replay_seq_start;
+	u64			journal_replay_seq_end;
+	/*
+	 * Two different uses:
+	 * "Has this fsck pass?" - i.e. should this type of error be an
+	 * emergency read-only
+	 * And, in certain situations fsck will rewind to an earlier pass: used
+	 * for signaling to the toplevel code which pass we want to run now.
+	 */
+	enum bch_recovery_pass	curr_recovery_pass;
+	/* bitmask of recovery passes that we actually ran */
+	u64			recovery_passes_complete;
+	/* never rewinds version of curr_recovery_pass */
+	enum bch_recovery_pass	recovery_pass_done;
+	spinlock_t		recovery_pass_lock;
+	struct semaphore	online_fsck_mutex;
+
 	/* DEBUG JUNK */
-	struct dentry		*debug;
+	struct dentry		*fs_debug_dir;
+	struct dentry		*btree_debug_dir;
 	struct btree_debug	btree_debug[BTREE_ID_NR];
-#ifdef CONFIG_BCACHEFS_DEBUG
 	struct btree		*verify_data;
 	struct btree_node	*verify_ondisk;
 	struct mutex		verify_lock;
-#endif
 
-	u64			unused_inode_hint;
+	u64			*unused_inode_hints;
+	unsigned		inode_shard_bits;
 
 	/*
 	 * A btree node on disk could have too many bsets for an iterator to fit
@@ -715,25 +1071,95 @@ struct bch_fs {
 	mempool_t		btree_bounce_pool;
 
 	struct journal		journal;
+	GENRADIX(struct journal_replay *) journal_entries;
+	u64			journal_entries_base_seq;
+	struct journal_keys	journal_keys;
+	struct list_head	journal_iters;
 
-	unsigned		bucket_journal_seq;
+	struct find_btree_nodes	found_btree_nodes;
 
-	/* The rest of this all shows up in sysfs */
-	atomic_long_t		read_realloc_races;
-	atomic_long_t		extent_migrate_done;
-	atomic_long_t		extent_migrate_raced;
+	u64			last_bucket_seq_cleanup;
 
-	unsigned		btree_gc_periodic:1;
-	unsigned		copy_gc_enabled:1;
-	bool			promote_whole_extents;
+	u64			counters_on_mount[BCH_COUNTER_NR];
+	u64 __percpu		*counters;
 
-#define BCH_DEBUG_PARAM(name, description) bool name;
-	BCH_DEBUG_PARAMS_ALL()
-#undef BCH_DEBUG_PARAM
+	struct bch2_time_stats	times[BCH_TIME_STAT_NR];
 
-	struct time_stats	times[BCH_TIME_STAT_NR];
+	struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
+
+	/* ERRORS */
+	struct list_head	fsck_error_msgs;
+	struct mutex		fsck_error_msgs_lock;
+	bool			fsck_alloc_msgs_err;
+
+	bch_sb_errors_cpu	fsck_error_counts;
+	struct mutex		fsck_error_counts_lock;
 };
 
+extern struct wait_queue_head bch2_read_only_wait;
+
+static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+	atomic_long_inc(&c->writes[ref]);
+#else
+	percpu_ref_get(&c->writes);
+#endif
+}
+
+static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+	return !test_bit(BCH_FS_going_ro, &c->flags) &&
+		atomic_long_inc_not_zero(&c->writes[ref]);
+#else
+	return percpu_ref_tryget(&c->writes);
+#endif
+}
+
+static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+	return !test_bit(BCH_FS_going_ro, &c->flags) &&
+		atomic_long_inc_not_zero(&c->writes[ref]);
+#else
+	return percpu_ref_tryget_live(&c->writes);
+#endif
+}
+
+static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+	long v = atomic_long_dec_return(&c->writes[ref]);
+
+	BUG_ON(v < 0);
+	if (v)
+		return;
+	for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
+		if (atomic_long_read(&c->writes[i]))
+			return;
+
+	set_bit(BCH_FS_write_disable_complete, &c->flags);
+	wake_up(&bch2_read_only_wait);
+#else
+	percpu_ref_put(&c->writes);
+#endif
+}
+
+static inline bool bch2_ro_ref_tryget(struct bch_fs *c)
+{
+	if (test_bit(BCH_FS_stopping, &c->flags))
+		return false;
+
+	return refcount_inc_not_zero(&c->ro_ref);
+}
+
+static inline void bch2_ro_ref_put(struct bch_fs *c)
+{
+	if (refcount_dec_and_test(&c->ro_ref))
+		wake_up(&c->ro_ref_wait);
+}
+
 static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
 {
 #ifndef NO_BCACHEFS_FS
@@ -742,11 +1168,6 @@ static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
 #endif
 }
 
-static inline bool bch2_fs_running(struct bch_fs *c)
-{
-	return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
-}
-
 static inline unsigned bucket_bytes(const struct bch_dev *ca)
 {
 	return ca->mi.bucket_size << 9;
@@ -754,7 +1175,75 @@ static inline unsigned bucket_bytes(const struct bch_dev *ca)
 
 static inline unsigned block_bytes(const struct bch_fs *c)
 {
-	return c->opts.block_size << 9;
+	return c->opts.block_size;
+}
+
+static inline unsigned block_sectors(const struct bch_fs *c)
+{
+	return c->opts.block_size >> 9;
+}
+
+static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
+{
+	return c->btree_key_cache_btrees & (1U << btree);
 }
 
+static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
+{
+	struct timespec64 t;
+	s64 sec;
+	s32 rem;
+
+	time += c->sb.time_base_lo;
+
+	sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
+
+	set_normalized_timespec64(&t, sec, rem * (s64)c->sb.nsec_per_time_unit);
+
+	return t;
+}
+
+static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts)
+{
+	return (ts.tv_sec * c->sb.time_units_per_sec +
+		(int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo;
+}
+
+static inline s64 bch2_current_time(const struct bch_fs *c)
+{
+	struct timespec64 now;
+
+	ktime_get_coarse_real_ts64(&now);
+	return timespec_to_bch2_time(c, now);
+}
+
+static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw)
+{
+	return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX);
+}
+
+static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
+{
+	struct stdio_redirect *stdio = c->stdio;
+
+	if (c->stdio_filter && c->stdio_filter != current)
+		stdio = NULL;
+	return stdio;
+}
+
+static inline unsigned metadata_replicas_required(struct bch_fs *c)
+{
+	return min(c->opts.metadata_replicas,
+		   c->opts.metadata_replicas_required);
+}
+
+static inline unsigned data_replicas_required(struct bch_fs *c)
+{
+	return min(c->opts.data_replicas,
+		   c->opts.data_replicas_required);
+}
+
+#define BKEY_PADDED_ONSTACK(key, pad)				\
+	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
+
 #endif /* _BCACHEFS_H */
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index e300738d..cef22c15 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_FORMAT_H
 #define _BCACHEFS_FORMAT_H
 
@@ -73,12 +74,34 @@
 
 #include <asm/types.h>
 #include <asm/byteorder.h>
+#include <linux/kernel.h>
 #include <linux/uuid.h>
+#include <uapi/linux/magic.h>
+#include "vstructs.h"
+
+#ifdef __KERNEL__
+typedef uuid_t __uuid_t;
+#endif
+
+#define BITMASK(name, type, field, offset, end)				\
+static const __maybe_unused unsigned	name##_OFFSET = offset;		\
+static const __maybe_unused unsigned	name##_BITS = (end - offset);	\
+									\
+static inline __u64 name(const type *k)					\
+{									\
+	return (k->field >> offset) & ~(~0ULL << (end - offset));	\
+}									\
+									\
+static inline void SET_##name(type *k, __u64 v)				\
+{									\
+	k->field &= ~(~(~0ULL << (end - offset)) << offset);		\
+	k->field |= (v & ~(~0ULL << (end - offset))) << offset;		\
+}
 
 #define LE_BITMASK(_bits, name, type, field, offset, end)		\
-static const unsigned	name##_OFFSET = offset;				\
-static const unsigned	name##_BITS = (end - offset);			\
-static const __u##_bits	name##_MAX = (1ULL << (end - offset)) - 1;	\
+static const __maybe_unused unsigned	name##_OFFSET = offset;		\
+static const __maybe_unused unsigned	name##_BITS = (end - offset);	\
+static const __maybe_unused __u##_bits	name##_MAX = (1ULL << (end - offset)) - 1;\
 									\
 static inline __u64 name(const type *k)					\
 {									\
@@ -129,26 +152,30 @@ struct bpos {
 #else
 #error edit for your odd byteorder.
 #endif
-} __attribute__((packed, aligned(4)));
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+__aligned(4)
+#endif
+;
 
 #define KEY_INODE_MAX			((__u64)~0ULL)
 #define KEY_OFFSET_MAX			((__u64)~0ULL)
 #define KEY_SNAPSHOT_MAX		((__u32)~0U)
 #define KEY_SIZE_MAX			((__u32)~0U)
 
-static inline struct bpos POS(__u64 inode, __u64 offset)
+static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
 {
-	struct bpos ret;
-
-	ret.inode	= inode;
-	ret.offset	= offset;
-	ret.snapshot	= 0;
-
-	return ret;
+	return (struct bpos) {
+		.inode		= inode,
+		.offset		= offset,
+		.snapshot	= snapshot,
+	};
 }
 
-#define POS_MIN				POS(0, 0)
-#define POS_MAX				POS(KEY_INODE_MAX, KEY_OFFSET_MAX)
+#define POS_MIN				SPOS(0, 0, 0)
+#define POS_MAX				SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0)
+#define SPOS_MAX			SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
+#define POS(_inode, _offset)		SPOS(_inode, _offset, 0)
 
 /* Empty placeholder struct, for container_of() */
 struct bch_val {
@@ -163,7 +190,11 @@ struct bversion {
 	__u32		hi;
 	__u64		lo;
 #endif
-} __attribute__((packed, aligned(4)));
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+__aligned(4)
+#endif
+;
 
 struct bkey {
 	/* Size of combined key and value, in u64s */
@@ -186,17 +217,46 @@ struct bkey {
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 	__u8		pad[1];
 
-	struct bversion	version;
+	struct bversion	bversion;
 	__u32		size;		/* extent size, in sectors */
 	struct bpos	p;
 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 	struct bpos	p;
 	__u32		size;		/* extent size, in sectors */
-	struct bversion	version;
+	struct bversion	bversion;
 
 	__u8		pad[1];
 #endif
-} __attribute__((packed, aligned(8)));
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+/*
+ * The big-endian version of bkey can't be compiled by rustc with the "aligned"
+ * attr since it doesn't allow types to have both "packed" and "aligned" attrs.
+ * So for Rust compatibility, don't include this. It can be included in the LE
+ * version because the "packed" attr is redundant in that case.
+ *
+ * History: (quoting Kent)
+ *
+ * Specifically, when i was designing bkey, I wanted the header to be no
+ * bigger than necessary so that bkey_packed could use the rest. That means that
+ * decently offten extent keys will fit into only 8 bytes, instead of spilling over
+ * to 16.
+ *
+ * But packed_bkey treats the part after the header - the packed section -
+ * as a single multi word, variable length integer. And bkey, the unpacked
+ * version, is just a special case version of a bkey_packed; all the packed
+ * bkey code will work on keys in any packed format, the in-memory
+ * representation of an unpacked key also is just one type of packed key...
+ *
+ * So that constrains the key part of a bkig endian bkey to start right
+ * after the header.
+ *
+ * If we ever do a bkey_v2 and need to expand the hedaer by another byte for
+ * some reason - that will clean up this wart.
+ */
+__aligned(8)
+#endif
+;
 
 struct bkey_packed {
 	__u64		_data[0];
@@ -230,9 +290,17 @@ struct bkey_packed {
 	 * to the same size as struct bkey should hopefully be safest.
 	 */
 	__u8		pad[sizeof(struct bkey) - 3];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
+
+typedef struct {
+	__le64			lo;
+	__le64			hi;
+} bch_le128;
 
 #define BKEY_U64s			(sizeof(struct bkey) / sizeof(__u64))
+#define BKEY_U64s_MAX			U8_MAX
+#define BKEY_VAL_U64s_MAX		(BKEY_U64s_MAX - BKEY_U64s)
+
 #define KEY_PACKED_BITS_START		24
 
 #define KEY_FORMAT_LOCAL_BTREE		0
@@ -260,8 +328,8 @@ enum bch_bkey_fields {
 		bkey_format_field(OFFSET,	p.offset),		\
 		bkey_format_field(SNAPSHOT,	p.snapshot),		\
 		bkey_format_field(SIZE,		size),			\
-		bkey_format_field(VERSION_HI,	version.hi),		\
-		bkey_format_field(VERSION_LO,	version.lo),		\
+		bkey_format_field(VERSION_HI,	bversion.hi),		\
+		bkey_format_field(VERSION_LO,	bversion.lo),		\
 	},								\
 })
 
@@ -269,18 +337,17 @@ enum bch_bkey_fields {
 struct bkey_i {
 	__u64			_data[0];
 
-	union {
-	struct {
-		/* Size of combined key and value, in u64s */
-		__u8		u64s;
-	};
-	struct {
-		struct bkey	k;
-		struct bch_val	v;
-	};
-	};
+	struct bkey	k;
+	struct bch_val	v;
 };
 
+#define POS_KEY(_pos)							\
+((struct bkey) {							\
+	.u64s		= BKEY_U64s,					\
+	.format		= KEY_FORMAT_CURRENT,				\
+	.p		= _pos,						\
+})
+
 #define KEY(_inode, _offset, _size)					\
 ((struct bkey) {							\
 	.u64s		= BKEY_U64s,					\
@@ -297,16 +364,7 @@ static inline void bkey_init(struct bkey *k)
 #define bkey_bytes(_k)		((_k)->u64s * sizeof(__u64))
 
 #define __BKEY_PADDED(key, pad)					\
-	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
-
-#define BKEY_VAL_TYPE(name, nr)						\
-struct bkey_i_##name {							\
-	union {								\
-		struct bkey		k;				\
-		struct bkey_i		k_i;				\
-	};								\
-	struct bch_##name		v;				\
-}
+	struct bkey_i key; __u64 key ## _pad[pad]
 
 /*
  * - DELETED keys are used internally to mark keys that should be ignored but
@@ -322,550 +380,94 @@ struct bkey_i_##name {							\
  *   by new writes or cluster-wide GC. Node repair can also overwrite them with
  *   the same or a more recent version number, but not with an older version
  *   number.
-*/
-#define KEY_TYPE_DELETED		0
-#define KEY_TYPE_DISCARD		1
-#define KEY_TYPE_ERROR			2
-#define KEY_TYPE_COOKIE			3
-#define KEY_TYPE_PERSISTENT_DISCARD	4
-#define KEY_TYPE_GENERIC_NR		128
-
-struct bch_cookie {
-	struct bch_val		v;
-	__le64			cookie;
-};
-BKEY_VAL_TYPE(cookie,		KEY_TYPE_COOKIE);
-
-/* Extents */
-
-/*
- * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
- * preceded by checksum/compression information (bch_extent_crc32 or
- * bch_extent_crc64).
- *
- * One major determining factor in the format of extents is how we handle and
- * represent extents that have been partially overwritten and thus trimmed:
- *
- * If an extent is not checksummed or compressed, when the extent is trimmed we
- * don't have to remember the extent we originally allocated and wrote: we can
- * merely adjust ptr->offset to point to the start of the data that is currently
- * live. The size field in struct bkey records the current (live) size of the
- * extent, and is also used to mean "size of region on disk that we point to" in
- * this case.
- *
- * Thus an extent that is not checksummed or compressed will consist only of a
- * list of bch_extent_ptrs, with none of the fields in
- * bch_extent_crc32/bch_extent_crc64.
- *
- * When an extent is checksummed or compressed, it's not possible to read only
- * the data that is currently live: we have to read the entire extent that was
- * originally written, and then return only the part of the extent that is
- * currently live.
- *
- * Thus, in addition to the current size of the extent in struct bkey, we need
- * to store the size of the originally allocated space - this is the
- * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
- * when the extent is trimmed, instead of modifying the offset field of the
- * pointer, we keep a second smaller offset field - "offset into the original
- * extent of the currently live region".
- *
- * The other major determining factor is replication and data migration:
- *
- * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
- * write, we will initially write all the replicas in the same format, with the
- * same checksum type and compression format - however, when copygc runs later (or
- * tiering/cache promotion, anything that moves data), it is not in general
- * going to rewrite all the pointers at once - one of the replicas may be in a
- * bucket on one device that has very little fragmentation while another lives
- * in a bucket that has become heavily fragmented, and thus is being rewritten
- * sooner than the rest.
  *
- * Thus it will only move a subset of the pointers (or in the case of
- * tiering/cache promotion perhaps add a single pointer without dropping any
- * current pointers), and if the extent has been partially overwritten it must
- * write only the currently live portion (or copygc would not be able to reduce
- * fragmentation!) - which necessitates a different bch_extent_crc format for
- * the new pointer.
- *
- * But in the interests of space efficiency, we don't want to store one
- * bch_extent_crc for each pointer if we don't have to.
- *
- * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
- * bch_extent_ptrs appended arbitrarily one after the other. We determine the
- * type of a given entry with a scheme similar to utf8 (except we're encoding a
- * type, not a size), encoding the type in the position of the first set bit:
- *
- * bch_extent_crc32	- 0b1
- * bch_extent_ptr	- 0b10
- * bch_extent_crc64	- 0b100
- *
- * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
- * bch_extent_crc64 is the least constrained).
- *
- * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
- * until the next bch_extent_crc32/64.
- *
- * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
- * is neither checksummed nor compressed.
+ * - WHITEOUT: for hash table btrees
  */
-
-/* 128 bits, sufficient for cryptographic MACs: */
-struct bch_csum {
-	__le64			lo;
-	__le64			hi;
-} __attribute__((packed, aligned(8)));
-
-enum bch_csum_type {
-	BCH_CSUM_NONE			= 0,
-	BCH_CSUM_CRC32C_NONZERO		= 1,
-	BCH_CSUM_CRC64_NONZERO		= 2,
-	BCH_CSUM_CHACHA20_POLY1305_80	= 3,
-	BCH_CSUM_CHACHA20_POLY1305_128	= 4,
-	BCH_CSUM_CRC32C			= 5,
-	BCH_CSUM_CRC64			= 6,
-	BCH_CSUM_NR			= 7,
-};
-
-static const unsigned bch_crc_bytes[] = {
-	[BCH_CSUM_NONE]				= 0,
-	[BCH_CSUM_CRC32C_NONZERO]		= 4,
-	[BCH_CSUM_CRC32C]			= 4,
-	[BCH_CSUM_CRC64_NONZERO]		= 8,
-	[BCH_CSUM_CRC64]			= 8,
-	[BCH_CSUM_CHACHA20_POLY1305_80]		= 10,
-	[BCH_CSUM_CHACHA20_POLY1305_128]	= 16,
-};
-
-static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
-{
-	switch (type) {
-	case BCH_CSUM_CHACHA20_POLY1305_80:
-	case BCH_CSUM_CHACHA20_POLY1305_128:
-		return true;
-	default:
-		return false;
-	}
-}
-
-enum bch_compression_type {
-	BCH_COMPRESSION_NONE		= 0,
-	BCH_COMPRESSION_LZ4_OLD		= 1,
-	BCH_COMPRESSION_GZIP		= 2,
-	BCH_COMPRESSION_LZ4		= 3,
-	BCH_COMPRESSION_ZSTD		= 4,
-	BCH_COMPRESSION_NR		= 5,
-};
-
-enum bch_extent_entry_type {
-	BCH_EXTENT_ENTRY_ptr		= 0,
-	BCH_EXTENT_ENTRY_crc32		= 1,
-	BCH_EXTENT_ENTRY_crc64		= 2,
-	BCH_EXTENT_ENTRY_crc128		= 3,
-};
-
-#define BCH_EXTENT_ENTRY_MAX		4
-
-/* Compressed/uncompressed size are stored biased by 1: */
-struct bch_extent_crc32 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u32			type:2,
-				_compressed_size:7,
-				_uncompressed_size:7,
-				offset:7,
-				_unused:1,
-				csum_type:4,
-				compression_type:4;
-	__u32			csum;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u32			csum;
-	__u32			compression_type:4,
-				csum_type:4,
-				_unused:1,
-				offset:7,
-				_uncompressed_size:7,
-				_compressed_size:7,
-				type:2;
-#endif
-} __attribute__((packed, aligned(8)));
-
-#define CRC32_SIZE_MAX		(1U << 7)
-#define CRC32_NONCE_MAX		0
-
-struct bch_extent_crc64 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:3,
-				_compressed_size:9,
-				_uncompressed_size:9,
-				offset:9,
-				nonce:10,
-				csum_type:4,
-				compression_type:4,
-				csum_hi:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			csum_hi:16,
-				compression_type:4,
-				csum_type:4,
-				nonce:10,
-				offset:9,
-				_uncompressed_size:9,
-				_compressed_size:9,
-				type:3;
-#endif
-	__u64			csum_lo;
-} __attribute__((packed, aligned(8)));
-
-#define CRC64_SIZE_MAX		(1U << 9)
-#define CRC64_NONCE_MAX		((1U << 10) - 1)
-
-struct bch_extent_crc128 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:4,
-				_compressed_size:13,
-				_uncompressed_size:13,
-				offset:13,
-				nonce:13,
-				csum_type:4,
-				compression_type:4;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			compression_type:4,
-				csum_type:4,
-				nonce:13,
-				offset:13,
-				_uncompressed_size:13,
-				_compressed_size:13,
-				type:4;
-#endif
-	struct bch_csum		csum;
-} __attribute__((packed, aligned(8)));
-
-#define CRC128_SIZE_MAX		(1U << 13)
-#define CRC128_NONCE_MAX	((1U << 13) - 1)
-
-/*
- * @reservation - pointer hasn't been written to, just reserved
- */
-struct bch_extent_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:1,
-				cached:1,
-				erasure_coded:1,
-				reservation:1,
-				offset:44, /* 8 petabytes */
-				dev:8,
-				gen:8;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			gen:8,
-				dev:8,
-				offset:44,
-				reservation:1,
-				erasure_coded:1,
-				cached:1,
-				type:1;
-#endif
-} __attribute__((packed, aligned(8)));
-
-struct bch_extent_reservation {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:5,
-				unused:23,
-				replicas:4,
-				generation:32;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			generation:32,
-				replicas:4,
-				unused:23,
-				type:5;
-#endif
-};
-
-union bch_extent_entry {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
-	unsigned long			type;
-#elif __BITS_PER_LONG == 32
-	struct {
-		unsigned long		pad;
-		unsigned long		type;
-	};
-#else
-#error edit for your odd byteorder.
-#endif
-	struct bch_extent_crc32		crc32;
-	struct bch_extent_crc64		crc64;
-	struct bch_extent_crc128	crc128;
-	struct bch_extent_ptr		ptr;
-};
-
-enum {
-	BCH_EXTENT		= 128,
-
-	/*
-	 * This is kind of a hack, we're overloading the type for a boolean that
-	 * really should be part of the value - BCH_EXTENT and BCH_EXTENT_CACHED
-	 * have the same value type:
-	 */
-	BCH_EXTENT_CACHED	= 129,
-
-	/*
-	 * Persistent reservation:
-	 */
-	BCH_RESERVATION		= 130,
+#define BCH_BKEY_TYPES()				\
+	x(deleted,		0)			\
+	x(whiteout,		1)			\
+	x(error,		2)			\
+	x(cookie,		3)			\
+	x(hash_whiteout,	4)			\
+	x(btree_ptr,		5)			\
+	x(extent,		6)			\
+	x(reservation,		7)			\
+	x(inode,		8)			\
+	x(inode_generation,	9)			\
+	x(dirent,		10)			\
+	x(xattr,		11)			\
+	x(alloc,		12)			\
+	x(quota,		13)			\
+	x(stripe,		14)			\
+	x(reflink_p,		15)			\
+	x(reflink_v,		16)			\
+	x(inline_data,		17)			\
+	x(btree_ptr_v2,		18)			\
+	x(indirect_inline_data,	19)			\
+	x(alloc_v2,		20)			\
+	x(subvolume,		21)			\
+	x(snapshot,		22)			\
+	x(inode_v2,		23)			\
+	x(alloc_v3,		24)			\
+	x(set,			25)			\
+	x(lru,			26)			\
+	x(alloc_v4,		27)			\
+	x(backpointer,		28)			\
+	x(inode_v3,		29)			\
+	x(bucket_gens,		30)			\
+	x(snapshot_tree,	31)			\
+	x(logged_op_truncate,	32)			\
+	x(logged_op_finsert,	33)			\
+	x(accounting,		34)
+
+enum bch_bkey_type {
+#define x(name, nr) KEY_TYPE_##name	= nr,
+	BCH_BKEY_TYPES()
+#undef x
+	KEY_TYPE_MAX,
 };
 
-struct bch_extent {
-	struct bch_val		v;
-
-	union bch_extent_entry	start[0];
-	__u64			_data[0];
-} __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(extent,		BCH_EXTENT);
-
-struct bch_reservation {
+struct bch_deleted {
 	struct bch_val		v;
-
-	__le32			generation;
-	__u8			nr_replicas;
-	__u8			pad[3];
-} __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(reservation,	BCH_RESERVATION);
-
-/* Maximum size (in u64s) a single pointer could be: */
-#define BKEY_EXTENT_PTR_U64s_MAX\
-	((sizeof(struct bch_extent_crc128) +			\
-	  sizeof(struct bch_extent_ptr)) / sizeof(u64))
-
-/* Maximum possible size of an entire extent value: */
-#define BKEY_EXTENT_VAL_U64s_MAX				\
-	(BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
-
-#define BKEY_PADDED(key)	__BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
-
-/* * Maximum possible size of an entire extent, key + value: */
-#define BKEY_EXTENT_U64s_MAX		(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
-
-/* Btree pointers don't carry around checksums: */
-#define BKEY_BTREE_PTR_VAL_U64s_MAX				\
-	((sizeof(struct bch_extent_ptr)) / sizeof(u64) * BCH_REPLICAS_MAX)
-#define BKEY_BTREE_PTR_U64s_MAX					\
-	(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
-
-/* Inodes */
-
-#define BLOCKDEV_INODE_MAX	4096
-
-#define BCACHEFS_ROOT_INO	4096
-
-enum bch_inode_types {
-	BCH_INODE_FS		= 128,
-	BCH_INODE_BLOCKDEV	= 129,
-	BCH_INODE_GENERATION	= 130,
 };
 
-struct bch_inode {
+struct bch_whiteout {
 	struct bch_val		v;
-
-	__le64			bi_hash_seed;
-	__le32			bi_flags;
-	__le16			bi_mode;
-	__u8			fields[0];
-} __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(inode,		BCH_INODE_FS);
-
-struct bch_inode_generation {
-	struct bch_val		v;
-
-	__le32			bi_generation;
-	__le32			pad;
-} __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(inode_generation,	BCH_INODE_GENERATION);
-
-#define BCH_INODE_FIELDS()					\
-	BCH_INODE_FIELD(bi_atime,			64)	\
-	BCH_INODE_FIELD(bi_ctime,			64)	\
-	BCH_INODE_FIELD(bi_mtime,			64)	\
-	BCH_INODE_FIELD(bi_otime,			64)	\
-	BCH_INODE_FIELD(bi_size,			64)	\
-	BCH_INODE_FIELD(bi_sectors,			64)	\
-	BCH_INODE_FIELD(bi_uid,				32)	\
-	BCH_INODE_FIELD(bi_gid,				32)	\
-	BCH_INODE_FIELD(bi_nlink,			32)	\
-	BCH_INODE_FIELD(bi_generation,			32)	\
-	BCH_INODE_FIELD(bi_dev,				32)	\
-	BCH_INODE_FIELD(bi_data_checksum,		8)	\
-	BCH_INODE_FIELD(bi_compression,			8)	\
-	BCH_INODE_FIELD(bi_project,			32)	\
-	BCH_INODE_FIELD(bi_background_compression,	8)	\
-	BCH_INODE_FIELD(bi_data_replicas,		8)	\
-	BCH_INODE_FIELD(bi_promote_target,		16)	\
-	BCH_INODE_FIELD(bi_foreground_target,		16)	\
-	BCH_INODE_FIELD(bi_background_target,		16)
-
-#define BCH_INODE_FIELDS_INHERIT()				\
-	BCH_INODE_FIELD(bi_data_checksum)			\
-	BCH_INODE_FIELD(bi_compression)				\
-	BCH_INODE_FIELD(bi_project)				\
-	BCH_INODE_FIELD(bi_background_compression)		\
-	BCH_INODE_FIELD(bi_data_replicas)			\
-	BCH_INODE_FIELD(bi_promote_target)			\
-	BCH_INODE_FIELD(bi_foreground_target)			\
-	BCH_INODE_FIELD(bi_background_target)
-
-enum {
-	/*
-	 * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
-	 * flags)
-	 */
-	__BCH_INODE_SYNC	= 0,
-	__BCH_INODE_IMMUTABLE	= 1,
-	__BCH_INODE_APPEND	= 2,
-	__BCH_INODE_NODUMP	= 3,
-	__BCH_INODE_NOATIME	= 4,
-
-	__BCH_INODE_I_SIZE_DIRTY= 5,
-	__BCH_INODE_I_SECTORS_DIRTY= 6,
-	__BCH_INODE_UNLINKED	= 7,
-
-	/* bits 20+ reserved for packed fields below: */
 };
 
-#define BCH_INODE_SYNC		(1 << __BCH_INODE_SYNC)
-#define BCH_INODE_IMMUTABLE	(1 << __BCH_INODE_IMMUTABLE)
-#define BCH_INODE_APPEND	(1 << __BCH_INODE_APPEND)
-#define BCH_INODE_NODUMP	(1 << __BCH_INODE_NODUMP)
-#define BCH_INODE_NOATIME	(1 << __BCH_INODE_NOATIME)
-#define BCH_INODE_I_SIZE_DIRTY	(1 << __BCH_INODE_I_SIZE_DIRTY)
-#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
-#define BCH_INODE_UNLINKED	(1 << __BCH_INODE_UNLINKED)
-
-LE32_BITMASK(INODE_STR_HASH,	struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS,	struct bch_inode, bi_flags, 24, 32);
-
-struct bch_inode_blockdev {
+struct bch_error {
 	struct bch_val		v;
-
-	__le64			i_size;
-	__le64			i_flags;
-
-	/* Seconds: */
-	__le64			i_ctime;
-	__le64			i_mtime;
-
-	uuid_le			i_uuid;
-	__u8			i_label[32];
-} __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(inode_blockdev,	BCH_INODE_BLOCKDEV);
-
-/* Thin provisioned volume, or cache for another block device? */
-LE64_BITMASK(CACHED_DEV,	struct bch_inode_blockdev, i_flags, 0,  1)
-
-/* Dirents */
-
-/*
- * Dirents (and xattrs) have to implement string lookups; since our b-tree
- * doesn't support arbitrary length strings for the key, we instead index by a
- * 64 bit hash (currently truncated sha1) of the string, stored in the offset
- * field of the key - using linear probing to resolve hash collisions. This also
- * provides us with the readdir cookie posix requires.
- *
- * Linear probing requires us to use whiteouts for deletions, in the event of a
- * collision:
- */
-
-enum {
-	BCH_DIRENT		= 128,
-	BCH_DIRENT_WHITEOUT	= 129,
 };
 
-struct bch_dirent {
+struct bch_cookie {
 	struct bch_val		v;
-
-	/* Target inode number: */
-	__le64			d_inum;
-
-	/*
-	 * Copy of mode bits 12-15 from the target inode - so userspace can get
-	 * the filetype without having to do a stat()
-	 */
-	__u8			d_type;
-
-	__u8			d_name[];
-} __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(dirent,		BCH_DIRENT);
-
-#define BCH_NAME_MAX	(U8_MAX * sizeof(u64) -				\
-			 sizeof(struct bkey) -				\
-			 offsetof(struct bch_dirent, d_name))
-
-
-/* Xattrs */
-
-enum {
-	BCH_XATTR		= 128,
-	BCH_XATTR_WHITEOUT	= 129,
+	__le64			cookie;
 };
 
-#define BCH_XATTR_INDEX_USER			0
-#define BCH_XATTR_INDEX_POSIX_ACL_ACCESS	1
-#define BCH_XATTR_INDEX_POSIX_ACL_DEFAULT	2
-#define BCH_XATTR_INDEX_TRUSTED			3
-#define BCH_XATTR_INDEX_SECURITY	        4
-
-struct bch_xattr {
+struct bch_hash_whiteout {
 	struct bch_val		v;
-	__u8			x_type;
-	__u8			x_name_len;
-	__le16			x_val_len;
-	__u8			x_name[];
-} __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(xattr,		BCH_XATTR);
-
-/* Bucket/allocation information: */
-
-enum {
-	BCH_ALLOC		= 128,
 };
 
-enum {
-	BCH_ALLOC_FIELD_READ_TIME	= 0,
-	BCH_ALLOC_FIELD_WRITE_TIME	= 1,
-};
-
-struct bch_alloc {
+struct bch_set {
 	struct bch_val		v;
-	__u8			fields;
-	__u8			gen;
-	__u8			data[];
-} __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(alloc,	BCH_ALLOC);
-
-/* Quotas: */
-
-enum {
-	BCH_QUOTA		= 128,
-};
-
-enum quota_types {
-	QTYP_USR		= 0,
-	QTYP_GRP		= 1,
-	QTYP_PRJ		= 2,
-	QTYP_NR			= 3,
 };
 
-enum quota_counters {
-	Q_SPC			= 0,
-	Q_INO			= 1,
-	Q_COUNTERS		= 2,
-};
-
-struct bch_quota_counter {
-	__le64			hardlimit;
-	__le64			softlimit;
-};
+/* 128 bits, sufficient for cryptographic MACs: */
+struct bch_csum {
+	__le64			lo;
+	__le64			hi;
+} __packed __aligned(8);
 
-struct bch_quota {
+struct bch_backpointer {
 	struct bch_val		v;
-	struct bch_quota_counter c[Q_COUNTERS];
-} __attribute__((packed, aligned(8)));
-BKEY_VAL_TYPE(quota,	BCH_QUOTA);
+	__u8			btree_id;
+	__u8			level;
+	__u8			data_type;
+	__u8			bucket_gen;
+	__u32			pad;
+	__u32			bucket_len;
+	struct bpos		pos;
+} __packed __aligned(8);
 
 /* Optional/variable size superblock sections: */
 
@@ -875,14 +477,43 @@ struct bch_sb_field {
 	__le32			type;
 };
 
-#define BCH_SB_FIELDS()		\
-	x(journal,	0)	\
-	x(members,	1)	\
-	x(crypt,	2)	\
-	x(replicas,	3)	\
-	x(quota,	4)	\
-	x(disk_groups,	5)	\
-	x(clean,	6)
+#define BCH_SB_FIELDS()				\
+	x(journal,			0)	\
+	x(members_v1,			1)	\
+	x(crypt,			2)	\
+	x(replicas_v0,			3)	\
+	x(quota,			4)	\
+	x(disk_groups,			5)	\
+	x(clean,			6)	\
+	x(replicas,			7)	\
+	x(journal_seq_blacklist,	8)	\
+	x(journal_v2,			9)	\
+	x(counters,			10)	\
+	x(members_v2,			11)	\
+	x(errors,			12)	\
+	x(ext,				13)	\
+	x(downgrade,			14)
+
+#include "alloc_background_format.h"
+#include "dirent_format.h"
+#include "disk_accounting_format.h"
+#include "disk_groups_format.h"
+#include "extents_format.h"
+#include "ec_format.h"
+#include "inode_format.h"
+#include "journal_seq_blacklist_format.h"
+#include "logged_ops_format.h"
+#include "lru_format.h"
+#include "quota_format.h"
+#include "reflink_format.h"
+#include "replicas_format.h"
+#include "snapshot_format.h"
+#include "subvolume_format.h"
+#include "sb-counters_format.h"
+#include "sb-downgrade_format.h"
+#include "sb-errors_format.h"
+#include "sb-members_format.h"
+#include "xattr_format.h"
 
 enum bch_sb_field_type {
 #define x(f, nr)	BCH_SB_FIELD_##f = nr,
@@ -891,59 +522,28 @@ enum bch_sb_field_type {
 	BCH_SB_FIELD_NR
 };
 
+/*
+ * Most superblock fields are replicated in all device's superblocks - a few are
+ * not:
+ */
+#define BCH_SINGLE_DEVICE_SB_FIELDS		\
+	((1U << BCH_SB_FIELD_journal)|		\
+	 (1U << BCH_SB_FIELD_journal_v2))
+
 /* BCH_SB_FIELD_journal: */
 
 struct bch_sb_field_journal {
 	struct bch_sb_field	field;
-	__le64			buckets[0];
-};
-
-/* BCH_SB_FIELD_members: */
-
-struct bch_member {
-	uuid_le			uuid;
-	__le64			nbuckets;	/* device size */
-	__le16			first_bucket;   /* index of first bucket used */
-	__le16			bucket_size;	/* sectors */
-	__le32			pad;
-	__le64			last_mount;	/* time_t */
-
-	__le64			flags[2];
-};
-
-LE64_BITMASK(BCH_MEMBER_STATE,		struct bch_member, flags[0],  0,  4)
-/* 4-10 unused, was TIER, HAS_(META)DATA */
-LE64_BITMASK(BCH_MEMBER_REPLACEMENT,	struct bch_member, flags[0], 10, 14)
-LE64_BITMASK(BCH_MEMBER_DISCARD,	struct bch_member, flags[0], 14, 15)
-LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,	struct bch_member, flags[0], 15, 20)
-LE64_BITMASK(BCH_MEMBER_GROUP,		struct bch_member, flags[0], 20, 28)
-LE64_BITMASK(BCH_MEMBER_DURABILITY,	struct bch_member, flags[0], 28, 30)
-
-#define BCH_TIER_MAX			4U
-
-#if 0
-LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,	struct bch_member, flags[1], 0,  20);
-LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
-#endif
-
-enum bch_member_state {
-	BCH_MEMBER_STATE_RW		= 0,
-	BCH_MEMBER_STATE_RO		= 1,
-	BCH_MEMBER_STATE_FAILED		= 2,
-	BCH_MEMBER_STATE_SPARE		= 3,
-	BCH_MEMBER_STATE_NR		= 4,
-};
-
-enum cache_replacement {
-	CACHE_REPLACEMENT_LRU		= 0,
-	CACHE_REPLACEMENT_FIFO		= 1,
-	CACHE_REPLACEMENT_RANDOM	= 2,
-	CACHE_REPLACEMENT_NR		= 3,
+	__le64			buckets[];
 };
 
-struct bch_sb_field_members {
+struct bch_sb_field_journal_v2 {
 	struct bch_sb_field	field;
-	struct bch_member	members[0];
+
+	struct bch_sb_field_journal_v2_entry {
+		__le64		start;
+		__le64		nr;
+	}			d[];
 };
 
 /* BCH_SB_FIELD_crypt: */
@@ -957,10 +557,10 @@ struct bch_key {
 };
 
 #define BCH_KEY_MAGIC					\
-	(((u64) 'b' <<  0)|((u64) 'c' <<  8)|		\
-	 ((u64) 'h' << 16)|((u64) '*' << 24)|		\
-	 ((u64) '*' << 32)|((u64) 'k' << 40)|		\
-	 ((u64) 'e' << 48)|((u64) 'y' << 56))
+	(((__u64) 'b' <<  0)|((__u64) 'c' <<  8)|		\
+	 ((__u64) 'h' << 16)|((__u64) '*' << 24)|		\
+	 ((__u64) '*' << 32)|((__u64) 'k' << 40)|		\
+	 ((__u64) 'e' << 48)|((__u64) 'y' << 56))
 
 struct bch_encrypted_key {
 	__le64			magic;
@@ -994,64 +594,6 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N,	struct bch_sb_field_crypt, kdf_flags,  0, 16);
 LE64_BITMASK(BCH_KDF_SCRYPT_R,	struct bch_sb_field_crypt, kdf_flags, 16, 32);
 LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);
 
-/* BCH_SB_FIELD_replicas: */
-
-enum bch_data_type {
-	BCH_DATA_NONE		= 0,
-	BCH_DATA_SB		= 1,
-	BCH_DATA_JOURNAL	= 2,
-	BCH_DATA_BTREE		= 3,
-	BCH_DATA_USER		= 4,
-	BCH_DATA_CACHED		= 5,
-	BCH_DATA_NR		= 6,
-};
-
-struct bch_replicas_entry {
-	u8			data_type;
-	u8			nr;
-	u8			devs[0];
-};
-
-struct bch_sb_field_replicas {
-	struct bch_sb_field	field;
-	struct bch_replicas_entry entries[0];
-};
-
-/* BCH_SB_FIELD_quota: */
-
-struct bch_sb_quota_counter {
-	__le32				timelimit;
-	__le32				warnlimit;
-};
-
-struct bch_sb_quota_type {
-	__le64				flags;
-	struct bch_sb_quota_counter	c[Q_COUNTERS];
-};
-
-struct bch_sb_field_quota {
-	struct bch_sb_field		field;
-	struct bch_sb_quota_type	q[QTYP_NR];
-} __attribute__((packed, aligned(8)));
-
-/* BCH_SB_FIELD_disk_groups: */
-
-#define BCH_SB_LABEL_SIZE		32
-
-struct bch_disk_group {
-	__u8			label[BCH_SB_LABEL_SIZE];
-	__le64			flags[2];
-};
-
-LE64_BITMASK(BCH_GROUP_DELETED,		struct bch_disk_group, flags[0], 0,  1)
-LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,	struct bch_disk_group, flags[0], 1,  6)
-LE64_BITMASK(BCH_GROUP_PARENT,		struct bch_disk_group, flags[0], 6, 24)
-
-struct bch_sb_field_disk_groups {
-	struct bch_sb_field	field;
-	struct bch_disk_group	entries[0];
-};
-
 /*
  * On clean shutdown, store btree roots and current journal sequence number in
  * the superblock:
@@ -1063,57 +605,118 @@ struct jset_entry {
 	__u8			type; /* designates what this jset holds */
 	__u8			pad[3];
 
-	union {
-		struct bkey_i	start[0];
-		__u64		_data[0];
-	};
+	struct bkey_i		start[0];
+	__u64			_data[];
 };
 
 struct bch_sb_field_clean {
 	struct bch_sb_field	field;
 
 	__le32			flags;
-	__le16			read_clock;
-	__le16			write_clock;
+	__le16			_read_clock; /* no longer used */
+	__le16			_write_clock;
 	__le64			journal_seq;
 
-	union {
-		struct jset_entry start[0];
-		__u64		_data[0];
-	};
+	struct jset_entry	start[0];
+	__u64			_data[];
+};
+
+struct bch_sb_field_ext {
+	struct bch_sb_field	field;
+	__le64			recovery_passes_required[2];
+	__le64			errors_silent[8];
+	__le64			btrees_lost_data;
 };
 
 /* Superblock: */
 
 /*
- * Version 8:	BCH_SB_ENCODED_EXTENT_MAX_BITS
- *		BCH_MEMBER_DATA_ALLOWED
- * Version 9:	incompatible extent nonce change
+ * New versioning scheme:
+ * One common version number for all on disk data structures - superblock, btree
+ * nodes, journal entries
  */
+#define BCH_VERSION_MAJOR(_v)		((__u16) ((_v) >> 10))
+#define BCH_VERSION_MINOR(_v)		((__u16) ((_v) & ~(~0U << 10)))
+#define BCH_VERSION(_major, _minor)	(((_major) << 10)|(_minor) << 0)
 
-#define BCH_SB_VERSION_MIN		7
-#define BCH_SB_VERSION_EXTENT_MAX	8
-#define BCH_SB_VERSION_EXTENT_NONCE_V1	9
-#define BCH_SB_VERSION_MAX		9
+/*
+ * field 1:		version name
+ * field 2:		BCH_VERSION(major, minor)
+ * field 3:		recovery passess required on upgrade
+ */
+#define BCH_METADATA_VERSIONS()						\
+	x(bkey_renumber,		BCH_VERSION(0, 10))		\
+	x(inode_btree_change,		BCH_VERSION(0, 11))		\
+	x(snapshot,			BCH_VERSION(0, 12))		\
+	x(inode_backpointers,		BCH_VERSION(0, 13))		\
+	x(btree_ptr_sectors_written,	BCH_VERSION(0, 14))		\
+	x(snapshot_2,			BCH_VERSION(0, 15))		\
+	x(reflink_p_fix,		BCH_VERSION(0, 16))		\
+	x(subvol_dirent,		BCH_VERSION(0, 17))		\
+	x(inode_v2,			BCH_VERSION(0, 18))		\
+	x(freespace,			BCH_VERSION(0, 19))		\
+	x(alloc_v4,			BCH_VERSION(0, 20))		\
+	x(new_data_types,		BCH_VERSION(0, 21))		\
+	x(backpointers,			BCH_VERSION(0, 22))		\
+	x(inode_v3,			BCH_VERSION(0, 23))		\
+	x(unwritten_extents,		BCH_VERSION(0, 24))		\
+	x(bucket_gens,			BCH_VERSION(0, 25))		\
+	x(lru_v2,			BCH_VERSION(0, 26))		\
+	x(fragmentation_lru,		BCH_VERSION(0, 27))		\
+	x(no_bps_in_alloc_keys,		BCH_VERSION(0, 28))		\
+	x(snapshot_trees,		BCH_VERSION(0, 29))		\
+	x(major_minor,			BCH_VERSION(1,  0))		\
+	x(snapshot_skiplists,		BCH_VERSION(1,  1))		\
+	x(deleted_inodes,		BCH_VERSION(1,  2))		\
+	x(rebalance_work,		BCH_VERSION(1,  3))		\
+	x(member_seq,			BCH_VERSION(1,  4))		\
+	x(subvolume_fs_parent,		BCH_VERSION(1,  5))		\
+	x(btree_subvolume_children,	BCH_VERSION(1,  6))		\
+	x(mi_btree_bitmap,		BCH_VERSION(1,  7))		\
+	x(bucket_stripe_sectors,	BCH_VERSION(1,  8))		\
+	x(disk_accounting_v2,		BCH_VERSION(1,  9))		\
+	x(disk_accounting_v3,		BCH_VERSION(1, 10))		\
+	x(disk_accounting_inum,		BCH_VERSION(1, 11))		\
+	x(rebalance_work_acct_fix,	BCH_VERSION(1, 12))		\
+	x(inode_has_child_snapshots,	BCH_VERSION(1, 13))		\
+	x(backpointer_bucket_gen,	BCH_VERSION(1, 14))		\
+	x(disk_accounting_big_endian,	BCH_VERSION(1, 15))
+
+enum bcachefs_metadata_version {
+	bcachefs_metadata_version_min = 9,
+#define x(t, n)	bcachefs_metadata_version_##t = n,
+	BCH_METADATA_VERSIONS()
+#undef x
+	bcachefs_metadata_version_max
+};
+
+static const __maybe_unused
+unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
+
+#define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
 
 #define BCH_SB_SECTOR			8
-#define BCH_SB_MEMBERS_MAX		64 /* XXX kill */
+
+#define BCH_SB_LAYOUT_SIZE_BITS_MAX	16 /* 32 MB */
 
 struct bch_sb_layout {
-	uuid_le			magic;	/* bcachefs superblock UUID */
+	__uuid_t		magic;	/* bcachefs superblock UUID */
 	__u8			layout_type;
 	__u8			sb_max_size_bits; /* base 2 of 512 byte sectors */
 	__u8			nr_superblocks;
 	__u8			pad[5];
 	__le64			sb_offset[61];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 #define BCH_SB_LAYOUT_SECTOR	7
 
 /*
  * @offset	- sector where this sb was written
  * @version	- on disk format version
- * @magic	- identifies as a bcachefs superblock (BCACHE_MAGIC)
+ * @version_min	- Oldest metadata version this filesystem contains; so we can
+ *		  safely drop compatibility code and refuse to mount filesystems
+ *		  we'd need it for
+ * @magic	- identifies as a bcachefs superblock (BCHFS_MAGIC)
  * @seq		- incremented each time superblock is written
  * @uuid	- used for generating various magic numbers and identifying
  *                member devices, never changes
@@ -1125,10 +728,12 @@ struct bch_sb_layout {
  */
 struct bch_sb {
 	struct bch_csum		csum;
-	__le64			version;
-	uuid_le			magic;
-	uuid_le			uuid;
-	uuid_le			user_uuid;
+	__le16			version;
+	__le16			version_min;
+	__le16			pad[2];
+	__uuid_t		magic;
+	__uuid_t		uuid;
+	__uuid_t		user_uuid;
 	__u8			label[BCH_SB_LABEL_SIZE];
 	__le64			offset;
 	__le64			seq;
@@ -1142,17 +747,16 @@ struct bch_sb {
 	__le32			time_base_hi;
 	__le32			time_precision;
 
-	__le64			flags[8];
+	__le64			flags[7];
+	__le64			write_time;
 	__le64			features[2];
 	__le64			compat[2];
 
 	struct bch_sb_layout	layout;
 
-	union {
-		struct bch_sb_field start[0];
-		__le64		_data[0];
-	};
-} __attribute__((packed, aligned(8)));
+	struct bch_sb_field	start[0];
+	__le64			_data[];
+} __packed __aligned(8);
 
 /*
  * Flags:
@@ -1189,10 +793,15 @@ LE64_BITMASK(BCH_SB_USRQUOTA,		struct bch_sb, flags[0], 57, 58);
 LE64_BITMASK(BCH_SB_GRPQUOTA,		struct bch_sb, flags[0], 58, 59);
 LE64_BITMASK(BCH_SB_PRJQUOTA,		struct bch_sb, flags[0], 59, 60);
 
-/* 60-64 unused */
+LE64_BITMASK(BCH_SB_HAS_ERRORS,		struct bch_sb, flags[0], 60, 61);
+LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
+
+LE64_BITMASK(BCH_SB_BIG_ENDIAN,		struct bch_sb, flags[0], 62, 63);
+LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS,
+					struct bch_sb, flags[0], 63, 64);
 
 LE64_BITMASK(BCH_SB_STR_HASH_TYPE,	struct bch_sb, flags[1],  0,  4);
-LE64_BITMASK(BCH_SB_COMPRESSION_TYPE,	struct bch_sb, flags[1],  4,  8);
+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1],  4,  8);
 LE64_BITMASK(BCH_SB_INODE_32BIT,	struct bch_sb, flags[1],  8,  9);
 
 LE64_BITMASK(BCH_SB_128_BIT_MACS,	struct bch_sb, flags[1],  9, 10);
@@ -1212,52 +821,250 @@ LE64_BITMASK(BCH_SB_PROMOTE_TARGET,	struct bch_sb, flags[1], 28, 40);
 LE64_BITMASK(BCH_SB_FOREGROUND_TARGET,	struct bch_sb, flags[1], 40, 52);
 LE64_BITMASK(BCH_SB_BACKGROUND_TARGET,	struct bch_sb, flags[1], 52, 64);
 
-LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO,
 					struct bch_sb, flags[2],  0,  4);
+LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);
+
+LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
+LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);
+LE64_BITMASK(BCH_SB_SHARD_INUMS,	struct bch_sb, flags[3], 28, 29);
+LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
+LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
+LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
+LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
+LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
+LE64_BITMASK(BCH_SB_NOCOW,		struct bch_sb, flags[4], 33, 34);
+LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE,	struct bch_sb, flags[4], 34, 54);
+LE64_BITMASK(BCH_SB_VERSION_UPGRADE,	struct bch_sb, flags[4], 54, 56);
+
+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60);
+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
+					struct bch_sb, flags[4], 60, 64);
+
+LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
+					struct bch_sb, flags[5],  0, 16);
+LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
+					struct bch_sb, flags[5], 16, 32);
+
+static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
+{
+	return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4);
+}
 
-/* Features: */
-enum bch_sb_features {
-	BCH_FEATURE_LZ4			= 0,
-	BCH_FEATURE_GZIP		= 1,
-	BCH_FEATURE_ZSTD		= 2,
-	BCH_FEATURE_ATOMIC_NLINK	= 3,
+static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
+{
+	SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v);
+	SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4);
+}
+
+static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb)
+{
+	return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) |
+		(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4);
+}
+
+static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
+{
+	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v);
+	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4);
+}
+
+/*
+ * Features:
+ *
+ * journal_seq_blacklist_v3:	gates BCH_SB_FIELD_journal_seq_blacklist
+ * reflink:			gates KEY_TYPE_reflink
+ * inline_data:			gates KEY_TYPE_inline_data
+ * new_siphash:			gates BCH_STR_HASH_siphash
+ * new_extent_overwrite:	gates BTREE_NODE_NEW_EXTENT_OVERWRITE
+ */
+#define BCH_SB_FEATURES()			\
+	x(lz4,				0)	\
+	x(gzip,				1)	\
+	x(zstd,				2)	\
+	x(atomic_nlink,			3)	\
+	x(ec,				4)	\
+	x(journal_seq_blacklist_v3,	5)	\
+	x(reflink,			6)	\
+	x(new_siphash,			7)	\
+	x(inline_data,			8)	\
+	x(new_extent_overwrite,		9)	\
+	x(incompressible,		10)	\
+	x(btree_ptr_v2,			11)	\
+	x(extents_above_btree_updates,	12)	\
+	x(btree_updates_journalled,	13)	\
+	x(reflink_inline_data,		14)	\
+	x(new_varint,			15)	\
+	x(journal_no_flush,		16)	\
+	x(alloc_v2,			17)	\
+	x(extents_across_btree_nodes,	18)
+
+#define BCH_SB_FEATURES_ALWAYS				\
+	((1ULL << BCH_FEATURE_new_extent_overwrite)|	\
+	 (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
+	 (1ULL << BCH_FEATURE_btree_updates_journalled)|\
+	 (1ULL << BCH_FEATURE_alloc_v2)|\
+	 (1ULL << BCH_FEATURE_extents_across_btree_nodes))
+
+#define BCH_SB_FEATURES_ALL				\
+	(BCH_SB_FEATURES_ALWAYS|			\
+	 (1ULL << BCH_FEATURE_new_siphash)|		\
+	 (1ULL << BCH_FEATURE_btree_ptr_v2)|		\
+	 (1ULL << BCH_FEATURE_new_varint)|		\
+	 (1ULL << BCH_FEATURE_journal_no_flush))
+
+enum bch_sb_feature {
+#define x(f, n) BCH_FEATURE_##f,
+	BCH_SB_FEATURES()
+#undef x
+	BCH_FEATURE_NR,
+};
+
+#define BCH_SB_COMPAT()					\
+	x(alloc_info,				0)	\
+	x(alloc_metadata,			1)	\
+	x(extents_above_btree_updates_done,	2)	\
+	x(bformat_overflow_done,		3)
+
+enum bch_sb_compat {
+#define x(f, n) BCH_COMPAT_##f,
+	BCH_SB_COMPAT()
+#undef x
+	BCH_COMPAT_NR,
 };
 
 /* options: */
 
+#define BCH_VERSION_UPGRADE_OPTS()	\
+	x(compatible,		0)	\
+	x(incompatible,		1)	\
+	x(none,			2)
+
+enum bch_version_upgrade_opts {
+#define x(t, n) BCH_VERSION_UPGRADE_##t = n,
+	BCH_VERSION_UPGRADE_OPTS()
+#undef x
+};
+
 #define BCH_REPLICAS_MAX		4U
 
+#define BCH_BKEY_PTRS_MAX		16U
+
+#define BCH_ERROR_ACTIONS()		\
+	x(continue,		0)	\
+	x(fix_safe,		1)	\
+	x(panic,		2)	\
+	x(ro,			3)
+
 enum bch_error_actions {
-	BCH_ON_ERROR_CONTINUE		= 0,
-	BCH_ON_ERROR_RO			= 1,
-	BCH_ON_ERROR_PANIC		= 2,
-	BCH_NR_ERROR_ACTIONS		= 3,
+#define x(t, n) BCH_ON_ERROR_##t = n,
+	BCH_ERROR_ACTIONS()
+#undef x
+	BCH_ON_ERROR_NR
 };
 
-enum bch_csum_opts {
-	BCH_CSUM_OPT_NONE		= 0,
-	BCH_CSUM_OPT_CRC32C		= 1,
-	BCH_CSUM_OPT_CRC64		= 2,
-	BCH_CSUM_OPT_NR			= 3,
+#define BCH_STR_HASH_TYPES()		\
+	x(crc32c,		0)	\
+	x(crc64,		1)	\
+	x(siphash_old,		2)	\
+	x(siphash,		3)
+
+enum bch_str_hash_type {
+#define x(t, n) BCH_STR_HASH_##t = n,
+	BCH_STR_HASH_TYPES()
+#undef x
+	BCH_STR_HASH_NR
 };
 
+#define BCH_STR_HASH_OPTS()		\
+	x(crc32c,		0)	\
+	x(crc64,		1)	\
+	x(siphash,		2)
+
 enum bch_str_hash_opts {
-	BCH_STR_HASH_CRC32C		= 0,
-	BCH_STR_HASH_CRC64		= 1,
-	BCH_STR_HASH_SIPHASH		= 2,
-	BCH_STR_HASH_NR			= 3,
+#define x(t, n) BCH_STR_HASH_OPT_##t = n,
+	BCH_STR_HASH_OPTS()
+#undef x
+	BCH_STR_HASH_OPT_NR
+};
+
+#define BCH_CSUM_TYPES()			\
+	x(none,				0)	\
+	x(crc32c_nonzero,		1)	\
+	x(crc64_nonzero,		2)	\
+	x(chacha20_poly1305_80,		3)	\
+	x(chacha20_poly1305_128,	4)	\
+	x(crc32c,			5)	\
+	x(crc64,			6)	\
+	x(xxhash,			7)
+
+enum bch_csum_type {
+#define x(t, n) BCH_CSUM_##t = n,
+	BCH_CSUM_TYPES()
+#undef x
+	BCH_CSUM_NR
+};
+
+static const __maybe_unused unsigned bch_crc_bytes[] = {
+	[BCH_CSUM_none]				= 0,
+	[BCH_CSUM_crc32c_nonzero]		= 4,
+	[BCH_CSUM_crc32c]			= 4,
+	[BCH_CSUM_crc64_nonzero]		= 8,
+	[BCH_CSUM_crc64]			= 8,
+	[BCH_CSUM_xxhash]			= 8,
+	[BCH_CSUM_chacha20_poly1305_80]		= 10,
+	[BCH_CSUM_chacha20_poly1305_128]	= 16,
+};
+
+static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
+{
+	switch (type) {
+	case BCH_CSUM_chacha20_poly1305_80:
+	case BCH_CSUM_chacha20_poly1305_128:
+		return true;
+	default:
+		return false;
+	}
+}
+
+#define BCH_CSUM_OPTS()			\
+	x(none,			0)	\
+	x(crc32c,		1)	\
+	x(crc64,		2)	\
+	x(xxhash,		3)
+
+enum bch_csum_opt {
+#define x(t, n) BCH_CSUM_OPT_##t = n,
+	BCH_CSUM_OPTS()
+#undef x
+	BCH_CSUM_OPT_NR
 };
 
 #define BCH_COMPRESSION_TYPES()		\
-	x(NONE)				\
-	x(LZ4)				\
-	x(GZIP)				\
-	x(ZSTD)
+	x(none,			0)	\
+	x(lz4_old,		1)	\
+	x(gzip,			2)	\
+	x(lz4,			3)	\
+	x(zstd,			4)	\
+	x(incompressible,	5)
 
-enum bch_compression_opts {
-#define x(t) BCH_COMPRESSION_OPT_##t,
+enum bch_compression_type {
+#define x(t, n) BCH_COMPRESSION_TYPE_##t = n,
 	BCH_COMPRESSION_TYPES()
 #undef x
+	BCH_COMPRESSION_TYPE_NR
+};
+
+#define BCH_COMPRESSION_OPTS()		\
+	x(none,		0)		\
+	x(lz4,		1)		\
+	x(gzip,		2)		\
+	x(zstd,		3)
+
+enum bch_compression_opts {
+#define x(t, n) BCH_COMPRESSION_OPT_##t = n,
+	BCH_COMPRESSION_OPTS()
+#undef x
 	BCH_COMPRESSION_OPT_NR
 };
 
@@ -1269,10 +1076,13 @@ enum bch_compression_opts {
  */
 
 #define BCACHE_MAGIC							\
-	UUID_LE(0xf67385c6, 0x1a4e, 0xca45,				\
-		0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
+	UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca,				\
+		  0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
+#define BCHFS_MAGIC							\
+	UUID_INIT(0xc68573f6, 0x66ce, 0x90a9,				\
+		  0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
 
-#define BCACHEFS_STATFS_MAGIC		0xca451a4e
+#define BCACHEFS_STATFS_MAGIC		BCACHEFS_SUPER_MAGIC
 
 #define JSET_MAGIC		__cpu_to_le64(0x245235c1a3625032ULL)
 #define BSET_MAGIC		__cpu_to_le64(0x90135c78b99e07f5ULL)
@@ -1280,6 +1090,7 @@ enum bch_compression_opts {
 static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
 {
 	__le64 ret;
+
 	memcpy(&ret, &sb->uuid, sizeof(ret));
 	return ret;
 }
@@ -1296,11 +1107,6 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
 
 /* Journal */
 
-#define BCACHE_JSET_VERSION_UUIDv1	1
-#define BCACHE_JSET_VERSION_UUID	1	/* Always latest UUID format */
-#define BCACHE_JSET_VERSION_JKEYS	2
-#define BCACHE_JSET_VERSION		2
-
 #define JSET_KEYS_U64s	(sizeof(struct jset_entry) / sizeof(__u64))
 
 #define BCH_JSET_ENTRY_TYPES()			\
@@ -1308,15 +1114,35 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
 	x(btree_root,		1)		\
 	x(prio_ptrs,		2)		\
 	x(blacklist,		3)		\
-	x(blacklist_v2,		4)
-
-enum {
+	x(blacklist_v2,		4)		\
+	x(usage,		5)		\
+	x(data_usage,		6)		\
+	x(clock,		7)		\
+	x(dev_usage,		8)		\
+	x(log,			9)		\
+	x(overwrite,		10)		\
+	x(write_buffer_keys,	11)		\
+	x(datetime,		12)
+
+enum bch_jset_entry_type {
 #define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
 	BCH_JSET_ENTRY_TYPES()
 #undef x
 	BCH_JSET_ENTRY_NR
 };
 
+static inline bool jset_entry_is_key(struct jset_entry *e)
+{
+	switch (e->type) {
+	case BCH_JSET_ENTRY_btree_keys:
+	case BCH_JSET_ENTRY_btree_root:
+	case BCH_JSET_ENTRY_write_buffer_keys:
+		return true;
+	}
+
+	return false;
+}
+
 /*
  * Journal sequence numbers can be blacklisted: bsets record the max sequence
  * number of all the journal entries they contain updates for, so that on
@@ -1338,6 +1164,78 @@ struct jset_entry_blacklist_v2 {
 	__le64			end;
 };
 
+#define BCH_FS_USAGE_TYPES()			\
+	x(reserved,		0)		\
+	x(inodes,		1)		\
+	x(key_version,		2)
+
+enum bch_fs_usage_type {
+#define x(f, nr)	BCH_FS_USAGE_##f	= nr,
+	BCH_FS_USAGE_TYPES()
+#undef x
+	BCH_FS_USAGE_NR
+};
+
+struct jset_entry_usage {
+	struct jset_entry	entry;
+	__le64			v;
+} __packed;
+
+struct jset_entry_data_usage {
+	struct jset_entry	entry;
+	__le64			v;
+	struct bch_replicas_entry_v1 r;
+} __packed;
+
+struct jset_entry_clock {
+	struct jset_entry	entry;
+	__u8			rw;
+	__u8			pad[7];
+	__le64			time;
+} __packed;
+
+struct jset_entry_dev_usage_type {
+	__le64			buckets;
+	__le64			sectors;
+	__le64			fragmented;
+} __packed;
+
+struct jset_entry_dev_usage {
+	struct jset_entry	entry;
+	__le32			dev;
+	__u32			pad;
+
+	__le64			_buckets_ec;		/* No longer used */
+	__le64			_buckets_unavailable;	/* No longer used */
+
+	struct jset_entry_dev_usage_type d[];
+};
+
+static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
+{
+	return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
+		sizeof(struct jset_entry_dev_usage_type);
+}
+
+struct jset_entry_log {
+	struct jset_entry	entry;
+	u8			d[];
+} __packed __aligned(8);
+
+static inline unsigned jset_entry_log_msg_bytes(struct jset_entry_log *l)
+{
+	unsigned b = vstruct_bytes(&l->entry) - offsetof(struct jset_entry_log, d);
+
+	while (b && !l->d[b - 1])
+		--b;
+	return b;
+}
+
+struct jset_entry_datetime {
+	struct jset_entry	entry;
+	__le64			seconds;
+} __packed __aligned(8);
+
 /*
  * On disk format for a journal entry:
  * seq is monotonically increasing; every journal entry has its own unique
@@ -1360,54 +1258,131 @@ struct jset {
 
 	__u8			encrypted_start[0];
 
-	__le16			read_clock;
-	__le16			write_clock;
+	__le16			_read_clock; /* no longer used */
+	__le16			_write_clock;
 
 	/* Sequence number of oldest dirty journal entry */
 	__le64			last_seq;
 
 
-	union {
-		struct jset_entry start[0];
-		__u64		_data[0];
-	};
-} __attribute__((packed, aligned(8)));
+	struct jset_entry	start[0];
+	__u64			_data[];
+} __packed __aligned(8);
 
 LE32_BITMASK(JSET_CSUM_TYPE,	struct jset, flags, 0, 4);
 LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
+LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);
 
-#define BCH_JOURNAL_BUCKETS_MIN		20
+#define BCH_JOURNAL_BUCKETS_MIN		8
 
 /* Btree: */
 
-#define DEFINE_BCH_BTREE_IDS()					\
-	DEF_BTREE_ID(EXTENTS,	0, "extents")			\
-	DEF_BTREE_ID(INODES,	1, "inodes")			\
-	DEF_BTREE_ID(DIRENTS,	2, "dirents")			\
-	DEF_BTREE_ID(XATTRS,	3, "xattrs")			\
-	DEF_BTREE_ID(ALLOC,	4, "alloc")			\
-	DEF_BTREE_ID(QUOTAS,	5, "quotas")
+enum btree_id_flags {
+	BTREE_ID_EXTENTS	= BIT(0),
+	BTREE_ID_SNAPSHOTS	= BIT(1),
+	BTREE_ID_SNAPSHOT_FIELD	= BIT(2),
+	BTREE_ID_DATA		= BIT(3),
+};
 
-#define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,
+#define BCH_BTREE_IDS()								\
+	x(extents,		0,	BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\
+	  BIT_ULL(KEY_TYPE_whiteout)|						\
+	  BIT_ULL(KEY_TYPE_error)|						\
+	  BIT_ULL(KEY_TYPE_cookie)|						\
+	  BIT_ULL(KEY_TYPE_extent)|						\
+	  BIT_ULL(KEY_TYPE_reservation)|					\
+	  BIT_ULL(KEY_TYPE_reflink_p)|						\
+	  BIT_ULL(KEY_TYPE_inline_data))					\
+	x(inodes,		1,	BTREE_ID_SNAPSHOTS,			\
+	  BIT_ULL(KEY_TYPE_whiteout)|						\
+	  BIT_ULL(KEY_TYPE_inode)|						\
+	  BIT_ULL(KEY_TYPE_inode_v2)|						\
+	  BIT_ULL(KEY_TYPE_inode_v3)|						\
+	  BIT_ULL(KEY_TYPE_inode_generation))					\
+	x(dirents,		2,	BTREE_ID_SNAPSHOTS,			\
+	  BIT_ULL(KEY_TYPE_whiteout)|						\
+	  BIT_ULL(KEY_TYPE_hash_whiteout)|					\
+	  BIT_ULL(KEY_TYPE_dirent))						\
+	x(xattrs,		3,	BTREE_ID_SNAPSHOTS,			\
+	  BIT_ULL(KEY_TYPE_whiteout)|						\
+	  BIT_ULL(KEY_TYPE_cookie)|						\
+	  BIT_ULL(KEY_TYPE_hash_whiteout)|					\
+	  BIT_ULL(KEY_TYPE_xattr))						\
+	x(alloc,		4,	0,					\
+	  BIT_ULL(KEY_TYPE_alloc)|						\
+	  BIT_ULL(KEY_TYPE_alloc_v2)|						\
+	  BIT_ULL(KEY_TYPE_alloc_v3)|						\
+	  BIT_ULL(KEY_TYPE_alloc_v4))						\
+	x(quotas,		5,	0,					\
+	  BIT_ULL(KEY_TYPE_quota))						\
+	x(stripes,		6,	0,					\
+	  BIT_ULL(KEY_TYPE_stripe))						\
+	x(reflink,		7,	BTREE_ID_EXTENTS|BTREE_ID_DATA,		\
+	  BIT_ULL(KEY_TYPE_reflink_v)|						\
+	  BIT_ULL(KEY_TYPE_indirect_inline_data)|				\
+	  BIT_ULL(KEY_TYPE_error))						\
+	x(subvolumes,		8,	0,					\
+	  BIT_ULL(KEY_TYPE_subvolume))						\
+	x(snapshots,		9,	0,					\
+	  BIT_ULL(KEY_TYPE_snapshot))						\
+	x(lru,			10,	0,					\
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(freespace,		11,	BTREE_ID_EXTENTS,			\
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(need_discard,		12,	0,					\
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(backpointers,		13,	0,					\
+	  BIT_ULL(KEY_TYPE_backpointer))					\
+	x(bucket_gens,		14,	0,					\
+	  BIT_ULL(KEY_TYPE_bucket_gens))					\
+	x(snapshot_trees,	15,	0,					\
+	  BIT_ULL(KEY_TYPE_snapshot_tree))					\
+	x(deleted_inodes,	16,	BTREE_ID_SNAPSHOT_FIELD,		\
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(logged_ops,		17,	0,					\
+	  BIT_ULL(KEY_TYPE_logged_op_truncate)|					\
+	  BIT_ULL(KEY_TYPE_logged_op_finsert))					\
+	x(rebalance_work,	18,	BTREE_ID_SNAPSHOT_FIELD,		\
+	  BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))			\
+	x(subvolume_children,	19,	0,					\
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(accounting,		20,	BTREE_ID_SNAPSHOT_FIELD,		\
+	  BIT_ULL(KEY_TYPE_accounting))						\
 
 enum btree_id {
-	DEFINE_BCH_BTREE_IDS()
+#define x(name, nr, ...) BTREE_ID_##name = nr,
+	BCH_BTREE_IDS()
+#undef x
 	BTREE_ID_NR
 };
 
-#undef DEF_BTREE_ID
+/*
+ * Maximum number of btrees that we will _ever_ have under the current scheme,
+ * where we refer to them with 64 bit bitfields - and we also need a bit for
+ * the interior btree node type:
+ */
+#define BTREE_ID_NR_MAX		63
+
+static inline bool btree_id_is_alloc(enum btree_id id)
+{
+	switch (id) {
+	case BTREE_ID_alloc:
+	case BTREE_ID_backpointers:
+	case BTREE_ID_need_discard:
+	case BTREE_ID_freespace:
+	case BTREE_ID_bucket_gens:
+	case BTREE_ID_lru:
+	case BTREE_ID_accounting:
+		return true;
+	default:
+		return false;
+	}
+}
 
 #define BTREE_MAX_DEPTH		4U
 
 /* Btree nodes */
 
-/* Version 1: Seed pointer into btree node checksum
- */
-#define BCACHE_BSET_CSUM		1
-#define BCACHE_BSET_KEY_v1		2
-#define BCACHE_BSET_JOURNAL_SEQ		3
-#define BCACHE_BSET_VERSION		3
-
 /*
  * Btree nodes
  *
@@ -1430,11 +1405,9 @@ struct bset {
 	__le16			version;
 	__le16			u64s; /* count of d[] in u64s */
 
-	union {
-		struct bkey_packed start[0];
-		__u64		_data[0];
-	};
-} __attribute__((packed, aligned(8)));
+	struct bkey_packed	start[0];
+	__u64			_data[];
+} __packed __aligned(8);
 
 LE32_BITMASK(BSET_CSUM_TYPE,	struct bset, flags, 0, 4);
 
@@ -1442,6 +1415,9 @@ LE32_BITMASK(BSET_BIG_ENDIAN,	struct bset, flags, 4, 5);
 LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
 				struct bset, flags, 5, 6);
 
+/* Sector offset within the btree node: */
+LE32_BITMASK(BSET_OFFSET,	struct bset, flags, 16, 32);
+
 struct btree_node {
 	struct bch_csum		csum;
 	__le64			magic;
@@ -1452,7 +1428,7 @@ struct btree_node {
 	/* Closed interval: */
 	struct bpos		min_key;
 	struct bpos		max_key;
-	struct bch_extent_ptr	ptr;
+	struct bch_extent_ptr	_ptr; /* not used anymore */
 	struct bkey_format	format;
 
 	union {
@@ -1464,13 +1440,27 @@ struct btree_node {
 
 	};
 	};
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
-LE64_BITMASK(BTREE_NODE_ID,	struct btree_node, flags,  0,  4);
+LE64_BITMASK(BTREE_NODE_ID_LO,	struct btree_node, flags,  0,  4);
 LE64_BITMASK(BTREE_NODE_LEVEL,	struct btree_node, flags,  4,  8);
-/* 8-32 unused */
+LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
+				struct btree_node, flags,  8,  9);
+LE64_BITMASK(BTREE_NODE_ID_HI,	struct btree_node, flags,  9, 25);
+/* 25-32 unused */
 LE64_BITMASK(BTREE_NODE_SEQ,	struct btree_node, flags, 32, 64);
 
+static inline __u64 BTREE_NODE_ID(struct btree_node *n)
+{
+	return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4);
+}
+
+static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v)
+{
+	SET_BTREE_NODE_ID_LO(n, v);
+	SET_BTREE_NODE_ID_HI(n, v >> 4);
+}
+
 struct btree_node_entry {
 	struct bch_csum		csum;
 
@@ -1480,9 +1470,8 @@ struct btree_node_entry {
 		__u8		pad[22];
 		__le16		u64s;
 		__u64		_data[0];
-
 	};
 	};
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 #endif /* _BCACHEFS_FORMAT_H */
diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h
index 73e5d887..3c23bdf7 100644
--- a/libbcachefs/bcachefs_ioctl.h
+++ b/libbcachefs/bcachefs_ioctl.h
@@ -1,9 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_IOCTL_H
 #define _BCACHEFS_IOCTL_H
 
 #include <linux/uuid.h>
 #include <asm/ioctl.h>
 #include "bcachefs_format.h"
+#include "bkey_types.h"
 
 /*
  * Flags common to multiple ioctls:
@@ -13,6 +15,9 @@
 #define BCH_FORCE_IF_DATA_DEGRADED	(1 << 2)
 #define BCH_FORCE_IF_METADATA_DEGRADED	(1 << 3)
 
+#define BCH_FORCE_IF_LOST			\
+	(BCH_FORCE_IF_DATA_LOST|		\
+	 BCH_FORCE_IF_METADATA_LOST)
 #define BCH_FORCE_IF_DEGRADED			\
 	(BCH_FORCE_IF_DATA_DEGRADED|		\
 	 BCH_FORCE_IF_METADATA_DEGRADED)
@@ -67,10 +72,25 @@ struct bch_ioctl_incremental {
 #define BCH_IOCTL_DISK_OFFLINE	_IOW(0xbc,	7,  struct bch_ioctl_disk)
 #define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc,	8,  struct bch_ioctl_disk_set_state)
 #define BCH_IOCTL_DATA		_IOW(0xbc,	10, struct bch_ioctl_data)
-#define BCH_IOCTL_USAGE		_IOWR(0xbc,	11, struct bch_ioctl_usage)
+#define BCH_IOCTL_FS_USAGE	_IOWR(0xbc,	11, struct bch_ioctl_fs_usage)
+#define BCH_IOCTL_DEV_USAGE	_IOWR(0xbc,	11, struct bch_ioctl_dev_usage)
 #define BCH_IOCTL_READ_SUPER	_IOW(0xbc,	12, struct bch_ioctl_read_super)
 #define BCH_IOCTL_DISK_GET_IDX	_IOW(0xbc,	13,  struct bch_ioctl_disk_get_idx)
-#define BCH_IOCTL_DISK_RESIZE	_IOW(0xbc,	13,  struct bch_ioctl_disk_resize)
+#define BCH_IOCTL_DISK_RESIZE	_IOW(0xbc,	14,  struct bch_ioctl_disk_resize)
+#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15,  struct bch_ioctl_disk_resize_journal)
+
+#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc,	16,  struct bch_ioctl_subvolume)
+#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc,	17,  struct bch_ioctl_subvolume)
+
+#define BCH_IOCTL_DEV_USAGE_V2	_IOWR(0xbc,	18, struct bch_ioctl_dev_usage_v2)
+
+#define BCH_IOCTL_FSCK_OFFLINE	_IOW(0xbc,	19,  struct bch_ioctl_fsck_offline)
+#define BCH_IOCTL_FSCK_ONLINE	_IOW(0xbc,	20,  struct bch_ioctl_fsck_online)
+#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc,	21,  struct bch_ioctl_query_accounting)
+
+/* ioctl below act on a particular file, not the filesystem as a whole: */
+
+#define BCHFS_IOC_REINHERIT_ATTRS	_IOR(0xbc, 64, const char __user *)
 
 /*
  * BCH_IOCTL_QUERY_UUID: get filesystem UUID
@@ -80,7 +100,7 @@ struct bch_ioctl_incremental {
  * this UUID.
  */
 struct bch_ioctl_query_uuid {
-	uuid_le			uuid;
+	__uuid_t		uuid;
 };
 
 #if 0
@@ -160,11 +180,18 @@ struct bch_ioctl_disk_set_state {
 	__u64			dev;
 };
 
+#define BCH_DATA_OPS()			\
+	x(scrub,		0)	\
+	x(rereplicate,		1)	\
+	x(migrate,		2)	\
+	x(rewrite_old_nodes,	3)	\
+	x(drop_extra_replicas,	4)
+
 enum bch_data_ops {
-	BCH_DATA_OP_SCRUB	= 0,
-	BCH_DATA_OP_REREPLICATE	= 1,
-	BCH_DATA_OP_MIGRATE	= 2,
-	BCH_DATA_OP_NR		= 3,
+#define x(t, n) BCH_DATA_OP_##t = n,
+	BCH_DATA_OPS()
+#undef x
+	BCH_DATA_OP_NR
 };
 
 /*
@@ -177,11 +204,13 @@ enum bch_data_ops {
  * job. The file descriptor is O_CLOEXEC.
  */
 struct bch_ioctl_data {
-	__u32			op;
+	__u16			op;
+	__u8			start_btree;
+	__u8			end_btree;
 	__u32			flags;
 
-	struct bpos		start;
-	struct bpos		end;
+	struct bpos		start_pos;
+	struct bpos		end_pos;
 
 	union {
 	struct {
@@ -192,7 +221,7 @@ struct bch_ioctl_data {
 		__u64		pad[8];
 	};
 	};
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 enum bch_data_event {
 	BCH_DATA_EVENT_PROGRESS	= 0,
@@ -208,7 +237,7 @@ struct bch_ioctl_data_progress {
 
 	__u64			sectors_done;
 	__u64			sectors_total;
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
 struct bch_ioctl_data_event {
 	__u8			type;
@@ -217,48 +246,87 @@ struct bch_ioctl_data_event {
 	struct bch_ioctl_data_progress p;
 	__u64			pad2[15];
 	};
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
 
-struct bch_ioctl_dev_usage {
-	__u8			state;
-	__u8			alive;
-	__u8			pad[6];
-	__u32			dev;
+struct bch_replicas_usage {
+	__u64			sectors;
+	struct bch_replicas_entry_v1 r;
+} __packed;
 
-	__u32			bucket_size;
-	__u64			nr_buckets;
+static inline unsigned replicas_usage_bytes(struct bch_replicas_usage *u)
+{
+	return offsetof(struct bch_replicas_usage, r) + replicas_entry_bytes(&u->r);
+}
 
-	__u64			buckets[BCH_DATA_NR];
-	__u64			sectors[BCH_DATA_NR];
-};
+static inline struct bch_replicas_usage *
+replicas_usage_next(struct bch_replicas_usage *u)
+{
+	return (void *) u + replicas_usage_bytes(u);
+}
 
+/* Obsolete */
+/*
+ * BCH_IOCTL_FS_USAGE: query filesystem disk space usage
+ *
+ * Returns disk space usage broken out by data type, number of replicas, and
+ * by component device
+ *
+ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
+ *
+ * On success, @replica_entries_bytes will be changed to indicate the number of
+ * bytes actually used.
+ *
+ * Returns -ERANGE if @replica_entries_bytes was too small
+ */
 struct bch_ioctl_fs_usage {
 	__u64			capacity;
 	__u64			used;
 	__u64			online_reserved;
 	__u64			persistent_reserved[BCH_REPLICAS_MAX];
-	__u64			sectors[BCH_DATA_NR][BCH_REPLICAS_MAX];
+
+	__u32			replica_entries_bytes;
+	__u32			pad;
+
+	struct bch_replicas_usage replicas[];
 };
 
+/* Obsolete */
 /*
- * BCH_IOCTL_USAGE: query filesystem disk space usage
- *
- * Returns disk space usage broken out by data type, number of replicas, and
- * by component device
- *
- * @nr_devices	- number of devices userspace allocated space for in @devs
- *
- * On success, @fs and @devs will be filled out appropriately and devs[i].alive
- * will indicate if a device was present in that slot
+ * BCH_IOCTL_DEV_USAGE: query device disk space usage
  *
- * Returns -ERANGE if @nr_devices was too small
+ * Returns disk space usage broken out by data type - both by buckets and
+ * sectors.
  */
-struct bch_ioctl_usage {
-	__u16			nr_devices;
-	__u16			pad[3];
+struct bch_ioctl_dev_usage {
+	__u64			dev;
+	__u32			flags;
+	__u8			state;
+	__u8			pad[7];
+
+	__u32			bucket_size;
+	__u64			nr_buckets;
+
+	__u64			buckets_ec;
 
-	struct bch_ioctl_fs_usage fs;
-	struct bch_ioctl_dev_usage devs[0];
+	struct bch_ioctl_dev_usage_type {
+		__u64		buckets;
+		__u64		sectors;
+		__u64		fragmented;
+	}			d[10];
+};
+
+/* Obsolete */
+struct bch_ioctl_dev_usage_v2 {
+	__u64			dev;
+	__u32			flags;
+	__u8			state;
+	__u8			nr_data_types;
+	__u8			pad[6];
+
+	__u32			bucket_size;
+	__u64			nr_buckets;
+
+	struct bch_ioctl_dev_usage_type d[];
 };
 
 /*
@@ -306,4 +374,73 @@ struct bch_ioctl_disk_resize {
 	__u64			nbuckets;
 };
 
+/*
+ * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device
+ *
+ * @dev		- member to resize
+ * @nbuckets	- new number of buckets
+ */
+struct bch_ioctl_disk_resize_journal {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+	__u64			nbuckets;
+};
+
+struct bch_ioctl_subvolume {
+	__u32			flags;
+	__u32			dirfd;
+	__u16			mode;
+	__u16			pad[3];
+	__u64			dst_ptr;
+	__u64			src_ptr;
+};
+
+#define BCH_SUBVOL_SNAPSHOT_CREATE	(1U << 0)
+#define BCH_SUBVOL_SNAPSHOT_RO		(1U << 1)
+
+/*
+ * BCH_IOCTL_FSCK_OFFLINE: run fsck from the 'bcachefs fsck' userspace command,
+ * but with the kernel's implementation of fsck:
+ */
+struct bch_ioctl_fsck_offline {
+	__u64			flags;
+	__u64			opts;		/* string */
+	__u64			nr_devs;
+	__u64			devs[] __counted_by(nr_devs);
+};
+
+/*
+ * BCH_IOCTL_FSCK_ONLINE: run fsck from the 'bcachefs fsck' userspace command,
+ * but with the kernel's implementation of fsck:
+ */
+struct bch_ioctl_fsck_online {
+	__u64			flags;
+	__u64			opts;		/* string */
+};
+
+/*
+ * BCH_IOCTL_QUERY_ACCOUNTING: query filesystem disk accounting
+ *
+ * Returns disk space usage broken out by data type, number of replicas, and
+ * by component device
+ *
+ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
+ *
+ * On success, @replica_entries_bytes will be changed to indicate the number of
+ * bytes actually used.
+ *
+ * Returns -ERANGE if @replica_entries_bytes was too small
+ */
+struct bch_ioctl_query_accounting {
+	__u64			capacity;
+	__u64			used;
+	__u64			online_reserved;
+
+	__u32			accounting_u64s; /* input parameter */
+	__u32			accounting_types_mask; /* input parameter */
+
+	struct bkey_i_accounting accounting[];
+};
+
 #endif /* _BCACHEFS_IOCTL_H */
diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c
index 850ba72c..995ba32e 100644
--- a/libbcachefs/bkey.c
+++ b/libbcachefs/bkey.c
@@ -1,50 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
 #include "bkey.h"
+#include "bkey_cmp.h"
 #include "bkey_methods.h"
 #include "bset.h"
 #include "util.h"
 
-#undef EBUG_ON
-
-#ifdef DEBUG_BKEYS
-#define EBUG_ON(cond)		BUG_ON(cond)
-#else
-#define EBUG_ON(cond)
-#endif
-
 const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
 
-struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
-			      const struct bkey_packed *);
-
-void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits)
+void bch2_bkey_packed_to_binary_text(struct printbuf *out,
+				     const struct bkey_format *f,
+				     const struct bkey_packed *k)
 {
-	unsigned bit = high_bit_offset, done = 0;
+	const u64 *p = high_word(f, k);
+	unsigned word_bits = 64 - high_bit_offset;
+	unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset;
+	u64 v = *p & (~0ULL >> high_bit_offset);
+
+	if (!nr_key_bits) {
+		prt_str(out, "(empty)");
+		return;
+	}
 
 	while (1) {
-		while (bit < 64) {
-			if (done && !(done % 8))
-				*out++ = ' ';
-			*out++ = *p & (1ULL << (63 - bit)) ? '1' : '0';
-			bit++;
-			done++;
-			if (done == nr_bits) {
-				*out++ = '\0';
-				return;
-			}
+		unsigned next_key_bits = nr_key_bits;
+
+		if (nr_key_bits < 64) {
+			v >>= 64 - nr_key_bits;
+			next_key_bits = 0;
+		} else {
+			next_key_bits -= 64;
 		}
 
+		bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits));
+
+		if (!next_key_bits)
+			break;
+
+		prt_char(out, ' ');
+
 		p = next_word(p);
-		bit = 0;
+		v = *p;
+		word_bits = 64;
+		nr_key_bits = next_key_bits;
 	}
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 
 static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
-				 const struct bkey *unpacked,
-				 const struct bkey_format *format)
+				  const struct bkey *unpacked,
+				  const struct bkey_format *format)
 {
 	struct bkey tmp;
 
@@ -56,22 +63,35 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
 	tmp = __bch2_bkey_unpack_key(format, packed);
 
 	if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
-		char buf1[160], buf2[160];
-		char buf3[160], buf4[160];
+		struct printbuf buf = PRINTBUF;
 
-		bch2_bkey_to_text(buf1, sizeof(buf1), unpacked);
-		bch2_bkey_to_text(buf2, sizeof(buf2), &tmp);
-		bch2_to_binary(buf3, (void *) unpacked, 80);
-		bch2_to_binary(buf4, high_word(format, packed), 80);
-
-		panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n",
+		prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n",
 		      format->key_u64s,
 		      format->bits_per_field[0],
 		      format->bits_per_field[1],
 		      format->bits_per_field[2],
 		      format->bits_per_field[3],
-		      format->bits_per_field[4],
-		      buf1, buf2, buf3, buf4);
+		      format->bits_per_field[4]);
+
+		prt_printf(&buf, "compiled unpack: ");
+		bch2_bkey_to_text(&buf, unpacked);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "c unpack:        ");
+		bch2_bkey_to_text(&buf, &tmp);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "compiled unpack: ");
+		bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
+						(struct bkey_packed *) unpacked);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "c unpack:        ");
+		bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
+						(struct bkey_packed *) &tmp);
+		prt_newline(&buf);
+
+		panic("%s", buf.buf);
 	}
 }
 
@@ -107,7 +127,7 @@ static void pack_state_finish(struct pack_state *state,
 			      struct bkey_packed *k)
 {
 	EBUG_ON(state->p <  k->_data);
-	EBUG_ON(state->p >= k->_data + state->format->key_u64s);
+	EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s);
 
 	*state->p = state->w;
 }
@@ -157,6 +177,28 @@ static u64 get_inc_field(struct unpack_state *state, unsigned field)
 }
 
 __always_inline
+static void __set_inc_field(struct pack_state *state, unsigned field, u64 v)
+{
+	unsigned bits = state->format->bits_per_field[field];
+
+	if (bits) {
+		if (bits > state->bits) {
+			bits -= state->bits;
+			/* avoid shift by 64 if bits is 64 - bits is never 0 here: */
+			state->w |= (v >> 1) >> (bits - 1);
+
+			*state->p = state->w;
+			state->p = next_word(state->p);
+			state->w = 0;
+			state->bits = 64;
+		}
+
+		state->bits -= bits;
+		state->w |= v << state->bits;
+	}
+}
+
+__always_inline
 static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
 {
 	unsigned bits = state->format->bits_per_field[field];
@@ -170,20 +212,7 @@ static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
 	if (fls64(v) > bits)
 		return false;
 
-	if (bits > state->bits) {
-		bits -= state->bits;
-		/* avoid shift by 64 if bits is 0 - bits is never 64 here: */
-		state->w |= (v >> 1) >> (bits - 1);
-
-		*state->p = state->w;
-		state->p = next_word(state->p);
-		state->w = 0;
-		state->bits = 64;
-	}
-
-	state->bits -= bits;
-	state->w |= v << state->bits;
-
+	__set_inc_field(state, field, v);
 	return true;
 }
 
@@ -200,9 +229,10 @@ static bool bch2_bkey_transform_key(const struct bkey_format *out_f,
 {
 	struct pack_state out_s = pack_state_init(out_f, out);
 	struct unpack_state in_s = unpack_state_init(in_f, in);
+	u64 *w = out->_data;
 	unsigned i;
 
-	out->_data[0] = 0;
+	*w = 0;
 
 	for (i = 0; i < BKEY_NR_FIELDS; i++)
 		if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
@@ -233,14 +263,6 @@ bool bch2_bkey_transform(const struct bkey_format *out_f,
 	return true;
 }
 
-#define bkey_fields()							\
-	x(BKEY_FIELD_INODE,		p.inode)			\
-	x(BKEY_FIELD_OFFSET,		p.offset)			\
-	x(BKEY_FIELD_SNAPSHOT,		p.snapshot)			\
-	x(BKEY_FIELD_SIZE,		size)				\
-	x(BKEY_FIELD_VERSION_HI,	version.hi)			\
-	x(BKEY_FIELD_VERSION_LO,	version.lo)
-
 struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format,
 			      const struct bkey_packed *in)
 {
@@ -286,30 +308,27 @@ struct bpos __bkey_unpack_pos(const struct bkey_format *format,
 
 /**
  * bch2_bkey_pack_key -- pack just the key, not the value
+ * @out:	packed result
+ * @in:		key to pack
+ * @format:	format of packed result
+ *
+ * Returns: true on success, false on failure
  */
 bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
-		   const struct bkey_format *format)
+			const struct bkey_format *format)
 {
 	struct pack_state state = pack_state_init(format, out);
+	u64 *w = out->_data;
 
 	EBUG_ON((void *) in == (void *) out);
 	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
 	EBUG_ON(in->format != KEY_FORMAT_CURRENT);
 
-	out->_data[0] = 0;
+	*w = 0;
 
 #define x(id, field)	if (!set_inc_field(&state, id, in->field)) return false;
 	bkey_fields()
 #undef x
-
-	/*
-	 * Extents - we have to guarantee that if an extent is packed, a trimmed
-	 * version will also pack:
-	 */
-	if (bkey_start_offset(in) <
-	    le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET]))
-		return false;
-
 	pack_state_finish(&state, out);
 	out->u64s	= format->key_u64s + in->u64s - BKEY_U64s;
 	out->format	= KEY_FORMAT_LOCAL_BTREE;
@@ -322,11 +341,14 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
 
 /**
  * bch2_bkey_unpack -- unpack the key and the value
+ * @b:		btree node of @src key (for packed format)
+ * @dst:	unpacked result
+ * @src:	packed input
  */
 void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
-		 const struct bkey_packed *src)
+		      const struct bkey_packed *src)
 {
-	dst->k = bkey_unpack_key(b, src);
+	__bkey_unpack_key(b, &dst->k, src);
 
 	memcpy_u64s(&dst->v,
 		    bkeyp_val(&b->format, src),
@@ -335,19 +357,24 @@ void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
 
 /**
  * bch2_bkey_pack -- pack the key and the value
+ * @dst:	packed result
+ * @src:	unpacked input
+ * @format:	format of packed result
+ *
+ * Returns: true on success, false on failure
  */
-bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in,
-	       const struct bkey_format *format)
+bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src,
+		    const struct bkey_format *format)
 {
 	struct bkey_packed tmp;
 
-	if (!bch2_bkey_pack_key(&tmp, &in->k, format))
+	if (!bch2_bkey_pack_key(&tmp, &src->k, format))
 		return false;
 
-	memmove_u64s((u64 *) out + format->key_u64s,
-		     &in->v,
-		     bkey_val_u64s(&in->k));
-	memcpy_u64s(out, &tmp, format->key_u64s);
+	memmove_u64s((u64 *) dst + format->key_u64s,
+		     &src->v,
+		     bkey_val_u64s(&src->k));
+	memcpy_u64s_small(dst, &tmp, format->key_u64s);
 
 	return true;
 }
@@ -367,19 +394,7 @@ static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
 		ret = false;
 	}
 
-	if (bits > state->bits) {
-		bits -= state->bits;
-		state->w |= (v >> 1) >> (bits - 1);
-
-		*state->p = state->w;
-		state->p = next_word(state->p);
-		state->w = 0;
-		state->bits = 64;
-	}
-
-	state->bits -= bits;
-	state->w |= v << state->bits;
-
+	__set_inc_field(state, field, v);
 	return ret;
 }
 
@@ -410,7 +425,7 @@ static bool bkey_packed_successor(struct bkey_packed *out,
 
 		if ((*p & mask) != mask) {
 			*p += 1ULL << offset;
-			EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0);
+			EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0);
 			return true;
 		}
 
@@ -422,6 +437,24 @@ static bool bkey_packed_successor(struct bkey_packed *out,
 
 	return false;
 }
+
+static bool bkey_format_has_too_big_fields(const struct bkey_format *f)
+{
+	for (unsigned i = 0; i < f->nr_fields; i++) {
+		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+		u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+		u64 packed_max = f->bits_per_field[i]
+			? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+			: 0;
+		u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+		if (packed_max + field_offset < packed_max ||
+		    packed_max + field_offset > unpacked_max)
+			return true;
+	}
+
+	return false;
+}
 #endif
 
 /*
@@ -438,12 +471,20 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
 {
 	const struct bkey_format *f = &b->format;
 	struct pack_state state = pack_state_init(f, out);
+	u64 *w = out->_data;
 #ifdef CONFIG_BCACHEFS_DEBUG
 	struct bpos orig = in;
 #endif
 	bool exact = true;
+	unsigned i;
 
-	out->_data[0] = 0;
+	/*
+	 * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3
+	 * byte header, but pack_pos() won't if the len/version fields are big
+	 * enough - we need to make sure to zero them out:
+	 */
+	for (i = 0; i < f->key_u64s; i++)
+		w[i] = 0;
 
 	if (unlikely(in.snapshot <
 		     le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
@@ -467,24 +508,24 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
 		     le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
 		return BKEY_PACK_POS_FAIL;
 
-	if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) {
+	if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode))) {
 		in.offset	= KEY_OFFSET_MAX;
 		in.snapshot	= KEY_SNAPSHOT_MAX;
 		exact = false;
 	}
 
-	if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) {
+	if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset))) {
 		in.snapshot	= KEY_SNAPSHOT_MAX;
 		exact = false;
 	}
 
-	if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot))
+	if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)))
 		exact = false;
 
 	pack_state_finish(&state, out);
 	out->u64s	= f->key_u64s;
 	out->format	= KEY_FORMAT_LOCAL_BTREE;
-	out->type	= KEY_TYPE_DELETED;
+	out->type	= KEY_TYPE_deleted;
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 	if (exact) {
@@ -494,7 +535,8 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
 
 		BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
 		BUG_ON(bkey_packed_successor(&successor, b, *out) &&
-		       bkey_cmp_left_packed(b, &successor, &orig) < 0);
+		       bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
+		       !bkey_format_has_too_big_fields(f));
 	}
 #endif
 
@@ -515,24 +557,6 @@ void bch2_bkey_format_init(struct bkey_format_state *s)
 	s->field_min[BKEY_FIELD_SIZE] = 0;
 }
 
-static void __bkey_format_add(struct bkey_format_state *s,
-			      unsigned field, u64 v)
-{
-	s->field_min[field] = min(s->field_min[field], v);
-	s->field_max[field] = max(s->field_max[field], v);
-}
-
-/*
- * Changes @format so that @k can be successfully packed with @format
- */
-void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
-{
-#define x(id, field) __bkey_format_add(s, id, k->field);
-	bkey_fields()
-#undef x
-	__bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k));
-}
-
 void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
 {
 	unsigned field = 0;
@@ -550,7 +574,12 @@ void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
 static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
 			     unsigned bits, u64 offset)
 {
-	offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1));
+	unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+	u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+
+	bits = min(bits, unpacked_bits);
+
+	offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1));
 
 	f->bits_per_field[i]	= bits;
 	f->field_offset[i]	= cpu_to_le64(offset);
@@ -575,8 +604,10 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
 
 	/* allow for extent merging: */
 	if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
-		ret.bits_per_field[BKEY_FIELD_SIZE] += 4;
-		bits += 4;
+		unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]);
+
+		ret.bits_per_field[BKEY_FIELD_SIZE] += b;
+		bits += b;
 	}
 
 	ret.key_u64s = DIV_ROUND_UP(bits, 64);
@@ -596,36 +627,71 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
 		}
 	}
 
-	EBUG_ON(bch2_bkey_format_validate(&ret));
+#ifdef CONFIG_BCACHEFS_DEBUG
+	{
+		struct printbuf buf = PRINTBUF;
+
+		BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf));
+		printbuf_exit(&buf);
+	}
+#endif
 	return ret;
 }
 
-const char *bch2_bkey_format_validate(struct bkey_format *f)
+int bch2_bkey_format_invalid(struct bch_fs *c,
+			     struct bkey_format *f,
+			     enum bch_validate_flags flags,
+			     struct printbuf *err)
 {
-	unsigned i, bits = KEY_PACKED_BITS_START;
-
-	if (f->nr_fields != BKEY_NR_FIELDS)
-		return "incorrect number of fields";
-
-	for (i = 0; i < f->nr_fields; i++) {
-		u64 field_offset = le64_to_cpu(f->field_offset[i]);
+	unsigned bits = KEY_PACKED_BITS_START;
 
-		if (f->bits_per_field[i] > 64)
-			return "field too large";
+	if (f->nr_fields != BKEY_NR_FIELDS) {
+		prt_printf(err, "incorrect number of fields: got %u, should be %u",
+			   f->nr_fields, BKEY_NR_FIELDS);
+		return -BCH_ERR_invalid;
+	}
 
-		if (field_offset &&
-		    (f->bits_per_field[i] == 64 ||
-		    (field_offset + ((1ULL << f->bits_per_field[i]) - 1) <
-		     field_offset)))
-			return "offset + bits overflow";
+	/*
+	 * Verify that the packed format can't represent fields larger than the
+	 * unpacked format:
+	 */
+	for (unsigned i = 0; i < f->nr_fields; i++) {
+		if (bch2_bkey_format_field_overflows(f, i)) {
+			unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+			u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+			unsigned packed_bits = min(64, f->bits_per_field[i]);
+			u64 packed_max = packed_bits
+				? ~((~0ULL << 1) << (packed_bits - 1))
+				: 0;
+
+			prt_printf(err, "field %u too large: %llu + %llu > %llu",
+				   i, packed_max, le64_to_cpu(f->field_offset[i]), unpacked_max);
+			return -BCH_ERR_invalid;
+		}
 
 		bits += f->bits_per_field[i];
 	}
 
-	if (f->key_u64s != DIV_ROUND_UP(bits, 64))
-		return "incorrect key_u64s";
+	if (f->key_u64s != DIV_ROUND_UP(bits, 64)) {
+		prt_printf(err, "incorrect key_u64s: got %u, should be %u",
+			   f->key_u64s, DIV_ROUND_UP(bits, 64));
+		return -BCH_ERR_invalid;
+	}
+
+	return 0;
+}
 
-	return NULL;
+void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f)
+{
+	prt_printf(out, "u64s %u fields ", f->key_u64s);
+
+	for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) {
+		if (i)
+			prt_str(out, ", ");
+		prt_printf(out, "%u:%llu",
+			   f->bits_per_field[i],
+			   le64_to_cpu(f->field_offset[i]));
+	}
 }
 
 /*
@@ -712,51 +778,7 @@ unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
 	return 0;
 }
 
-#ifdef CONFIG_X86_64
-
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
-				  unsigned nr_key_bits)
-{
-	long d0, d1, d2, d3;
-	int cmp;
-
-	/* we shouldn't need asm for this, but gcc is being retarded: */
-
-	asm(".intel_syntax noprefix;"
-	    "xor eax, eax;"
-	    "xor edx, edx;"
-	    "1:;"
-	    "mov r8, [rdi];"
-	    "mov r9, [rsi];"
-	    "sub ecx, 64;"
-	    "jl 2f;"
-
-	    "cmp r8, r9;"
-	    "jnz 3f;"
-
-	    "lea rdi, [rdi - 8];"
-	    "lea rsi, [rsi - 8];"
-	    "jmp 1b;"
-
-	    "2:;"
-	    "not ecx;"
-	    "shr r8, 1;"
-	    "shr r9, 1;"
-	    "shr r8, cl;"
-	    "shr r9, cl;"
-	    "cmp r8, r9;"
-
-	    "3:\n"
-	    "seta al;"
-	    "setb dl;"
-	    "sub eax, edx;"
-	    ".att_syntax prefix;"
-	    : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
-	    : "0" (l), "1" (r), "3" (nr_key_bits)
-	    : "r8", "r9", "cc", "memory");
-
-	return cmp;
-}
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
 
 #define I(_x)			(*(out)++ = (_x))
 #define I1(i0)						I(i0)
@@ -988,41 +1010,6 @@ int bch2_compile_bkey_format(const struct bkey_format *format, void *_out)
 }
 
 #else
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
-				  unsigned nr_key_bits)
-{
-	u64 l_v, r_v;
-
-	if (!nr_key_bits)
-		return 0;
-
-	/* for big endian, skip past header */
-	nr_key_bits += high_bit_offset;
-	l_v = *l & (~0ULL >> high_bit_offset);
-	r_v = *r & (~0ULL >> high_bit_offset);
-
-	while (1) {
-		if (nr_key_bits < 64) {
-			l_v >>= 64 - nr_key_bits;
-			r_v >>= 64 - nr_key_bits;
-			nr_key_bits = 0;
-		} else {
-			nr_key_bits -= 64;
-		}
-
-		if (l_v != r_v)
-			return l_v < r_v ? -1 : 1;
-
-		if (!nr_key_bits)
-			return 0;
-
-		l = next_word(l);
-		r = next_word(r);
-
-		l_v = *l;
-		r_v = *r;
-	}
-}
 #endif
 
 __pure
@@ -1030,19 +1017,7 @@ int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
 					  const struct bkey_packed *r,
 					  const struct btree *b)
 {
-	const struct bkey_format *f = &b->format;
-	int ret;
-
-	EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
-	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
-
-	ret = __bkey_cmp_bits(high_word(f, l),
-			      high_word(f, r),
-			      b->nr_key_bits);
-
-	EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l),
-				bkey_unpack_pos(b, r)));
-	return ret;
+	return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
 }
 
 __pure __flatten
@@ -1050,34 +1025,15 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
 					       const struct bkey_packed *l,
 					       const struct bpos *r)
 {
-	return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r);
+	return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r);
 }
 
 __pure __flatten
-int __bch2_bkey_cmp_packed(const struct bkey_packed *l,
-			   const struct bkey_packed *r,
-			   const struct btree *b)
+int bch2_bkey_cmp_packed(const struct btree *b,
+			 const struct bkey_packed *l,
+			 const struct bkey_packed *r)
 {
-	int packed = bkey_lr_packed(l, r);
-
-	if (likely(packed == BKEY_PACKED_BOTH))
-		return __bch2_bkey_cmp_packed_format_checked(l, r, b);
-
-	switch (packed) {
-	case BKEY_PACKED_NONE:
-		return bkey_cmp(((struct bkey *) l)->p,
-				((struct bkey *) r)->p);
-	case BKEY_PACKED_LEFT:
-		return __bch2_bkey_cmp_left_packed_format_checked(b,
-				  (struct bkey_packed *) l,
-				  &((struct bkey *) r)->p);
-	case BKEY_PACKED_RIGHT:
-		return -__bch2_bkey_cmp_left_packed_format_checked(b,
-				  (struct bkey_packed *) r,
-				  &((struct bkey *) l)->p);
-	default:
-		unreachable();
-	}
+	return bch2_bkey_cmp_packed_inlined(b, l, r);
 }
 
 __pure __flatten
@@ -1088,7 +1044,7 @@ int __bch2_bkey_cmp_left_packed(const struct btree *b,
 	const struct bkey *l_unpacked;
 
 	return unlikely(l_unpacked = packed_to_bkey_c(l))
-		? bkey_cmp(l_unpacked->p, *r)
+		? bpos_cmp(l_unpacked->p, *r)
 		: __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
 }
 
@@ -1108,7 +1064,7 @@ void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
 {
 	const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
 	u8 *l = k->key_start;
-	u8 *h = (u8 *) (k->_data + f->key_u64s) - 1;
+	u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1;
 
 	while (l < h) {
 		swap(*l, *h);
@@ -1124,11 +1080,12 @@ void bch2_bkey_pack_test(void)
 	struct bkey_packed p;
 
 	struct bkey_format test_format = {
-		.key_u64s	= 2,
+		.key_u64s	= 3,
 		.nr_fields	= BKEY_NR_FIELDS,
 		.bits_per_field = {
 			13,
 			64,
+			32,
 		},
 	};
 
diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h
index bd1d21b0..054e2d5e 100644
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@@ -1,68 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_BKEY_H
 #define _BCACHEFS_BKEY_H
 
 #include <linux/bug.h>
 #include "bcachefs_format.h"
-
+#include "bkey_types.h"
+#include "btree_types.h"
 #include "util.h"
 #include "vstructs.h"
 
+#if 0
+
+/*
+ * compiled unpack functions are disabled, pending a new interface for
+ * dynamically allocating executable memory:
+ */
+
 #ifdef CONFIG_X86_64
 #define HAVE_BCACHEFS_COMPILED_UNPACK	1
 #endif
+#endif
 
-void bch2_to_binary(char *, const u64 *, unsigned);
-
-/* bkey with split value, const */
-struct bkey_s_c {
-	const struct bkey	*k;
-	const struct bch_val	*v;
-};
-
-/* bkey with split value */
-struct bkey_s {
-	union {
-	struct {
-		struct bkey	*k;
-		struct bch_val	*v;
-	};
-	struct bkey_s_c		s_c;
-	};
-};
-
-#define bkey_next(_k)		vstruct_next(_k)
-
-static inline unsigned bkey_val_u64s(const struct bkey *k)
-{
-	return k->u64s - BKEY_U64s;
-}
-
-static inline size_t bkey_val_bytes(const struct bkey *k)
-{
-	return bkey_val_u64s(k) * sizeof(u64);
-}
-
-static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
-{
-	k->u64s = BKEY_U64s + val_u64s;
-}
-
-static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
-{
-	k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
-}
-
-#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_DELETED)
-
-#define bkey_whiteout(_k)				\
-	((_k)->type == KEY_TYPE_DELETED || (_k)->type == KEY_TYPE_DISCARD)
-
-#define bkey_packed_typecheck(_k)					\
-({									\
-	BUILD_BUG_ON(!type_is(_k, struct bkey *) &&			\
-		     !type_is(_k, struct bkey_packed *));		\
-	type_is(_k, struct bkey_packed *);				\
-})
+void bch2_bkey_packed_to_binary_text(struct printbuf *,
+				     const struct bkey_format *,
+				     const struct bkey_packed *);
 
 enum bkey_lr_packed {
 	BKEY_PACKED_BOTH,
@@ -71,38 +32,20 @@ enum bkey_lr_packed {
 	BKEY_PACKED_NONE,
 };
 
-#define bkey_lr_packed_typecheck(_l, _r)				\
-	(!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1))
-
 #define bkey_lr_packed(_l, _r)						\
 	((_l)->format + ((_r)->format << 1))
 
-#define bkey_copy(_dst, _src)					\
-do {								\
-	BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) &&		\
-		     !type_is(_dst, struct bkey_packed *));	\
-	BUILD_BUG_ON(!type_is(_src, struct bkey_i *) &&		\
-		     !type_is(_src, struct bkey_packed *));	\
-	EBUG_ON((u64 *) (_dst) > (u64 *) (_src) &&		\
-		(u64 *) (_dst) < (u64 *) (_src) +		\
-		((struct bkey *) (_src))->u64s);		\
-								\
-	__memmove_u64s_down((_dst), (_src),			\
-			    ((struct bkey *) (_src))->u64s);	\
-} while (0)
-
-struct btree;
+static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src)
+{
+	memcpy_u64s_small(dst, src, src->u64s);
+}
 
-struct bkey_format_state {
-	u64 field_min[BKEY_NR_FIELDS];
-	u64 field_max[BKEY_NR_FIELDS];
-};
+static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src)
+{
+	memcpy_u64s_small(dst, src, src->k.u64s);
+}
 
-void bch2_bkey_format_init(struct bkey_format_state *);
-void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *);
-void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
-struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
-const char *bch2_bkey_format_validate(struct bkey_format *);
+struct btree;
 
 __pure
 unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
@@ -122,9 +65,9 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
 					  const struct bpos *);
 
 __pure
-int __bch2_bkey_cmp_packed(const struct bkey_packed *,
-			   const struct bkey_packed *,
-			   const struct btree *);
+int bch2_bkey_cmp_packed(const struct btree *,
+			 const struct bkey_packed *,
+			 const struct bkey_packed *);
 
 __pure
 int __bch2_bkey_cmp_left_packed(const struct btree *,
@@ -139,8 +82,9 @@ int bkey_cmp_left_packed(const struct btree *b,
 }
 
 /*
- * we prefer to pass bpos by ref, but it's often enough terribly convenient to
- * pass it by by val... as much as I hate c++, const ref would be nice here:
+ * The compiler generates better code when we pass bpos by ref, but it's often
+ * enough terribly convenient to pass it by val... as much as I hate c++, const
+ * ref would be nice here:
  */
 __pure __flatten
 static inline int bkey_cmp_left_packed_byval(const struct btree *b,
@@ -150,55 +94,105 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b,
 	return bkey_cmp_left_packed(b, l, &r);
 }
 
-/*
- * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to
- * skip dispatching on k->format:
- */
-#define bkey_cmp_packed(_b, _l, _r)					\
-({									\
-	int _cmp;							\
-									\
-	switch (bkey_lr_packed_typecheck(_l, _r)) {			\
-	case BKEY_PACKED_NONE:						\
-		_cmp = bkey_cmp(((struct bkey *) (_l))->p,		\
-				((struct bkey *) (_r))->p);		\
-		break;							\
-	case BKEY_PACKED_LEFT:						\
-		_cmp = bkey_cmp_left_packed((_b),			\
-				  (struct bkey_packed *) (_l),		\
-				  &((struct bkey *) (_r))->p);		\
-		break;							\
-	case BKEY_PACKED_RIGHT:						\
-		_cmp = -bkey_cmp_left_packed((_b),			\
-				  (struct bkey_packed *) (_r),		\
-				  &((struct bkey *) (_l))->p);		\
-		break;							\
-	case BKEY_PACKED_BOTH:						\
-		_cmp = __bch2_bkey_cmp_packed((void *) (_l),		\
-					 (void *) (_r), (_b));		\
-		break;							\
-	}								\
-	_cmp;								\
-})
-
-#if 1
-static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
+static __always_inline bool bpos_eq(struct bpos l, struct bpos r)
 {
-	if (l.inode != r.inode)
-		return l.inode < r.inode ? -1 : 1;
-	if (l.offset != r.offset)
-		return l.offset < r.offset ? -1 : 1;
-	if (l.snapshot != r.snapshot)
-		return l.snapshot < r.snapshot ? -1 : 1;
-	return 0;
+	return  !((l.inode	^ r.inode) |
+		  (l.offset	^ r.offset) |
+		  (l.snapshot	^ r.snapshot));
+}
+
+static __always_inline bool bpos_lt(struct bpos l, struct bpos r)
+{
+	return  l.inode	!= r.inode ? l.inode < r.inode :
+		l.offset != r.offset ? l.offset < r.offset :
+		l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false;
+}
+
+static __always_inline bool bpos_le(struct bpos l, struct bpos r)
+{
+	return  l.inode	!= r.inode ? l.inode < r.inode :
+		l.offset != r.offset ? l.offset < r.offset :
+		l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true;
+}
+
+static __always_inline bool bpos_gt(struct bpos l, struct bpos r)
+{
+	return bpos_lt(r, l);
+}
+
+static __always_inline bool bpos_ge(struct bpos l, struct bpos r)
+{
+	return bpos_le(r, l);
+}
+
+static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
+{
+	return  cmp_int(l.inode,    r.inode) ?:
+		cmp_int(l.offset,   r.offset) ?:
+		cmp_int(l.snapshot, r.snapshot);
 }
-#else
-int bkey_cmp(struct bpos l, struct bpos r);
-#endif
 
 static inline struct bpos bpos_min(struct bpos l, struct bpos r)
 {
-	return bkey_cmp(l, r) < 0 ? l : r;
+	return bpos_lt(l, r) ? l : r;
+}
+
+static inline struct bpos bpos_max(struct bpos l, struct bpos r)
+{
+	return bpos_gt(l, r) ? l : r;
+}
+
+static __always_inline bool bkey_eq(struct bpos l, struct bpos r)
+{
+	return  !((l.inode	^ r.inode) |
+		  (l.offset	^ r.offset));
+}
+
+static __always_inline bool bkey_lt(struct bpos l, struct bpos r)
+{
+	return  l.inode	!= r.inode
+		? l.inode < r.inode
+		: l.offset < r.offset;
+}
+
+static __always_inline bool bkey_le(struct bpos l, struct bpos r)
+{
+	return  l.inode	!= r.inode
+		? l.inode < r.inode
+		: l.offset <= r.offset;
+}
+
+static __always_inline bool bkey_gt(struct bpos l, struct bpos r)
+{
+	return bkey_lt(r, l);
+}
+
+static __always_inline bool bkey_ge(struct bpos l, struct bpos r)
+{
+	return bkey_le(r, l);
+}
+
+static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
+{
+	return  cmp_int(l.inode,    r.inode) ?:
+		cmp_int(l.offset,   r.offset);
+}
+
+static inline struct bpos bkey_min(struct bpos l, struct bpos r)
+{
+	return bkey_lt(l, r) ? l : r;
+}
+
+static inline struct bpos bkey_max(struct bpos l, struct bpos r)
+{
+	return bkey_gt(l, r) ? l : r;
+}
+
+static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
+{
+	return bpos_eq(l.k->p, r.k->p) &&
+		bkey_bytes(l.k) == bkey_bytes(r.k) &&
+		!memcmp(l.v, r.v, bkey_val_bytes(l.k));
 }
 
 void bch2_bpos_swab(struct bpos *);
@@ -206,16 +200,16 @@ void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
 
 static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
 {
-	return  (l.hi > r.hi) - (l.hi < r.hi) ?:
-		(l.lo > r.lo) - (l.lo < r.lo);
+	return  cmp_int(l.hi, r.hi) ?:
+		cmp_int(l.lo, r.lo);
 }
 
 #define ZERO_VERSION	((struct bversion) { .hi = 0, .lo = 0 })
 #define MAX_VERSION	((struct bversion) { .hi = ~0, .lo = ~0ULL })
 
-static __always_inline int bversion_zero(struct bversion v)
+static __always_inline bool bversion_zero(struct bversion v)
 {
-	return !bversion_cmp(v, ZERO_VERSION);
+	return bversion_cmp(v, ZERO_VERSION) == 0;
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -257,24 +251,46 @@ static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
 		format->bits_per_field[BKEY_FIELD_SNAPSHOT];
 }
 
-static inline struct bpos bkey_successor(struct bpos p)
+static inline struct bpos bpos_successor(struct bpos p)
+{
+	if (!++p.snapshot &&
+	    !++p.offset &&
+	    !++p.inode)
+		BUG();
+
+	return p;
+}
+
+static inline struct bpos bpos_predecessor(struct bpos p)
+{
+	if (!p.snapshot-- &&
+	    !p.offset-- &&
+	    !p.inode--)
+		BUG();
+
+	return p;
+}
+
+static inline struct bpos bpos_nosnap_successor(struct bpos p)
 {
-	struct bpos ret = p;
+	p.snapshot = 0;
 
-	if (!++ret.offset)
-		BUG_ON(!++ret.inode);
+	if (!++p.offset &&
+	    !++p.inode)
+		BUG();
 
-	return ret;
+	return p;
 }
 
-static inline struct bpos bkey_predecessor(struct bpos p)
+static inline struct bpos bpos_nosnap_predecessor(struct bpos p)
 {
-	struct bpos ret = p;
+	p.snapshot = 0;
 
-	if (!ret.offset--)
-		BUG_ON(!ret.inode--);
+	if (!p.offset-- &&
+	    !p.inode--)
+		BUG();
 
-	return ret;
+	return p;
 }
 
 static inline u64 bkey_start_offset(const struct bkey *k)
@@ -296,10 +312,13 @@ static inline struct bpos bkey_start_pos(const struct bkey *k)
 static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
 				      const struct bkey_packed *k)
 {
-	unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+	return bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+}
 
-	EBUG_ON(k->u64s < ret);
-	return ret;
+static inline bool bkeyp_u64s_valid(const struct bkey_format *f,
+				    const struct bkey_packed *k)
+{
+	return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s);
 }
 
 static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
@@ -327,7 +346,7 @@ static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
 }
 
 #define bkeyp_val(_format, _k)						\
-	 ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k)))
+	 ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k)))
 
 extern const struct bkey_format bch2_bkey_format_current;
 
@@ -367,6 +386,99 @@ void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
 bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
 	       const struct bkey_format *);
 
+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
+
+static inline void
+__bkey_unpack_key_format_checked(const struct btree *b,
+			       struct bkey *dst,
+			       const struct bkey_packed *src)
+{
+	if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) {
+		compiled_unpack_fn unpack_fn = b->aux_data;
+		unpack_fn(dst, src);
+
+		if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+		    bch2_expensive_debug_checks) {
+			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
+
+			BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
+		}
+	} else {
+		*dst = __bch2_bkey_unpack_key(&b->format, src);
+	}
+}
+
+static inline struct bkey
+bkey_unpack_key_format_checked(const struct btree *b,
+			       const struct bkey_packed *src)
+{
+	struct bkey dst;
+
+	__bkey_unpack_key_format_checked(b, &dst, src);
+	return dst;
+}
+
+static inline void __bkey_unpack_key(const struct btree *b,
+				     struct bkey *dst,
+				     const struct bkey_packed *src)
+{
+	if (likely(bkey_packed(src)))
+		__bkey_unpack_key_format_checked(b, dst, src);
+	else
+		*dst = *packed_to_bkey_c(src);
+}
+
+/**
+ * bkey_unpack_key -- unpack just the key, not the value
+ */
+static inline struct bkey bkey_unpack_key(const struct btree *b,
+					  const struct bkey_packed *src)
+{
+	return likely(bkey_packed(src))
+		? bkey_unpack_key_format_checked(b, src)
+		: *packed_to_bkey_c(src);
+}
+
+static inline struct bpos
+bkey_unpack_pos_format_checked(const struct btree *b,
+			       const struct bkey_packed *src)
+{
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+	return bkey_unpack_key_format_checked(b, src).p;
+#else
+	return __bkey_unpack_pos(&b->format, src);
+#endif
+}
+
+static inline struct bpos bkey_unpack_pos(const struct btree *b,
+					  const struct bkey_packed *src)
+{
+	return likely(bkey_packed(src))
+		? bkey_unpack_pos_format_checked(b, src)
+		: packed_to_bkey_c(src)->p;
+}
+
+/* Disassembled bkeys */
+
+static inline struct bkey_s_c bkey_disassemble(const struct btree *b,
+					       const struct bkey_packed *k,
+					       struct bkey *u)
+{
+	__bkey_unpack_key(b, u, k);
+
+	return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
+}
+
+/* non const version: */
+static inline struct bkey_s __bkey_disassemble(const struct btree *b,
+					       struct bkey_packed *k,
+					       struct bkey *u)
+{
+	__bkey_unpack_key(b, u, k);
+
+	return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
+}
+
 static inline u64 bkey_field_max(const struct bkey_format *f,
 				 enum bch_bkey_fields nr)
 {
@@ -390,195 +502,10 @@ static inline int bch2_compile_bkey_format(const struct bkey_format *format,
 static inline void bkey_reassemble(struct bkey_i *dst,
 				   struct bkey_s_c src)
 {
-	BUG_ON(bkey_packed(src.k));
 	dst->k = *src.k;
-	memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k));
-}
-
-#define bkey_s_null		((struct bkey_s)   { .k = NULL })
-#define bkey_s_c_null		((struct bkey_s_c) { .k = NULL })
-
-#define bkey_s_err(err)		((struct bkey_s)   { .k = ERR_PTR(err) })
-#define bkey_s_c_err(err)	((struct bkey_s_c) { .k = ERR_PTR(err) })
-
-static inline struct bkey_s bkey_to_s(struct bkey *k)
-{
-	return (struct bkey_s) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
-{
-	return (struct bkey_s_c) { .k = k, .v = NULL };
+	memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
 }
 
-static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
-{
-	return (struct bkey_s) { .k = &k->k, .v = &k->v };
-}
-
-static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
-{
-	return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
-}
-
-/*
- * For a given type of value (e.g. struct bch_extent), generates the types for
- * bkey + bch_extent - inline, split, split const - and also all the conversion
- * functions, which also check that the value is of the correct type.
- *
- * We use anonymous unions for upcasting - e.g. converting from e.g. a
- * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
- * functions.
- */
-#define __BKEY_VAL_ACCESSORS(name, nr, _assert)				\
-struct bkey_s_c_##name {						\
-	union {								\
-	struct {							\
-		const struct bkey	*k;				\
-		const struct bch_##name	*v;				\
-	};								\
-	struct bkey_s_c			s_c;				\
-	};								\
-};									\
-									\
-struct bkey_s_##name {							\
-	union {								\
-	struct {							\
-		struct bkey		*k;				\
-		struct bch_##name	*v;				\
-	};								\
-	struct bkey_s_c_##name		c;				\
-	struct bkey_s			s;				\
-	struct bkey_s_c			s_c;				\
-	};								\
-};									\
-									\
-static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
-{									\
-	_assert(k->k.type, nr);						\
-	return container_of(&k->k, struct bkey_i_##name, k);		\
-}									\
-									\
-static inline const struct bkey_i_##name *				\
-bkey_i_to_##name##_c(const struct bkey_i *k)				\
-{									\
-	_assert(k->k.type, nr);						\
-	return container_of(&k->k, struct bkey_i_##name, k);		\
-}									\
-									\
-static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
-{									\
-	_assert(k.k->type, nr);						\
-	return (struct bkey_s_##name) {					\
-		.k = k.k,						\
-		.v = container_of(k.v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
-{									\
-	_assert(k.k->type, nr);						\
-	return (struct bkey_s_c_##name) {				\
-		.k = k.k,						\
-		.v = container_of(k.v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
-{									\
-	return (struct bkey_s_##name) {					\
-		.k = &k->k,						\
-		.v = &k->v,						\
-	};								\
-}									\
-									\
-static inline struct bkey_s_c_##name					\
-name##_i_to_s_c(const struct bkey_i_##name *k)				\
-{									\
-	return (struct bkey_s_c_##name) {				\
-		.k = &k->k,						\
-		.v = &k->v,						\
-	};								\
-}									\
-									\
-static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
-{									\
-	_assert(k->k.type, nr);						\
-	return (struct bkey_s_##name) {					\
-		.k = &k->k,						\
-		.v = container_of(&k->v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_s_c_##name					\
-bkey_i_to_s_c_##name(const struct bkey_i *k)				\
-{									\
-	_assert(k->k.type, nr);						\
-	return (struct bkey_s_c_##name) {				\
-		.k = &k->k,						\
-		.v = container_of(&k->v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bch_##name *					\
-bkey_p_##name##_val(const struct bkey_format *f,			\
-		    struct bkey_packed *k)				\
-{									\
-	return container_of(bkeyp_val(f, k), struct bch_##name, v);	\
-}									\
-									\
-static inline const struct bch_##name *					\
-bkey_p_c_##name##_val(const struct bkey_format *f,			\
-		      const struct bkey_packed *k)			\
-{									\
-	return container_of(bkeyp_val(f, k), struct bch_##name, v);	\
-}									\
-									\
-static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
-{									\
-	struct bkey_i_##name *k =					\
-		container_of(&_k->k, struct bkey_i_##name, k);		\
-									\
-	bkey_init(&k->k);						\
-	memset(&k->v, 0, sizeof(k->v));					\
-	k->k.type = nr;							\
-	set_bkey_val_bytes(&k->k, sizeof(k->v));			\
-									\
-	return k;							\
-}
-
-#define __BKEY_VAL_ASSERT(_type, _nr)	EBUG_ON(_type != _nr)
-
-#define BKEY_VAL_ACCESSORS(name, _nr)					\
-	static inline void __bch_##name##_assert(u8 type, u8 nr)	\
-	{								\
-		EBUG_ON(type != _nr);					\
-	}								\
-									\
-	__BKEY_VAL_ACCESSORS(name, _nr, __bch_##name##_assert)
-
-BKEY_VAL_ACCESSORS(cookie,		KEY_TYPE_COOKIE);
-
-static inline void __bch2_extent_assert(u8 type, u8 nr)
-{
-	EBUG_ON(type != BCH_EXTENT && type != BCH_EXTENT_CACHED);
-}
-
-__BKEY_VAL_ACCESSORS(extent,		BCH_EXTENT, __bch2_extent_assert);
-BKEY_VAL_ACCESSORS(reservation,		BCH_RESERVATION);
-
-BKEY_VAL_ACCESSORS(inode,		BCH_INODE_FS);
-BKEY_VAL_ACCESSORS(inode_blockdev,	BCH_INODE_BLOCKDEV);
-BKEY_VAL_ACCESSORS(inode_generation,	BCH_INODE_GENERATION);
-
-BKEY_VAL_ACCESSORS(dirent,		BCH_DIRENT);
-
-BKEY_VAL_ACCESSORS(xattr,		BCH_XATTR);
-
-BKEY_VAL_ACCESSORS(alloc,		BCH_ALLOC);
-
-BKEY_VAL_ACCESSORS(quota,		BCH_QUOTA);
-
 /* byte order helpers */
 
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
@@ -605,7 +532,7 @@ static inline unsigned high_word_offset(const struct bkey_format *f)
 #error edit for your odd byteorder.
 #endif
 
-#define high_word(f, k)		((k)->_data + high_word_offset(f))
+#define high_word(f, k)		((u64 *) (k)->_data + high_word_offset(f))
 #define next_word(p)		nth_word(p, 1)
 #define prev_word(p)		nth_word(p, -1)
 
@@ -615,4 +542,64 @@ void bch2_bkey_pack_test(void);
 static inline void bch2_bkey_pack_test(void) {}
 #endif
 
+#define bkey_fields()							\
+	x(BKEY_FIELD_INODE,		p.inode)			\
+	x(BKEY_FIELD_OFFSET,		p.offset)			\
+	x(BKEY_FIELD_SNAPSHOT,		p.snapshot)			\
+	x(BKEY_FIELD_SIZE,		size)				\
+	x(BKEY_FIELD_VERSION_HI,	bversion.hi)			\
+	x(BKEY_FIELD_VERSION_LO,	bversion.lo)
+
+struct bkey_format_state {
+	u64 field_min[BKEY_NR_FIELDS];
+	u64 field_max[BKEY_NR_FIELDS];
+};
+
+void bch2_bkey_format_init(struct bkey_format_state *);
+
+static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v)
+{
+	s->field_min[field] = min(s->field_min[field], v);
+	s->field_max[field] = max(s->field_max[field], v);
+}
+
+/*
+ * Changes @format so that @k can be successfully packed with @format
+ */
+static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
+{
+#define x(id, field) __bkey_format_add(s, id, k->field);
+	bkey_fields()
+#undef x
+}
+
+void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
+
+static inline bool bch2_bkey_format_field_overflows(struct bkey_format *f, unsigned i)
+{
+	unsigned f_bits = f->bits_per_field[i];
+	unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+	u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
+	u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+	if (f_bits > unpacked_bits)
+		return true;
+
+	if ((f_bits == unpacked_bits) && field_offset)
+		return true;
+
+	u64 f_mask = f_bits
+		? ~((~0ULL << (f_bits - 1)) << 1)
+		: 0;
+
+	if (((field_offset + f_mask) & unpacked_mask) < field_offset)
+		return true;
+	return false;
+}
+
+int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *,
+			     enum bch_validate_flags, struct printbuf *);
+void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
+
 #endif /* _BCACHEFS_BKEY_H */
diff --git a/libbcachefs/bkey_buf.h b/libbcachefs/bkey_buf.h
new file mode 100644
index 00000000..a30c4ae8
--- /dev/null
+++ b/libbcachefs/bkey_buf.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_BUF_H
+#define _BCACHEFS_BKEY_BUF_H
+
+#include "bcachefs.h"
+#include "bkey.h"
+
+struct bkey_buf {
+	struct bkey_i	*k;
+	u64		onstack[12];
+};
+
+static inline void bch2_bkey_buf_realloc(struct bkey_buf *s,
+					 struct bch_fs *c, unsigned u64s)
+{
+	if (s->k == (void *) s->onstack &&
+	    u64s > ARRAY_SIZE(s->onstack)) {
+		s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
+		memcpy(s->k, s->onstack, sizeof(s->onstack));
+	}
+}
+
+static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s,
+					    struct bch_fs *c,
+					    struct bkey_s_c k)
+{
+	bch2_bkey_buf_realloc(s, c, k.k->u64s);
+	bkey_reassemble(s->k, k);
+}
+
+static inline void bch2_bkey_buf_copy(struct bkey_buf *s,
+				      struct bch_fs *c,
+				      struct bkey_i *src)
+{
+	bch2_bkey_buf_realloc(s, c, src->k.u64s);
+	bkey_copy(s->k, src);
+}
+
+static inline void bch2_bkey_buf_unpack(struct bkey_buf *s,
+					struct bch_fs *c,
+					struct btree *b,
+					struct bkey_packed *src)
+{
+	bch2_bkey_buf_realloc(s, c, BKEY_U64s +
+			      bkeyp_val_u64s(&b->format, src));
+	bch2_bkey_unpack(b, s->k, src);
+}
+
+static inline void bch2_bkey_buf_init(struct bkey_buf *s)
+{
+	s->k = (void *) s->onstack;
+}
+
+static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c)
+{
+	if (s->k != (void *) s->onstack)
+		mempool_free(s->k, &c->large_bkey_pool);
+	s->k = NULL;
+}
+
+#endif /* _BCACHEFS_BKEY_BUF_H */
diff --git a/libbcachefs/bkey_cmp.h b/libbcachefs/bkey_cmp.h
new file mode 100644
index 00000000..5f42a6e6
--- /dev/null
+++ b/libbcachefs/bkey_cmp.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_CMP_H
+#define _BCACHEFS_BKEY_CMP_H
+
+#include "bkey.h"
+
+#ifdef CONFIG_X86_64
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+				  unsigned nr_key_bits)
+{
+	long d0, d1, d2, d3;
+	int cmp;
+
+	/* we shouldn't need asm for this, but gcc is being retarded: */
+
+	asm(".intel_syntax noprefix;"
+	    "xor eax, eax;"
+	    "xor edx, edx;"
+	    "1:;"
+	    "mov r8, [rdi];"
+	    "mov r9, [rsi];"
+	    "sub ecx, 64;"
+	    "jl 2f;"
+
+	    "cmp r8, r9;"
+	    "jnz 3f;"
+
+	    "lea rdi, [rdi - 8];"
+	    "lea rsi, [rsi - 8];"
+	    "jmp 1b;"
+
+	    "2:;"
+	    "not ecx;"
+	    "shr r8, 1;"
+	    "shr r9, 1;"
+	    "shr r8, cl;"
+	    "shr r9, cl;"
+	    "cmp r8, r9;"
+
+	    "3:\n"
+	    "seta al;"
+	    "setb dl;"
+	    "sub eax, edx;"
+	    ".att_syntax prefix;"
+	    : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
+	    : "0" (l), "1" (r), "3" (nr_key_bits)
+	    : "r8", "r9", "cc", "memory");
+
+	return cmp;
+}
+#else
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+				  unsigned nr_key_bits)
+{
+	u64 l_v, r_v;
+
+	if (!nr_key_bits)
+		return 0;
+
+	/* for big endian, skip past header */
+	nr_key_bits += high_bit_offset;
+	l_v = *l & (~0ULL >> high_bit_offset);
+	r_v = *r & (~0ULL >> high_bit_offset);
+
+	while (1) {
+		if (nr_key_bits < 64) {
+			l_v >>= 64 - nr_key_bits;
+			r_v >>= 64 - nr_key_bits;
+			nr_key_bits = 0;
+		} else {
+			nr_key_bits -= 64;
+		}
+
+		if (!nr_key_bits || l_v != r_v)
+			break;
+
+		l = next_word(l);
+		r = next_word(r);
+
+		l_v = *l;
+		r_v = *r;
+	}
+
+	return cmp_int(l_v, r_v);
+}
+#endif
+
+static inline __pure __flatten
+int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l,
+					  const struct bkey_packed *r,
+					  const struct btree *b)
+{
+	const struct bkey_format *f = &b->format;
+	int ret;
+
+	EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+	ret = __bkey_cmp_bits(high_word(f, l),
+			      high_word(f, r),
+			      b->nr_key_bits);
+
+	EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
+				bkey_unpack_pos(b, r)));
+	return ret;
+}
+
+static inline __pure __flatten
+int bch2_bkey_cmp_packed_inlined(const struct btree *b,
+			 const struct bkey_packed *l,
+			 const struct bkey_packed *r)
+{
+	struct bkey unpacked;
+
+	if (likely(bkey_packed(l) && bkey_packed(r)))
+		return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
+
+	if (bkey_packed(l)) {
+		__bkey_unpack_key_format_checked(b, &unpacked, l);
+		l = (void *) &unpacked;
+	} else if (bkey_packed(r)) {
+		__bkey_unpack_key_format_checked(b, &unpacked, r);
+		r = (void *) &unpacked;
+	}
+
+	return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
+}
+
+#endif /* _BCACHEFS_BKEY_CMP_H */
diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c
index e4f62f90..15c93576 100644
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@@ -1,191 +1,481 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "backpointers.h"
 #include "bkey_methods.h"
+#include "btree_cache.h"
 #include "btree_types.h"
-#include "alloc.h"
+#include "alloc_background.h"
 #include "dirent.h"
+#include "disk_accounting.h"
+#include "ec.h"
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
+#include "io_misc.h"
+#include "lru.h"
 #include "quota.h"
+#include "reflink.h"
+#include "snapshot.h"
+#include "subvolume.h"
 #include "xattr.h"
 
-const struct bkey_ops bch2_bkey_ops[] = {
-	[BKEY_TYPE_EXTENTS]	= bch2_bkey_extent_ops,
-	[BKEY_TYPE_INODES]	= bch2_bkey_inode_ops,
-	[BKEY_TYPE_DIRENTS]	= bch2_bkey_dirent_ops,
-	[BKEY_TYPE_XATTRS]	= bch2_bkey_xattr_ops,
-	[BKEY_TYPE_ALLOC]	= bch2_bkey_alloc_ops,
-	[BKEY_TYPE_QUOTAS]	= bch2_bkey_quota_ops,
-	[BKEY_TYPE_BTREE]	= bch2_bkey_btree_ops,
+const char * const bch2_bkey_types[] = {
+#define x(name, nr) #name,
+	BCH_BKEY_TYPES()
+#undef x
+	NULL
 };
 
-const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
-				  struct bkey_s_c k)
+static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k,
+				struct bkey_validate_context from)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	return 0;
+}
 
-	switch (k.k->type) {
-	case KEY_TYPE_DELETED:
-	case KEY_TYPE_DISCARD:
-		return NULL;
+#define bch2_bkey_ops_deleted ((struct bkey_ops) {	\
+	.key_validate	= deleted_key_validate,		\
+})
 
-	case KEY_TYPE_ERROR:
-		return bkey_val_bytes(k.k) != 0
-			? "value size should be zero"
-			: NULL;
+#define bch2_bkey_ops_whiteout ((struct bkey_ops) {	\
+	.key_validate	= deleted_key_validate,		\
+})
 
-	case KEY_TYPE_COOKIE:
-		return bkey_val_bytes(k.k) != sizeof(struct bch_cookie)
-			? "incorrect value size"
-			: NULL;
+static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k,
+				  struct bkey_validate_context from)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_val_bytes(k.k),
+			 c, bkey_val_size_nonzero,
+			 "incorrect value size (%zu != 0)",
+			 bkey_val_bytes(k.k));
+fsck_err:
+	return ret;
+}
 
-	default:
-		if (k.k->type < KEY_TYPE_GENERIC_NR)
-			return "invalid type";
+#define bch2_bkey_ops_error ((struct bkey_ops) {	\
+	.key_validate = empty_val_key_validate,		\
+})
 
-		return ops->key_invalid(c, k);
-	}
+static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k,
+				    struct bkey_validate_context from)
+{
+	return 0;
 }
 
-const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
-			      struct bkey_s_c k)
+static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
+				    struct bkey_s_c k)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k);
 
-	if (k.k->u64s < BKEY_U64s)
-		return "u64s too small";
+	prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie));
+}
 
-	if (!ops->is_extents) {
-		if (k.k->size)
-			return "nonzero size field";
-	} else {
-		if ((k.k->size == 0) != bkey_deleted(k.k))
-			return "bad size field";
-	}
+#define bch2_bkey_ops_cookie ((struct bkey_ops) {	\
+	.key_validate	= key_type_cookie_validate,	\
+	.val_to_text	= key_type_cookie_to_text,	\
+	.min_val_size	= 8,				\
+})
 
-	if (ops->is_extents &&
-	    !k.k->size &&
-	    !bkey_deleted(k.k))
-		return "zero size field";
+#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\
+	.key_validate	= empty_val_key_validate,	\
+})
 
-	if (k.k->p.snapshot)
-		return "nonzero snapshot";
+static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
+					 struct bkey_validate_context from)
+{
+	return 0;
+}
 
-	if (type != BKEY_TYPE_BTREE &&
-	    !bkey_cmp(k.k->p, POS_MAX))
-		return "POS_MAX key";
+static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
+					 struct bkey_s_c k)
+{
+	struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
+	unsigned datalen = bkey_inline_data_bytes(k.k);
 
-	return NULL;
+	prt_printf(out, "datalen %u: %*phN",
+	       datalen, min(datalen, 32U), d.v->data);
 }
 
-const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
-			      struct bkey_s_c k)
+#define bch2_bkey_ops_inline_data ((struct bkey_ops) {		\
+	.key_validate	= key_type_inline_data_validate,	\
+	.val_to_text	= key_type_inline_data_to_text,		\
+})
+
+static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 {
-	return __bch2_bkey_invalid(c, type, k) ?:
-		bch2_bkey_val_invalid(c, type, k);
+	bch2_key_resize(l.k, l.k->size + r.k->size);
+	return true;
 }
 
-const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
+#define bch2_bkey_ops_set ((struct bkey_ops) {		\
+	.key_validate	= empty_val_key_validate,	\
+	.key_merge	= key_type_set_merge,		\
+})
+
+const struct bkey_ops bch2_bkey_ops[] = {
+#define x(name, nr) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,
+	BCH_BKEY_TYPES()
+#undef x
+};
+
+const struct bkey_ops bch2_bkey_null_ops = {
+};
+
+int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k,
+			   struct bkey_validate_context from)
 {
-	if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
-		return "key before start of btree node";
+	if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
+		return 0;
+
+	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size,
+			 c, bkey_val_size_too_small,
+			 "bad val size (%zu < %u)",
+			 bkey_val_bytes(k.k), ops->min_val_size);
 
-	if (bkey_cmp(k.k->p, b->data->max_key) > 0)
-		return "key past end of btree node";
+	if (!ops->key_validate)
+		return 0;
 
-	return NULL;
+	ret = ops->key_validate(c, k, from);
+fsck_err:
+	return ret;
 }
 
-void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
-{
-	enum bkey_type type = btree_node_type(b);
-	const struct bkey_ops *ops = &bch2_bkey_ops[type];
-	const char *invalid;
+static u64 bch2_key_types_allowed[] = {
+	[BKEY_TYPE_btree] =
+		BIT_ULL(KEY_TYPE_deleted)|
+		BIT_ULL(KEY_TYPE_btree_ptr)|
+		BIT_ULL(KEY_TYPE_btree_ptr_v2),
+#define x(name, nr, flags, keys)	[BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
+	BCH_BTREE_IDS()
+#undef x
+};
 
-	BUG_ON(!k.k->u64s);
+const char *bch2_btree_node_type_str(enum btree_node_type type)
+{
+	return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
+}
 
-	invalid = bch2_bkey_invalid(c, type, k) ?:
-		bch2_bkey_in_btree_node(b, k);
-	if (invalid) {
-		char buf[160];
+int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
+			 struct bkey_validate_context from)
+{
+	enum btree_node_type type = __btree_node_type(from.level, from.btree);
+
+	if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
+		return 0;
+
+	int ret = 0;
+
+	bkey_fsck_err_on(k.k->u64s < BKEY_U64s,
+			 c, bkey_u64s_too_small,
+			 "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
+
+	if (type >= BKEY_TYPE_NR)
+		return 0;
+
+	bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX &&
+			 (type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) &&
+			 !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)),
+			 c, bkey_invalid_type_for_btree,
+			 "invalid key type for btree %s (%s)",
+			 bch2_btree_node_type_str(type),
+			 k.k->type < KEY_TYPE_MAX
+			 ? bch2_bkey_types[k.k->type]
+			 : "(unknown)");
+
+	if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
+		bkey_fsck_err_on(k.k->size == 0,
+				 c, bkey_extent_size_zero,
+				 "size == 0");
+
+		bkey_fsck_err_on(k.k->size > k.k->p.offset,
+				 c, bkey_extent_size_greater_than_offset,
+				 "size greater than offset (%u > %llu)",
+				 k.k->size, k.k->p.offset);
+	} else {
+		bkey_fsck_err_on(k.k->size,
+				 c, bkey_size_nonzero,
+				 "size != 0");
+	}
 
-		bch2_bkey_val_to_text(c, type, buf, sizeof(buf), k);
-		bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid);
-		return;
+	if (type != BKEY_TYPE_btree) {
+		enum btree_id btree = type - 1;
+
+		if (btree_type_has_snapshots(btree)) {
+			bkey_fsck_err_on(!k.k->p.snapshot,
+					 c, bkey_snapshot_zero,
+					 "snapshot == 0");
+		} else if (!btree_type_has_snapshot_field(btree)) {
+			bkey_fsck_err_on(k.k->p.snapshot,
+					 c, bkey_snapshot_nonzero,
+					 "nonzero snapshot");
+		} else {
+			/*
+			 * btree uses snapshot field but it's not required to be
+			 * nonzero
+			 */
+		}
+
+		bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX),
+				 c, bkey_at_pos_max,
+				 "key at POS_MAX");
 	}
+fsck_err:
+	return ret;
+}
 
-	if (k.k->type >= KEY_TYPE_GENERIC_NR &&
-	    ops->key_debugcheck)
-		ops->key_debugcheck(c, b, k);
+int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
+		       struct bkey_validate_context from)
+{
+	return __bch2_bkey_validate(c, k, from) ?:
+		bch2_bkey_val_validate(c, k, from);
 }
 
-#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b,
+			    struct bkey_s_c k,
+			    struct bkey_validate_context from)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key),
+			 c, bkey_before_start_of_btree_node,
+			 "key before start of btree node");
+
+	bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key),
+			 c, bkey_after_end_of_btree_node,
+			 "key past end of btree node");
+fsck_err:
+	return ret;
+}
+
+void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
+{
+	if (bpos_eq(pos, POS_MIN))
+		prt_printf(out, "POS_MIN");
+	else if (bpos_eq(pos, POS_MAX))
+		prt_printf(out, "POS_MAX");
+	else if (bpos_eq(pos, SPOS_MAX))
+		prt_printf(out, "SPOS_MAX");
+	else {
+		if (pos.inode == U64_MAX)
+			prt_printf(out, "U64_MAX");
+		else
+			prt_printf(out, "%llu", pos.inode);
+		prt_printf(out, ":");
+		if (pos.offset == U64_MAX)
+			prt_printf(out, "U64_MAX");
+		else
+			prt_printf(out, "%llu", pos.offset);
+		prt_printf(out, ":");
+		if (pos.snapshot == U32_MAX)
+			prt_printf(out, "U32_MAX");
+		else
+			prt_printf(out, "%u", pos.snapshot);
+	}
+}
 
-int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
+void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
 {
-	char *out = buf, *end = buf + size;
+	if (k) {
+		prt_printf(out, "u64s %u type ", k->u64s);
+
+		if (k->type < KEY_TYPE_MAX)
+			prt_printf(out, "%s ", bch2_bkey_types[k->type]);
+		else
+			prt_printf(out, "%u ", k->type);
 
-	p("u64s %u type %u ", k->u64s, k->type);
+		bch2_bpos_to_text(out, k->p);
 
-	if (bkey_cmp(k->p, POS_MAX))
-		p("%llu:%llu", k->p.inode, k->p.offset);
-	else
-		p("POS_MAX");
+		prt_printf(out, " len %u ver %llu", k->size, k->bversion.lo);
+	} else {
+		prt_printf(out, "(null)");
+	}
+}
 
-	p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
+void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
+		      struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
 
-	return out - buf;
+	if (likely(ops->val_to_text))
+		ops->val_to_text(out, c, k);
 }
 
-int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
-		     char *buf, size_t size, struct bkey_s_c k)
+void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
+			   struct bkey_s_c k)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[type];
-	char *out = buf, *end = buf + size;
+	bch2_bkey_to_text(out, k.k);
 
-	switch (k.k->type) {
-	case KEY_TYPE_DELETED:
-		p(" deleted");
-		break;
-	case KEY_TYPE_DISCARD:
-		p(" discard");
-		break;
-	case KEY_TYPE_ERROR:
-		p(" error");
-		break;
-	case KEY_TYPE_COOKIE:
-		p(" cookie");
-		break;
-	default:
-		if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
-			ops->val_to_text(c, buf, size, k);
-		break;
+	if (bkey_val_bytes(k.k)) {
+		prt_printf(out, ": ");
+		bch2_val_to_text(out, c, k);
 	}
+}
 
-	return out - buf;
+void bch2_bkey_swab_val(struct bkey_s k)
+{
+	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
+
+	if (ops->swab)
+		ops->swab(k);
 }
 
-int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
-			  char *buf, size_t size, struct bkey_s_c k)
+bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
 {
-	char *out = buf, *end = buf + size;
+	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
 
-	out += bch2_bkey_to_text(out, end - out, k.k);
-	out += scnprintf(out, end - out, ": ");
-	out += bch2_val_to_text(c, type, out, end - out, k);
+	return ops->key_normalize
+		? ops->key_normalize(c, k)
+		: false;
+}
 
-	return out - buf;
+bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
+{
+	const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type);
+
+	return ops->key_merge &&
+		bch2_bkey_maybe_mergable(l.k, r.k) &&
+		(u64) l.k->size + r.k->size <= KEY_SIZE_MAX &&
+		!bch2_key_merging_disabled &&
+		ops->key_merge(c, l, r);
 }
 
-void bch2_bkey_swab(enum bkey_type type,
-		   const struct bkey_format *f,
-		   struct bkey_packed *k)
+static const struct old_bkey_type {
+	u8		btree_node_type;
+	u8		old;
+	u8		new;
+} bkey_renumber_table[] = {
+	{BKEY_TYPE_btree,	128, KEY_TYPE_btree_ptr		},
+	{BKEY_TYPE_extents,	128, KEY_TYPE_extent		},
+	{BKEY_TYPE_extents,	129, KEY_TYPE_extent		},
+	{BKEY_TYPE_extents,	130, KEY_TYPE_reservation	},
+	{BKEY_TYPE_inodes,	128, KEY_TYPE_inode		},
+	{BKEY_TYPE_inodes,	130, KEY_TYPE_inode_generation	},
+	{BKEY_TYPE_dirents,	128, KEY_TYPE_dirent		},
+	{BKEY_TYPE_dirents,	129, KEY_TYPE_hash_whiteout	},
+	{BKEY_TYPE_xattrs,	128, KEY_TYPE_xattr		},
+	{BKEY_TYPE_xattrs,	129, KEY_TYPE_hash_whiteout	},
+	{BKEY_TYPE_alloc,	128, KEY_TYPE_alloc		},
+	{BKEY_TYPE_quotas,	128, KEY_TYPE_quota		},
+};
+
+void bch2_bkey_renumber(enum btree_node_type btree_node_type,
+			struct bkey_packed *k,
+			int write)
 {
-	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	const struct old_bkey_type *i;
+
+	for (i = bkey_renumber_table;
+	     i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table);
+	     i++)
+		if (btree_node_type == i->btree_node_type &&
+		    k->type == (write ? i->new : i->old)) {
+			k->type = write ? i->old : i->new;
+			break;
+		}
+}
 
-	bch2_bkey_swab_key(f, k);
+void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
+			unsigned version, unsigned big_endian,
+			int write,
+			struct bkey_format *f,
+			struct bkey_packed *k)
+{
+	const struct bkey_ops *ops;
+	struct bkey uk;
+	unsigned nr_compat = 5;
+	int i;
+
+	/*
+	 * Do these operations in reverse order in the write path:
+	 */
+
+	for (i = 0; i < nr_compat; i++)
+	switch (!write ? i : nr_compat - 1 - i) {
+	case 0:
+		if (big_endian != CPU_BIG_ENDIAN) {
+			bch2_bkey_swab_key(f, k);
+		} else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+			bch2_bkey_swab_key(f, k);
+			bch2_bkey_swab_key(f, k);
+		}
+		break;
+	case 1:
+		if (version < bcachefs_metadata_version_bkey_renumber)
+			bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
+		break;
+	case 2:
+		if (version < bcachefs_metadata_version_inode_btree_change &&
+		    btree_id == BTREE_ID_inodes) {
+			if (!bkey_packed(k)) {
+				struct bkey_i *u = packed_to_bkey(k);
+
+				swap(u->k.p.inode, u->k.p.offset);
+			} else if (f->bits_per_field[BKEY_FIELD_INODE] &&
+				   f->bits_per_field[BKEY_FIELD_OFFSET]) {
+				struct bkey_format tmp = *f, *in = f, *out = &tmp;
+
+				swap(tmp.bits_per_field[BKEY_FIELD_INODE],
+				     tmp.bits_per_field[BKEY_FIELD_OFFSET]);
+				swap(tmp.field_offset[BKEY_FIELD_INODE],
+				     tmp.field_offset[BKEY_FIELD_OFFSET]);
+
+				if (!write)
+					swap(in, out);
+
+				uk = __bch2_bkey_unpack_key(in, k);
+				swap(uk.p.inode, uk.p.offset);
+				BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
+			}
+		}
+		break;
+	case 3:
+		if (version < bcachefs_metadata_version_snapshot &&
+		    (level || btree_type_has_snapshots(btree_id))) {
+			struct bkey_i *u = packed_to_bkey(k);
+
+			if (u) {
+				u->k.p.snapshot = write
+					? 0 : U32_MAX;
+			} else {
+				u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]);
+				u64 max_packed = min_packed +
+					~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+				uk = __bch2_bkey_unpack_key(f, k);
+				uk.p.snapshot = write
+					? min_packed : min_t(u64, U32_MAX, max_packed);
+
+				BUG_ON(!bch2_bkey_pack_key(k, &uk, f));
+			}
+		}
 
-	if (ops->swab)
-		ops->swab(f, k);
+		break;
+	case 4: {
+		struct bkey_s u;
+
+		if (!bkey_packed(k)) {
+			u = bkey_i_to_s(packed_to_bkey(k));
+		} else {
+			uk = __bch2_bkey_unpack_key(f, k);
+			u.k = &uk;
+			u.v = bkeyp_val(f, k);
+		}
+
+		if (big_endian != CPU_BIG_ENDIAN)
+			bch2_bkey_swab_val(u);
+
+		ops = bch2_bkey_type_ops(k->type);
+
+		if (ops->compat)
+			ops->compat(btree_id, version, big_endian, write, u);
+		break;
+	}
+	default:
+		BUG();
+	}
 }
diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h
index 9e2c90d5..bf34111c 100644
--- a/libbcachefs/bkey_methods.h
+++ b/libbcachefs/bkey_methods.h
@@ -1,86 +1,139 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_BKEY_METHODS_H
 #define _BCACHEFS_BKEY_METHODS_H
 
 #include "bkey.h"
 
-#define DEF_BTREE_ID(kwd, val, name) BKEY_TYPE_##kwd = val,
-
-enum bkey_type {
-	DEFINE_BCH_BTREE_IDS()
-	BKEY_TYPE_BTREE,
+struct bch_fs;
+struct btree;
+struct btree_trans;
+struct bkey;
+enum btree_node_type;
+
+extern const char * const bch2_bkey_types[];
+extern const struct bkey_ops bch2_bkey_null_ops;
+
+/*
+ * key_validate: checks validity of @k, returns 0 if good or -EINVAL if bad. If
+ * invalid, entire key will be deleted.
+ *
+ * When invalid, error string is returned via @err. @rw indicates whether key is
+ * being read or written; more aggressive checks can be enabled when rw == WRITE.
+ */
+struct bkey_ops {
+	int		(*key_validate)(struct bch_fs *c, struct bkey_s_c k,
+					struct bkey_validate_context from);
+	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
+				       struct bkey_s_c);
+	void		(*swab)(struct bkey_s);
+	bool		(*key_normalize)(struct bch_fs *, struct bkey_s);
+	bool		(*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+	int		(*trigger)(struct btree_trans *, enum btree_id, unsigned,
+				   struct bkey_s_c, struct bkey_s,
+				   enum btree_iter_update_trigger_flags);
+	void		(*compat)(enum btree_id id, unsigned version,
+				  unsigned big_endian, int write,
+				  struct bkey_s);
+
+	/* Size of value type when first created: */
+	unsigned	min_val_size;
 };
 
-#undef DEF_BTREE_ID
+extern const struct bkey_ops bch2_bkey_ops[];
 
-/* Type of a key in btree @id at level @level: */
-static inline enum bkey_type bkey_type(unsigned level, enum btree_id id)
+static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
 {
-	return level ? BKEY_TYPE_BTREE : (enum bkey_type) id;
+	return likely(type < KEY_TYPE_MAX)
+		? &bch2_bkey_ops[type]
+		: &bch2_bkey_null_ops;
 }
 
-static inline bool btree_type_has_ptrs(enum bkey_type type)
+int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c,
+			   struct bkey_validate_context);
+int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c,
+			 struct bkey_validate_context);
+int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c,
+		       struct bkey_validate_context);
+int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c,
+			    struct bkey_validate_context from);
+
+void bch2_bpos_to_text(struct printbuf *, struct bpos);
+void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
+void bch2_val_to_text(struct printbuf *, struct bch_fs *,
+		      struct bkey_s_c);
+void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
+			   struct bkey_s_c);
+
+void bch2_bkey_swab_val(struct bkey_s);
+
+bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
+
+static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r)
 {
-	switch (type) {
-	case BKEY_TYPE_BTREE:
-	case BKEY_TYPE_EXTENTS:
-		return true;
-	default:
-		return false;
-	}
+	return l->type == r->type &&
+		!bversion_cmp(l->bversion, r->bversion) &&
+		bpos_eq(l->p, bkey_start_pos(r));
 }
 
-struct bch_fs;
-struct btree;
-struct bkey;
+bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+
+static inline int bch2_key_trigger(struct btree_trans *trans,
+		enum btree_id btree, unsigned level,
+		struct bkey_s_c old, struct bkey_s new,
+		enum btree_iter_update_trigger_flags flags)
+{
+	const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
 
-enum merge_result {
-	BCH_MERGE_NOMERGE,
+	return ops->trigger
+		? ops->trigger(trans, btree, level, old, new, flags)
+		: 0;
+}
 
-	/*
-	 * The keys were mergeable, but would have overflowed size - so instead
-	 * l was changed to the maximum size, and both keys were modified:
-	 */
-	BCH_MERGE_PARTIAL,
-	BCH_MERGE_MERGE,
-};
+static inline int bch2_key_trigger_old(struct btree_trans *trans,
+			enum btree_id btree_id, unsigned level,
+			struct bkey_s_c old,
+			enum btree_iter_update_trigger_flags flags)
+{
+	struct bkey_i deleted;
 
-typedef bool (*key_filter_fn)(struct bch_fs *, struct btree *,
-			      struct bkey_s);
-typedef enum merge_result (*key_merge_fn)(struct bch_fs *,
-					  struct btree *,
-					  struct bkey_i *, struct bkey_i *);
+	bkey_init(&deleted.k);
+	deleted.k.p = old.k->p;
 
-struct bkey_ops {
-	/* Returns reason for being invalid if invalid, else NULL: */
-	const char *	(*key_invalid)(const struct bch_fs *,
-				       struct bkey_s_c);
-	void		(*key_debugcheck)(struct bch_fs *, struct btree *,
-					  struct bkey_s_c);
-	void		(*val_to_text)(struct bch_fs *, char *,
-				       size_t, struct bkey_s_c);
-	void		(*swab)(const struct bkey_format *, struct bkey_packed *);
-	key_filter_fn	key_normalize;
-	key_merge_fn	key_merge;
-	bool		is_extents;
-};
+	return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted),
+				BTREE_TRIGGER_overwrite|flags);
+}
 
-const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type,
-				  struct bkey_s_c);
-const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
-const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
-const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
+static inline int bch2_key_trigger_new(struct btree_trans *trans,
+			enum btree_id btree_id, unsigned level,
+			struct bkey_s new,
+			enum btree_iter_update_trigger_flags flags)
+{
+	struct bkey_i deleted;
+
+	bkey_init(&deleted.k);
+	deleted.k.p = new.k->p;
 
-void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
+	return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
+				BTREE_TRIGGER_insert|flags);
+}
 
-int bch2_bkey_to_text(char *, size_t, const struct bkey *);
-int bch2_val_to_text(struct bch_fs *, enum bkey_type,
-		     char *, size_t, struct bkey_s_c);
-int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
-			  char *, size_t, struct bkey_s_c);
+void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
 
-void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
-		    struct bkey_packed *);
+void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
+			int, struct bkey_format *, struct bkey_packed *);
 
-extern const struct bkey_ops bch2_bkey_ops[];
+static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id,
+			       unsigned version, unsigned big_endian,
+			       int write,
+			       struct bkey_format *f,
+			       struct bkey_packed *k)
+{
+	if (version < bcachefs_metadata_version_current ||
+	    big_endian != CPU_BIG_ENDIAN ||
+	    IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
+		__bch2_bkey_compat(level, btree_id, version,
+				   big_endian, write, f, k);
+
+}
 
 #endif /* _BCACHEFS_BKEY_METHODS_H */
diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c
new file mode 100644
index 00000000..4536eb50
--- /dev/null
+++ b/libbcachefs/bkey_sort.c
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "bkey_cmp.h"
+#include "bkey_sort.h"
+#include "bset.h"
+#include "extents.h"
+
+typedef int (*sort_cmp_fn)(const struct btree *,
+			   const struct bkey_packed *,
+			   const struct bkey_packed *);
+
+static inline bool sort_iter_end(struct sort_iter *iter)
+{
+	return !iter->used;
+}
+
+static inline void sort_iter_sift(struct sort_iter *iter, unsigned from,
+				  sort_cmp_fn cmp)
+{
+	unsigned i;
+
+	for (i = from;
+	     i + 1 < iter->used &&
+	     cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
+	     i++)
+		swap(iter->data[i], iter->data[i + 1]);
+}
+
+static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	unsigned i = iter->used;
+
+	while (i--)
+		sort_iter_sift(iter, i, cmp);
+}
+
+static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
+{
+	return !sort_iter_end(iter) ? iter->data->k : NULL;
+}
+
+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	struct sort_iter_set *i = iter->data;
+
+	BUG_ON(!iter->used);
+
+	i->k = bkey_p_next(i->k);
+
+	BUG_ON(i->k > i->end);
+
+	if (i->k == i->end)
+		array_remove_item(iter->data, iter->used, 0);
+	else
+		sort_iter_sift(iter, 0, cmp);
+}
+
+static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
+						 sort_cmp_fn cmp)
+{
+	struct bkey_packed *ret = sort_iter_peek(iter);
+
+	if (ret)
+		sort_iter_advance(iter, cmp);
+
+	return ret;
+}
+
+/*
+ * If keys compare equal, compare by pointer order:
+ */
+static inline int key_sort_fix_overlapping_cmp(const struct btree *b,
+					       const struct bkey_packed *l,
+					       const struct bkey_packed *r)
+{
+	return bch2_bkey_cmp_packed(b, l, r) ?:
+		cmp_int((unsigned long) l, (unsigned long) r);
+}
+
+static inline bool should_drop_next_key(struct sort_iter *iter)
+{
+	/*
+	 * key_sort_cmp() ensures that when keys compare equal the older key
+	 * comes first; so if l->k compares equal to r->k then l->k is older
+	 * and should be dropped.
+	 */
+	return iter->used >= 2 &&
+		!bch2_bkey_cmp_packed(iter->b,
+				 iter->data[0].k,
+				 iter->data[1].k);
+}
+
+struct btree_nr_keys
+bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
+			      struct sort_iter *iter)
+{
+	struct bkey_packed *out = dst->start;
+	struct bkey_packed *k;
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	sort_iter_sort(iter, key_sort_fix_overlapping_cmp);
+
+	while ((k = sort_iter_peek(iter))) {
+		if (!bkey_deleted(k) &&
+		    !should_drop_next_key(iter)) {
+			bkey_p_copy(out, k);
+			btree_keys_account_key_add(&nr, 0, out);
+			out = bkey_p_next(out);
+		}
+
+		sort_iter_advance(iter, key_sort_fix_overlapping_cmp);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+/* Sort + repack in a new format: */
+struct btree_nr_keys
+bch2_sort_repack(struct bset *dst, struct btree *src,
+		 struct btree_node_iter *src_iter,
+		 struct bkey_format *out_f,
+		 bool filter_whiteouts)
+{
+	struct bkey_format *in_f = &src->format;
+	struct bkey_packed *in, *out = vstruct_last(dst);
+	struct btree_nr_keys nr;
+	bool transform = memcmp(out_f, &src->format, sizeof(*out_f));
+
+	memset(&nr, 0, sizeof(nr));
+
+	while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
+		if (filter_whiteouts && bkey_deleted(in))
+			continue;
+
+		if (!transform)
+			bkey_p_copy(out, in);
+		else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
+					     ? in_f : &bch2_bkey_format_current, in))
+			out->format = KEY_FORMAT_LOCAL_BTREE;
+		else
+			bch2_bkey_unpack(src, (void *) out, in);
+
+		out->needs_whiteout = false;
+
+		btree_keys_account_key_add(&nr, 0, out);
+		out = bkey_p_next(out);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+static inline int keep_unwritten_whiteouts_cmp(const struct btree *b,
+				const struct bkey_packed *l,
+				const struct bkey_packed *r)
+{
+	return bch2_bkey_cmp_packed_inlined(b, l, r) ?:
+		(int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
+		(long) l - (long) r;
+}
+
+#include "btree_update_interior.h"
+
+/*
+ * For sorting in the btree node write path: whiteouts not in the unwritten
+ * whiteouts area are dropped, whiteouts in the unwritten whiteouts area are
+ * dropped if overwritten by real keys:
+ */
+unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *dst, struct sort_iter *iter)
+{
+	struct bkey_packed *in, *next, *out = dst;
+
+	sort_iter_sort(iter, keep_unwritten_whiteouts_cmp);
+
+	while ((in = sort_iter_next(iter, keep_unwritten_whiteouts_cmp))) {
+		if (bkey_deleted(in) && in < unwritten_whiteouts_start(iter->b))
+			continue;
+
+		if ((next = sort_iter_peek(iter)) &&
+		    !bch2_bkey_cmp_packed_inlined(iter->b, in, next))
+			continue;
+
+		bkey_p_copy(out, in);
+		out = bkey_p_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
+
+/*
+ * Main sort routine for compacting a btree node in memory: we always drop
+ * whiteouts because any whiteouts that need to be written are in the unwritten
+ * whiteouts area:
+ */
+unsigned bch2_sort_keys(struct bkey_packed *dst, struct sort_iter *iter)
+{
+	struct bkey_packed *in, *out = dst;
+
+	sort_iter_sort(iter, bch2_bkey_cmp_packed_inlined);
+
+	while ((in = sort_iter_next(iter, bch2_bkey_cmp_packed_inlined))) {
+		if (bkey_deleted(in))
+			continue;
+
+		bkey_p_copy(out, in);
+		out = bkey_p_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
diff --git a/libbcachefs/bkey_sort.h b/libbcachefs/bkey_sort.h
new file mode 100644
index 00000000..9be969d4
--- /dev/null
+++ b/libbcachefs/bkey_sort.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_SORT_H
+#define _BCACHEFS_BKEY_SORT_H
+
+struct sort_iter {
+	struct btree		*b;
+	unsigned		used;
+	unsigned		size;
+
+	struct sort_iter_set {
+		struct bkey_packed *k, *end;
+	} data[];
+};
+
+static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size)
+{
+	iter->b = b;
+	iter->used = 0;
+	iter->size = size;
+}
+
+struct sort_iter_stack {
+	struct sort_iter	iter;
+	struct sort_iter_set	sets[MAX_BSETS + 1];
+};
+
+static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b)
+{
+	sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets));
+}
+
+static inline void sort_iter_add(struct sort_iter *iter,
+				 struct bkey_packed *k,
+				 struct bkey_packed *end)
+{
+	BUG_ON(iter->used >= iter->size);
+
+	if (k != end)
+		iter->data[iter->used++] = (struct sort_iter_set) { k, end };
+}
+
+struct btree_nr_keys
+bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *,
+			      struct sort_iter *);
+
+struct btree_nr_keys
+bch2_sort_repack(struct bset *, struct btree *,
+		 struct btree_node_iter *,
+		 struct bkey_format *, bool);
+
+unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *, struct sort_iter *);
+unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *);
+
+#endif /* _BCACHEFS_BKEY_SORT_H */
diff --git a/libbcachefs/bkey_types.h b/libbcachefs/bkey_types.h
new file mode 100644
index 00000000..2af6279b
--- /dev/null
+++ b/libbcachefs/bkey_types.h
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_TYPES_H
+#define _BCACHEFS_BKEY_TYPES_H
+
+#include "bcachefs_format.h"
+
+/*
+ * bkey_i	- bkey with inline value
+ * bkey_s	- bkey with split value
+ * bkey_s_c	- bkey with split value, const
+ */
+
+#define bkey_p_next(_k)		vstruct_next(_k)
+
+static inline struct bkey_i *bkey_next(struct bkey_i *k)
+{
+	return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
+}
+
+#define bkey_val_u64s(_k)	((_k)->u64s - BKEY_U64s)
+
+static inline size_t bkey_val_bytes(const struct bkey *k)
+{
+	return bkey_val_u64s(k) * sizeof(u64);
+}
+
+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
+{
+	unsigned u64s = BKEY_U64s + val_u64s;
+
+	BUG_ON(u64s > U8_MAX);
+	k->u64s = u64s;
+}
+
+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
+{
+	set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
+}
+
+#define bkey_val_end(_k)	((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
+
+#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_deleted)
+
+#define bkey_whiteout(_k)				\
+	((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
+
+/* bkey with split value, const */
+struct bkey_s_c {
+	const struct bkey	*k;
+	const struct bch_val	*v;
+};
+
+/* bkey with split value */
+struct bkey_s {
+	union {
+	struct {
+		struct bkey	*k;
+		struct bch_val	*v;
+	};
+	struct bkey_s_c		s_c;
+	};
+};
+
+#define bkey_s_null		((struct bkey_s)   { .k = NULL })
+#define bkey_s_c_null		((struct bkey_s_c) { .k = NULL })
+
+#define bkey_s_err(err)		((struct bkey_s)   { .k = ERR_PTR(err) })
+#define bkey_s_c_err(err)	((struct bkey_s_c) { .k = ERR_PTR(err) })
+
+static inline struct bkey_s bkey_to_s(struct bkey *k)
+{
+	return (struct bkey_s) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
+{
+	return (struct bkey_s_c) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
+{
+	return (struct bkey_s) { .k = &k->k, .v = &k->v };
+}
+
+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
+{
+	return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
+}
+
+/*
+ * For a given type of value (e.g. struct bch_extent), generates the types for
+ * bkey + bch_extent - inline, split, split const - and also all the conversion
+ * functions, which also check that the value is of the correct type.
+ *
+ * We use anonymous unions for upcasting - e.g. converting from e.g. a
+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
+ * functions.
+ */
+#define x(name, ...)					\
+struct bkey_i_##name {							\
+	union {								\
+		struct bkey		k;				\
+		struct bkey_i		k_i;				\
+	};								\
+	struct bch_##name		v;				\
+};									\
+									\
+struct bkey_s_c_##name {						\
+	union {								\
+	struct {							\
+		const struct bkey	*k;				\
+		const struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+struct bkey_s_##name {							\
+	union {								\
+	struct {							\
+		struct bkey		*k;				\
+		struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c_##name		c;				\
+	struct bkey_s			s;				\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline const struct bkey_i_##name *				\
+bkey_i_to_##name##_c(const struct bkey_i *k)				\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
+	return (struct bkey_s_##name) {					\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
+	return (struct bkey_s_c_##name) {				\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
+{									\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+name##_i_to_s_c(const struct bkey_i_##name *k)				\
+{									\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+bkey_i_to_s_c_##name(const struct bkey_i *k)				\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
+{									\
+	struct bkey_i_##name *k =					\
+		container_of(&_k->k, struct bkey_i_##name, k);		\
+									\
+	bkey_init(&k->k);						\
+	memset(&k->v, 0, sizeof(k->v));					\
+	k->k.type = KEY_TYPE_##name;					\
+	set_bkey_val_bytes(&k->k, sizeof(k->v));			\
+									\
+	return k;							\
+}
+
+BCH_BKEY_TYPES();
+#undef x
+
+enum bch_validate_flags {
+	BCH_VALIDATE_write		= BIT(0),
+	BCH_VALIDATE_commit		= BIT(1),
+	BCH_VALIDATE_journal		= BIT(2),
+	BCH_VALIDATE_silent		= BIT(3),
+};
+
+#define BKEY_VALIDATE_CONTEXTS()	\
+	x(unknown)			\
+	x(commit)			\
+	x(journal)			\
+	x(btree_root)			\
+	x(btree_node)
+
+struct bkey_validate_context {
+	enum {
+#define x(n)	BKEY_VALIDATE_##n,
+	BKEY_VALIDATE_CONTEXTS()
+#undef x
+	}			from:8;
+	u8			level;
+	enum btree_id		btree;
+	bool			root:1;
+	enum bch_validate_flags	flags:8;
+};
+
+#endif /* _BCACHEFS_BKEY_TYPES_H */
diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c
index 8c77fc50..9a4a83d6 100644
--- a/libbcachefs/bset.c
+++ b/libbcachefs/bset.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Code for working with individual keys, and sorted sets of keys with in a
  * btree node
@@ -9,28 +10,30 @@
 #include "btree_cache.h"
 #include "bset.h"
 #include "eytzinger.h"
+#include "trace.h"
 #include "util.h"
 
-#include <asm/unaligned.h>
-#include <linux/dynamic_fault.h>
+#include <linux/unaligned.h>
 #include <linux/console.h>
 #include <linux/random.h>
 #include <linux/prefetch.h>
 
-/* hack.. */
-#include "alloc_types.h"
-#include <trace/events/bcachefs.h>
+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
+						  struct btree *);
 
-struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
+static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
 {
-	struct bset_tree *t;
+	unsigned n = ARRAY_SIZE(iter->data);
 
-	for_each_bset(b, t)
-		if (k >= btree_bkey_first(b, t) &&
-		    k < btree_bkey_last(b, t))
-			return t;
+	while (n && __btree_node_iter_set_end(iter, n - 1))
+		--n;
 
-	BUG();
+	return n;
+}
+
+struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
+{
+	return bch2_bkey_to_bset_inlined(b, k);
 }
 
 /*
@@ -49,54 +52,60 @@ struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
  * by the time we actually do the insert will all be deleted.
  */
 
-void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
+void bch2_dump_bset(struct bch_fs *c, struct btree *b,
+		    struct bset *i, unsigned set)
 {
 	struct bkey_packed *_k, *_n;
-	struct bkey k, n;
-	char buf[120];
+	struct bkey uk, n;
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
 
 	if (!i->u64s)
 		return;
 
-	for (_k = i->start, k = bkey_unpack_key(b, _k);
+	for (_k = i->start;
 	     _k < vstruct_last(i);
-	     _k = _n, k = n) {
-		_n = bkey_next(_k);
+	     _k = _n) {
+		_n = bkey_p_next(_k);
+
+		if (!_k->u64s) {
+			printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n", set,
+			       _k->_data - i->_data);
+			break;
+		}
+
+		k = bkey_disassemble(b, _k, &uk);
 
-		bch2_bkey_to_text(buf, sizeof(buf), &k);
-		printk(KERN_ERR "block %u key %zi/%u: %s\n", set,
-		       _k->_data - i->_data, i->u64s, buf);
+		printbuf_reset(&buf);
+		if (c)
+			bch2_bkey_val_to_text(&buf, c, k);
+		else
+			bch2_bkey_to_text(&buf, k.k);
+		printk(KERN_ERR "block %u key %5zu: %s\n", set,
+		       _k->_data - i->_data, buf.buf);
 
 		if (_n == vstruct_last(i))
 			continue;
 
 		n = bkey_unpack_key(b, _n);
 
-		if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) {
+		if (bpos_lt(n.p, k.k->p)) {
 			printk(KERN_ERR "Key skipped backwards\n");
 			continue;
 		}
 
-		/*
-		 * Weird check for duplicate non extent keys: extents are
-		 * deleted iff they have 0 size, so if it has zero size and it's
-		 * not deleted these aren't extents:
-		 */
-		if (((!k.size && !bkey_deleted(&k)) ||
-		     (!n.size && !bkey_deleted(&n))) &&
-		    !bkey_deleted(&k) &&
-		    !bkey_cmp(n.p, k.p))
+		if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p))
 			printk(KERN_ERR "Duplicate keys\n");
 	}
+
+	printbuf_exit(&buf);
 }
 
-void bch2_dump_btree_node(struct btree *b)
+void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
 {
-	struct bset_tree *t;
-
 	console_lock();
 	for_each_bset(b, t)
-		bch2_dump_bset(b, bset(b, t), t - b->set);
+		bch2_dump_bset(c, b, bset(b, t), t - b->set);
 	console_unlock();
 }
 
@@ -104,207 +113,200 @@ void bch2_dump_btree_node_iter(struct btree *b,
 			      struct btree_node_iter *iter)
 {
 	struct btree_node_iter_set *set;
+	struct printbuf buf = PRINTBUF;
 
-	printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets);
+	printk(KERN_ERR "btree node iter with %u/%u sets:\n",
+	       __btree_node_iter_used(iter), b->nsets);
 
 	btree_node_iter_for_each(iter, set) {
 		struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
 		struct bset_tree *t = bch2_bkey_to_bset(b, k);
 		struct bkey uk = bkey_unpack_key(b, k);
-		char buf[100];
 
-		bch2_bkey_to_text(buf, sizeof(buf), &uk);
-		printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set,
-		       k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf);
+		printbuf_reset(&buf);
+		bch2_bkey_to_text(&buf, &uk);
+		printk(KERN_ERR "set %zu key %u: %s\n",
+		       t - b->set, set->k, buf.buf);
 	}
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-static bool keys_out_of_order(struct btree *b,
-			      const struct bkey_packed *prev,
-			      const struct bkey_packed *next,
-			      bool is_extents)
-{
-	struct bkey nextu = bkey_unpack_key(b, next);
 
-	return bkey_cmp_left_packed_byval(b, prev, bkey_start_pos(&nextu)) > 0 ||
-		((is_extents
-		  ? !bkey_deleted(next)
-		  : !bkey_deleted(prev)) &&
-		 !bkey_cmp_packed(b, prev, next));
+	printbuf_exit(&buf);
 }
 
-void __bch2_verify_btree_nr_keys(struct btree *b)
+struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b)
 {
-	struct bset_tree *t;
 	struct bkey_packed *k;
-	struct btree_nr_keys nr = { 0 };
+	struct btree_nr_keys nr = {};
 
 	for_each_bset(b, t)
-		for (k = btree_bkey_first(b, t);
-		     k != btree_bkey_last(b, t);
-		     k = bkey_next(k))
-			if (!bkey_whiteout(k))
+		bset_tree_for_each_key(b, t, k)
+			if (!bkey_deleted(k))
 				btree_keys_account_key_add(&nr, t - b->set, k);
+	return nr;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_verify_btree_nr_keys(struct btree *b)
+{
+	struct btree_nr_keys nr = bch2_btree_node_count_keys(b);
 
 	BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
 }
 
-static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
-					   struct btree *b,
-					   struct bkey_packed *k)
+static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
+					    struct btree *b)
 {
-	const struct bkey_packed *n = bch2_btree_node_iter_peek_all(iter, b);
+	struct btree_node_iter iter = *_iter;
+	const struct bkey_packed *k, *n;
+
+	k = bch2_btree_node_iter_peek_all(&iter, b);
+	__bch2_btree_node_iter_advance(&iter, b);
+	n = bch2_btree_node_iter_peek_all(&iter, b);
 
 	bkey_unpack_key(b, k);
 
 	if (n &&
-	    keys_out_of_order(b, k, n, iter->is_extents)) {
+	    bkey_iter_cmp(b, k, n) > 0) {
+		struct btree_node_iter_set *set;
 		struct bkey ku = bkey_unpack_key(b, k);
 		struct bkey nu = bkey_unpack_key(b, n);
-		char buf1[80], buf2[80];
-
-		bch2_dump_btree_node(b);
-		bch2_bkey_to_text(buf1, sizeof(buf1), &ku);
-		bch2_bkey_to_text(buf2, sizeof(buf2), &nu);
-		panic("out of order/overlapping:\n%s\n%s\n", buf1, buf2);
+		struct printbuf buf1 = PRINTBUF;
+		struct printbuf buf2 = PRINTBUF;
+
+		bch2_dump_btree_node(NULL, b);
+		bch2_bkey_to_text(&buf1, &ku);
+		bch2_bkey_to_text(&buf2, &nu);
+		printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
+		       buf1.buf, buf2.buf);
+		printk(KERN_ERR "iter was:");
+
+		btree_node_iter_for_each(_iter, set) {
+			struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k);
+			struct bset_tree *t = bch2_bkey_to_bset(b, k2);
+			printk(" [%zi %zi]", t - b->set,
+			       k2->_data - bset(b, t)->_data);
+		}
+		panic("\n");
 	}
 }
 
 void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
-				struct btree *b)
+				 struct btree *b)
 {
-	struct btree_node_iter_set *set, *prev = NULL;
-	struct bset_tree *t;
-	struct bkey_packed *k, *first;
+	struct btree_node_iter_set *set, *s2;
+	struct bkey_packed *k, *p;
 
 	if (bch2_btree_node_iter_end(iter))
 		return;
 
+	/* Verify no duplicates: */
 	btree_node_iter_for_each(iter, set) {
-		k = __btree_node_offset_to_key(b, set->k);
-		t = bch2_bkey_to_bset(b, k);
-
-		BUG_ON(__btree_node_offset_to_key(b, set->end) !=
-		       btree_bkey_last(b, t));
-
-		BUG_ON(prev &&
-		       btree_node_iter_cmp(iter, b, *prev, *set) > 0);
-
-		prev = set;
+		BUG_ON(set->k > set->end);
+		btree_node_iter_for_each(iter, s2)
+			BUG_ON(set != s2 && set->end == s2->end);
 	}
 
-	first = __btree_node_offset_to_key(b, iter->data[0].k);
-
-	for_each_bset(b, t)
-		if (bch2_btree_node_iter_bset_pos(iter, b, t) ==
-		    btree_bkey_last(b, t) &&
-		    (k = bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))))
-			BUG_ON(__btree_node_iter_cmp(iter->is_extents, b,
-						     k, first) > 0);
-}
-
-void bch2_verify_key_order(struct btree *b,
-			  struct btree_node_iter *iter,
-			  struct bkey_packed *where)
-{
-	struct bset_tree *t = bch2_bkey_to_bset(b, where);
-	struct bkey_packed *k, *prev;
-	struct bkey uk, uw = bkey_unpack_key(b, where);
-
-	k = bch2_bkey_prev_all(b, t, where);
-	if (k &&
-	    keys_out_of_order(b, k, where, iter->is_extents)) {
-		char buf1[100], buf2[100];
-
-		bch2_dump_btree_node(b);
-		uk = bkey_unpack_key(b, k);
-		bch2_bkey_to_text(buf1, sizeof(buf1), &uk);
-		bch2_bkey_to_text(buf2, sizeof(buf2), &uw);
-		panic("out of order with prev:\n%s\n%s\n",
-		      buf1, buf2);
+	/* Verify that set->end is correct: */
+	btree_node_iter_for_each(iter, set) {
+		for_each_bset(b, t)
+			if (set->end == t->end_offset) {
+				BUG_ON(set->k < btree_bkey_first_offset(t) ||
+				       set->k >= t->end_offset);
+				goto found;
+			}
+		BUG();
+found:
+		do {} while (0);
 	}
 
-	k = bkey_next(where);
-	BUG_ON(k != btree_bkey_last(b, t) &&
-	       keys_out_of_order(b, where, k, iter->is_extents));
+	/* Verify iterator is sorted: */
+	btree_node_iter_for_each(iter, set)
+		BUG_ON(set != iter->data &&
+		       btree_node_iter_cmp(b, set[-1], set[0]) > 0);
+
+	k = bch2_btree_node_iter_peek_all(iter, b);
 
 	for_each_bset(b, t) {
-		if (where >= btree_bkey_first(b, t) ||
-		    where < btree_bkey_last(b, t))
+		if (iter->data[0].end == t->end_offset)
 			continue;
 
-		k = bch2_btree_node_iter_bset_pos(iter, b, t);
-
-		if (k == btree_bkey_last(b, t))
-			k = bch2_bkey_prev_all(b, t, k);
-
-		while (bkey_cmp_left_packed_byval(b, k, bkey_start_pos(&uw)) > 0 &&
-		       (prev = bch2_bkey_prev_all(b, t, k)))
-			k = prev;
+		p = bch2_bkey_prev_all(b, t,
+			bch2_btree_node_iter_bset_pos(iter, b, t));
 
-		for (;
-		     k != btree_bkey_last(b, t);
-		     k = bkey_next(k)) {
-			uk = bkey_unpack_key(b, k);
-
-			if (iter->is_extents) {
-				BUG_ON(!(bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0 ||
-					 bkey_cmp(uk.p, bkey_start_pos(&uw)) <= 0));
-			} else {
-				BUG_ON(!bkey_cmp(uw.p, uk.p) &&
-				       !bkey_deleted(&uk));
-			}
+		BUG_ON(p && bkey_iter_cmp(b, k, p) < 0);
+	}
+}
 
-			if (bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0)
-				break;
-		}
+void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
+			    struct bkey_packed *insert, unsigned clobber_u64s)
+{
+	struct bset_tree *t = bch2_bkey_to_bset(b, where);
+	struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
+	struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s);
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+#if 0
+	BUG_ON(prev &&
+	       bkey_iter_cmp(b, prev, insert) > 0);
+#else
+	if (prev &&
+	    bkey_iter_cmp(b, prev, insert) > 0) {
+		struct bkey k1 = bkey_unpack_key(b, prev);
+		struct bkey k2 = bkey_unpack_key(b, insert);
+
+		bch2_dump_btree_node(NULL, b);
+		bch2_bkey_to_text(&buf1, &k1);
+		bch2_bkey_to_text(&buf2, &k2);
+
+		panic("prev > insert:\n"
+		      "prev    key %s\n"
+		      "insert  key %s\n",
+		      buf1.buf, buf2.buf);
+	}
+#endif
+#if 0
+	BUG_ON(next != btree_bkey_last(b, t) &&
+	       bkey_iter_cmp(b, insert, next) > 0);
+#else
+	if (next != btree_bkey_last(b, t) &&
+	    bkey_iter_cmp(b, insert, next) > 0) {
+		struct bkey k1 = bkey_unpack_key(b, insert);
+		struct bkey k2 = bkey_unpack_key(b, next);
+
+		bch2_dump_btree_node(NULL, b);
+		bch2_bkey_to_text(&buf1, &k1);
+		bch2_bkey_to_text(&buf2, &k2);
+
+		panic("insert > next:\n"
+		      "insert  key %s\n"
+		      "next    key %s\n",
+		      buf1.buf, buf2.buf);
 	}
+#endif
 }
 
 #else
 
 static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
-						   struct btree *b,
-						   struct bkey_packed *k) {}
+						   struct btree *b) {}
 
 #endif
 
 /* Auxiliary search trees */
 
-#define BFLOAT_FAILED_UNPACKED	(U8_MAX - 0)
-#define BFLOAT_FAILED_PREV	(U8_MAX - 1)
-#define BFLOAT_FAILED_OVERFLOW	(U8_MAX - 2)
-#define BFLOAT_FAILED		(U8_MAX - 2)
-
-#define KEY_WORDS		BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS)
+#define BFLOAT_FAILED_UNPACKED	U8_MAX
+#define BFLOAT_FAILED		U8_MAX
 
 struct bkey_float {
 	u8		exponent;
 	u8		key_offset;
-	union {
-		u32	mantissa32;
-	struct {
-		u16	mantissa16;
-		u16	_pad;
-	};
-	};
-} __packed;
-
-#define BFLOAT_32BIT_NR		32U
-
-static unsigned bkey_float_byte_offset(unsigned idx)
-{
-	int d = (idx - BFLOAT_32BIT_NR) << 1;
-
-	d &= ~(d >> 31);
-
-	return idx * 6 - d;
-}
+	u16		mantissa;
+};
+#define BKEY_MANTISSA_BITS	16
 
 struct ro_aux_tree {
-	struct bkey_float	_d[0];
+	u8			nothing[0];
+	struct bkey_float	f[];
 };
 
 struct rw_aux_tree {
@@ -312,44 +314,6 @@ struct rw_aux_tree {
 	struct bpos	k;
 };
 
-/*
- * BSET_CACHELINE was originally intended to match the hardware cacheline size -
- * it used to be 64, but I realized the lookup code would touch slightly less
- * memory if it was 128.
- *
- * It definites the number of bytes (in struct bset) per struct bkey_float in
- * the auxiliar search tree - when we're done searching the bset_float tree we
- * have this many bytes left that we do a linear search over.
- *
- * Since (after level 5) every level of the bset_tree is on a new cacheline,
- * we're touching one fewer cacheline in the bset tree in exchange for one more
- * cacheline in the linear search - but the linear search might stop before it
- * gets to the second cacheline.
- */
-
-#define BSET_CACHELINE		128
-
-/* Space required for the btree node keys */
-static inline size_t btree_keys_bytes(struct btree *b)
-{
-	return PAGE_SIZE << b->page_order;
-}
-
-static inline size_t btree_keys_cachelines(struct btree *b)
-{
-	return btree_keys_bytes(b) / BSET_CACHELINE;
-}
-
-static inline size_t btree_aux_data_bytes(struct btree *b)
-{
-	return btree_keys_cachelines(b) * 8;
-}
-
-static inline size_t btree_aux_data_u64s(struct btree *b)
-{
-	return btree_aux_data_bytes(b) / sizeof(u64);
-}
-
 static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
 {
 	BUG_ON(t->aux_data_offset == U16_MAX);
@@ -359,8 +323,7 @@ static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
 		return t->aux_data_offset;
 	case BSET_RO_AUX_TREE:
 		return t->aux_data_offset +
-			DIV_ROUND_UP(bkey_float_byte_offset(t->size) +
-				     sizeof(u8) * t->size, 8);
+			DIV_ROUND_UP(t->size * sizeof(struct bkey_float), 8);
 	case BSET_RW_AUX_TREE:
 		return t->aux_data_offset +
 			DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
@@ -391,32 +354,16 @@ static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
 	return __aux_tree_base(b, t);
 }
 
-static u8 *ro_aux_tree_prev(const struct btree *b,
-			    const struct bset_tree *t)
-{
-	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
-	return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
-}
-
-static struct bkey_float *bkey_float_get(struct ro_aux_tree *b,
-					 unsigned idx)
-{
-	return (void *) b + bkey_float_byte_offset(idx);
-}
-
 static struct bkey_float *bkey_float(const struct btree *b,
 				     const struct bset_tree *t,
 				     unsigned idx)
 {
-	return bkey_float_get(ro_aux_tree_base(b, t), idx);
+	return ro_aux_tree_base(b, t)->f + idx;
 }
 
 static void bset_aux_tree_verify(struct btree *b)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
-	struct bset_tree *t;
-
 	for_each_bset(b, t) {
 		if (t->aux_data_offset == U16_MAX)
 			continue;
@@ -431,38 +378,13 @@ static void bset_aux_tree_verify(struct btree *b)
 #endif
 }
 
-/* Memory allocation */
-
-void bch2_btree_keys_free(struct btree *b)
-{
-	vfree(b->aux_data);
-	b->aux_data = NULL;
-}
-
-#ifndef PAGE_KERNEL_EXEC
-# define PAGE_KERNEL_EXEC PAGE_KERNEL
-#endif
-
-int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
-{
-	b->page_order	= page_order;
-	b->aux_data	= __vmalloc(btree_aux_data_bytes(b), gfp,
-				    PAGE_KERNEL_EXEC);
-	if (!b->aux_data)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
+void bch2_btree_keys_init(struct btree *b)
 {
 	unsigned i;
 
 	b->nsets		= 0;
 	memset(&b->nr, 0, sizeof(b->nr));
-#ifdef CONFIG_BCACHEFS_DEBUG
-	b->expensive_debug_checks = expensive_debug_checks;
-#endif
+
 	for (i = 0; i < MAX_BSETS; i++)
 		b->set[i].data_offset = U16_MAX;
 
@@ -539,19 +461,10 @@ static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
 					       unsigned j)
 {
 	return cacheline_to_bkey(b, t,
-			__eytzinger1_to_inorder(j, t->size, t->extra),
+			__eytzinger1_to_inorder(j, t->size - 1, t->extra),
 			bkey_float(b, t, j)->key_offset);
 }
 
-static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
-					     const struct bset_tree *t,
-					     unsigned j)
-{
-	unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
-
-	return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
-}
-
 static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
 				       const struct bset_tree *t)
 {
@@ -588,7 +501,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b,
 	struct bkey_packed *k = btree_bkey_first(b, t);
 	unsigned j = 0;
 
-	if (!btree_keys_expensive_checks(b))
+	if (!bch2_expensive_debug_checks)
 		return;
 
 	BUG_ON(bset_has_ro_aux_tree(t));
@@ -602,7 +515,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b,
 	goto start;
 	while (1) {
 		if (rw_aux_to_bkey(b, t, j) == k) {
-			BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
+			BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k,
 					bkey_unpack_pos(b, k)));
 start:
 			if (++j == t->size)
@@ -612,7 +525,7 @@ start:
 			       rw_aux_tree(b, t)[j - 1].offset);
 		}
 
-		k = bkey_next(k);
+		k = bkey_p_next(k);
 		BUG_ON(k >= btree_bkey_last(b, t));
 	}
 }
@@ -622,48 +535,34 @@ static unsigned rw_aux_tree_bsearch(struct btree *b,
 				    struct bset_tree *t,
 				    unsigned offset)
 {
-	unsigned l = 0, r = t->size;
+	unsigned bset_offs = offset - btree_bkey_first_offset(t);
+	unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t);
+	unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0;
 
 	EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+	EBUG_ON(!t->size);
+	EBUG_ON(idx > t->size);
 
-	while (l < r) {
-		unsigned m = (l + r) >> 1;
-
-		if (rw_aux_tree(b, t)[m].offset < offset)
-			l = m + 1;
-		else
-			r = m;
-	}
+	while (idx < t->size &&
+	       rw_aux_tree(b, t)[idx].offset < offset)
+		idx++;
 
-	EBUG_ON(l < t->size &&
-		rw_aux_tree(b, t)[l].offset < offset);
-	EBUG_ON(l &&
-		rw_aux_tree(b, t)[l - 1].offset >= offset);
+	while (idx &&
+	       rw_aux_tree(b, t)[idx - 1].offset >= offset)
+		idx--;
 
-	EBUG_ON(l > r);
-	EBUG_ON(l > t->size);
+	EBUG_ON(idx < t->size &&
+		rw_aux_tree(b, t)[idx].offset < offset);
+	EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset);
+	EBUG_ON(idx + 1 < t->size &&
+		rw_aux_tree(b, t)[idx].offset ==
+		rw_aux_tree(b, t)[idx + 1].offset);
 
-	return l;
-}
-
-static inline unsigned bfloat_mantissa(const struct bkey_float *f,
-				       unsigned idx)
-{
-	return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16;
-}
-
-static inline void bfloat_mantissa_set(struct bkey_float *f,
-				       unsigned idx, unsigned mantissa)
-{
-	if (idx < BFLOAT_32BIT_NR)
-		f->mantissa32 = mantissa;
-	else
-		f->mantissa16 = mantissa;
+	return idx;
 }
 
 static inline unsigned bkey_mantissa(const struct bkey_packed *k,
-				     const struct bkey_float *f,
-				     unsigned idx)
+				     const struct bkey_float *f)
 {
 	u64 v;
 
@@ -680,69 +579,33 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k,
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 	v >>= f->exponent & 7;
 #else
-	v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
+	v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS;
 #endif
-	return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v;
+	return (u16) v;
 }
 
-static void make_bfloat(struct btree *b, struct bset_tree *t,
-			unsigned j,
-			struct bkey_packed *min_key,
-			struct bkey_packed *max_key)
+static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
+					unsigned j,
+					struct bkey_packed *min_key,
+					struct bkey_packed *max_key)
 {
 	struct bkey_float *f = bkey_float(b, t, j);
 	struct bkey_packed *m = tree_to_bkey(b, t, j);
-	struct bkey_packed *p = tree_to_prev_bkey(b, t, j);
-	struct bkey_packed *l, *r;
-	unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16;
+	struct bkey_packed *l = is_power_of_2(j)
+		? min_key
+		: tree_to_bkey(b, t, j >> ffs(j));
+	struct bkey_packed *r = is_power_of_2(j + 1)
+		? max_key
+		: tree_to_bkey(b, t, j >> (ffz(j) + 1));
 	unsigned mantissa;
 	int shift, exponent, high_bit;
 
-	EBUG_ON(bkey_next(p) != m);
-
-	if (is_power_of_2(j)) {
-		l = min_key;
-
-		if (!l->u64s) {
-			if (!bkey_pack_pos(l, b->data->min_key, b)) {
-				struct bkey_i tmp;
-
-				bkey_init(&tmp.k);
-				tmp.k.p = b->data->min_key;
-				bkey_copy(l, &tmp);
-			}
-		}
-	} else {
-		l = tree_to_prev_bkey(b, t, j >> ffs(j));
-
-		EBUG_ON(m < l);
-	}
-
-	if (is_power_of_2(j + 1)) {
-		r = max_key;
-
-		if (!r->u64s) {
-			if (!bkey_pack_pos(r, t->max_key, b)) {
-				struct bkey_i tmp;
-
-				bkey_init(&tmp.k);
-				tmp.k.p = t->max_key;
-				bkey_copy(r, &tmp);
-			}
-		}
-	} else {
-		r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
-
-		EBUG_ON(m > r);
-	}
-
 	/*
 	 * for failed bfloats, the lookup code falls back to comparing against
 	 * the original key.
 	 */
 
-	if (!bkey_packed(l) || !bkey_packed(r) ||
-	    !bkey_packed(p) || !bkey_packed(m) ||
+	if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) ||
 	    !b->nr_key_bits) {
 		f->exponent = BFLOAT_FAILED_UNPACKED;
 		return;
@@ -759,8 +622,8 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 	 * of the key: we handle this later:
 	 */
 	high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
-		       min_t(unsigned, bits, b->nr_key_bits) - 1);
-	exponent = high_bit - (bits - 1);
+		       min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1);
+	exponent = high_bit - (BKEY_MANTISSA_BITS - 1);
 
 	/*
 	 * Then we calculate the actual shift value, from the start of the key
@@ -769,19 +632,19 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 	shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
 
-	EBUG_ON(shift + bits > b->format.key_u64s * 64);
+	EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64);
 #else
 	shift = high_bit_offset +
 		b->nr_key_bits -
 		exponent -
-		bits;
+		BKEY_MANTISSA_BITS;
 
 	EBUG_ON(shift < KEY_PACKED_BITS_START);
 #endif
 	EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
 
 	f->exponent = shift;
-	mantissa = bkey_mantissa(m, f, j);
+	mantissa = bkey_mantissa(m, f);
 
 	/*
 	 * If we've got garbage bits, set them to all 1s - it's legal for the
@@ -790,65 +653,28 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 	if (exponent < 0)
 		mantissa |= ~(~0U << -exponent);
 
-	bfloat_mantissa_set(f, j, mantissa);
-
-	/*
-	 * The bfloat must be able to tell its key apart from the previous key -
-	 * if its key and the previous key don't differ in the required bits,
-	 * flag as failed - unless the keys are actually equal, in which case
-	 * we aren't required to return a specific one:
-	 */
-	if (exponent > 0 &&
-	    bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) &&
-	    bkey_cmp_packed(b, p, m)) {
-		f->exponent = BFLOAT_FAILED_PREV;
-		return;
-	}
-
-	/*
-	 * f->mantissa must compare >= the original key - for transitivity with
-	 * the comparison in bset_search_tree. If we're dropping set bits,
-	 * increment it:
-	 */
-	if (exponent > (int) bch2_bkey_ffs(b, m)) {
-		if (j < BFLOAT_32BIT_NR
-		    ? f->mantissa32 == U32_MAX
-		    : f->mantissa16 == U16_MAX)
-			f->exponent = BFLOAT_FAILED_OVERFLOW;
-
-		if (j < BFLOAT_32BIT_NR)
-			f->mantissa32++;
-		else
-			f->mantissa16++;
-	}
+	f->mantissa = mantissa;
 }
 
 /* bytes remaining - only valid for last bset: */
-static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t)
 {
 	bset_aux_tree_verify(b);
 
 	return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
 }
 
-static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t)
 {
-	unsigned bytes = __bset_tree_capacity(b, t);
-
-	if (bytes < 7 * BFLOAT_32BIT_NR)
-		return bytes / 7;
-
-	bytes -= 7 * BFLOAT_32BIT_NR;
-
-	return BFLOAT_32BIT_NR + bytes / 5;
+	return __bset_tree_capacity(b, t) / sizeof(struct bkey_float);
 }
 
-static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t)
 {
 	return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
 }
 
-static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
+static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
 {
 	struct bkey_packed *k;
 
@@ -857,9 +683,7 @@ static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
 	rw_aux_tree(b, t)[0].offset =
 		__btree_node_key_to_offset(b, btree_bkey_first(b, t));
 
-	for (k = btree_bkey_first(b, t);
-	     k != btree_bkey_last(b, t);
-	     k = bkey_next(k)) {
+	bset_tree_for_each_key(b, t, k) {
 		if (t->size == bset_rw_tree_capacity(b, t))
 			break;
 
@@ -869,14 +693,11 @@ static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
 	}
 }
 
-static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
+static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
 {
-	struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
-	struct bkey_packed min_key, max_key;
-	unsigned j, cacheline = 1;
-
-	/* signal to make_bfloat() that they're uninitialized: */
-	min_key.u64s = max_key.u64s = 0;
+	struct bkey_packed *k = btree_bkey_first(b, t);
+	struct bkey_i min_key, max_key;
+	unsigned cacheline = 1;
 
 	t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
 		      bset_ro_tree_capacity(b, t));
@@ -887,12 +708,12 @@ retry:
 		return;
 	}
 
-	t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
+	t->extra = eytzinger1_extra(t->size - 1);
 
 	/* First we figure out where the first key in each cacheline is */
-	eytzinger1_for_each(j, t->size) {
+	eytzinger1_for_each(j, t->size - 1) {
 		while (bkey_to_cacheline(b, t, k) < cacheline)
-			prev = k, k = bkey_next(k);
+			k = bkey_p_next(k);
 
 		if (k >= btree_bkey_last(b, t)) {
 			/* XXX: this path sucks */
@@ -900,22 +721,27 @@ retry:
 			goto retry;
 		}
 
-		ro_aux_tree_prev(b, t)[j] = prev->u64s;
 		bkey_float(b, t, j)->key_offset =
 			bkey_to_cacheline_offset(b, t, cacheline++, k);
 
-		EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
 		EBUG_ON(tree_to_bkey(b, t, j) != k);
 	}
 
-	while (bkey_next(k) != btree_bkey_last(b, t))
-		k = bkey_next(k);
+	if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
+		bkey_init(&min_key.k);
+		min_key.k.p = b->data->min_key;
+	}
 
-	t->max_key = bkey_unpack_pos(b, k);
+	if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
+		bkey_init(&max_key.k);
+		max_key.k.p = b->data->max_key;
+	}
 
 	/* Then we build the tree */
-	eytzinger1_for_each(j, t->size)
-		make_bfloat(b, t, j, &min_key, &max_key);
+	eytzinger1_for_each(j, t->size - 1)
+		make_bfloat(b, t, j,
+			    bkey_to_packed(&min_key),
+			    bkey_to_packed(&max_key));
 }
 
 static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
@@ -969,13 +795,12 @@ void bch2_bset_init_first(struct btree *b, struct bset *i)
 	set_btree_bset(b, t, i);
 }
 
-void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
-			 struct btree_node_entry *bne)
+void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne)
 {
 	struct bset *i = &bne->keys;
 	struct bset_tree *t;
 
-	BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
+	BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b));
 	BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
 	BUG_ON(b->nsets >= MAX_BSETS);
 
@@ -1014,7 +839,7 @@ static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
 		do {
 			p = j ? tree_to_bkey(b, t,
 					__inorder_to_eytzinger1(j--,
-							t->size, t->extra))
+							t->size - 1, t->extra))
 			      : btree_bkey_first(b, t);
 		} while (p >= k);
 		break;
@@ -1037,19 +862,21 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
 	struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
 
 	while ((p = __bkey_prev(b, t, k)) && !ret) {
-		for (i = p; i != k; i = bkey_next(i))
+		for (i = p; i != k; i = bkey_p_next(i))
 			if (i->type >= min_key_type)
 				ret = i;
 
 		k = p;
 	}
 
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+	if (bch2_expensive_debug_checks) {
 		BUG_ON(ret >= orig_k);
 
-		for (i = ret ? bkey_next(ret) : btree_bkey_first(b, t);
+		for (i = ret
+			? bkey_p_next(ret)
+			: btree_bkey_first(b, t);
 		     i != orig_k;
-		     i = bkey_next(i))
+		     i = bkey_p_next(i))
 			BUG_ON(i->type >= min_key_type);
 	}
 
@@ -1058,92 +885,38 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
 
 /* Insert */
 
-static void rw_aux_tree_fix_invalidated_key(struct btree *b,
-					    struct bset_tree *t,
-					    struct bkey_packed *k)
-{
-	unsigned offset = __btree_node_key_to_offset(b, k);
-	unsigned j = rw_aux_tree_bsearch(b, t, offset);
-
-	if (j < t->size &&
-	    rw_aux_tree(b, t)[j].offset == offset)
-		rw_aux_tree_set(b, t, j, k);
-
-	bch2_bset_verify_rw_aux_tree(b, t);
-}
-
-static void ro_aux_tree_fix_invalidated_key(struct btree *b,
-					    struct bset_tree *t,
-					    struct bkey_packed *k)
+static void rw_aux_tree_insert_entry(struct btree *b,
+				     struct bset_tree *t,
+				     unsigned idx)
 {
-	struct bkey_packed min_key, max_key;
-	unsigned inorder, j;
-
-	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
-	/* signal to make_bfloat() that they're uninitialized: */
-	min_key.u64s = max_key.u64s = 0;
-
-	if (bkey_next(k) == btree_bkey_last(b, t)) {
-		t->max_key = bkey_unpack_pos(b, k);
+	EBUG_ON(!idx || idx > t->size);
+	struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1);
+	struct bkey_packed *end = idx < t->size
+				  ? rw_aux_to_bkey(b, t, idx)
+				  : btree_bkey_last(b, t);
 
-		for (j = 1; j < t->size; j = j * 2 + 1)
-			make_bfloat(b, t, j, &min_key, &max_key);
-	}
-
-	inorder = bkey_to_cacheline(b, t, k);
-
-	if (inorder &&
-	    inorder < t->size) {
-		j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
-
-		if (k == tree_to_bkey(b, t, j)) {
-			/* Fix the node this key corresponds to */
-			make_bfloat(b, t, j, &min_key, &max_key);
-
-			/* Children for which this key is the right boundary */
-			for (j = eytzinger1_left_child(j);
-			     j < t->size;
-			     j = eytzinger1_right_child(j))
-				make_bfloat(b, t, j, &min_key, &max_key);
-		}
-	}
-
-	if (inorder + 1 < t->size) {
-		j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra);
+	if (t->size < bset_rw_tree_capacity(b, t) &&
+	    (void *) end - (void *) start > L1_CACHE_BYTES) {
+		struct bkey_packed *k = start;
 
-		if (k == tree_to_prev_bkey(b, t, j)) {
-			make_bfloat(b, t, j, &min_key, &max_key);
+		while (1) {
+			k = bkey_p_next(k);
+			if (k == end)
+				break;
 
-			/* Children for which this key is the left boundary */
-			for (j = eytzinger1_right_child(j);
-			     j < t->size;
-			     j = eytzinger1_left_child(j))
-				make_bfloat(b, t, j, &min_key, &max_key);
+			if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
+				memmove(&rw_aux_tree(b, t)[idx + 1],
+					&rw_aux_tree(b, t)[idx],
+					(void *) &rw_aux_tree(b, t)[t->size] -
+					(void *) &rw_aux_tree(b, t)[idx]);
+				t->size++;
+				rw_aux_tree_set(b, t, idx, k);
+				break;
+			}
 		}
 	}
 }
 
-/**
- * bch2_bset_fix_invalidated_key() - given an existing  key @k that has been
- * modified, fix any auxiliary search tree by remaking all the nodes in the
- * auxiliary search tree that @k corresponds to
- */
-void bch2_bset_fix_invalidated_key(struct btree *b, struct bset_tree *t,
-				   struct bkey_packed *k)
-{
-	switch (bset_aux_tree_type(t)) {
-	case BSET_NO_AUX_TREE:
-		break;
-	case BSET_RO_AUX_TREE:
-		ro_aux_tree_fix_invalidated_key(b, t, k);
-		break;
-	case BSET_RW_AUX_TREE:
-		rw_aux_tree_fix_invalidated_key(b, t, k);
-		break;
-	}
-}
-
 static void bch2_bset_fix_lookup_table(struct btree *b,
 				       struct bset_tree *t,
 				       struct bkey_packed *_where,
@@ -1151,88 +924,59 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
 				       unsigned new_u64s)
 {
 	int shift = new_u64s - clobber_u64s;
-	unsigned l, j, where = __btree_node_key_to_offset(b, _where);
+	unsigned idx, j, where = __btree_node_key_to_offset(b, _where);
 
 	EBUG_ON(bset_has_ro_aux_tree(t));
 
 	if (!bset_has_rw_aux_tree(t))
 		return;
 
-	l = rw_aux_tree_bsearch(b, t, where);
-
-	/* l is first >= than @where */
-
-	EBUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where);
-	EBUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where);
-
-	if (!l) /* never delete first entry */
-		l++;
-	else if (l < t->size &&
-		 where < t->end_offset &&
-		 rw_aux_tree(b, t)[l].offset == where)
-		rw_aux_tree_set(b, t, l++, _where);
-
-	/* l now > where */
-
-	for (j = l;
-	     j < t->size &&
-	     rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
-	     j++)
-		;
-
-	if (j < t->size &&
-	    rw_aux_tree(b, t)[j].offset + shift ==
-	    rw_aux_tree(b, t)[l - 1].offset)
-		j++;
+	if (where > rw_aux_tree(b, t)[t->size - 1].offset) {
+		rw_aux_tree_insert_entry(b, t, t->size);
+		goto verify;
+	}
 
-	memmove(&rw_aux_tree(b, t)[l],
-		&rw_aux_tree(b, t)[j],
-		(void *) &rw_aux_tree(b, t)[t->size] -
-		(void *) &rw_aux_tree(b, t)[j]);
-	t->size -= j - l;
+	/* returns first entry >= where */
+	idx = rw_aux_tree_bsearch(b, t, where);
+
+	if (rw_aux_tree(b, t)[idx].offset == where) {
+		if (!idx) { /* never delete first entry */
+			idx++;
+		} else if (where < t->end_offset) {
+			rw_aux_tree_set(b, t, idx++, _where);
+		} else {
+			EBUG_ON(where != t->end_offset);
+			rw_aux_tree_insert_entry(b, t, --t->size);
+			goto verify;
+		}
+	}
 
-	for (j = l; j < t->size; j++)
-	       rw_aux_tree(b, t)[j].offset += shift;
+	EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where);
+	if (idx < t->size &&
+	    rw_aux_tree(b, t)[idx].offset + shift ==
+	    rw_aux_tree(b, t)[idx - 1].offset) {
+		memmove(&rw_aux_tree(b, t)[idx],
+			&rw_aux_tree(b, t)[idx + 1],
+			(void *) &rw_aux_tree(b, t)[t->size] -
+			(void *) &rw_aux_tree(b, t)[idx + 1]);
+		t->size -= 1;
+	}
 
-	EBUG_ON(l < t->size &&
-		rw_aux_tree(b, t)[l].offset ==
-		rw_aux_tree(b, t)[l - 1].offset);
+	for (j = idx; j < t->size; j++)
+		rw_aux_tree(b, t)[j].offset += shift;
 
-	if (t->size < bset_rw_tree_capacity(b, t) &&
-	    (l < t->size
-	     ? rw_aux_tree(b, t)[l].offset
-	     : t->end_offset) -
-	    rw_aux_tree(b, t)[l - 1].offset >
-	    L1_CACHE_BYTES / sizeof(u64)) {
-		struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
-		struct bkey_packed *end = l < t->size
-			? rw_aux_to_bkey(b, t, l)
-			: btree_bkey_last(b, t);
-		struct bkey_packed *k = start;
+	EBUG_ON(idx < t->size &&
+		rw_aux_tree(b, t)[idx].offset ==
+		rw_aux_tree(b, t)[idx - 1].offset);
 
-		while (1) {
-			k = bkey_next(k);
-			if (k == end)
-				break;
-
-			if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
-				memmove(&rw_aux_tree(b, t)[l + 1],
-					&rw_aux_tree(b, t)[l],
-					(void *) &rw_aux_tree(b, t)[t->size] -
-					(void *) &rw_aux_tree(b, t)[l]);
-				t->size++;
-				rw_aux_tree_set(b, t, l, k);
-				break;
-			}
-		}
-	}
+	rw_aux_tree_insert_entry(b, t, idx);
 
+verify:
 	bch2_bset_verify_rw_aux_tree(b, t);
 	bset_aux_tree_verify(b);
 }
 
 void bch2_bset_insert(struct btree *b,
-		      struct btree_node_iter *iter,
 		      struct bkey_packed *where,
 		      struct bkey_i *insert,
 		      unsigned clobber_u64s)
@@ -1242,16 +986,17 @@ void bch2_bset_insert(struct btree *b,
 	struct bkey_packed packed, *src = bkey_to_packed(insert);
 
 	bch2_bset_verify_rw_aux_tree(b, t);
+	bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s);
 
 	if (bch2_bkey_pack_key(&packed, &insert->k, f))
 		src = &packed;
 
-	if (!bkey_whiteout(&insert->k))
+	if (!bkey_deleted(&insert->k))
 		btree_keys_account_key_add(&b->nr, t - b->set, src);
 
 	if (src->u64s != clobber_u64s) {
-		u64 *src_p = where->_data + clobber_u64s;
-		u64 *dst_p = where->_data + src->u64s;
+		u64 *src_p = (u64 *) where->_data + clobber_u64s;
+		u64 *dst_p = (u64 *) where->_data + src->u64s;
 
 		EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
 			(int) clobber_u64s - src->u64s);
@@ -1261,14 +1006,14 @@ void bch2_bset_insert(struct btree *b,
 		set_btree_bset_end(b, t);
 	}
 
-	memcpy_u64s(where, src,
+	memcpy_u64s_small(where, src,
 		    bkeyp_key_u64s(f, src));
 	memcpy_u64s(bkeyp_val(f, where), &insert->v,
 		    bkeyp_val_u64s(f, src));
 
-	bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
+	if (src->u64s != clobber_u64s)
+		bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
 
-	bch2_verify_key_order(b, iter, where);
 	bch2_verify_btree_nr_keys(b);
 }
 
@@ -1277,7 +1022,7 @@ void bch2_bset_delete(struct btree *b,
 		      unsigned clobber_u64s)
 {
 	struct bset_tree *t = bset_tree_last(b);
-	u64 *src_p = where->_data + clobber_u64s;
+	u64 *src_p = (u64 *) where->_data + clobber_u64s;
 	u64 *dst_p = where->_data;
 
 	bch2_bset_verify_rw_aux_tree(b, t);
@@ -1296,15 +1041,14 @@ void bch2_bset_delete(struct btree *b,
 __flatten
 static struct bkey_packed *bset_search_write_set(const struct btree *b,
 				struct bset_tree *t,
-				struct bpos search,
-				const struct bkey_packed *packed_search)
+				struct bpos *search)
 {
 	unsigned l = 0, r = t->size;
 
 	while (l + 1 != r) {
 		unsigned m = (l + r) >> 1;
 
-		if (bkey_cmp(rw_aux_tree(b, t)[m].k, search) < 0)
+		if (bpos_lt(rw_aux_tree(b, t)[m].k, *search))
 			l = m;
 		else
 			r = m;
@@ -1313,93 +1057,97 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b,
 	return rw_aux_to_bkey(b, t, l);
 }
 
-noinline
-static int bset_search_tree_slowpath(const struct btree *b,
-				struct bset_tree *t, struct bpos *search,
-				const struct bkey_packed *packed_search,
-				unsigned n)
+static inline void prefetch_four_cachelines(void *p)
 {
-	return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n),
-				 packed_search, search) < 0;
+#ifdef CONFIG_X86_64
+	asm("prefetcht0 (-127 + 64 * 0)(%0);"
+	    "prefetcht0 (-127 + 64 * 1)(%0);"
+	    "prefetcht0 (-127 + 64 * 2)(%0);"
+	    "prefetcht0 (-127 + 64 * 3)(%0);"
+	    :
+	    : "r" (p + 127));
+#else
+	prefetch(p + L1_CACHE_BYTES * 0);
+	prefetch(p + L1_CACHE_BYTES * 1);
+	prefetch(p + L1_CACHE_BYTES * 2);
+	prefetch(p + L1_CACHE_BYTES * 3);
+#endif
+}
+
+static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
+					      const struct bkey_float *f)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
+
+	return f->exponent > key_bits_start;
+#else
+	unsigned key_bits_end = high_bit_offset + b->nr_key_bits;
+
+	return f->exponent + BKEY_MANTISSA_BITS < key_bits_end;
+#endif
 }
 
 __flatten
 static struct bkey_packed *bset_search_tree(const struct btree *b,
-				struct bset_tree *t,
-				struct bpos search,
+				const struct bset_tree *t,
+				const struct bpos *search,
 				const struct bkey_packed *packed_search)
 {
 	struct ro_aux_tree *base = ro_aux_tree_base(b, t);
-	struct bkey_float *f = bkey_float_get(base, 1);
-	void *p;
-	unsigned inorder, n = 1;
+	struct bkey_float *f;
+	struct bkey_packed *k;
+	unsigned inorder, n = 1, l, r;
+	int cmp;
 
-	while (1) {
-		if (likely(n << 4 < t->size)) {
-			p = bkey_float_get(base, n << 4);
-			prefetch(p);
-		} else if (n << 3 < t->size) {
-			inorder = __eytzinger1_to_inorder(n, t->size, t->extra);
-			p = bset_cacheline(b, t, inorder);
-#ifdef CONFIG_X86_64
-			asm(".intel_syntax noprefix;"
-			    "prefetcht0 [%0 - 127 + 64 * 0];"
-			    "prefetcht0 [%0 - 127 + 64 * 1];"
-			    "prefetcht0 [%0 - 127 + 64 * 2];"
-			    "prefetcht0 [%0 - 127 + 64 * 3];"
-			    ".att_syntax prefix;"
-			    :
-			    : "r" (p + 127));
-#else
-			prefetch(p + L1_CACHE_BYTES * 0);
-			prefetch(p + L1_CACHE_BYTES * 1);
-			prefetch(p + L1_CACHE_BYTES * 2);
-			prefetch(p + L1_CACHE_BYTES * 3);
-#endif
-		} else if (n >= t->size)
-			break;
+	do {
+		if (likely(n << 4 < t->size))
+			prefetch(&base->f[n << 4]);
 
-		f = bkey_float_get(base, n);
+		f = &base->f[n];
+		if (unlikely(f->exponent >= BFLOAT_FAILED))
+			goto slowpath;
 
-		if (packed_search &&
-		    likely(f->exponent < BFLOAT_FAILED))
-			n = n * 2 + (bfloat_mantissa(f, n) <
-				     bkey_mantissa(packed_search, f, n));
-		else
-			n = n * 2 + bset_search_tree_slowpath(b, t,
-						&search, packed_search, n);
+		l = f->mantissa;
+		r = bkey_mantissa(packed_search, f);
+
+		if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f))
+			goto slowpath;
+
+		n = n * 2 + (l < r);
+		continue;
+slowpath:
+		k = tree_to_bkey(b, t, n);
+		cmp = bkey_cmp_p_or_unp(b, k, packed_search, search);
+		if (!cmp)
+			return k;
+
+		n = n * 2 + (cmp < 0);
 	} while (n < t->size);
 
-	inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
+	inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra);
 
 	/*
 	 * n would have been the node we recursed to - the low bit tells us if
 	 * we recursed left or recursed right.
 	 */
-	if (n & 1) {
-		return cacheline_to_bkey(b, t, inorder, f->key_offset);
-	} else {
-		if (--inorder) {
-			n = eytzinger1_prev(n >> 1, t->size);
-			f = bkey_float_get(base, n);
-			return cacheline_to_bkey(b, t, inorder, f->key_offset);
-		} else
+	if (likely(!(n & 1))) {
+		--inorder;
+		if (unlikely(!inorder))
 			return btree_bkey_first(b, t);
+
+		f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)];
 	}
+
+	return cacheline_to_bkey(b, t, inorder, f->key_offset);
 }
 
-/*
- * Returns the first key greater than or equal to @search
- */
-__always_inline __flatten
-static struct bkey_packed *bch2_bset_search(struct btree *b,
+static __always_inline __flatten
+struct bkey_packed *__bch2_bset_search(struct btree *b,
 				struct bset_tree *t,
-				struct bpos search,
-				struct bkey_packed *packed_search,
-				const struct bkey_packed *lossy_packed_search,
-				bool strictly_greater)
+				struct bpos *search,
+				const struct bkey_packed *lossy_packed_search)
 {
-	struct bkey_packed *m;
 
 	/*
 	 * First, we search for a cacheline, then lastly we do a linear search
@@ -1418,43 +1166,41 @@ static struct bkey_packed *bch2_bset_search(struct btree *b,
 
 	switch (bset_aux_tree_type(t)) {
 	case BSET_NO_AUX_TREE:
-		m = btree_bkey_first(b, t);
-		break;
+		return btree_bkey_first(b, t);
 	case BSET_RW_AUX_TREE:
-		m = bset_search_write_set(b, t, search, lossy_packed_search);
-		break;
+		return bset_search_write_set(b, t, search);
 	case BSET_RO_AUX_TREE:
-		/*
-		 * Each node in the auxiliary search tree covers a certain range
-		 * of bits, and keys above and below the set it covers might
-		 * differ outside those bits - so we have to special case the
-		 * start and end - handle that here:
-		 */
-
-		if (bkey_cmp(search, t->max_key) > 0)
-			return btree_bkey_last(b, t);
-
-		m = bset_search_tree(b, t, search, lossy_packed_search);
-		break;
+		return bset_search_tree(b, t, search, lossy_packed_search);
+	default:
+		BUG();
 	}
+}
 
+static __always_inline __flatten
+struct bkey_packed *bch2_bset_search_linear(struct btree *b,
+				struct bset_tree *t,
+				struct bpos *search,
+				struct bkey_packed *packed_search,
+				const struct bkey_packed *lossy_packed_search,
+				struct bkey_packed *m)
+{
 	if (lossy_packed_search)
 		while (m != btree_bkey_last(b, t) &&
-		       !btree_iter_pos_cmp_p_or_unp(b, search, lossy_packed_search,
-						    m, strictly_greater))
-			m = bkey_next(m);
+		       bkey_iter_cmp_p_or_unp(b, m,
+					lossy_packed_search, search) < 0)
+			m = bkey_p_next(m);
 
 	if (!packed_search)
 		while (m != btree_bkey_last(b, t) &&
-		       !btree_iter_pos_cmp_packed(b, &search, m, strictly_greater))
-			m = bkey_next(m);
+		       bkey_iter_pos_cmp(b, m, search) < 0)
+			m = bkey_p_next(m);
 
-	if (btree_keys_expensive_checks(b)) {
+	if (bch2_expensive_debug_checks) {
 		struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
 
 		BUG_ON(prev &&
-		       btree_iter_pos_cmp_p_or_unp(b, search, packed_search,
-						   prev, strictly_greater));
+		       bkey_iter_cmp_p_or_unp(b, prev,
+					packed_search, search) >= 0);
 	}
 
 	return m;
@@ -1462,6 +1208,25 @@ static struct bkey_packed *bch2_bset_search(struct btree *b,
 
 /* Btree node iterator */
 
+static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
+			      struct btree *b,
+			      const struct bkey_packed *k,
+			      const struct bkey_packed *end)
+{
+	if (k != end) {
+		struct btree_node_iter_set *pos;
+
+		btree_node_iter_for_each(iter, pos)
+			;
+
+		BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
+		*pos = (struct btree_node_iter_set) {
+			__btree_node_key_to_offset(b, k),
+			__btree_node_key_to_offset(b, end)
+		};
+	}
+}
+
 void bch2_btree_node_iter_push(struct btree_node_iter *iter,
 			       struct btree *b,
 			       const struct bkey_packed *k,
@@ -1471,28 +1236,29 @@ void bch2_btree_node_iter_push(struct btree_node_iter *iter,
 	bch2_btree_node_iter_sort(iter, b);
 }
 
-noinline __flatten __attribute__((cold))
+noinline __flatten __cold
 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
-			      struct btree *b, struct bpos search,
-			      bool strictly_greater, bool is_extents)
+			      struct btree *b, struct bpos *search)
 {
-	struct bset_tree *t;
+	struct bkey_packed *k;
 
 	trace_bkey_pack_pos_fail(search);
 
-	for_each_bset(b, t)
-		__bch2_btree_node_iter_push(iter, b,
-			bch2_bset_search(b, t, search, NULL, NULL,
-					strictly_greater),
-			btree_bkey_last(b, t));
+	bch2_btree_node_iter_init_from_start(iter, b);
 
-	bch2_btree_node_iter_sort(iter, b);
+	while ((k = bch2_btree_node_iter_peek(iter, b)) &&
+	       bkey_iter_pos_cmp(b, k, search) < 0)
+		bch2_btree_node_iter_advance(iter, b);
 }
 
 /**
- * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
+ * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a
  * given position
  *
+ * @iter:	iterator to initialize
+ * @b:		btree node to search
+ * @search:	search key
+ *
  * Main entry point to the lookup code for individual btree nodes:
  *
  * NOTE:
@@ -1520,7 +1286,7 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
  *    to the search key is going to have 0 sectors after the search key.
  *
  *    But this does mean that we can't just search for
- *    bkey_successor(start_of_range) to get the first extent that overlaps with
+ *    bpos_successor(start_of_range) to get the first extent that overlaps with
  *    the range we want - if we're unlucky and there's an extent that ends
  *    exactly where we searched, then there could be a deleted key at the same
  *    position and we'd get that when we search instead of the preceding extent
@@ -1529,19 +1295,22 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
  *    So we've got to search for start_of_range, then after the lookup iterate
  *    past any extents that compare equal to the position we searched for.
  */
+__flatten
 void bch2_btree_node_iter_init(struct btree_node_iter *iter,
-			       struct btree *b, struct bpos search,
-			       bool strictly_greater, bool is_extents)
+			       struct btree *b, struct bpos *search)
 {
-	struct bset_tree *t;
 	struct bkey_packed p, *packed_search = NULL;
+	struct btree_node_iter_set *pos = iter->data;
+	struct bkey_packed *k[MAX_BSETS];
+	unsigned i;
 
-	EBUG_ON(bkey_cmp(search, b->data->min_key) < 0);
+	EBUG_ON(bpos_lt(*search, b->data->min_key));
+	EBUG_ON(bpos_gt(*search, b->data->max_key));
 	bset_aux_tree_verify(b);
 
-	__bch2_btree_node_iter_init(iter, is_extents);
+	memset(iter, 0, sizeof(*iter));
 
-	switch (bch2_bkey_pack_pos_lossy(&p, search, b)) {
+	switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) {
 	case BKEY_PACK_POS_EXACT:
 		packed_search = &p;
 		break;
@@ -1549,28 +1318,35 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
 		packed_search = NULL;
 		break;
 	case BKEY_PACK_POS_FAIL:
-		btree_node_iter_init_pack_failed(iter, b, search,
-					strictly_greater, is_extents);
+		btree_node_iter_init_pack_failed(iter, b, search);
 		return;
 	}
 
-	for_each_bset(b, t)
-		__bch2_btree_node_iter_push(iter, b,
-					   bch2_bset_search(b, t, search,
-							   packed_search, &p,
-							   strictly_greater),
-					   btree_bkey_last(b, t));
+	for (i = 0; i < b->nsets; i++) {
+		k[i] = __bch2_bset_search(b, b->set + i, search, &p);
+		prefetch_four_cachelines(k[i]);
+	}
+
+	for (i = 0; i < b->nsets; i++) {
+		struct bset_tree *t = b->set + i;
+		struct bkey_packed *end = btree_bkey_last(b, t);
+
+		k[i] = bch2_bset_search_linear(b, t, search,
+					       packed_search, &p, k[i]);
+		if (k[i] != end)
+			*pos++ = (struct btree_node_iter_set) {
+				__btree_node_key_to_offset(b, k[i]),
+				__btree_node_key_to_offset(b, end)
+			};
+	}
 
 	bch2_btree_node_iter_sort(iter, b);
 }
 
 void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
-					  struct btree *b,
-					  bool is_extents)
+					  struct btree *b)
 {
-	struct bset_tree *t;
-
-	__bch2_btree_node_iter_init(iter, is_extents);
+	memset(iter, 0, sizeof(*iter));
 
 	for_each_bset(b, t)
 		__bch2_btree_node_iter_push(iter, b,
@@ -1598,7 +1374,7 @@ static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
 {
 	bool ret;
 
-	if ((ret = (btree_node_iter_cmp(iter, b,
+	if ((ret = (btree_node_iter_cmp(b,
 					iter->data[first],
 					iter->data[first + 1]) > 0)))
 		swap(iter->data[first], iter->data[first + 1]);
@@ -1637,7 +1413,10 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
 	EBUG_ON(iter->data->k > iter->data->end);
 
 	if (unlikely(__btree_node_iter_set_end(iter, 0))) {
-		bch2_btree_node_iter_set_drop(iter, iter->data);
+		/* avoid an expensive memmove call: */
+		iter->data[0] = iter->data[1];
+		iter->data[1] = iter->data[2];
+		iter->data[2] = (struct btree_node_iter_set) { 0, 0 };
 		return;
 	}
 
@@ -1653,64 +1432,42 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
 	btree_node_iter_sort_two(iter, b, 1);
 }
 
-/**
- * bch_btree_node_iter_advance - advance @iter by one key
- *
- * Doesn't do debugchecks - for cases where (insert_fixup_extent()) a bset might
- * momentarily have out of order extents.
- */
 void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
 				  struct btree *b)
 {
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct bkey_packed *k = bch2_btree_node_iter_peek_all(iter, b);
+	if (bch2_expensive_debug_checks) {
+		bch2_btree_node_iter_verify(iter, b);
+		bch2_btree_node_iter_next_check(iter, b);
+	}
 
 	__bch2_btree_node_iter_advance(iter, b);
-	bch2_btree_node_iter_next_check(iter, b, k);
-#else
-	__bch2_btree_node_iter_advance(iter, b);
-#endif
-}
-
-static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
-{
-	unsigned n = ARRAY_SIZE(iter->data);
-
-	while (n && __btree_node_iter_set_end(iter, n - 1))
-		--n;
-
-	return n;
 }
 
 /*
  * Expensive:
  */
-struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
-						     struct btree *b,
-						     unsigned min_key_type)
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
+						  struct btree *b)
 {
 	struct bkey_packed *k, *prev = NULL;
-	struct bkey_packed *orig_pos = bch2_btree_node_iter_peek_all(iter, b);
 	struct btree_node_iter_set *set;
-	struct bset_tree *t;
-	unsigned end;
+	unsigned end = 0;
 
-	bch2_btree_node_iter_verify(iter, b);
+	if (bch2_expensive_debug_checks)
+		bch2_btree_node_iter_verify(iter, b);
 
 	for_each_bset(b, t) {
-		k = bch2_bkey_prev_filter(b, t,
-			bch2_btree_node_iter_bset_pos(iter, b, t),
-			min_key_type);
+		k = bch2_bkey_prev_all(b, t,
+			bch2_btree_node_iter_bset_pos(iter, b, t));
 		if (k &&
-		    (!prev || __btree_node_iter_cmp(iter->is_extents, b,
-						    k, prev) > 0)) {
+		    (!prev || bkey_iter_cmp(b, k, prev) > 0)) {
 			prev = k;
 			end = t->end_offset;
 		}
 	}
 
 	if (!prev)
-		goto out;
+		return NULL;
 
 	/*
 	 * We're manually memmoving instead of just calling sort() to ensure the
@@ -1731,18 +1488,20 @@ found:
 
 	iter->data[0].k = __btree_node_key_to_offset(b, prev);
 	iter->data[0].end = end;
-out:
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-		struct btree_node_iter iter2 = *iter;
 
-		if (prev)
-			bch2_btree_node_iter_advance(&iter2, b);
+	if (bch2_expensive_debug_checks)
+		bch2_btree_node_iter_verify(iter, b);
+	return prev;
+}
 
-		while ((k = bch2_btree_node_iter_peek_all(&iter2, b)) != orig_pos) {
-			BUG_ON(k->type >= min_key_type);
-			bch2_btree_node_iter_advance(&iter2, b);
-		}
-	}
+struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter,
+					      struct btree *b)
+{
+	struct bkey_packed *prev;
+
+	do {
+		prev = bch2_btree_node_iter_prev_all(iter, b);
+	} while (prev && bkey_deleted(prev));
 
 	return prev;
 }
@@ -1758,11 +1517,9 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
 
 /* Mergesort */
 
-void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
+void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats)
 {
-	struct bset_tree *t;
-
-	for_each_bset(b, t) {
+	for_each_bset_c(b, t) {
 		enum bset_aux_tree_type type = bset_aux_tree_type(t);
 		size_t j;
 
@@ -1774,84 +1531,40 @@ void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
 			stats->floats += t->size - 1;
 
 			for (j = 1; j < t->size; j++)
-				switch (bkey_float(b, t, j)->exponent) {
-				case BFLOAT_FAILED_UNPACKED:
-					stats->failed_unpacked++;
-					break;
-				case BFLOAT_FAILED_PREV:
-					stats->failed_prev++;
-					break;
-				case BFLOAT_FAILED_OVERFLOW:
-					stats->failed_overflow++;
-					break;
-				}
+				stats->failed +=
+					bkey_float(b, t, j)->exponent ==
+					BFLOAT_FAILED;
 		}
 	}
 }
 
-int bch2_bkey_print_bfloat(struct btree *b, struct bkey_packed *k,
-			   char *buf, size_t size)
+void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
+			 struct bkey_packed *k)
 {
 	struct bset_tree *t = bch2_bkey_to_bset(b, k);
-	struct bkey_packed *l, *r, *p;
-	struct bkey uk, up;
-	char buf1[200], buf2[200];
-	unsigned j;
-
-	if (!size)
-		return 0;
+	struct bkey uk;
+	unsigned j, inorder;
 
 	if (!bset_has_ro_aux_tree(t))
-		goto out;
-
-	j = __inorder_to_eytzinger1(bkey_to_cacheline(b, t, k), t->size, t->extra);
-	if (j &&
-	    j < t->size &&
-	    k == tree_to_bkey(b, t, j))
-		switch (bkey_float(b, t, j)->exponent) {
-		case BFLOAT_FAILED_UNPACKED:
-			uk = bkey_unpack_key(b, k);
-			return scnprintf(buf, size,
-					 "    failed unpacked at depth %u\n"
-					 "\t%llu:%llu\n",
-					 ilog2(j),
-					 uk.p.inode, uk.p.offset);
-		case BFLOAT_FAILED_PREV:
-			p = tree_to_prev_bkey(b, t, j);
-			l = is_power_of_2(j)
-				? btree_bkey_first(b, t)
-				: tree_to_prev_bkey(b, t, j >> ffs(j));
-			r = is_power_of_2(j + 1)
-				? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))
-				: tree_to_bkey(b, t, j >> (ffz(j) + 1));
-
-			up = bkey_unpack_key(b, p);
-			uk = bkey_unpack_key(b, k);
-			bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits);
-			bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits);
-
-			return scnprintf(buf, size,
-					 "    failed prev at depth %u\n"
-					 "\tkey starts at bit %u but first differing bit at %u\n"
-					 "\t%llu:%llu\n"
-					 "\t%llu:%llu\n"
-					 "\t%s\n"
-					 "\t%s\n",
-					 ilog2(j),
-					 bch2_bkey_greatest_differing_bit(b, l, r),
-					 bch2_bkey_greatest_differing_bit(b, p, k),
-					 uk.p.inode, uk.p.offset,
-					 up.p.inode, up.p.offset,
-					 buf1, buf2);
-		case BFLOAT_FAILED_OVERFLOW:
-			uk = bkey_unpack_key(b, k);
-			return scnprintf(buf, size,
-					 "    failed overflow at depth %u\n"
-					 "\t%llu:%llu\n",
-					 ilog2(j),
-					 uk.p.inode, uk.p.offset);
-		}
-out:
-	*buf = '\0';
-	return 0;
+		return;
+
+	inorder = bkey_to_cacheline(b, t, k);
+	if (!inorder || inorder >= t->size)
+		return;
+
+	j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra);
+	if (k != tree_to_bkey(b, t, j))
+		return;
+
+	switch (bkey_float(b, t, j)->exponent) {
+	case BFLOAT_FAILED:
+		uk = bkey_unpack_key(b, k);
+		prt_printf(out,
+		       "    failed unpacked at depth %u\n"
+		       "\t",
+		       ilog2(j));
+		bch2_bpos_to_text(out, uk.p);
+		prt_printf(out, "\n");
+		break;
+	}
 }
diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h
index 296c05b4..6953d55b 100644
--- a/libbcachefs/bset.h
+++ b/libbcachefs/bset.h
@@ -1,10 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_BSET_H
 #define _BCACHEFS_BSET_H
 
 #include <linux/kernel.h>
 #include <linux/types.h>
 
-#include "bcachefs_format.h"
+#include "bcachefs.h"
 #include "bkey.h"
 #include "bkey_methods.h"
 #include "btree_types.h"
@@ -146,17 +147,6 @@
  * first key in that range of bytes again.
  */
 
-extern bool bch2_expensive_debug_checks;
-
-static inline bool btree_keys_expensive_checks(const struct btree *b)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	return bch2_expensive_debug_checks || *b->expensive_debug_checks;
-#else
-	return false;
-#endif
-}
-
 enum bset_aux_tree_type {
 	BSET_NO_AUX_TREE,
 	BSET_RO_AUX_TREE,
@@ -183,110 +173,50 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree
 	}
 }
 
-typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
-
-static inline void
-__bkey_unpack_key_format_checked(const struct btree *b,
-			       struct bkey *dst,
-			       const struct bkey_packed *src)
-{
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-	{
-		compiled_unpack_fn unpack_fn = b->aux_data;
-		unpack_fn(dst, src);
-
-		if (btree_keys_expensive_checks(b)) {
-			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
-
-			/*
-			 * hack around a harmless race when compacting whiteouts
-			 * for a write:
-			 */
-			dst2.needs_whiteout = dst->needs_whiteout;
-
-			BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
-		}
-	}
-#else
-	*dst = __bch2_bkey_unpack_key(&b->format, src);
-#endif
-}
-
-static inline struct bkey
-bkey_unpack_key_format_checked(const struct btree *b,
-			       const struct bkey_packed *src)
-{
-	struct bkey dst;
-
-	__bkey_unpack_key_format_checked(b, &dst, src);
-	return dst;
-}
-
-static inline void __bkey_unpack_key(const struct btree *b,
-				     struct bkey *dst,
-				     const struct bkey_packed *src)
-{
-	if (likely(bkey_packed(src)))
-		__bkey_unpack_key_format_checked(b, dst, src);
-	else
-		*dst = *packed_to_bkey_c(src);
-}
-
-/**
- * bkey_unpack_key -- unpack just the key, not the value
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
  */
-static inline struct bkey bkey_unpack_key(const struct btree *b,
-					  const struct bkey_packed *src)
-{
-	return likely(bkey_packed(src))
-		? bkey_unpack_key_format_checked(b, src)
-		: *packed_to_bkey_c(src);
-}
 
-static inline struct bpos
-bkey_unpack_pos_format_checked(const struct btree *b,
-			       const struct bkey_packed *src)
+#define BSET_CACHELINE		256
+
+static inline size_t btree_keys_cachelines(const struct btree *b)
 {
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-	return bkey_unpack_key_format_checked(b, src).p;
-#else
-	return __bkey_unpack_pos(&b->format, src);
-#endif
+	return (1U << b->byte_order) / BSET_CACHELINE;
 }
 
-static inline struct bpos bkey_unpack_pos(const struct btree *b,
-					  const struct bkey_packed *src)
+static inline size_t btree_aux_data_bytes(const struct btree *b)
 {
-	return likely(bkey_packed(src))
-		? bkey_unpack_pos_format_checked(b, src)
-		: packed_to_bkey_c(src)->p;
+	return btree_keys_cachelines(b) * 8;
 }
 
-/* Disassembled bkeys */
-
-static inline struct bkey_s_c bkey_disassemble(struct btree *b,
-					       const struct bkey_packed *k,
-					       struct bkey *u)
+static inline size_t btree_aux_data_u64s(const struct btree *b)
 {
-	__bkey_unpack_key(b, u, k);
-
-	return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
+	return btree_aux_data_bytes(b) / sizeof(u64);
 }
 
-/* non const version: */
-static inline struct bkey_s __bkey_disassemble(struct btree *b,
-					       struct bkey_packed *k,
-					       struct bkey *u)
-{
-	__bkey_unpack_key(b, u, k);
+#define for_each_bset(_b, _t)						\
+	for (struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
 
-	return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
-}
+#define for_each_bset_c(_b, _t)						\
+	for (const struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
 
-#define for_each_bset(_b, _t)					\
-	for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
+#define bset_tree_for_each_key(_b, _t, _k)				\
+	for (_k = btree_bkey_first(_b, _t);				\
+	     _k != btree_bkey_last(_b, _t);				\
+	     _k = bkey_p_next(_k))
 
-static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
+static inline bool bset_has_ro_aux_tree(const struct bset_tree *t)
 {
 	return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
 }
@@ -334,19 +264,14 @@ static inline struct bset *bset_next_set(struct btree *b,
 	return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
 }
 
-void bch2_btree_keys_free(struct btree *);
-int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t);
-void bch2_btree_keys_init(struct btree *, bool *);
+void bch2_btree_keys_init(struct btree *);
 
 void bch2_bset_init_first(struct btree *, struct bset *);
-void bch2_bset_init_next(struct bch_fs *, struct btree *,
-			 struct btree_node_entry *);
+void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
 void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
-void bch2_bset_fix_invalidated_key(struct btree *, struct bset_tree *,
-				  struct bkey_packed *);
 
-void bch2_bset_insert(struct btree *, struct btree_node_iter *,
-		     struct bkey_packed *, struct bkey_i *, unsigned);
+void bch2_bset_insert(struct btree *, struct bkey_packed *, struct bkey_i *,
+		      unsigned);
 void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
 
 /* Bkey utility code */
@@ -355,12 +280,12 @@ void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
 static inline int bkey_cmp_p_or_unp(const struct btree *b,
 				    const struct bkey_packed *l,
 				    const struct bkey_packed *r_packed,
-				    struct bpos *r)
+				    const struct bpos *r)
 {
 	EBUG_ON(r_packed && !bkey_packed(r_packed));
 
 	if (unlikely(!bkey_packed(l)))
-		return bkey_cmp(packed_to_bkey_c(l)->p, *r);
+		return bpos_cmp(packed_to_bkey_c(l)->p, *r);
 
 	if (likely(r_packed))
 		return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
@@ -368,28 +293,18 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b,
 	return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
 }
 
-/* Returns true if @k is after iterator position @pos */
-static inline bool btree_iter_pos_cmp_packed(const struct btree *b,
-					     struct bpos *pos,
-					     const struct bkey_packed *k,
-					     bool strictly_greater)
+static inline struct bset_tree *
+bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k)
 {
-	int cmp = bkey_cmp_left_packed(b, k, pos);
+	unsigned offset = __btree_node_key_to_offset(b, k);
 
-	return cmp > 0 ||
-		(cmp == 0 && !strictly_greater && !bkey_deleted(k));
-}
-
-static inline bool btree_iter_pos_cmp_p_or_unp(const struct btree *b,
-					struct bpos pos,
-					const struct bkey_packed *pos_packed,
-					const struct bkey_packed *k,
-					bool strictly_greater)
-{
-	int cmp = bkey_cmp_p_or_unp(b, k, pos_packed, &pos);
+	for_each_bset(b, t)
+		if (offset <= t->end_offset) {
+			EBUG_ON(offset < btree_bkey_first_offset(t));
+			return t;
+		}
 
-	return cmp > 0 ||
-		(cmp == 0 && !strictly_greater && !bkey_deleted(k));
+	BUG();
 }
 
 struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
@@ -406,43 +321,18 @@ bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
 static inline struct bkey_packed *
 bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
 {
-	return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_DISCARD + 1);
-}
-
-enum bch_extent_overlap {
-	BCH_EXTENT_OVERLAP_ALL		= 0,
-	BCH_EXTENT_OVERLAP_BACK		= 1,
-	BCH_EXTENT_OVERLAP_FRONT	= 2,
-	BCH_EXTENT_OVERLAP_MIDDLE	= 3,
-};
-
-/* Returns how k overlaps with m */
-static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
-							 const struct bkey *m)
-{
-	int cmp1 = bkey_cmp(k->p, m->p) < 0;
-	int cmp2 = bkey_cmp(bkey_start_pos(k),
-			    bkey_start_pos(m)) > 0;
-
-	return (cmp1 << 1) + cmp2;
+	return bch2_bkey_prev_filter(b, t, k, 1);
 }
 
 /* Btree key iteration */
 
-static inline void __bch2_btree_node_iter_init(struct btree_node_iter *iter,
-					      bool is_extents)
-{
-	iter->is_extents = is_extents;
-	memset(iter->data, 0, sizeof(iter->data));
-}
-
 void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
 			      const struct bkey_packed *,
 			      const struct bkey_packed *);
 void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
-			      struct bpos, bool, bool);
+			       struct bpos *);
 void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
-					 struct btree *, bool);
+					  struct btree *);
 struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
 						 struct btree *,
 						 struct bset_tree *);
@@ -469,53 +359,46 @@ static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
 	return __btree_node_iter_set_end(iter, 0);
 }
 
-static inline int __btree_node_iter_cmp(bool is_extents,
-					struct btree *b,
-					struct bkey_packed *l,
-					struct bkey_packed *r)
+/*
+ * When keys compare equal, deleted keys compare first:
+ *
+ * XXX: only need to compare pointers for keys that are both within a
+ * btree_node_iterator - we need to break ties for prev() to work correctly
+ */
+static inline int bkey_iter_cmp(const struct btree *b,
+				const struct bkey_packed *l,
+				const struct bkey_packed *r)
 {
-	/*
-	 * For non extents, when keys compare equal the deleted keys have to
-	 * come first - so that bch2_btree_node_iter_next_check() can detect
-	 * duplicate nondeleted keys (and possibly other reasons?)
-	 *
-	 * For extents, bkey_deleted() is used as a proxy for k->size == 0, so
-	 * deleted keys have to sort last.
-	 */
-	return bkey_cmp_packed(b, l, r)
-		?: (is_extents
-		    ? (int) bkey_deleted(l) - (int) bkey_deleted(r)
-		    : (int) bkey_deleted(r) - (int) bkey_deleted(l))
-		?: (l > r) - (l < r);
+	return bch2_bkey_cmp_packed(b, l, r)
+		?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
+		?: cmp_int(l, r);
 }
 
-static inline int btree_node_iter_cmp(struct btree_node_iter *iter,
-				      struct btree *b,
+static inline int btree_node_iter_cmp(const struct btree *b,
 				      struct btree_node_iter_set l,
 				      struct btree_node_iter_set r)
 {
-	return __btree_node_iter_cmp(iter->is_extents, b,
+	return bkey_iter_cmp(b,
 			__btree_node_offset_to_key(b, l.k),
 			__btree_node_offset_to_key(b, r.k));
 }
 
-static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
-			      struct btree *b,
-			      const struct bkey_packed *k,
-			      const struct bkey_packed *end)
+/* These assume r (the search key) is not a deleted key: */
+static inline int bkey_iter_pos_cmp(const struct btree *b,
+			const struct bkey_packed *l,
+			const struct bpos *r)
 {
-	if (k != end) {
-		struct btree_node_iter_set *pos;
-
-		btree_node_iter_for_each(iter, pos)
-			;
+	return bkey_cmp_left_packed(b, l, r)
+		?: -((int) bkey_deleted(l));
+}
 
-		BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
-		*pos = (struct btree_node_iter_set) {
-			__btree_node_key_to_offset(b, k),
-			__btree_node_key_to_offset(b, end)
-		};
-	}
+static inline int bkey_iter_cmp_p_or_unp(const struct btree *b,
+				    const struct bkey_packed *l,
+				    const struct bkey_packed *r_packed,
+				    const struct bpos *r)
+{
+	return bkey_cmp_p_or_unp(b, l, r_packed, r)
+		?: -((int) bkey_deleted(l));
 }
 
 static inline struct bkey_packed *
@@ -526,33 +409,23 @@ __bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
 }
 
 static inline struct bkey_packed *
-bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter,
-				 struct btree *b,
-				 unsigned min_key_type)
-{
-	while (!bch2_btree_node_iter_end(iter)) {
-		struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b);
-
-		if (k->type >= min_key_type)
-			return k;
-
-		bch2_btree_node_iter_advance(iter, b);
-	}
-
-	return NULL;
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
-			      struct btree *b)
+bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b)
 {
-	return bch2_btree_node_iter_peek_filter(iter, b, 0);
+	return !bch2_btree_node_iter_end(iter)
+		? __btree_node_offset_to_key(b, iter->data->k)
+		: NULL;
 }
 
 static inline struct bkey_packed *
 bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
 {
-	return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_DISCARD + 1);
+	struct bkey_packed *k;
+
+	while ((k = bch2_btree_node_iter_peek_all(iter, b)) &&
+	       bkey_deleted(k))
+		bch2_btree_node_iter_advance(iter, b);
+
+	return k;
 }
 
 static inline struct bkey_packed *
@@ -566,41 +439,29 @@ bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
 	return ret;
 }
 
-struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *,
-						     struct btree *, unsigned);
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree *b)
-{
-	return bch2_btree_node_iter_prev_filter(iter, b, 0);
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
-{
-	return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_DISCARD + 1);
-}
-
-/*
- * Iterates over all _live_ keys - skipping deleted (and potentially
- * overlapping) keys
- */
-#define for_each_btree_node_key(b, k, iter, _is_extents)		\
-	for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
-	     ((k) = bch2_btree_node_iter_peek(iter, b));			\
-	     bch2_btree_node_iter_advance(iter, b))
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
+						  struct btree *);
+struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *,
+					      struct btree *);
 
 struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
 						struct btree *,
 						struct bkey *);
 
-#define for_each_btree_node_key_unpack(b, k, iter, _is_extents, unpacked)\
-	for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
+#define for_each_btree_node_key(b, k, iter)				\
+	for (bch2_btree_node_iter_init_from_start((iter), (b));		\
+	     (k = bch2_btree_node_iter_peek((iter), (b)));		\
+	     bch2_btree_node_iter_advance(iter, b))
+
+#define for_each_btree_node_key_unpack(b, k, iter, unpacked)		\
+	for (bch2_btree_node_iter_init_from_start((iter), (b));		\
 	     (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
 	     bch2_btree_node_iter_advance(iter, b))
 
 /* Accounting: */
 
+struct btree_nr_keys bch2_btree_node_count_keys(struct btree *);
+
 static inline void btree_keys_account_key(struct btree_nr_keys *n,
 					  unsigned bset,
 					  struct bkey_packed *k,
@@ -615,52 +476,68 @@ static inline void btree_keys_account_key(struct btree_nr_keys *n,
 		n->unpacked_keys += sign;
 }
 
+static inline void btree_keys_account_val_delta(struct btree *b,
+						struct bkey_packed *k,
+						int delta)
+{
+	struct bset_tree *t = bch2_bkey_to_bset(b, k);
+
+	b->nr.live_u64s			+= delta;
+	b->nr.bset_u64s[t - b->set]	+= delta;
+}
+
 #define btree_keys_account_key_add(_nr, _bset_idx, _k)		\
 	btree_keys_account_key(_nr, _bset_idx, _k, 1)
 #define btree_keys_account_key_drop(_nr, _bset_idx, _k)	\
 	btree_keys_account_key(_nr, _bset_idx, _k, -1)
 
+#define btree_account_key_add(_b, _k)				\
+	btree_keys_account_key(&(_b)->nr,			\
+		bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1)
+#define btree_account_key_drop(_b, _k)				\
+	btree_keys_account_key(&(_b)->nr,			\
+		bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1)
+
 struct bset_stats {
 	struct {
 		size_t nr, bytes;
 	} sets[BSET_TREE_NR_TYPES];
 
 	size_t floats;
-	size_t failed_unpacked;
-	size_t failed_prev;
-	size_t failed_overflow;
+	size_t failed;
 };
 
-void bch2_btree_keys_stats(struct btree *, struct bset_stats *);
-int bch2_bkey_print_bfloat(struct btree *, struct bkey_packed *,
-			  char *, size_t);
+void bch2_btree_keys_stats(const struct btree *, struct bset_stats *);
+void bch2_bfloat_to_text(struct printbuf *, struct btree *,
+			 struct bkey_packed *);
 
 /* Debug stuff */
 
-void bch2_dump_bset(struct btree *, struct bset *, unsigned);
-void bch2_dump_btree_node(struct btree *);
+void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned);
+void bch2_dump_btree_node(struct bch_fs *, struct btree *);
 void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 
 void __bch2_verify_btree_nr_keys(struct btree *);
 void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
-void bch2_verify_key_order(struct btree *, struct btree_node_iter *,
-			  struct bkey_packed *);
+void bch2_verify_insert_pos(struct btree *, struct bkey_packed *,
+			    struct bkey_packed *, unsigned);
 
 #else
 
 static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
 static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
 					      struct btree *b) {}
-static inline void bch2_verify_key_order(struct btree *b,
-					struct btree_node_iter *iter,
-					struct bkey_packed *where) {}
+static inline void bch2_verify_insert_pos(struct btree *b,
+					  struct bkey_packed *where,
+					  struct bkey_packed *insert,
+					  unsigned clobber_u64s) {}
 #endif
 
 static inline void bch2_verify_btree_nr_keys(struct btree *b)
 {
-	if (btree_keys_expensive_checks(b))
+	if (bch2_debug_check_btree_accounting)
 		__bch2_verify_btree_nr_keys(b);
 }
 
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index db3712a8..1117be90 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -1,60 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bbpos.h"
+#include "bkey_buf.h"
 #include "btree_cache.h"
 #include "btree_io.h"
 #include "btree_iter.h"
 #include "btree_locking.h"
 #include "debug.h"
-#include "extents.h"
+#include "errcode.h"
+#include "error.h"
+#include "journal.h"
+#include "trace.h"
 
 #include <linux/prefetch.h>
-#include <trace/events/bcachefs.h>
-
-#define DEF_BTREE_ID(kwd, val, name) name,
-
-const char * const bch2_btree_ids[] = {
-	DEFINE_BCH_BTREE_IDS()
+#include <linux/sched/mm.h>
+#include <linux/swap.h>
+
+#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
+do {						 \
+	if (shrinker_counter)			 \
+		bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++;	 \
+} while (0)
+
+const char * const bch2_btree_node_flags[] = {
+#define x(f)	#f,
+	BTREE_FLAGS()
+#undef x
 	NULL
 };
 
-#undef DEF_BTREE_ID
-
 void bch2_recalc_btree_reserve(struct bch_fs *c)
 {
-	unsigned i, reserve = 16;
+	unsigned reserve = 16;
 
-	if (!c->btree_roots[0].b)
+	if (!c->btree_roots_known[0].b)
 		reserve += 8;
 
-	for (i = 0; i < BTREE_ID_NR; i++)
-		if (c->btree_roots[i].b)
-			reserve += min_t(unsigned, 1,
-					 c->btree_roots[i].b->level) * 8;
+	for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
 
-	c->btree_cache.reserve = reserve;
+		if (r->b)
+			reserve += min_t(unsigned, 1, r->b->c.level) * 8;
+	}
+
+	c->btree_cache.nr_reserve = reserve;
 }
 
-static inline unsigned btree_cache_can_free(struct btree_cache *bc)
+static inline size_t btree_cache_can_free(struct btree_cache_list *list)
 {
-	return max_t(int, 0, bc->used - bc->reserve);
+	struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
+
+	size_t can_free = list->nr;
+	if (!list->idx)
+		can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
+	return can_free;
 }
 
-static void __btree_node_data_free(struct bch_fs *c, struct btree *b)
+static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
 {
-	EBUG_ON(btree_node_write_in_flight(b));
+	BUG_ON(!list_empty(&b->list));
 
-	kvpfree(b->data, btree_bytes(c));
-	b->data = NULL;
-	bch2_btree_keys_free(b);
+	if (b->c.lock.readers)
+		list_add(&b->list, &bc->freed_pcpu);
+	else
+		list_add(&b->list, &bc->freed_nonpcpu);
 }
 
-static void btree_node_data_free(struct bch_fs *c, struct btree *b)
+static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b)
+{
+	BUG_ON(!list_empty(&b->list));
+	BUG_ON(!b->data);
+
+	bc->nr_freeable++;
+	list_add(&b->list, &bc->freeable);
+}
+
+void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
 {
 	struct btree_cache *bc = &c->btree_cache;
 
-	__btree_node_data_free(c, b);
-	bc->used--;
-	list_move(&b->list, &bc->freed);
+	mutex_lock(&bc->lock);
+	__bch2_btree_node_to_freelist(bc, b);
+	mutex_unlock(&bc->lock);
+
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
+}
+
+static void __btree_node_data_free(struct btree_cache *bc, struct btree *b)
+{
+	BUG_ON(!list_empty(&b->list));
+	BUG_ON(btree_node_hashed(b));
+
+	/*
+	 * This should really be done in slub/vmalloc, but we're using the
+	 * kmalloc_large() path, so we're working around a slub bug by doing
+	 * this here:
+	 */
+	if (b->data)
+		mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE);
+	if (b->aux_data)
+		mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE);
+
+	EBUG_ON(btree_node_write_in_flight(b));
+
+	clear_btree_node_just_written(b);
+
+	kvfree(b->data);
+	b->data = NULL;
+#ifdef __KERNEL__
+	kvfree(b->aux_data);
+#else
+	munmap(b->aux_data, btree_aux_data_bytes(b));
+#endif
+	b->aux_data = NULL;
+
+	btree_node_to_freedlist(bc, b);
+}
+
+static void btree_node_data_free(struct btree_cache *bc, struct btree *b)
+{
+	BUG_ON(list_empty(&b->list));
+	list_del_init(&b->list);
+	--bc->nr_freeable;
+	__btree_node_data_free(bc, b);
 }
 
 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -63,181 +133,352 @@ static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
 	const struct btree *b = obj;
 	const u64 *v = arg->key;
 
-	return PTR_HASH(&b->key) == *v ? 0 : 1;
+	return b->hash_val == *v ? 0 : 1;
 }
 
 static const struct rhashtable_params bch_btree_cache_params = {
-	.head_offset	= offsetof(struct btree, hash),
-	.key_offset	= offsetof(struct btree, key.v),
-	.key_len	= sizeof(struct bch_extent_ptr),
-	.obj_cmpfn	= bch2_btree_cache_cmp_fn,
+	.head_offset		= offsetof(struct btree, hash),
+	.key_offset		= offsetof(struct btree, hash_val),
+	.key_len		= sizeof(u64),
+	.obj_cmpfn		= bch2_btree_cache_cmp_fn,
+	.automatic_shrinking	= true,
 };
 
-static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
+static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 {
-	struct btree_cache *bc = &c->btree_cache;
+	BUG_ON(b->data || b->aux_data);
 
-	b->data = kvpmalloc(btree_bytes(c), gfp);
-	if (!b->data)
-		goto err;
+	gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
 
-	if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp))
-		goto err;
+	b->data = kvmalloc(btree_buf_bytes(b), gfp);
+	if (!b->data)
+		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
+#ifdef __KERNEL__
+	b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
+#else
+	b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
+			   PROT_READ|PROT_WRITE|PROT_EXEC,
+			   MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+	if (b->aux_data == MAP_FAILED)
+		b->aux_data = NULL;
+#endif
+	if (!b->aux_data) {
+		kvfree(b->data);
+		b->data = NULL;
+		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
+	}
 
-	bc->used++;
-	list_move(&b->list, &bc->freeable);
-	return;
-err:
-	kvpfree(b->data, btree_bytes(c));
-	b->data = NULL;
-	list_move(&b->list, &bc->freed);
+	return 0;
 }
 
-static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
+static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
 {
-	struct btree *b = kzalloc(sizeof(struct btree), gfp);
+	struct btree *b;
+
+	b = kzalloc(sizeof(struct btree), gfp);
 	if (!b)
 		return NULL;
 
-	bkey_extent_init(&b->key);
-	six_lock_init(&b->lock);
+	bkey_btree_ptr_init(&b->key);
 	INIT_LIST_HEAD(&b->list);
 	INIT_LIST_HEAD(&b->write_blocked);
+	b->byte_order = ilog2(c->opts.btree_node_size);
+	return b;
+}
+
+struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	b = __btree_node_mem_alloc(c, GFP_KERNEL);
+	if (!b)
+		return NULL;
 
-	btree_node_data_alloc(c, b, gfp);
-	return b->data ? b : NULL;
+	if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
+		kfree(b);
+		return NULL;
+	}
+
+	bch2_btree_lock_init(&b->c, 0);
+
+	__bch2_btree_node_to_freelist(bc, b);
+	return b;
+}
+
+static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
+{
+	struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
+
+	u64 mask = bc->pinned_nodes_mask[!!b->c.level];
+
+	return ((mask & BIT_ULL(b->c.btree_id)) &&
+		bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
+		bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
+}
+
+void bch2_node_pin(struct bch_fs *c, struct btree *b)
+{
+	struct btree_cache *bc = &c->btree_cache;
+
+	mutex_lock(&bc->lock);
+	BUG_ON(!__btree_node_pinned(bc, b));
+	if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
+		set_btree_node_pinned(b);
+		list_move(&b->list, &bc->live[1].list);
+		bc->live[0].nr--;
+		bc->live[1].nr++;
+	}
+	mutex_unlock(&bc->lock);
+}
+
+void bch2_btree_cache_unpin(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b, *n;
+
+	mutex_lock(&bc->lock);
+	c->btree_cache.pinned_nodes_mask[0] = 0;
+	c->btree_cache.pinned_nodes_mask[1] = 0;
+
+	list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
+		clear_btree_node_pinned(b);
+		list_move(&b->list, &bc->live[0].list);
+		bc->live[0].nr++;
+		bc->live[1].nr--;
+	}
+
+	mutex_unlock(&bc->lock);
 }
 
 /* Btree in memory cache - hash table */
 
-void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
+void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
 {
-	rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+	lockdep_assert_held(&bc->lock);
+
+	int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+	BUG_ON(ret);
 
 	/* Cause future lookups for this node to fail: */
-	bkey_i_to_extent(&b->key)->v._data[0] = 0;
+	b->hash_val = 0;
+
+	if (b->c.btree_id < BTREE_ID_NR)
+		--bc->nr_by_btree[b->c.btree_id];
+	--bc->live[btree_node_pinned(b)].nr;
+	list_del_init(&b->list);
+}
+
+void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
+{
+	__bch2_btree_node_hash_remove(bc, b);
+	__bch2_btree_node_to_freelist(bc, b);
 }
 
 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
 {
-	return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
-					     bch_btree_cache_params);
+	BUG_ON(!list_empty(&b->list));
+	BUG_ON(b->hash_val);
+
+	b->hash_val = btree_ptr_hash_val(&b->key);
+	int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
+						bch_btree_cache_params);
+	if (ret)
+		return ret;
+
+	if (b->c.btree_id < BTREE_ID_NR)
+		bc->nr_by_btree[b->c.btree_id]++;
+
+	bool p = __btree_node_pinned(bc, b);
+	mod_bit(BTREE_NODE_pinned, &b->flags, p);
+
+	list_add_tail(&b->list, &bc->live[p].list);
+	bc->live[p].nr++;
+	return 0;
 }
 
 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
 				unsigned level, enum btree_id id)
 {
-	int ret;
-
-	b->level	= level;
-	b->btree_id	= id;
+	b->c.level	= level;
+	b->c.btree_id	= id;
 
 	mutex_lock(&bc->lock);
-	ret = __bch2_btree_node_hash_insert(bc, b);
-	if (!ret)
-		list_add(&b->list, &bc->live);
+	int ret = __bch2_btree_node_hash_insert(bc, b);
 	mutex_unlock(&bc->lock);
 
 	return ret;
 }
 
+void bch2_btree_node_update_key_early(struct btree_trans *trans,
+				      enum btree_id btree, unsigned level,
+				      struct bkey_s_c old, struct bkey_i *new)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b;
+	struct bkey_buf tmp;
+	int ret;
+
+	bch2_bkey_buf_init(&tmp);
+	bch2_bkey_buf_reassemble(&tmp, c, old);
+
+	b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
+	if (!IS_ERR_OR_NULL(b)) {
+		mutex_lock(&c->btree_cache.lock);
+
+		__bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+		bkey_copy(&b->key, new);
+		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+		BUG_ON(ret);
+
+		mutex_unlock(&c->btree_cache.lock);
+		six_unlock_read(&b->c.lock);
+	}
+
+	bch2_bkey_buf_exit(&tmp, c);
+}
+
 __flatten
 static inline struct btree *btree_cache_find(struct btree_cache *bc,
 				     const struct bkey_i *k)
 {
-	return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k),
-				      bch_btree_cache_params);
+	u64 v = btree_ptr_hash_val(k);
+
+	return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
 }
 
 /*
  * this version is for btree nodes that have already been freed (we're not
  * reaping a real btree node)
  */
-static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
 {
 	struct btree_cache *bc = &c->btree_cache;
 	int ret = 0;
 
 	lockdep_assert_held(&bc->lock);
+wait_on_io:
+	if (b->flags & ((1U << BTREE_NODE_dirty)|
+			(1U << BTREE_NODE_read_in_flight)|
+			(1U << BTREE_NODE_write_in_flight))) {
+		if (!flush) {
+			if (btree_node_dirty(b))
+				BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
+			else if (btree_node_read_in_flight(b))
+				BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
+			else if (btree_node_write_in_flight(b))
+				BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
+			return -BCH_ERR_ENOMEM_btree_node_reclaim;
+		}
+
+		/* XXX: waiting on IO with btree cache lock held */
+		bch2_btree_node_wait_on_read(b);
+		bch2_btree_node_wait_on_write(b);
+	}
 
-	if (!six_trylock_intent(&b->lock))
-		return -ENOMEM;
+	if (!six_trylock_intent(&b->c.lock)) {
+		BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
+		return -BCH_ERR_ENOMEM_btree_node_reclaim;
+	}
 
-	if (!six_trylock_write(&b->lock))
+	if (!six_trylock_write(&b->c.lock)) {
+		BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
 		goto out_unlock_intent;
+	}
 
-	if (btree_node_noevict(b))
-		goto out_unlock;
+	/* recheck under lock */
+	if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
+			(1U << BTREE_NODE_write_in_flight))) {
+		if (!flush) {
+			if (btree_node_read_in_flight(b))
+				BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
+			else if (btree_node_write_in_flight(b))
+				BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
+			goto out_unlock;
+		}
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
+		goto wait_on_io;
+	}
 
-	if (!btree_node_may_write(b))
+	if (btree_node_noevict(b)) {
+		BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
 		goto out_unlock;
+	}
+	if (btree_node_write_blocked(b)) {
+		BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
+		goto out_unlock;
+	}
+	if (btree_node_will_make_reachable(b)) {
+		BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
+		goto out_unlock;
+	}
 
-	if (btree_node_dirty(b) ||
-	    btree_node_write_in_flight(b) ||
-	    btree_node_read_in_flight(b)) {
-		if (!flush)
+	if (btree_node_dirty(b)) {
+		if (!flush) {
+			BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
 			goto out_unlock;
-
-		wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
-			       TASK_UNINTERRUPTIBLE);
-
+		}
 		/*
 		 * Using the underscore version because we don't want to compact
 		 * bsets after the write, since this node is about to be evicted
 		 * - unless btree verify mode is enabled, since it runs out of
 		 * the post write cleanup:
 		 */
-		if (verify_btree_ondisk(c))
-			bch2_btree_node_write(c, b, SIX_LOCK_intent);
+		if (bch2_verify_btree_ondisk)
+			bch2_btree_node_write(c, b, SIX_LOCK_intent,
+					      BTREE_WRITE_cache_reclaim);
 		else
-			__bch2_btree_node_write(c, b, SIX_LOCK_read);
+			__bch2_btree_node_write(c, b,
+						BTREE_WRITE_cache_reclaim);
 
-		/* wait for any in flight btree write */
-		btree_node_wait_on_io(b);
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
+		goto wait_on_io;
 	}
 out:
-	if (PTR_HASH(&b->key) && !ret)
-		trace_btree_node_reap(c, b);
+	if (b->hash_val && !ret)
+		trace_and_count(c, btree_cache_reap, c, b);
 	return ret;
 out_unlock:
-	six_unlock_write(&b->lock);
+	six_unlock_write(&b->c.lock);
 out_unlock_intent:
-	six_unlock_intent(&b->lock);
-	ret = -ENOMEM;
+	six_unlock_intent(&b->c.lock);
+	ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
 	goto out;
 }
 
-static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
 {
-	return __btree_node_reclaim(c, b, false);
+	return __btree_node_reclaim(c, b, false, shrinker_counter);
 }
 
 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
 {
-	return __btree_node_reclaim(c, b, true);
+	return __btree_node_reclaim(c, b, true, false);
 }
 
 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 					   struct shrink_control *sc)
 {
-	struct bch_fs *c = container_of(shrink, struct bch_fs,
-					btree_cache.shrink);
-	struct btree_cache *bc = &c->btree_cache;
+	struct btree_cache_list *list = shrink->private_data;
+	struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
+	struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
 	struct btree *b, *t;
 	unsigned long nr = sc->nr_to_scan;
-	unsigned long can_free;
-	unsigned long touched = 0;
+	unsigned long can_free = 0;
 	unsigned long freed = 0;
-	unsigned i;
+	unsigned long touched = 0;
+	unsigned i, flags;
+	unsigned long ret = SHRINK_STOP;
+	bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
 
-	if (btree_shrinker_disabled(c))
+	if (bch2_btree_shrinker_disabled)
 		return SHRINK_STOP;
 
-	/* Return -1 if we can't do anything right now */
-	if (sc->gfp_mask & __GFP_IO)
-		mutex_lock(&bc->lock);
-	else if (!mutex_trylock(&bc->lock))
-		return -1;
+	mutex_lock(&bc->lock);
+	flags = memalloc_nofs_save();
 
 	/*
 	 * It's _really_ critical that we don't free too many btree nodes - we
@@ -246,124 +487,148 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 	 * succeed, so that inserting keys into the btree can always succeed and
 	 * IO can always make forward progress:
 	 */
-	nr /= btree_pages(c);
-	can_free = btree_cache_can_free(bc);
+	can_free = btree_cache_can_free(list);
 	nr = min_t(unsigned long, nr, can_free);
 
 	i = 0;
 	list_for_each_entry_safe(b, t, &bc->freeable, list) {
+		/*
+		 * Leave a few nodes on the freeable list, so that a btree split
+		 * won't have to hit the system allocator:
+		 */
+		if (++i <= 3)
+			continue;
+
 		touched++;
 
-		if (freed >= nr)
-			break;
+		if (touched >= nr)
+			goto out;
 
-		if (++i > 3 &&
-		    !btree_node_reclaim(c, b)) {
-			btree_node_data_free(c, b);
-			six_unlock_write(&b->lock);
-			six_unlock_intent(&b->lock);
+		if (!btree_node_reclaim(c, b, true)) {
+			btree_node_data_free(bc, b);
+			six_unlock_write(&b->c.lock);
+			six_unlock_intent(&b->c.lock);
 			freed++;
+			bc->nr_freed++;
 		}
 	}
 restart:
-	list_for_each_entry_safe(b, t, &bc->live, list) {
+	list_for_each_entry_safe(b, t, &list->list, list) {
 		touched++;
 
-		if (freed >= nr) {
-			/* Save position */
-			if (&t->list != &bc->live)
-				list_move_tail(&bc->live, &t->list);
-			break;
-		}
+		if (btree_node_accessed(b)) {
+			clear_btree_node_accessed(b);
+			bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++;
+			--touched;;
+		} else if (!btree_node_reclaim(c, b, true)) {
+			__bch2_btree_node_hash_remove(bc, b);
+			__btree_node_data_free(bc, b);
 
-		if (!btree_node_accessed(b) &&
-		    !btree_node_reclaim(c, b)) {
-			/* can't call bch2_btree_node_hash_remove under lock  */
 			freed++;
-			if (&t->list != &bc->live)
-				list_move_tail(&bc->live, &t->list);
-
-			btree_node_data_free(c, b);
+			bc->nr_freed++;
+
+			six_unlock_write(&b->c.lock);
+			six_unlock_intent(&b->c.lock);
+
+			if (freed == nr)
+				goto out_rotate;
+		} else if (trigger_writes &&
+			   btree_node_dirty(b) &&
+			   !btree_node_will_make_reachable(b) &&
+			   !btree_node_write_blocked(b) &&
+			   six_trylock_read(&b->c.lock)) {
+			list_move(&list->list, &b->list);
 			mutex_unlock(&bc->lock);
-
-			bch2_btree_node_hash_remove(bc, b);
-			six_unlock_write(&b->lock);
-			six_unlock_intent(&b->lock);
-
-			if (freed >= nr)
-				goto out;
-
-			if (sc->gfp_mask & __GFP_IO)
-				mutex_lock(&bc->lock);
-			else if (!mutex_trylock(&bc->lock))
-				goto out;
+			__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
+			six_unlock_read(&b->c.lock);
+			if (touched >= nr)
+				goto out_nounlock;
+			mutex_lock(&bc->lock);
 			goto restart;
-		} else
-			clear_btree_node_accessed(b);
-	}
+		}
 
-	mutex_unlock(&bc->lock);
+		if (touched >= nr)
+			break;
+	}
+out_rotate:
+	if (&t->list != &list->list)
+		list_move_tail(&list->list, &t->list);
 out:
-	return (unsigned long) freed * btree_pages(c);
+	mutex_unlock(&bc->lock);
+out_nounlock:
+	ret = freed;
+	memalloc_nofs_restore(flags);
+	trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
+	return ret;
 }
 
 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
 					    struct shrink_control *sc)
 {
-	struct bch_fs *c = container_of(shrink, struct bch_fs,
-					btree_cache.shrink);
-	struct btree_cache *bc = &c->btree_cache;
+	struct btree_cache_list *list = shrink->private_data;
 
-	if (btree_shrinker_disabled(c))
+	if (bch2_btree_shrinker_disabled)
 		return 0;
 
-	return btree_cache_can_free(bc) * btree_pages(c);
+	return btree_cache_can_free(list);
 }
 
 void bch2_fs_btree_cache_exit(struct bch_fs *c)
 {
 	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
-	unsigned i;
+	struct btree *b, *t;
+	unsigned long flags;
 
-	if (bc->shrink.list.next)
-		unregister_shrinker(&bc->shrink);
+	shrinker_free(bc->live[1].shrink);
+	shrinker_free(bc->live[0].shrink);
 
+	/* vfree() can allocate memory: */
+	flags = memalloc_nofs_save();
 	mutex_lock(&bc->lock);
 
-#ifdef CONFIG_BCACHEFS_DEBUG
 	if (c->verify_data)
-		list_move(&c->verify_data->list, &bc->live);
+		list_move(&c->verify_data->list, &bc->live[0].list);
 
-	kvpfree(c->verify_ondisk, btree_bytes(c));
-#endif
+	kvfree(c->verify_ondisk);
 
-	for (i = 0; i < BTREE_ID_NR; i++)
-		if (c->btree_roots[i].b)
-			list_add(&c->btree_roots[i].b->list, &bc->live);
+	for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
 
-	list_splice(&bc->freeable, &bc->live);
+		if (r->b)
+			list_add(&r->b->list, &bc->live[0].list);
+	}
 
-	while (!list_empty(&bc->live)) {
-		b = list_first_entry(&bc->live, struct btree, list);
+	list_for_each_entry_safe(b, t, &bc->live[1].list, list)
+		bch2_btree_node_hash_remove(bc, b);
+	list_for_each_entry_safe(b, t, &bc->live[0].list, list)
+		bch2_btree_node_hash_remove(bc, b);
 
+	list_for_each_entry_safe(b, t, &bc->freeable, list) {
 		BUG_ON(btree_node_read_in_flight(b) ||
 		       btree_node_write_in_flight(b));
 
-		if (btree_node_dirty(b))
-			bch2_btree_complete_write(c, b, btree_current_write(b));
-		clear_btree_node_dirty(b);
-
-		btree_node_data_free(c, b);
+		btree_node_data_free(bc, b);
 	}
 
-	while (!list_empty(&bc->freed)) {
-		b = list_first_entry(&bc->freed, struct btree, list);
+	BUG_ON(!bch2_journal_error(&c->journal) &&
+	       atomic_long_read(&c->btree_cache.nr_dirty));
+
+	list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
+
+	list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) {
 		list_del(&b->list);
+		six_lock_exit(&b->c.lock);
 		kfree(b);
 	}
 
 	mutex_unlock(&bc->lock);
+	memalloc_nofs_restore(flags);
+
+	for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
+		BUG_ON(bc->nr_by_btree[i]);
+	BUG_ON(bc->live[0].nr);
+	BUG_ON(bc->live[1].nr);
+	BUG_ON(bc->nr_freeable);
 
 	if (bc->table_init_done)
 		rhashtable_destroy(&bc->table);
@@ -372,61 +637,61 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 int bch2_fs_btree_cache_init(struct bch_fs *c)
 {
 	struct btree_cache *bc = &c->btree_cache;
+	struct shrinker *shrink;
 	unsigned i;
 	int ret = 0;
 
-	pr_verbose_init(c->opts, "");
-
 	ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
 	if (ret)
-		goto out;
+		goto err;
 
 	bc->table_init_done = true;
 
 	bch2_recalc_btree_reserve(c);
 
-	for (i = 0; i < bc->reserve; i++)
-		if (!btree_node_mem_alloc(c, GFP_KERNEL)) {
-			ret = -ENOMEM;
-			goto out;
-		}
+	for (i = 0; i < bc->nr_reserve; i++)
+		if (!__bch2_btree_node_mem_alloc(c))
+			goto err;
 
-	list_splice_init(&bc->live, &bc->freeable);
+	list_splice_init(&bc->live[0].list, &bc->freeable);
 
-#ifdef CONFIG_BCACHEFS_DEBUG
 	mutex_init(&c->verify_lock);
 
-	c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
-	if (!c->verify_ondisk) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL);
-	if (!c->verify_data) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	list_del_init(&c->verify_data->list);
-#endif
+	shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
+	if (!shrink)
+		goto err;
+	bc->live[0].shrink	= shrink;
+	shrink->count_objects	= bch2_btree_cache_count;
+	shrink->scan_objects	= bch2_btree_cache_scan;
+	shrink->seeks		= 2;
+	shrink->private_data	= &bc->live[0];
+	shrinker_register(shrink);
+
+	shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
+	if (!shrink)
+		goto err;
+	bc->live[1].shrink	= shrink;
+	shrink->count_objects	= bch2_btree_cache_count;
+	shrink->scan_objects	= bch2_btree_cache_scan;
+	shrink->seeks		= 8;
+	shrink->private_data	= &bc->live[1];
+	shrinker_register(shrink);
 
-	bc->shrink.count_objects	= bch2_btree_cache_count;
-	bc->shrink.scan_objects		= bch2_btree_cache_scan;
-	bc->shrink.seeks		= 4;
-	bc->shrink.batch		= btree_pages(c) * 2;
-	register_shrinker(&bc->shrink);
-out:
-	pr_verbose_init(c->opts, "ret %i", ret);
-	return ret;
+	return 0;
+err:
+	return -BCH_ERR_ENOMEM_fs_btree_cache_init;
 }
 
 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
 {
 	mutex_init(&bc->lock);
-	INIT_LIST_HEAD(&bc->live);
+	for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
+		bc->live[i].idx = i;
+		INIT_LIST_HEAD(&bc->live[i].list);
+	}
 	INIT_LIST_HEAD(&bc->freeable);
-	INIT_LIST_HEAD(&bc->freed);
+	INIT_LIST_HEAD(&bc->freed_pcpu);
+	INIT_LIST_HEAD(&bc->freed_nonpcpu);
 }
 
 /*
@@ -435,46 +700,48 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
  * cannibalize_bucket() will take. This means every time we unlock the root of
  * the btree, we need to release this lock if we have it held.
  */
-void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
+void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_cache *bc = &c->btree_cache;
 
 	if (bc->alloc_lock == current) {
-		trace_btree_node_cannibalize_unlock(c);
+		trace_and_count(c, btree_cache_cannibalize_unlock, trans);
 		bc->alloc_lock = NULL;
 		closure_wake_up(&bc->alloc_wait);
 	}
 }
 
-int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
+int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_cache *bc = &c->btree_cache;
 	struct task_struct *old;
 
-	old = cmpxchg(&bc->alloc_lock, NULL, current);
-	if (old == NULL || old == current)
+	old = NULL;
+	if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current)
 		goto success;
 
 	if (!cl) {
-		trace_btree_node_cannibalize_lock_fail(c);
-		return -ENOMEM;
+		trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
+		return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
 	}
 
 	closure_wait(&bc->alloc_wait, cl);
 
 	/* Try again, after adding ourselves to waitlist */
-	old = cmpxchg(&bc->alloc_lock, NULL, current);
-	if (old == NULL || old == current) {
+	old = NULL;
+	if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) {
 		/* We raced */
 		closure_wake_up(&bc->alloc_wait);
 		goto success;
 	}
 
-	trace_btree_node_cannibalize_lock_fail(c);
-	return -EAGAIN;
+	trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
+	return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
 
 success:
-	trace_btree_node_cannibalize_lock(c);
+	trace_and_count(c, btree_cache_cannibalize_lock, trans);
 	return 0;
 }
 
@@ -483,14 +750,16 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
 
-	list_for_each_entry_reverse(b, &bc->live, list)
-		if (!btree_node_reclaim(c, b))
-			return b;
+	for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
+		list_for_each_entry_reverse(b, &bc->live[i].list, list)
+			if (!btree_node_reclaim(c, b, false))
+				return b;
 
 	while (1) {
-		list_for_each_entry_reverse(b, &bc->live, list)
-			if (!btree_node_write_and_reclaim(c, b))
-				return b;
+		for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
+			list_for_each_entry_reverse(b, &bc->live[i].list, list)
+				if (!btree_node_write_and_reclaim(c, b))
+					return b;
 
 		/*
 		 * Rare case: all nodes were intent-locked.
@@ -501,49 +770,76 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
 	}
 }
 
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
+struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
+	struct list_head *freed = pcpu_read_locks
+		? &bc->freed_pcpu
+		: &bc->freed_nonpcpu;
+	struct btree *b, *b2;
 	u64 start_time = local_clock();
 
 	mutex_lock(&bc->lock);
 
 	/*
-	 * btree_free() doesn't free memory; it sticks the node on the end of
-	 * the list. Check if there's any freed nodes there:
-	 */
-	list_for_each_entry(b, &bc->freeable, list)
-		if (!btree_node_reclaim(c, b))
-			goto out_unlock;
-
-	/*
 	 * We never free struct btree itself, just the memory that holds the on
 	 * disk node. Check the freed list before allocating a new one:
 	 */
-	list_for_each_entry(b, &bc->freed, list)
-		if (!btree_node_reclaim(c, b)) {
-			btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
-			if (b->data)
-				goto out_unlock;
-
-			six_unlock_write(&b->lock);
-			six_unlock_intent(&b->lock);
+	list_for_each_entry(b, freed, list)
+		if (!btree_node_reclaim(c, b, false)) {
+			list_del_init(&b->list);
+			goto got_node;
+		}
+
+	b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
+	if (!b) {
+		mutex_unlock(&bc->lock);
+		bch2_trans_unlock(trans);
+		b = __btree_node_mem_alloc(c, GFP_KERNEL);
+		if (!b)
 			goto err;
+		mutex_lock(&bc->lock);
+	}
+
+	bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
+
+	BUG_ON(!six_trylock_intent(&b->c.lock));
+	BUG_ON(!six_trylock_write(&b->c.lock));
+
+got_node:
+	/*
+	 * btree_free() doesn't free memory; it sticks the node on the end of
+	 * the list. Check if there's any freed nodes there:
+	 */
+	list_for_each_entry(b2, &bc->freeable, list)
+		if (!btree_node_reclaim(c, b2, false)) {
+			swap(b->data, b2->data);
+			swap(b->aux_data, b2->aux_data);
+
+			list_del_init(&b2->list);
+			--bc->nr_freeable;
+			btree_node_to_freedlist(bc, b2);
+			mutex_unlock(&bc->lock);
+
+			six_unlock_write(&b2->c.lock);
+			six_unlock_intent(&b2->c.lock);
+			goto got_mem;
 		}
 
-	b = btree_node_mem_alloc(c, __GFP_NOWARN|GFP_NOIO);
-	if (!b)
-		goto err;
+	mutex_unlock(&bc->lock);
 
-	BUG_ON(!six_trylock_intent(&b->lock));
-	BUG_ON(!six_trylock_write(&b->lock));
-out_unlock:
+	if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
+		bch2_trans_unlock(trans);
+		if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
+			goto err;
+	}
+
+got_mem:
+	BUG_ON(!list_empty(&b->list));
 	BUG_ON(btree_node_hashed(b));
+	BUG_ON(btree_node_dirty(b));
 	BUG_ON(btree_node_write_in_flight(b));
-
-	list_del_init(&b->list);
-	mutex_unlock(&bc->lock);
 out:
 	b->flags		= 0;
 	b->written		= 0;
@@ -551,135 +847,215 @@ out:
 	b->sib_u64s[0]		= 0;
 	b->sib_u64s[1]		= 0;
 	b->whiteout_u64s	= 0;
-	b->uncompacted_whiteout_u64s = 0;
-	bch2_btree_keys_init(b, &c->expensive_debug_checks);
+	bch2_btree_keys_init(b);
+	set_btree_node_accessed(b);
 
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
 			       start_time);
 
+	int ret = bch2_trans_relock(trans);
+	if (unlikely(ret)) {
+		bch2_btree_node_to_freelist(c, b);
+		return ERR_PTR(ret);
+	}
+
 	return b;
 err:
+	mutex_lock(&bc->lock);
+
 	/* Try to cannibalize another cached btree node: */
 	if (bc->alloc_lock == current) {
-		b = btree_node_cannibalize(c);
-		list_del_init(&b->list);
-		mutex_unlock(&bc->lock);
+		b2 = btree_node_cannibalize(c);
+		clear_btree_node_just_written(b2);
+		__bch2_btree_node_hash_remove(bc, b2);
+
+		if (b) {
+			swap(b->data, b2->data);
+			swap(b->aux_data, b2->aux_data);
+			btree_node_to_freedlist(bc, b2);
+			six_unlock_write(&b2->c.lock);
+			six_unlock_intent(&b2->c.lock);
+		} else {
+			b = b2;
+		}
 
-		bch2_btree_node_hash_remove(bc, b);
+		BUG_ON(!list_empty(&b->list));
+		mutex_unlock(&bc->lock);
 
-		trace_btree_node_cannibalize(c);
+		trace_and_count(c, btree_cache_cannibalize, trans);
 		goto out;
 	}
 
 	mutex_unlock(&bc->lock);
-	return ERR_PTR(-ENOMEM);
+	return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
 }
 
 /* Slowpath, don't want it inlined into btree_iter_traverse() */
-static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
-				struct btree_iter *iter,
+static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
+				struct btree_path *path,
 				const struct bkey_i *k,
+				enum btree_id btree_id,
 				unsigned level,
 				enum six_lock_type lock_type,
 				bool sync)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
 
+	if (unlikely(level >= BTREE_MAX_DEPTH)) {
+		int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
+						 level, BTREE_MAX_DEPTH);
+		return ERR_PTR(ret);
+	}
+
+	if (unlikely(!bkey_is_btree_ptr(&k->k))) {
+		struct printbuf buf = PRINTBUF;
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+		int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf);
+		printbuf_exit(&buf);
+		return ERR_PTR(ret);
+	}
+
+	if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) {
+		struct printbuf buf = PRINTBUF;
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+		int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
+		printbuf_exit(&buf);
+		return ERR_PTR(ret);
+	}
+
 	/*
 	 * Parent node must be locked, else we could read in a btree node that's
 	 * been freed:
 	 */
-	BUG_ON(!btree_node_locked(iter, level + 1));
-	BUG_ON(level >= BTREE_MAX_DEPTH);
+	if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
+		trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
+	}
+
+	b = bch2_btree_node_mem_alloc(trans, level != 0);
+
+	if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
+		if (!path)
+			return b;
+
+		trans->memory_allocation_failure = true;
+		trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
+	}
 
-	b = bch2_btree_node_mem_alloc(c);
 	if (IS_ERR(b))
 		return b;
 
 	bkey_copy(&b->key, k);
-	if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) {
+	if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
 		/* raced with another fill: */
 
 		/* mark as unhashed... */
-		bkey_i_to_extent(&b->key)->v._data[0] = 0;
+		b->hash_val = 0;
 
 		mutex_lock(&bc->lock);
-		list_add(&b->list, &bc->freeable);
+		__bch2_btree_node_to_freelist(bc, b);
 		mutex_unlock(&bc->lock);
 
-		six_unlock_write(&b->lock);
-		six_unlock_intent(&b->lock);
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
 		return NULL;
 	}
 
-	/*
-	 * If the btree node wasn't cached, we can't drop our lock on
-	 * the parent until after it's added to the cache - because
-	 * otherwise we could race with a btree_split() freeing the node
-	 * we're trying to lock.
-	 *
-	 * But the deadlock described below doesn't exist in this case,
-	 * so it's safe to not drop the parent lock until here:
-	 */
-	if (btree_node_read_locked(iter, level + 1))
-		btree_node_unlock(iter, level + 1);
+	set_btree_node_read_in_flight(b);
+	six_unlock_write(&b->c.lock);
 
-	bch2_btree_node_read(c, b, sync);
+	if (path) {
+		u32 seq = six_lock_seq(&b->c.lock);
 
-	six_unlock_write(&b->lock);
+		/* Unlock before doing IO: */
+		six_unlock_intent(&b->c.lock);
+		bch2_trans_unlock_noassert(trans);
 
-	if (!sync) {
-		six_unlock_intent(&b->lock);
-		return NULL;
-	}
+		bch2_btree_node_read(trans, b, sync);
+
+		int ret = bch2_trans_relock(trans);
+		if (ret)
+			return ERR_PTR(ret);
 
-	if (lock_type == SIX_LOCK_read)
-		six_lock_downgrade(&b->lock);
+		if (!sync)
+			return NULL;
+
+		if (!six_relock_type(&b->c.lock, lock_type, seq))
+			b = NULL;
+	} else {
+		bch2_btree_node_read(trans, b, sync);
+		if (lock_type == SIX_LOCK_read)
+			six_lock_downgrade(&b->c.lock);
+	}
 
 	return b;
 }
 
-/**
- * bch_btree_node_get - find a btree node in the cache and lock it, reading it
- * in from disk if necessary.
- *
- * If IO is necessary and running under generic_make_request, returns -EAGAIN.
- *
- * The btree node will have either a read or a write lock held, depending on
- * the @write parameter.
- */
-struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
-				  const struct bkey_i *k, unsigned level,
-				  enum six_lock_type lock_type,
-				  bool may_drop_locks)
+static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
 {
+	struct printbuf buf = PRINTBUF;
+
+	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
+		return;
+
+	prt_printf(&buf,
+		   "btree node header doesn't match ptr: ");
+	bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+	prt_str(&buf, "\nptr: ");
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+	prt_str(&buf, "\nheader: ");
+	bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data));
+	prt_str(&buf, "\nmin ");
+	bch2_bpos_to_text(&buf, b->data->min_key);
+
+	prt_printf(&buf, "\nmax ");
+	bch2_bpos_to_text(&buf, b->data->max_key);
+
+	bch2_fs_topology_error(c, "%s", buf.buf);
+
+	printbuf_exit(&buf);
+}
+
+static inline void btree_check_header(struct bch_fs *c, struct btree *b)
+{
+	if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
+	    b->c.level != BTREE_NODE_LEVEL(b->data) ||
+	    !bpos_eq(b->data->max_key, b->key.k.p) ||
+	    (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+	     !bpos_eq(b->data->min_key,
+		      bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
+		btree_bad_header(c, b);
+}
+
+static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
+					   const struct bkey_i *k, unsigned level,
+					   enum six_lock_type lock_type,
+					   unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
-	struct bset_tree *t;
+	bool need_relock = false;
+	int ret;
 
-	/*
-	 * XXX: locking optimization
-	 *
-	 * we can make the locking looser here - caller can drop lock on parent
-	 * node before locking child node (and potentially blocking): we just
-	 * have to have bch2_btree_node_fill() call relock on the parent and
-	 * return -EINTR if that fails
-	 */
-	EBUG_ON(!btree_node_locked(iter, level + 1));
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
 retry:
-	rcu_read_lock();
 	b = btree_cache_find(bc, k);
-	rcu_read_unlock();
-
 	if (unlikely(!b)) {
 		/*
 		 * We must have the parent locked to call bch2_btree_node_fill(),
 		 * else we could read in a btree node from disk that's been
 		 * freed:
 		 */
-		b = bch2_btree_node_fill(c, iter, k, level, lock_type, true);
+		b = bch2_btree_node_fill(trans, path, k, path->btree_id,
+					 level, lock_type, true);
+		need_relock = true;
 
 		/* We raced and found the btree node in the cache */
 		if (!b)
@@ -688,55 +1064,60 @@ retry:
 		if (IS_ERR(b))
 			return b;
 	} else {
-		/*
-		 * There's a potential deadlock with splits and insertions into
-		 * interior nodes we have to avoid:
-		 *
-		 * The other thread might be holding an intent lock on the node
-		 * we want, and they want to update its parent node so they're
-		 * going to upgrade their intent lock on the parent node to a
-		 * write lock.
-		 *
-		 * But if we're holding a read lock on the parent, and we're
-		 * trying to get the intent lock they're holding, we deadlock.
-		 *
-		 * So to avoid this we drop the read locks on parent nodes when
-		 * we're starting to take intent locks - and handle the race.
-		 *
-		 * The race is that they might be about to free the node we
-		 * want, and dropping our read lock on the parent node lets them
-		 * update the parent marking the node we want as freed, and then
-		 * free it:
-		 *
-		 * To guard against this, btree nodes are evicted from the cache
-		 * when they're freed - and PTR_HASH() is zeroed out, which we
-		 * check for after we lock the node.
-		 *
-		 * Then, bch2_btree_node_relock() on the parent will fail - because
-		 * the parent was modified, when the pointer to the node we want
-		 * was removed - and we'll bail out:
-		 */
-		if (btree_node_read_locked(iter, level + 1))
-			btree_node_unlock(iter, level + 1);
+		if (btree_node_read_locked(path, level + 1))
+			btree_node_unlock(trans, path, level + 1);
+
+		ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			return ERR_PTR(ret);
 
-		if (!btree_node_lock(b, k->k.p, level, iter,
-				     lock_type, may_drop_locks))
-			return ERR_PTR(-EINTR);
+		BUG_ON(ret);
 
-		if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
-			     b->level != level ||
+		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+			     b->c.level != level ||
 			     race_fault())) {
-			six_unlock_type(&b->lock, lock_type);
-			if (bch2_btree_node_relock(iter, level + 1))
+			six_unlock_type(&b->c.lock, lock_type);
+			if (bch2_btree_node_relock(trans, path, level + 1))
 				goto retry;
 
-			trans_restart();
-			return ERR_PTR(-EINTR);
+			trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
+			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
 		}
+
+		/* avoid atomic set bit if it's not needed: */
+		if (!btree_node_accessed(b))
+			set_btree_node_accessed(b);
 	}
 
-	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
-		       TASK_UNINTERRUPTIBLE);
+	if (unlikely(btree_node_read_in_flight(b))) {
+		u32 seq = six_lock_seq(&b->c.lock);
+
+		six_unlock_type(&b->c.lock, lock_type);
+		bch2_trans_unlock(trans);
+		need_relock = true;
+
+		bch2_btree_node_wait_on_read(b);
+
+		ret = bch2_trans_relock(trans);
+		if (ret)
+			return ERR_PTR(ret);
+
+		/*
+		 * should_be_locked is not set on this path yet, so we need to
+		 * relock it specifically:
+		 */
+		if (!six_relock_type(&b->c.lock, lock_type, seq))
+			goto retry;
+	}
+
+	if (unlikely(need_relock)) {
+		ret = bch2_trans_relock(trans) ?:
+			bch2_btree_path_relock_intent(trans, path);
+		if (ret) {
+			six_unlock_type(&b->c.lock, lock_type);
+			return ERR_PTR(ret);
+		}
+	}
 
 	prefetch(b->aux_data);
 
@@ -748,193 +1129,377 @@ retry:
 		prefetch(p + L1_CACHE_BYTES * 2);
 	}
 
-	/* avoid atomic set bit if it's not needed: */
-	if (btree_node_accessed(b))
-		set_btree_node_accessed(b);
-
 	if (unlikely(btree_node_read_error(b))) {
-		six_unlock_type(&b->lock, lock_type);
-		return ERR_PTR(-EIO);
+		six_unlock_type(&b->c.lock, lock_type);
+		return ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
 	}
 
-	EBUG_ON(b->btree_id != iter->btree_id ||
-		BTREE_NODE_LEVEL(b->data) != level ||
-		bkey_cmp(b->data->max_key, k->k.p));
+	EBUG_ON(b->c.btree_id != path->btree_id);
+	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+	btree_check_header(c, b);
 
 	return b;
 }
 
-struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
-					  struct btree_iter *iter,
-					  struct btree *b,
-					  bool may_drop_locks,
-					  enum btree_node_sibling sib)
+/**
+ * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
+ * in from disk if necessary.
+ *
+ * @trans:	btree transaction object
+ * @path:	btree_path being traversed
+ * @k:		pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
+ * @level:	level of btree node being looked up (0 == leaf node)
+ * @lock_type:	SIX_LOCK_read or SIX_LOCK_intent
+ * @trace_ip:	ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
+ *
+ * The btree node will have either a read or a write lock held, depending on
+ * the @write parameter.
+ *
+ * Returns: btree node or ERR_PTR()
+ */
+struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
+				  const struct bkey_i *k, unsigned level,
+				  enum six_lock_type lock_type,
+				  unsigned long trace_ip)
 {
-	struct btree *parent;
-	struct btree_node_iter node_iter;
-	struct bkey_packed *k;
-	BKEY_PADDED(k) tmp;
-	struct btree *ret = NULL;
-	unsigned level = b->level;
+	struct bch_fs *c = trans->c;
+	struct btree *b;
+	int ret;
 
-	parent = btree_iter_node(iter, level + 1);
-	if (!parent)
-		return NULL;
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
 
-	if (!bch2_btree_node_relock(iter, level + 1))
-		goto out_upgrade;
+	b = btree_node_mem_ptr(k);
 
-	node_iter = iter->l[parent->level].iter;
+	/*
+	 * Check b->hash_val _before_ calling btree_node_lock() - this might not
+	 * be the node we want anymore, and trying to lock the wrong node could
+	 * cause an unneccessary transaction restart:
+	 */
+	if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
+		     !b ||
+		     b->hash_val != btree_ptr_hash_val(k)))
+		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
 
-	k = bch2_btree_node_iter_peek_all(&node_iter, parent);
-	BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
+	if (btree_node_read_locked(path, level + 1))
+		btree_node_unlock(trans, path, level + 1);
 
-	k = sib == btree_prev_sib
-		? bch2_btree_node_iter_prev(&node_iter, parent)
-		: (bch2_btree_node_iter_advance(&node_iter, parent),
-		   bch2_btree_node_iter_peek(&node_iter, parent));
-	if (!k)
-		goto out;
+	ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		return ERR_PTR(ret);
 
-	bch2_bkey_unpack(parent, &tmp.k, k);
+	BUG_ON(ret);
 
-	ret = bch2_btree_node_get(c, iter, &tmp.k, level,
-				  SIX_LOCK_intent, may_drop_locks);
+	if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+		     b->c.level != level ||
+		     race_fault())) {
+		six_unlock_type(&b->c.lock, lock_type);
+		if (bch2_btree_node_relock(trans, path, level + 1))
+			return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
 
-	if (PTR_ERR_OR_ZERO(ret) == -EINTR && may_drop_locks) {
-		struct btree_iter *linked;
+		trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
+	}
 
-		if (!bch2_btree_node_relock(iter, level + 1))
-			goto out_upgrade;
+	if (unlikely(btree_node_read_in_flight(b))) {
+		six_unlock_type(&b->c.lock, lock_type);
+		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+	}
 
-		/*
-		 * We might have got -EINTR because trylock failed, and we're
-		 * holding other locks that would cause us to deadlock:
-		 */
-		for_each_linked_btree_iter(iter, linked)
-			if (btree_iter_cmp(iter, linked) < 0)
-				__bch2_btree_iter_unlock(linked);
+	prefetch(b->aux_data);
+
+	for_each_bset(b, t) {
+		void *p = (u64 *) b->aux_data + t->aux_data_offset;
 
-		if (sib == btree_prev_sib)
-			btree_node_unlock(iter, level);
+		prefetch(p + L1_CACHE_BYTES * 0);
+		prefetch(p + L1_CACHE_BYTES * 1);
+		prefetch(p + L1_CACHE_BYTES * 2);
+	}
 
-		ret = bch2_btree_node_get(c, iter, &tmp.k, level,
-					  SIX_LOCK_intent, may_drop_locks);
+	/* avoid atomic set bit if it's not needed: */
+	if (!btree_node_accessed(b))
+		set_btree_node_accessed(b);
 
-		/*
-		 * before btree_iter_relock() calls btree_iter_verify_locks():
-		 */
-		if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
-			btree_node_unlock(iter, level + 1);
+	if (unlikely(btree_node_read_error(b))) {
+		six_unlock_type(&b->c.lock, lock_type);
+		return ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
+	}
 
-		if (!bch2_btree_node_relock(iter, level)) {
-			btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+	EBUG_ON(b->c.btree_id != path->btree_id);
+	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+	btree_check_header(c, b);
 
-			if (!IS_ERR(ret)) {
-				six_unlock_intent(&ret->lock);
-				ret = ERR_PTR(-EINTR);
-			}
-		}
+	return b;
+}
 
-		bch2_btree_iter_relock(iter);
+struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
+					 const struct bkey_i *k,
+					 enum btree_id btree_id,
+					 unsigned level,
+					 bool nofill)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	int ret;
+
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+	if (c->opts.btree_node_mem_ptr_optimization) {
+		b = btree_node_mem_ptr(k);
+		if (b)
+			goto lock_node;
 	}
-out:
-	if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
-		btree_node_unlock(iter, level + 1);
+retry:
+	b = btree_cache_find(bc, k);
+	if (unlikely(!b)) {
+		if (nofill)
+			goto out;
+
+		b = bch2_btree_node_fill(trans, NULL, k, btree_id,
+					 level, SIX_LOCK_read, true);
 
-	bch2_btree_iter_verify_locks(iter);
+		/* We raced and found the btree node in the cache */
+		if (!b)
+			goto retry;
 
-	BUG_ON((!may_drop_locks || !IS_ERR(ret)) &&
-	       (iter->uptodate >= BTREE_ITER_NEED_RELOCK ||
-		!btree_node_locked(iter, level)));
+		if (IS_ERR(b) &&
+		    !bch2_btree_cache_cannibalize_lock(trans, NULL))
+			goto retry;
 
-	if (!IS_ERR_OR_NULL(ret)) {
-		struct btree *n1 = ret, *n2 = b;
+		if (IS_ERR(b))
+			goto out;
+	} else {
+lock_node:
+		ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			return ERR_PTR(ret);
 
-		if (sib != btree_prev_sib)
-			swap(n1, n2);
+		BUG_ON(ret);
 
-		BUG_ON(bkey_cmp(btree_type_successor(n1->btree_id,
-						     n1->key.k.p),
-				n2->data->min_key));
+		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+			     b->c.btree_id != btree_id ||
+			     b->c.level != level)) {
+			six_unlock_read(&b->c.lock);
+			goto retry;
+		}
 	}
 
-	return ret;
-out_upgrade:
-	if (may_drop_locks)
-		bch2_btree_iter_upgrade(iter, level + 2, true);
-	ret = ERR_PTR(-EINTR);
-	goto out;
+	/* XXX: waiting on IO with btree locks held: */
+	__bch2_btree_node_wait_on_read(b);
+
+	prefetch(b->aux_data);
+
+	for_each_bset(b, t) {
+		void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+		prefetch(p + L1_CACHE_BYTES * 0);
+		prefetch(p + L1_CACHE_BYTES * 1);
+		prefetch(p + L1_CACHE_BYTES * 2);
+	}
+
+	/* avoid atomic set bit if it's not needed: */
+	if (!btree_node_accessed(b))
+		set_btree_node_accessed(b);
+
+	if (unlikely(btree_node_read_error(b))) {
+		six_unlock_read(&b->c.lock);
+		b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
+		goto out;
+	}
+
+	EBUG_ON(b->c.btree_id != btree_id);
+	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+	btree_check_header(c, b);
+out:
+	bch2_btree_cache_cannibalize_unlock(trans);
+	return b;
 }
 
-void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
-			      const struct bkey_i *k, unsigned level)
+int bch2_btree_node_prefetch(struct btree_trans *trans,
+			     struct btree_path *path,
+			     const struct bkey_i *k,
+			     enum btree_id btree_id, unsigned level)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
 
-	BUG_ON(!btree_node_locked(iter, level + 1));
+	BUG_ON(path && !btree_node_locked(path, level + 1));
 	BUG_ON(level >= BTREE_MAX_DEPTH);
 
-	rcu_read_lock();
-	b = btree_cache_find(bc, k);
-	rcu_read_unlock();
+	struct btree *b = btree_cache_find(bc, k);
+	if (b)
+		return 0;
 
+	b = bch2_btree_node_fill(trans, path, k, btree_id,
+				 level, SIX_LOCK_read, false);
+	int ret = PTR_ERR_OR_ZERO(b);
+	if (ret)
+		return ret;
 	if (b)
+		six_unlock_read(&b->c.lock);
+	return 0;
+}
+
+void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	b = btree_cache_find(bc, k);
+	if (!b)
 		return;
 
-	bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false);
+	BUG_ON(b == btree_node_root(trans->c, b));
+wait_on_io:
+	/* not allowed to wait on io with btree locks held: */
+
+	/* XXX we're called from btree_gc which will be holding other btree
+	 * nodes locked
+	 */
+	__bch2_btree_node_wait_on_read(b);
+	__bch2_btree_node_wait_on_write(b);
+
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+	if (unlikely(b->hash_val != btree_ptr_hash_val(k)))
+		goto out;
+
+	if (btree_node_dirty(b)) {
+		__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
+		goto wait_on_io;
+	}
+
+	BUG_ON(btree_node_dirty(b));
+
+	mutex_lock(&bc->lock);
+	bch2_btree_node_hash_remove(bc, b);
+	btree_node_data_free(bc, b);
+	mutex_unlock(&bc->lock);
+out:
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
+}
+
+const char *bch2_btree_id_str(enum btree_id btree)
+{
+	return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
+}
+
+void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree)
+{
+	if (btree < BTREE_ID_NR)
+		prt_str(out, __bch2_btree_ids[btree]);
+	else
+		prt_printf(out, "(unknown btree %u)", btree);
+}
+
+void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level)
+{
+	prt_str(out, "btree=");
+	bch2_btree_id_to_text(out, btree);
+	prt_printf(out, " level=%u", level);
+}
+
+void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
+{
+	bch2_btree_id_to_text(out, b->c.btree_id);
+	prt_printf(out, " level %u/", b->c.level);
+	struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id);
+	if (r)
+		prt_printf(out, "%u", r->level);
+	else
+		prt_printf(out, "(unknown)");
+	prt_printf(out, "\n  ");
+
+	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
 }
 
-int bch2_print_btree_node(struct bch_fs *c, struct btree *b,
-			  char *buf, size_t len)
+void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
 {
-	const struct bkey_format *f = &b->format;
 	struct bset_stats stats;
-	char ptrs[100];
 
 	memset(&stats, 0, sizeof(stats));
 
-	bch2_val_to_text(c, BKEY_TYPE_BTREE, ptrs, sizeof(ptrs),
-			bkey_i_to_s_c(&b->key));
 	bch2_btree_keys_stats(b, &stats);
 
-	return scnprintf(buf, len,
-			 "l %u %llu:%llu - %llu:%llu:\n"
-			 "    ptrs: %s\n"
-			 "    format: u64s %u fields %u %u %u %u %u\n"
-			 "    unpack fn len: %u\n"
-			 "    bytes used %zu/%zu (%zu%% full)\n"
-			 "    sib u64s: %u, %u (merge threshold %zu)\n"
-			 "    nr packed keys %u\n"
-			 "    nr unpacked keys %u\n"
-			 "    floats %zu\n"
-			 "    failed unpacked %zu\n"
-			 "    failed prev %zu\n"
-			 "    failed overflow %zu\n",
-			 b->level,
-			 b->data->min_key.inode,
-			 b->data->min_key.offset,
-			 b->data->max_key.inode,
-			 b->data->max_key.offset,
-			 ptrs,
-			 f->key_u64s,
-			 f->bits_per_field[0],
-			 f->bits_per_field[1],
-			 f->bits_per_field[2],
-			 f->bits_per_field[3],
-			 f->bits_per_field[4],
-			 b->unpack_fn_len,
-			 b->nr.live_u64s * sizeof(u64),
-			 btree_bytes(c) - sizeof(struct btree_node),
-			 b->nr.live_u64s * 100 / btree_max_u64s(c),
-			 b->sib_u64s[0],
-			 b->sib_u64s[1],
-			 BTREE_FOREGROUND_MERGE_THRESHOLD(c),
-			 b->nr.packed_keys,
-			 b->nr.unpacked_keys,
-			 stats.floats,
-			 stats.failed_unpacked,
-			 stats.failed_prev,
-			 stats.failed_overflow);
+	prt_printf(out, "l %u ", b->c.level);
+	bch2_bpos_to_text(out, b->data->min_key);
+	prt_printf(out, " - ");
+	bch2_bpos_to_text(out, b->data->max_key);
+	prt_printf(out, ":\n"
+	       "    ptrs: ");
+	bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+	prt_newline(out);
+
+	prt_printf(out,
+	       "    format: ");
+	bch2_bkey_format_to_text(out, &b->format);
+
+	prt_printf(out,
+	       "    unpack fn len: %u\n"
+	       "    bytes used %zu/%zu (%zu%% full)\n"
+	       "    sib u64s: %u, %u (merge threshold %u)\n"
+	       "    nr packed keys %u\n"
+	       "    nr unpacked keys %u\n"
+	       "    floats %zu\n"
+	       "    failed unpacked %zu\n",
+	       b->unpack_fn_len,
+	       b->nr.live_u64s * sizeof(u64),
+	       btree_buf_bytes(b) - sizeof(struct btree_node),
+	       b->nr.live_u64s * 100 / btree_max_u64s(c),
+	       b->sib_u64s[0],
+	       b->sib_u64s[1],
+	       c->btree_foreground_merge_threshold,
+	       b->nr.packed_keys,
+	       b->nr.unpacked_keys,
+	       stats.floats,
+	       stats.failed);
+}
+
+static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
+				 const char *label, size_t nr)
+{
+	prt_printf(out, "%s\t", label);
+	prt_human_readable_u64(out, nr * c->opts.btree_node_size);
+	prt_printf(out, " (%zu)\n", nr);
+}
+
+static const char * const bch2_btree_cache_not_freed_reasons_strs[] = {
+#define x(n) #n,
+	BCH_BTREE_CACHE_NOT_FREED_REASONS()
+#undef x
+	NULL
+};
+
+void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
+{
+	struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
+
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 32);
+
+	prt_btree_cache_line(out, c, "live:",		bc->live[0].nr);
+	prt_btree_cache_line(out, c, "pinned:",		bc->live[1].nr);
+	prt_btree_cache_line(out, c, "freeable:",	bc->nr_freeable);
+	prt_btree_cache_line(out, c, "dirty:",		atomic_long_read(&bc->nr_dirty));
+	prt_printf(out, "cannibalize lock:\t%p\n",	bc->alloc_lock);
+	prt_newline(out);
+
+	for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) {
+		bch2_btree_id_to_text(out, i);
+		prt_printf(out, "\t");
+		prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size);
+		prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]);
+	}
+
+	prt_newline(out);
+	prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
+	prt_printf(out, "not freed:\n");
+
+	for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++)
+		prt_printf(out, "  %s\t%llu\n",
+			   bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]);
 }
diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h
index 96d134f4..dcc34fe4 100644
--- a/libbcachefs/btree_cache.h
+++ b/libbcachefs/btree_cache.h
@@ -1,47 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_BTREE_CACHE_H
 #define _BCACHEFS_BTREE_CACHE_H
 
 #include "bcachefs.h"
 #include "btree_types.h"
-#include "extents.h"
+#include "bkey_methods.h"
 
-struct btree_iter;
+extern const char * const bch2_btree_node_flags[];
 
-extern const char * const bch2_btree_ids[];
+struct btree_iter;
 
 void bch2_recalc_btree_reserve(struct bch_fs *);
 
+void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *);
+
+void __bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
 void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
+
 int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
 int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
 				unsigned, enum btree_id);
 
-void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
-int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
+void bch2_node_pin(struct bch_fs *, struct btree *);
+void bch2_btree_cache_unpin(struct bch_fs *);
+
+void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
+				      struct bkey_s_c, struct bkey_i *);
+
+void bch2_btree_cache_cannibalize_unlock(struct btree_trans *);
+int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *);
 
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
+struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
+struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
 
-struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
+struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
 				  const struct bkey_i *, unsigned,
-				  enum six_lock_type, bool);
+				  enum six_lock_type, unsigned long);
 
-struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
-					  struct btree *, bool,
-					  enum btree_node_sibling);
+struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *,
+					 enum btree_id, unsigned, bool);
 
-void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
-			      const struct bkey_i *, unsigned);
+int bch2_btree_node_prefetch(struct btree_trans *, struct btree_path *,
+			     const struct bkey_i *, enum btree_id, unsigned);
+
+void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *);
 
 void bch2_fs_btree_cache_exit(struct bch_fs *);
 int bch2_fs_btree_cache_init(struct bch_fs *);
 void bch2_fs_btree_cache_init_early(struct btree_cache *);
 
-#define PTR_HASH(_k)	(bkey_i_to_extent_c(_k)->v._data[0])
+static inline u64 btree_ptr_hash_val(const struct bkey_i *k)
+{
+	switch (k->k.type) {
+	case KEY_TYPE_btree_ptr:
+		return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start);
+	case KEY_TYPE_btree_ptr_v2:
+		/*
+		 * The cast/deref is only necessary to avoid sparse endianness
+		 * warnings:
+		 */
+		return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq);
+	default:
+		return 0;
+	}
+}
+
+static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k)
+{
+	return k->k.type == KEY_TYPE_btree_ptr_v2
+		? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr
+		: NULL;
+}
 
 /* is btree node in hash table? */
 static inline bool btree_node_hashed(struct btree *b)
 {
-	return bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key);
+	return b->hash_val != 0;
 }
 
 #define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)		\
@@ -50,41 +84,71 @@ static inline bool btree_node_hashed(struct btree *b)
 	     _iter = 0;	_iter < (_tbl)->size; _iter++)			\
 		rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
 
-static inline size_t btree_bytes(struct bch_fs *c)
+static inline size_t btree_buf_bytes(const struct btree *b)
 {
-	return c->opts.btree_node_size << 9;
+	return 1UL << b->byte_order;
 }
 
-static inline size_t btree_max_u64s(struct bch_fs *c)
+static inline size_t btree_buf_max_u64s(const struct btree *b)
 {
-	return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
+	return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64);
 }
 
-static inline size_t btree_page_order(struct bch_fs *c)
+static inline size_t btree_max_u64s(const struct bch_fs *c)
 {
-	return get_order(btree_bytes(c));
+	return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64);
 }
 
-static inline size_t btree_pages(struct bch_fs *c)
+static inline size_t btree_sectors(const struct bch_fs *c)
 {
-	return 1 << btree_page_order(c);
+	return c->opts.btree_node_size >> SECTOR_SHIFT;
 }
 
-static inline unsigned btree_blocks(struct bch_fs *c)
+static inline unsigned btree_blocks(const struct bch_fs *c)
 {
-	return c->opts.btree_node_size >> c->block_bits;
+	return btree_sectors(c) >> c->block_bits;
 }
 
-#define BTREE_SPLIT_THRESHOLD(c)		(btree_blocks(c) * 3 / 4)
+#define BTREE_SPLIT_THRESHOLD(c)		(btree_max_u64s(c) * 2 / 3)
 
 #define BTREE_FOREGROUND_MERGE_THRESHOLD(c)	(btree_max_u64s(c) * 1 / 3)
 #define BTREE_FOREGROUND_MERGE_HYSTERESIS(c)			\
 	(BTREE_FOREGROUND_MERGE_THRESHOLD(c) +			\
-	 (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
+	 (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2))
+
+static inline unsigned btree_id_nr_alive(struct bch_fs *c)
+{
+	return BTREE_ID_NR + c->btree_roots_extra.nr;
+}
+
+static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned id)
+{
+	if (likely(id < BTREE_ID_NR)) {
+		return &c->btree_roots_known[id];
+	} else {
+		unsigned idx = id - BTREE_ID_NR;
+
+		/* This can happen when we're called from btree_node_scan */
+		if (idx >= c->btree_roots_extra.nr)
+			return NULL;
+
+		return &c->btree_roots_extra.data[idx];
+	}
+}
+
+static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
+{
+	struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id);
+
+	return r ? r->b : NULL;
+}
 
-#define btree_node_root(_c, _b)	((_c)->btree_roots[(_b)->btree_id].b)
+const char *bch2_btree_id_str(enum btree_id);	/* avoid */
+void bch2_btree_id_to_text(struct printbuf *, enum btree_id);
+void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned);
 
-int bch2_print_btree_node(struct bch_fs *, struct btree *,
-			 char *, size_t);
+void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
+void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
+void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);
 
 #endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 969c1f19..e59924cf 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -1,25 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
  * Copyright (C) 2014 Datera Inc.
  */
 
 #include "bcachefs.h"
-#include "alloc.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "backpointers.h"
 #include "bkey_methods.h"
+#include "bkey_buf.h"
+#include "btree_journal_iter.h"
+#include "btree_key_cache.h"
 #include "btree_locking.h"
+#include "btree_node_scan.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
 #include "btree_gc.h"
 #include "buckets.h"
 #include "clock.h"
 #include "debug.h"
+#include "disk_accounting.h"
+#include "ec.h"
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
 #include "keylist.h"
 #include "move.h"
+#include "recovery_passes.h"
+#include "reflink.h"
 #include "replicas.h"
 #include "super-io.h"
+#include "trace.h"
 
 #include <linux/slab.h>
 #include <linux/bitops.h>
@@ -28,1069 +40,1224 @@
 #include <linux/preempt.h>
 #include <linux/rcupdate.h>
 #include <linux/sched/task.h>
-#include <trace/events/bcachefs.h>
-
-struct range_checks {
-	struct range_level {
-		struct bpos	min;
-		struct bpos	max;
-	}			l[BTREE_MAX_DEPTH];
-	unsigned		depth;
+
+#define DROP_THIS_NODE		10
+#define DROP_PREV_NODE		11
+#define DID_FILL_FROM_SCAN	12
+
+static const char * const bch2_gc_phase_strs[] = {
+#define x(n)	#n,
+	GC_PHASES()
+#undef x
+	NULL
 };
 
-static void btree_node_range_checks_init(struct range_checks *r, unsigned depth)
+void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p)
 {
-	unsigned i;
+	prt_str(out, bch2_gc_phase_strs[p->phase]);
+	prt_char(out, ' ');
+	bch2_btree_id_level_to_text(out, p->btree, p->level);
+	prt_char(out, ' ');
+	bch2_bpos_to_text(out, p->pos);
+}
+
+static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
+{
+	return (struct bkey_s) {{{
+		(struct bkey *) k.k,
+		(struct bch_val *) k.v
+	}}};
+}
+
+static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
+{
+	preempt_disable();
+	write_seqcount_begin(&c->gc_pos_lock);
+	c->gc_pos = new_pos;
+	write_seqcount_end(&c->gc_pos_lock);
+	preempt_enable();
+}
 
-	for (i = 0; i < BTREE_MAX_DEPTH; i++)
-		r->l[i].min = r->l[i].max = POS_MIN;
-	r->depth = depth;
+static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
+{
+	BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0);
+	__gc_pos_set(c, new_pos);
 }
 
-static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
-				    struct range_checks *r)
+static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
 {
-	struct range_level *l = &r->l[b->level];
-
-	struct bpos expected_min = bkey_cmp(l->min, l->max)
-		? btree_type_successor(b->btree_id, l->max)
-		: l->max;
-
-	bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c,
-		"btree node has incorrect min key: %llu:%llu != %llu:%llu",
-		b->data->min_key.inode,
-		b->data->min_key.offset,
-		expected_min.inode,
-		expected_min.offset);
-
-	l->max = b->data->max_key;
-
-	if (b->level > r->depth) {
-		l = &r->l[b->level - 1];
-
-		bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c,
-			"btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu",
-			b->data->min_key.inode,
-			b->data->min_key.offset,
-			l->min.inode,
-			l->min.offset);
-
-		bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c,
-			"btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu",
-			b->data->max_key.inode,
-			b->data->max_key.offset,
-			l->max.inode,
-			l->max.offset);
-
-		if (bkey_cmp(b->data->max_key, POS_MAX))
-			l->min = l->max =
-				btree_type_successor(b->btree_id,
-						     b->data->max_key);
+	switch (b->key.k.type) {
+	case KEY_TYPE_btree_ptr: {
+		struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
+
+		dst->k.p		= src->k.p;
+		dst->v.mem_ptr		= 0;
+		dst->v.seq		= b->data->keys.seq;
+		dst->v.sectors_written	= 0;
+		dst->v.flags		= 0;
+		dst->v.min_key		= b->data->min_key;
+		set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
+		memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
+		break;
+	}
+	case KEY_TYPE_btree_ptr_v2:
+		bkey_copy(&dst->k_i, &b->key);
+		break;
+	default:
+		BUG();
 	}
 }
 
-u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
+static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
 {
-	const struct bch_extent_ptr *ptr;
-	u8 max_stale = 0;
+	struct bkey_i_btree_ptr_v2 *new;
+	int ret;
 
-	if (bkey_extent_is_data(k.k)) {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+	if (c->opts.verbose) {
+		struct printbuf buf = PRINTBUF;
 
-		extent_for_each_ptr(e, ptr) {
-			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-			size_t b = PTR_BUCKET_NR(ca, ptr);
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+		prt_str(&buf, " -> ");
+		bch2_bpos_to_text(&buf, new_min);
 
-			if (gen_after(ca->oldest_gens[b], ptr->gen))
-				ca->oldest_gens[b] = ptr->gen;
+		bch_info(c, "%s(): %s", __func__, buf.buf);
+		printbuf_exit(&buf);
+	}
 
-			max_stale = max(max_stale, ptr_stale(ca, ptr));
-		}
+	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
+	if (!new)
+		return -BCH_ERR_ENOMEM_gc_repair_key;
+
+	btree_ptr_to_v2(b, new);
+	b->data->min_key	= new_min;
+	new->v.min_key		= new_min;
+	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
+
+	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+	if (ret) {
+		kfree(new);
+		return ret;
 	}
 
-	return max_stale;
+	bch2_btree_node_drop_keys_outside_node(b);
+	bkey_copy(&b->key, &new->k_i);
+	return 0;
 }
 
-/*
- * For runtime mark and sweep:
- */
-static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
-			   struct bkey_s_c k, unsigned flags)
+static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
 {
-	struct gc_pos pos = { 0 };
-	u8 ret = 0;
-
-	switch (type) {
-	case BKEY_TYPE_BTREE:
-		bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, NULL,
-			      0, flags|
-			      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-			      BCH_BUCKET_MARK_GC_LOCK_HELD);
-		break;
-	case BKEY_TYPE_EXTENTS:
-		bch2_mark_key(c, k, k.k->size, false, pos, NULL,
-			      0, flags|
-			      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-			      BCH_BUCKET_MARK_GC_LOCK_HELD);
-		ret = bch2_btree_key_recalc_oldest_gen(c, k);
-		break;
-	default:
-		BUG();
+	struct bkey_i_btree_ptr_v2 *new;
+	int ret;
+
+	if (c->opts.verbose) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+		prt_str(&buf, " -> ");
+		bch2_bpos_to_text(&buf, new_max);
+
+		bch_info(c, "%s(): %s", __func__, buf.buf);
+		printbuf_exit(&buf);
 	}
 
-	return ret;
+	ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
+	if (ret)
+		return ret;
+
+	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
+	if (!new)
+		return -BCH_ERR_ENOMEM_gc_repair_key;
+
+	btree_ptr_to_v2(b, new);
+	b->data->max_key	= new_max;
+	new->k.p		= new_max;
+	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
+
+	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+	if (ret) {
+		kfree(new);
+		return ret;
+	}
+
+	bch2_btree_node_drop_keys_outside_node(b);
+
+	mutex_lock(&c->btree_cache.lock);
+	__bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+	bkey_copy(&b->key, &new->k_i);
+	ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+	BUG_ON(ret);
+	mutex_unlock(&c->btree_cache.lock);
+	return 0;
 }
 
-int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
-				struct bkey_s_c k)
+static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b,
+				       struct btree *prev, struct btree *cur,
+				       struct bpos *pulled_from_scan)
 {
-	enum bch_data_type data_type = type == BKEY_TYPE_BTREE
-		? BCH_DATA_BTREE : BCH_DATA_USER;
+	struct bch_fs *c = trans->c;
+	struct bpos expected_start = !prev
+		? b->data->min_key
+		: bpos_successor(prev->key.k.p);
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	BUG_ON(journal_seq_verify(c) &&
-	       k.k->version.lo > journal_cur_seq(&c->journal));
+	BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+	       !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
+			b->data->min_key));
 
-	if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-	    fsck_err_on(!bch2_bkey_replicas_marked(c, data_type, k), c,
-			"superblock not marked as containing replicas (type %u)",
-			data_type)) {
-		ret = bch2_mark_bkey_replicas(c, data_type, k);
-		if (ret)
-			return ret;
+	if (bpos_eq(expected_start, cur->data->min_key))
+		return 0;
+
+	prt_printf(&buf, "  at ");
+	bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+	prt_printf(&buf, ":\n  parent: ");
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+	if (prev) {
+		prt_printf(&buf, "\n  prev: ");
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
 	}
 
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const struct bch_extent_ptr *ptr;
-
-		extent_for_each_ptr(e, ptr) {
-			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-			size_t b = PTR_BUCKET_NR(ca, ptr);
-			struct bucket *g = PTR_BUCKET(ca, ptr);
-
-			if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
-					"found ptr with missing gen in alloc btree,\n"
-					"type %s gen %u",
-					bch2_data_types[data_type],
-					ptr->gen)) {
-				g->_mark.gen = ptr->gen;
-				g->_mark.gen_valid = 1;
-				set_bit(b, ca->buckets_dirty);
-			}
+	prt_str(&buf, "\n  next: ");
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
 
-			if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
-					"%s ptr gen in the future: %u > %u",
-					bch2_data_types[data_type],
-					ptr->gen, g->mark.gen)) {
-				g->_mark.gen = ptr->gen;
-				g->_mark.gen_valid = 1;
-				set_bit(b, ca->buckets_dirty);
-				set_bit(BCH_FS_FIXED_GENS, &c->flags);
-			}
+	if (bpos_lt(expected_start, cur->data->min_key)) {				/* gap */
+		if (b->c.level == 1 &&
+		    bpos_lt(*pulled_from_scan, cur->data->min_key)) {
+			ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
+						     expected_start,
+						     bpos_predecessor(cur->data->min_key));
+			if (ret)
+				goto err;
 
+			*pulled_from_scan = cur->data->min_key;
+			ret = DID_FILL_FROM_SCAN;
+		} else {
+			if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
+					     "btree node with incorrect min_key%s", buf.buf))
+				ret = set_node_min(c, cur, expected_start);
+		}
+	} else {									/* overlap */
+		if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {	/* cur overwrites prev */
+			if (bpos_ge(prev->data->min_key, cur->data->min_key)) {		/* fully? */
+				if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node,
+						     "btree node overwritten by next node%s", buf.buf))
+					ret = DROP_PREV_NODE;
+			} else {
+				if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
+						     "btree node with incorrect max_key%s", buf.buf))
+					ret = set_node_max(c, prev,
+							   bpos_predecessor(cur->data->min_key));
+			}
+		} else {
+			if (bpos_ge(expected_start, cur->data->max_key)) {		/* fully? */
+				if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node,
+						     "btree node overwritten by prev node%s", buf.buf))
+					ret = DROP_THIS_NODE;
+			} else {
+				if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
+						     "btree node with incorrect min_key%s", buf.buf))
+					ret = set_node_min(c, cur, expected_start);
+			}
 		}
-		break;
-	}
 	}
-
-	atomic64_set(&c->key_version,
-		     max_t(u64, k.k->version.lo,
-			   atomic64_read(&c->key_version)));
-
-	bch2_gc_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
+err:
 fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
-static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
+static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
+				 struct btree *child, struct bpos *pulled_from_scan)
 {
-	enum bkey_type type = btree_node_type(b);
-	struct btree_node_iter iter;
-	struct bkey unpacked;
-	struct bkey_s_c k;
-	u8 stale = 0;
-
-	if (btree_node_has_ptrs(b))
-		for_each_btree_node_key_unpack(b, k, &iter,
-					       btree_node_is_extents(b),
-					       &unpacked) {
-			bch2_bkey_debugcheck(c, b, k);
-			stale = max(stale, bch2_gc_mark_key(c, type, k, 0));
-		}
-
-	return stale;
-}
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
 
-static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
-{
-	write_seqcount_begin(&c->gc_pos_lock);
-	c->gc_pos = new_pos;
-	write_seqcount_end(&c->gc_pos_lock);
-}
+	if (bpos_eq(child->key.k.p, b->key.k.p))
+		return 0;
 
-static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
-{
-	BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
-	__gc_pos_set(c, new_pos);
+	prt_printf(&buf, "  at ");
+	bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+	prt_printf(&buf, ":\n  parent: ");
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+	prt_str(&buf, "\n  child: ");
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
+
+	if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
+			     "btree node with incorrect max_key%s", buf.buf)) {
+		if (b->c.level == 1 &&
+		    bpos_lt(*pulled_from_scan, b->key.k.p)) {
+			ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
+						bpos_successor(child->key.k.p), b->key.k.p);
+			if (ret)
+				goto err;
+
+			*pulled_from_scan = b->key.k.p;
+			ret = DID_FILL_FROM_SCAN;
+		} else {
+			ret = set_node_max(c, child, b->key.k.p);
+		}
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
 }
 
-static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
+static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
+					      struct bpos *pulled_from_scan)
 {
-	struct btree_iter iter;
-	struct btree *b;
-	struct range_checks r;
-	unsigned depth = btree_id == BTREE_ID_EXTENTS ? 0 : 1;
-	unsigned max_stale;
+	struct bch_fs *c = trans->c;
+	struct btree_and_journal_iter iter;
+	struct bkey_s_c k;
+	struct bkey_buf prev_k, cur_k;
+	struct btree *prev = NULL, *cur = NULL;
+	bool have_child, new_pass = false;
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
-
-	if (!c->btree_roots[btree_id].b)
+	if (!b->c.level)
 		return 0;
 
-	/*
-	 * if expensive_debug_checks is on, run range_checks on all leaf nodes:
-	 */
-	if (expensive_debug_checks(c))
-		depth = 0;
+	bch2_bkey_buf_init(&prev_k);
+	bch2_bkey_buf_init(&cur_k);
+again:
+	cur = prev = NULL;
+	have_child = new_pass = false;
+	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+	iter.prefetch = true;
+
+	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
+		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
+
+		bch2_btree_and_journal_iter_advance(&iter);
+		bch2_bkey_buf_reassemble(&cur_k, c, k);
+
+		cur = bch2_btree_node_get_noiter(trans, cur_k.k,
+					b->c.btree_id, b->c.level - 1,
+					false);
+		ret = PTR_ERR_OR_ZERO(cur);
+
+		printbuf_reset(&buf);
+		bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1);
+		prt_char(&buf, ' ');
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
+
+		if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
+				trans, btree_node_read_error,
+				"Topology repair: unreadable btree node at\n"
+				"  %s",
+				buf.buf)) {
+			bch2_btree_node_evict(trans, cur_k.k);
+			cur = NULL;
+			ret = bch2_journal_key_delete(c, b->c.btree_id,
+						      b->c.level, cur_k.k->k.p);
+			if (ret)
+				break;
+
+			if (!btree_id_is_alloc(b->c.btree_id)) {
+				ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+				if (ret)
+					break;
+			}
+			continue;
+		}
 
-	btree_node_range_checks_init(&r, depth);
+		bch_err_msg(c, ret, "getting btree node");
+		if (ret)
+			break;
+
+		if (bch2_btree_node_is_stale(c, cur)) {
+			bch_info(c, "btree node older than nodes found by scanning\n  %s", buf.buf);
+			six_unlock_read(&cur->c.lock);
+			bch2_btree_node_evict(trans, cur_k.k);
+			ret = bch2_journal_key_delete(c, b->c.btree_id,
+						      b->c.level, cur_k.k->k.p);
+			cur = NULL;
+			if (ret)
+				break;
+			continue;
+		}
 
-	__for_each_btree_node(&iter, c, btree_id, POS_MIN,
-			      0, depth, BTREE_ITER_PREFETCH, b) {
-		btree_node_range_checks(c, b, &r);
+		ret = btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan);
+		if (ret == DID_FILL_FROM_SCAN) {
+			new_pass = true;
+			ret = 0;
+		}
+
+		if (ret == DROP_THIS_NODE) {
+			six_unlock_read(&cur->c.lock);
+			bch2_btree_node_evict(trans, cur_k.k);
+			ret = bch2_journal_key_delete(c, b->c.btree_id,
+						      b->c.level, cur_k.k->k.p);
+			cur = NULL;
+			if (ret)
+				break;
+			continue;
+		}
 
-		bch2_verify_btree_nr_keys(b);
+		if (prev)
+			six_unlock_read(&prev->c.lock);
+		prev = NULL;
 
-		max_stale = btree_gc_mark_node(c, b);
+		if (ret == DROP_PREV_NODE) {
+			bch_info(c, "dropped prev node");
+			bch2_btree_node_evict(trans, prev_k.k);
+			ret = bch2_journal_key_delete(c, b->c.btree_id,
+						      b->c.level, prev_k.k->k.p);
+			if (ret)
+				break;
 
-		gc_pos_set(c, gc_pos_btree_node(b));
+			bch2_btree_and_journal_iter_exit(&iter);
+			goto again;
+		} else if (ret)
+			break;
 
-		if (max_stale > 64)
-			bch2_btree_node_rewrite(c, &iter,
-					b->data->keys.seq,
-					BTREE_INSERT_USE_RESERVE|
-					BTREE_INSERT_NOWAIT|
-					BTREE_INSERT_GC_LOCK_HELD);
-		else if (!btree_gc_rewrite_disabled(c) &&
-			 (btree_gc_always_rewrite(c) || max_stale > 16))
-			bch2_btree_node_rewrite(c, &iter,
-					b->data->keys.seq,
-					BTREE_INSERT_NOWAIT|
-					BTREE_INSERT_GC_LOCK_HELD);
+		prev = cur;
+		cur = NULL;
+		bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
+	}
 
-		bch2_btree_iter_cond_resched(&iter);
+	if (!ret && !IS_ERR_OR_NULL(prev)) {
+		BUG_ON(cur);
+		ret = btree_repair_node_end(trans, b, prev, pulled_from_scan);
+		if (ret == DID_FILL_FROM_SCAN) {
+			new_pass = true;
+			ret = 0;
+		}
 	}
-	ret = bch2_btree_iter_unlock(&iter);
+
+	if (!IS_ERR_OR_NULL(prev))
+		six_unlock_read(&prev->c.lock);
+	prev = NULL;
+	if (!IS_ERR_OR_NULL(cur))
+		six_unlock_read(&cur->c.lock);
+	cur = NULL;
+
 	if (ret)
-		return ret;
+		goto err;
 
-	mutex_lock(&c->btree_root_lock);
+	bch2_btree_and_journal_iter_exit(&iter);
 
-	b = c->btree_roots[btree_id].b;
-	if (!btree_node_fake(b))
-		bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
-	gc_pos_set(c, gc_pos_btree_root(b->btree_id));
+	if (new_pass)
+		goto again;
 
-	mutex_unlock(&c->btree_root_lock);
-	return 0;
-}
+	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+	iter.prefetch = true;
 
-static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
-				  u64 start, u64 end,
-				  enum bch_data_type type,
-				  unsigned flags)
-{
-	u64 b = sector_to_bucket(ca, start);
+	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+		bch2_bkey_buf_reassemble(&cur_k, c, k);
+		bch2_btree_and_journal_iter_advance(&iter);
 
-	do {
-		unsigned sectors =
-			min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
-
-		bch2_mark_metadata_bucket(c, ca, b, type, sectors,
-					  gc_phase(GC_PHASE_SB), flags);
-		b++;
-		start += sectors;
-	} while (start < end);
-}
+		cur = bch2_btree_node_get_noiter(trans, cur_k.k,
+					b->c.btree_id, b->c.level - 1,
+					false);
+		ret = PTR_ERR_OR_ZERO(cur);
 
-void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
-			      unsigned flags)
-{
-	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
-	unsigned i;
-	u64 b;
+		bch_err_msg(c, ret, "getting btree node");
+		if (ret)
+			goto err;
 
-	if (c) {
-		lockdep_assert_held(&c->sb_lock);
-		percpu_down_read_preempt_disable(&c->usage_lock);
-	}
+		ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
+		six_unlock_read(&cur->c.lock);
+		cur = NULL;
 
-	for (i = 0; i < layout->nr_superblocks; i++) {
-		u64 offset = le64_to_cpu(layout->sb_offset[i]);
+		if (ret == DROP_THIS_NODE) {
+			bch2_btree_node_evict(trans, cur_k.k);
+			ret = bch2_journal_key_delete(c, b->c.btree_id,
+						      b->c.level, cur_k.k->k.p);
+			new_pass = true;
+		}
 
-		if (offset == BCH_SB_SECTOR)
-			mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
-					      BCH_DATA_SB, flags);
+		if (ret)
+			goto err;
 
-		mark_metadata_sectors(c, ca, offset,
-				      offset + (1 << layout->sb_max_size_bits),
-				      BCH_DATA_SB, flags);
+		have_child = true;
 	}
 
-	if (c)
-		spin_lock(&c->journal.lock);
+	printbuf_reset(&buf);
+	bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+	prt_newline(&buf);
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
 
-	for (i = 0; i < ca->journal.nr; i++) {
-		b = ca->journal.buckets[i];
-		bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
-					  ca->mi.bucket_size,
-					  gc_phase(GC_PHASE_SB), flags);
-	}
+	if (mustfix_fsck_err_on(!have_child,
+			trans, btree_node_topology_interior_node_empty,
+			"empty interior btree node at %s", buf.buf))
+		ret = DROP_THIS_NODE;
+err:
+fsck_err:
+	if (!IS_ERR_OR_NULL(prev))
+		six_unlock_read(&prev->c.lock);
+	if (!IS_ERR_OR_NULL(cur))
+		six_unlock_read(&cur->c.lock);
 
-	if (c) {
-		percpu_up_read_preempt_enable(&c->usage_lock);
-		spin_unlock(&c->journal.lock);
-	}
-}
+	bch2_btree_and_journal_iter_exit(&iter);
 
-static void bch2_mark_superblocks(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	unsigned i;
+	if (!ret && new_pass)
+		goto again;
 
-	mutex_lock(&c->sb_lock);
-	gc_pos_set(c, gc_phase(GC_PHASE_SB));
+	BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
 
-	for_each_online_member(ca, c, i)
-		bch2_mark_dev_superblock(c, ca,
-					 BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					 BCH_BUCKET_MARK_GC_LOCK_HELD);
-	mutex_unlock(&c->sb_lock);
+	bch2_bkey_buf_exit(&prev_k, c);
+	bch2_bkey_buf_exit(&cur_k, c);
+	printbuf_exit(&buf);
+	return ret;
 }
 
-/* Also see bch2_pending_btree_node_free_insert_done() */
-static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
+int bch2_check_topology(struct bch_fs *c)
 {
-	struct gc_pos pos = { 0 };
-	struct bch_fs_usage stats = { 0 };
-	struct btree_update *as;
-	struct pending_btree_node_free *d;
-
-	mutex_lock(&c->btree_interior_update_lock);
-	gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
-
-	for_each_pending_btree_node_free(c, as, d)
-		if (d->index_update_done)
-			bch2_mark_key(c, bkey_i_to_s_c(&d->key),
-				      c->opts.btree_node_size, true, pos,
-				      &stats, 0,
-				      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-				      BCH_BUCKET_MARK_GC_LOCK_HELD);
-	/*
-	 * Don't apply stats - pending deletes aren't tracked in
-	 * bch_alloc_stats:
-	 */
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bpos pulled_from_scan = POS_MIN;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
 
-	mutex_unlock(&c->btree_interior_update_lock);
-}
+	bch2_trans_srcu_unlock(trans);
 
-static void bch2_mark_allocator_buckets(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	struct open_bucket *ob;
-	size_t i, j, iter;
-	unsigned ci;
+	for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+		bool reconstructed_root = false;
 
-	percpu_down_read_preempt_disable(&c->usage_lock);
+		printbuf_reset(&buf);
+		bch2_btree_id_to_text(&buf, i);
 
-	spin_lock(&c->freelist_lock);
-	gc_pos_set(c, gc_pos_alloc(c, NULL));
+		if (r->error) {
+			ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+			if (ret)
+				break;
+reconstruct_root:
+			bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
+
+			r->alive = false;
+			r->error = 0;
+
+			if (!bch2_btree_has_scanned_nodes(c, i)) {
+				mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing,
+						 "no nodes found for btree %s, continue?", buf.buf);
+				bch2_btree_root_alloc_fake_trans(trans, i, 0);
+			} else {
+				bch2_btree_root_alloc_fake_trans(trans, i, 1);
+				bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+				ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
+				if (ret)
+					break;
+			}
 
-	for_each_member_device(ca, c, ci) {
-		fifo_for_each_entry(i, &ca->free_inc, iter)
-			bch2_mark_alloc_bucket(c, ca, i, true,
-					       gc_pos_alloc(c, NULL),
-					       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					       BCH_BUCKET_MARK_GC_LOCK_HELD);
+			reconstructed_root = true;
+		}
 
+		struct btree *b = r->b;
 
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+		ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
+		six_unlock_read(&b->c.lock);
 
-		for (j = 0; j < RESERVE_NR; j++)
-			fifo_for_each_entry(i, &ca->free[j], iter)
-				bch2_mark_alloc_bucket(c, ca, i, true,
-						       gc_pos_alloc(c, NULL),
-						       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-						       BCH_BUCKET_MARK_GC_LOCK_HELD);
-	}
+		if (ret == DROP_THIS_NODE) {
+			mutex_lock(&c->btree_cache.lock);
+			bch2_btree_node_hash_remove(&c->btree_cache, b);
+			mutex_unlock(&c->btree_cache.lock);
 
-	spin_unlock(&c->freelist_lock);
-
-	for (ob = c->open_buckets;
-	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
-	     ob++) {
-		spin_lock(&ob->lock);
-		if (ob->valid) {
-			gc_pos_set(c, gc_pos_alloc(c, ob));
-			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-			bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
-					       gc_pos_alloc(c, ob),
-					       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					       BCH_BUCKET_MARK_GC_LOCK_HELD);
+			r->b = NULL;
+
+			if (!reconstructed_root)
+				goto reconstruct_root;
+
+			bch_err(c, "empty btree root %s", buf.buf);
+			bch2_btree_root_alloc_fake_trans(trans, i, 0);
+			r->alive = false;
+			ret = 0;
 		}
-		spin_unlock(&ob->lock);
 	}
-
-	percpu_up_read_preempt_enable(&c->usage_lock);
+fsck_err:
+	printbuf_exit(&buf);
+	bch2_trans_put(trans);
+	return ret;
 }
 
-static void bch2_gc_start(struct bch_fs *c)
+/* marking of btree keys/nodes: */
+
+static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
+			    unsigned level, struct btree **prev,
+			    struct btree_iter *iter, struct bkey_s_c k,
+			    bool initial)
 {
-	struct bch_dev *ca;
-	struct bucket_array *buckets;
-	struct bucket_mark new;
-	unsigned i;
-	size_t b;
-	int cpu;
+	struct bch_fs *c = trans->c;
 
-	percpu_down_write(&c->usage_lock);
+	if (iter) {
+		struct btree_path *path = btree_iter_path(trans, iter);
+		struct btree *b = path_l(path)->b;
 
-	/*
-	 * Indicates to buckets code that gc is now in progress - done under
-	 * usage_lock to avoid racing with bch2_mark_key():
-	 */
-	__gc_pos_set(c, gc_phase(GC_PHASE_START));
-
-	/* Save a copy of the existing bucket stats while we recompute them: */
-	for_each_member_device(ca, c, i) {
-		ca->usage_cached = __bch2_dev_usage_read(ca);
-		for_each_possible_cpu(cpu) {
-			struct bch_dev_usage *p =
-				per_cpu_ptr(ca->usage_percpu, cpu);
-			memset(p, 0, sizeof(*p));
+		if (*prev != b) {
+			int ret = bch2_btree_node_check_topology(trans, b);
+			if (ret)
+				return ret;
 		}
+		*prev = b;
 	}
 
-	c->usage_cached = __bch2_fs_usage_read(c);
-	for_each_possible_cpu(cpu) {
-		struct bch_fs_usage *p =
-			per_cpu_ptr(c->usage_percpu, cpu);
+	struct bkey deleted = KEY(0, 0, 0);
+	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
 
-		memset(p->s, 0, sizeof(p->s));
-	}
+	deleted.p = k.k->p;
 
-	percpu_up_write(&c->usage_lock);
-
-	/* Clear bucket marks: */
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		buckets = bucket_array(ca);
-
-		for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
-			bucket_cmpxchg(buckets->b + b, new, ({
-				new.owned_by_allocator	= 0;
-				new.data_type		= 0;
-				new.cached_sectors	= 0;
-				new.dirty_sectors	= 0;
-			}));
-			ca->oldest_gens[b] = new.gen;
-		}
-		up_read(&ca->bucket_lock);
+	if (initial) {
+		BUG_ON(bch2_journal_seq_verify &&
+		       k.k->bversion.lo > atomic64_read(&c->journal.seq));
+
+		if (fsck_err_on(btree_id != BTREE_ID_accounting &&
+				k.k->bversion.lo > atomic64_read(&c->key_version),
+				trans, bkey_version_in_future,
+				"key version number higher than recorded %llu\n  %s",
+				atomic64_read(&c->key_version),
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			atomic64_set(&c->key_version, k.k->bversion.lo);
 	}
-}
 
-/**
- * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
- */
-void bch2_gc(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	u64 start_time = local_clock();
-	unsigned i;
+	if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
+				trans, btree_bitmap_not_marked,
+				"btree ptr not marked in member info btree allocated bitmap\n  %s",
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k),
+				 buf.buf))) {
+		mutex_lock(&c->sb_lock);
+		bch2_dev_btree_bitmap_mark(c, k);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	}
 
 	/*
-	 * Walk _all_ references to buckets, and recompute them:
-	 *
-	 * Order matters here:
-	 *  - Concurrent GC relies on the fact that we have a total ordering for
-	 *    everything that GC walks - see  gc_will_visit_node(),
-	 *    gc_will_visit_root()
-	 *
-	 *  - also, references move around in the course of index updates and
-	 *    various other crap: everything needs to agree on the ordering
-	 *    references are allowed to move around in - e.g., we're allowed to
-	 *    start with a reference owned by an open_bucket (the allocator) and
-	 *    move it to the btree, but not the reverse.
-	 *
-	 *    This is necessary to ensure that gc doesn't miss references that
-	 *    move around - if references move backwards in the ordering GC
-	 *    uses, GC could skip past them
+	 * We require a commit before key_trigger() because
+	 * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the
+	 * wrong result if we run it multiple times.
 	 */
-	trace_gc_start(c);
+	unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0;
 
-	/*
-	 * Do this before taking gc_lock - bch2_disk_reservation_get() blocks on
-	 * gc_lock if sectors_available goes to 0:
-	 */
-	bch2_recalc_sectors_available(c);
+	ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
+			       BTREE_TRIGGER_check_repair|flags);
+	if (ret)
+		goto out;
 
-	down_write(&c->gc_lock);
-	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+	if (trans->nr_updates) {
+		ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
+			-BCH_ERR_transaction_restart_nested;
 		goto out;
+	}
 
-	bch2_gc_start(c);
+	ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
+			       BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags);
+out:
+fsck_err:
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
+}
 
-	bch2_mark_superblocks(c);
+static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial)
+{
+	struct bch_fs *c = trans->c;
+	unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
+	int ret = 0;
 
-	/* Walk btree: */
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		int ret = bch2_gc_btree(c, i);
-		if (ret) {
-			bch_err(c, "btree gc failed: %d", ret);
-			set_bit(BCH_FS_GC_FAILURE, &c->flags);
-			goto out;
-		}
-	}
+	/* We need to make sure every leaf node is readable before going RW */
+	if (initial)
+		target_depth = 0;
 
-	bch2_mark_pending_btree_node_frees(c);
-	bch2_mark_allocator_buckets(c);
+	for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) {
+		struct btree *prev = NULL;
+		struct btree_iter iter;
+		bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
+					  BTREE_ITER_prefetch);
 
-	for_each_member_device(ca, c, i)
-		atomic_long_set(&ca->saturated_count, 0);
+		ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+			gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
+			bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
+		}));
+		if (ret)
+			goto err;
+	}
 
-	/* Indicates that gc is no longer in progress: */
-	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
-	c->gc_count++;
-out:
-	up_write(&c->gc_lock);
-	trace_gc_end(c);
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+	/* root */
+	do {
+retry_root:
+		bch2_trans_begin(trans);
+
+		struct btree_iter iter;
+		bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN,
+					  0, bch2_btree_id_root(c, btree)->b->c.level, 0);
+		struct btree *b = bch2_btree_iter_peek_node(&iter);
+		ret = PTR_ERR_OR_ZERO(b);
+		if (ret)
+			goto err_root;
 
-	/*
-	 * Wake up allocator in case it was waiting for buckets
-	 * because of not being able to inc gens
-	 */
-	for_each_member_device(ca, c, i)
-		bch2_wake_allocator(ca);
+		if (b != btree_node_root(c, b)) {
+			bch2_trans_iter_exit(trans, &iter);
+			goto retry_root;
+		}
 
-	/*
-	 * At startup, allocations can happen directly instead of via the
-	 * allocator thread - issue wakeup in case they blocked on gc_lock:
-	 */
-	closure_wake_up(&c->freelist_wait);
+		gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
+		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+		ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial);
+err_root:
+		bch2_trans_iter_exit(trans, &iter);
+	} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
+err:
+	bch_err_fn(c, ret);
+	return ret;
 }
 
-/* Btree coalescing */
+static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
+{
+	return cmp_int(gc_btree_order(l), gc_btree_order(r));
+}
 
-static void recalc_packed_keys(struct btree *b)
+static int bch2_gc_btrees(struct bch_fs *c)
 {
-	struct bkey_packed *k;
+	struct btree_trans *trans = bch2_trans_get(c);
+	enum btree_id ids[BTREE_ID_NR];
+	struct printbuf buf = PRINTBUF;
+	unsigned i;
+	int ret = 0;
 
-	memset(&b->nr, 0, sizeof(b->nr));
+	for (i = 0; i < BTREE_ID_NR; i++)
+		ids[i] = i;
+	bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
 
-	BUG_ON(b->nsets != 1);
+	for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+		unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
 
-	for (k =  btree_bkey_first(b, b->set);
-	     k != btree_bkey_last(b, b->set);
-	     k = bkey_next(k))
-		btree_keys_account_key_add(&b->nr, 0, k);
-}
+		if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
+			continue;
 
-static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
-				struct btree *old_nodes[GC_MERGE_NODES])
-{
-	struct btree *parent = btree_node_parent(iter, old_nodes[0]);
-	unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
-	unsigned blocks = btree_blocks(c) * 2 / 3;
-	struct btree *new_nodes[GC_MERGE_NODES];
-	struct btree_update *as;
-	struct keylist keylist;
-	struct bkey_format_state format_state;
-	struct bkey_format new_format;
-
-	memset(new_nodes, 0, sizeof(new_nodes));
-	bch2_keylist_init(&keylist, NULL);
-
-	/* Count keys that are not deleted */
-	for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
-		u64s += old_nodes[i]->nr.live_u64s;
-
-	nr_old_nodes = nr_new_nodes = i;
-
-	/* Check if all keys in @old_nodes could fit in one fewer node */
-	if (nr_old_nodes <= 1 ||
-	    __vstruct_blocks(struct btree_node, c->block_bits,
-			     DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
-		return;
-
-	/* Find a format that all keys in @old_nodes can pack into */
-	bch2_bkey_format_init(&format_state);
-
-	for (i = 0; i < nr_old_nodes; i++)
-		__bch2_btree_calc_format(&format_state, old_nodes[i]);
-
-	new_format = bch2_bkey_format_done(&format_state);
-
-	/* Check if repacking would make any nodes too big to fit */
-	for (i = 0; i < nr_old_nodes; i++)
-		if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) {
-			trace_btree_gc_coalesce_fail(c,
-					BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
-			return;
-		}
+		ret = bch2_gc_btree(trans, btree, true);
 
-	if (bch2_keylist_realloc(&keylist, NULL, 0,
-			(BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
-		trace_btree_gc_coalesce_fail(c,
-				BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
-		return;
+		if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
+					trans, btree_node_read_error,
+			       "btree node read error for %s",
+			       (printbuf_reset(&buf),
+				bch2_btree_id_to_text(&buf, btree),
+				buf.buf)))
+			ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
 	}
+fsck_err:
+	printbuf_exit(&buf);
+	bch2_trans_put(trans);
+	bch_err_fn(c, ret);
+	return ret;
+}
 
-	as = bch2_btree_update_start(c, iter->btree_id,
-			btree_update_reserve_required(c, parent) + nr_old_nodes,
-			BTREE_INSERT_NOFAIL|
-			BTREE_INSERT_USE_RESERVE,
-			NULL);
-	if (IS_ERR(as)) {
-		trace_btree_gc_coalesce_fail(c,
-				BTREE_GC_COALESCE_FAIL_RESERVE_GET);
-		bch2_keylist_free(&keylist, NULL);
-		return;
-	}
+static int bch2_mark_superblocks(struct bch_fs *c)
+{
+	gc_pos_set(c, gc_phase(GC_PHASE_sb));
 
-	trace_btree_gc_coalesce(c, old_nodes[0]);
+	return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
+}
 
-	for (i = 0; i < nr_old_nodes; i++)
-		bch2_btree_interior_update_will_free_node(as, old_nodes[i]);
+static void bch2_gc_free(struct bch_fs *c)
+{
+	bch2_accounting_gc_free(c);
 
-	/* Repack everything with @new_format and sort down to one bset */
-	for (i = 0; i < nr_old_nodes; i++)
-		new_nodes[i] =
-			__bch2_btree_node_alloc_replacement(as, old_nodes[i],
-							    new_format);
+	genradix_free(&c->reflink_gc_table);
+	genradix_free(&c->gc_stripes);
 
-	/*
-	 * Conceptually we concatenate the nodes together and slice them
-	 * up at different boundaries.
-	 */
-	for (i = nr_new_nodes - 1; i > 0; --i) {
-		struct btree *n1 = new_nodes[i];
-		struct btree *n2 = new_nodes[i - 1];
-
-		struct bset *s1 = btree_bset_first(n1);
-		struct bset *s2 = btree_bset_first(n2);
-		struct bkey_packed *k, *last = NULL;
-
-		/* Calculate how many keys from @n2 we could fit inside @n1 */
-		u64s = 0;
-
-		for (k = s2->start;
-		     k < vstruct_last(s2) &&
-		     vstruct_blocks_plus(n1->data, c->block_bits,
-					 u64s + k->u64s) <= blocks;
-		     k = bkey_next(k)) {
-			last = k;
-			u64s += k->u64s;
-		}
+	for_each_member_device(c, ca)
+		genradix_free(&ca->buckets_gc);
+}
 
-		if (u64s == le16_to_cpu(s2->u64s)) {
-			/* n2 fits entirely in n1 */
-			n1->key.k.p = n1->data->max_key = n2->data->max_key;
-
-			memcpy_u64s(vstruct_last(s1),
-				    s2->start,
-				    le16_to_cpu(s2->u64s));
-			le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
-
-			set_btree_bset_end(n1, n1->set);
-
-			six_unlock_write(&n2->lock);
-			bch2_btree_node_free_never_inserted(c, n2);
-			six_unlock_intent(&n2->lock);
-
-			memmove(new_nodes + i - 1,
-				new_nodes + i,
-				sizeof(new_nodes[0]) * (nr_new_nodes - i));
-			new_nodes[--nr_new_nodes] = NULL;
-		} else if (u64s) {
-			/* move part of n2 into n1 */
-			n1->key.k.p = n1->data->max_key =
-				bkey_unpack_pos(n1, last);
-
-			n2->data->min_key =
-				btree_type_successor(iter->btree_id,
-						     n1->data->max_key);
-
-			memcpy_u64s(vstruct_last(s1),
-				    s2->start, u64s);
-			le16_add_cpu(&s1->u64s, u64s);
-
-			memmove(s2->start,
-				vstruct_idx(s2, u64s),
-				(le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
-			s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
-
-			set_btree_bset_end(n1, n1->set);
-			set_btree_bset_end(n2, n2->set);
+static int bch2_gc_start(struct bch_fs *c)
+{
+	for_each_member_device(c, ca) {
+		int ret = bch2_dev_usage_init(ca, true);
+		if (ret) {
+			bch2_dev_put(ca);
+			return ret;
 		}
 	}
 
-	for (i = 0; i < nr_new_nodes; i++) {
-		struct btree *n = new_nodes[i];
+	return 0;
+}
+
+/* returns true if not equal */
+static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
+				     struct bch_alloc_v4 r)
+{
+	return  l.gen != r.gen				||
+		l.oldest_gen != r.oldest_gen		||
+		l.data_type != r.data_type		||
+		l.dirty_sectors	!= r.dirty_sectors	||
+		l.stripe_sectors != r.stripe_sectors	||
+		l.cached_sectors != r.cached_sectors	 ||
+		l.stripe_redundancy != r.stripe_redundancy ||
+		l.stripe != r.stripe;
+}
+
+static int bch2_alloc_write_key(struct btree_trans *trans,
+				struct btree_iter *iter,
+				struct bch_dev *ca,
+				struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i_alloc_v4 *a;
+	struct bch_alloc_v4 old_gc, gc, old_convert, new;
+	const struct bch_alloc_v4 *old;
+	int ret;
+
+	if (!bucket_valid(ca, k.k->p.offset))
+		return 0;
+
+	old = bch2_alloc_to_v4(k, &old_convert);
+	gc = new = *old;
 
-		recalc_packed_keys(n);
-		btree_node_reset_sib_u64s(n);
+	percpu_down_read(&c->mark_lock);
+	__bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset));
 
-		bch2_btree_build_aux_trees(n);
-		six_unlock_write(&n->lock);
+	old_gc = gc;
 
-		bch2_btree_node_write(c, n, SIX_LOCK_intent);
+	if ((old->data_type == BCH_DATA_sb ||
+	     old->data_type == BCH_DATA_journal) &&
+	    !bch2_dev_is_online(ca)) {
+		gc.data_type = old->data_type;
+		gc.dirty_sectors = old->dirty_sectors;
 	}
+	percpu_up_read(&c->mark_lock);
 
 	/*
-	 * The keys for the old nodes get deleted. We don't want to insert keys
-	 * that compare equal to the keys for the new nodes we'll also be
-	 * inserting - we can't because keys on a keylist must be strictly
-	 * greater than the previous keys, and we also don't need to since the
-	 * key for the new node will serve the same purpose (overwriting the key
-	 * for the old node).
+	 * gc.data_type doesn't yet include need_discard & need_gc_gen states -
+	 * fix that here:
 	 */
-	for (i = 0; i < nr_old_nodes; i++) {
-		struct bkey_i delete;
-		unsigned j;
-
-		for (j = 0; j < nr_new_nodes; j++)
-			if (!bkey_cmp(old_nodes[i]->key.k.p,
-				      new_nodes[j]->key.k.p))
-				goto next;
-
-		bkey_init(&delete.k);
-		delete.k.p = old_nodes[i]->key.k.p;
-		bch2_keylist_add_in_order(&keylist, &delete);
-next:
-		i = i;
+	alloc_data_type_set(&gc, gc.data_type);
+	if (gc.data_type != old_gc.data_type ||
+	    gc.dirty_sectors != old_gc.dirty_sectors) {
+		ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc);
+		if (ret)
+			return ret;
+
+		/*
+		 * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not
+		 * safe w.r.t. transaction restarts, so fixup the gc_bucket so
+		 * we don't run it twice:
+		 */
+		percpu_down_read(&c->mark_lock);
+		struct bucket *gc_m = gc_bucket(ca, iter->pos.offset);
+		gc_m->data_type = gc.data_type;
+		gc_m->dirty_sectors = gc.dirty_sectors;
+		percpu_up_read(&c->mark_lock);
 	}
 
+	if (fsck_err_on(new.data_type != gc.data_type,
+			trans, alloc_key_data_type_wrong,
+			"bucket %llu:%llu gen %u has wrong data_type"
+			": got %s, should be %s",
+			iter->pos.inode, iter->pos.offset,
+			gc.gen,
+			bch2_data_type_str(new.data_type),
+			bch2_data_type_str(gc.data_type)))
+		new.data_type = gc.data_type;
+
+#define copy_bucket_field(_errtype, _f)					\
+	if (fsck_err_on(new._f != gc._f,				\
+			trans, _errtype,				\
+			"bucket %llu:%llu gen %u data type %s has wrong " #_f	\
+			": got %llu, should be %llu",			\
+			iter->pos.inode, iter->pos.offset,		\
+			gc.gen,						\
+			bch2_data_type_str(gc.data_type),		\
+			(u64) new._f, (u64) gc._f))				\
+		new._f = gc._f;						\
+
+	copy_bucket_field(alloc_key_gen_wrong,			gen);
+	copy_bucket_field(alloc_key_dirty_sectors_wrong,	dirty_sectors);
+	copy_bucket_field(alloc_key_stripe_sectors_wrong,	stripe_sectors);
+	copy_bucket_field(alloc_key_cached_sectors_wrong,	cached_sectors);
+	copy_bucket_field(alloc_key_stripe_wrong,		stripe);
+	copy_bucket_field(alloc_key_stripe_redundancy_wrong,	stripe_redundancy);
+#undef copy_bucket_field
+
+	if (!bch2_alloc_v4_cmp(*old, new))
+		return 0;
+
+	a = bch2_alloc_to_v4_mut(trans, k);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		return ret;
+
+	a->v = new;
+
 	/*
-	 * Keys for the new nodes get inserted: bch2_btree_insert_keys() only
-	 * does the lookup once and thus expects the keys to be in sorted order
-	 * so we have to make sure the new keys are correctly ordered with
-	 * respect to the deleted keys added in the previous loop
+	 * The trigger normally makes sure these are set, but we're not running
+	 * triggers:
 	 */
-	for (i = 0; i < nr_new_nodes; i++)
-		bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);
-
-	/* Insert the newly coalesced nodes */
-	bch2_btree_insert_node(as, parent, iter, &keylist, 0);
+	if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
+		a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
 
-	BUG_ON(!bch2_keylist_empty(&keylist));
+	ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun);
+fsck_err:
+	return ret;
+}
 
-	BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]);
+static int bch2_gc_alloc_done(struct bch_fs *c)
+{
+	int ret = 0;
 
-	bch2_btree_iter_node_replace(iter, new_nodes[0]);
+	for_each_member_device(c, ca) {
+		ret = bch2_trans_run(c,
+			for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc,
+					POS(ca->dev_idx, ca->mi.first_bucket),
+					POS(ca->dev_idx, ca->mi.nbuckets - 1),
+					BTREE_ITER_slots|BTREE_ITER_prefetch, k,
+					NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+				bch2_alloc_write_key(trans, &iter, ca, k)));
+		if (ret) {
+			bch2_dev_put(ca);
+			break;
+		}
+	}
 
-	for (i = 0; i < nr_new_nodes; i++)
-		bch2_btree_open_bucket_put(c, new_nodes[i]);
+	bch_err_fn(c, ret);
+	return ret;
+}
 
-	/* Free the old nodes and update our sliding window */
-	for (i = 0; i < nr_old_nodes; i++) {
-		bch2_btree_node_free_inmem(c, old_nodes[i], iter);
-		six_unlock_intent(&old_nodes[i]->lock);
+static int bch2_gc_alloc_start(struct bch_fs *c)
+{
+	int ret = 0;
 
-		/*
-		 * the index update might have triggered a split, in which case
-		 * the nodes we coalesced - the new nodes we just created -
-		 * might not be sibling nodes anymore - don't add them to the
-		 * sliding window (except the first):
-		 */
-		if (!i) {
-			old_nodes[i] = new_nodes[i];
-		} else {
-			old_nodes[i] = NULL;
-			if (new_nodes[i])
-				six_unlock_intent(&new_nodes[i]->lock);
+	for_each_member_device(c, ca) {
+		ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
+		if (ret) {
+			bch2_dev_put(ca);
+			ret = -BCH_ERR_ENOMEM_gc_alloc_start;
+			break;
 		}
 	}
 
-	bch2_btree_update_done(as);
-	bch2_keylist_free(&keylist, NULL);
+	bch_err_fn(c, ret);
+	return ret;
 }
 
-static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
+static int bch2_gc_write_stripes_key(struct btree_trans *trans,
+				     struct btree_iter *iter,
+				     struct bkey_s_c k)
 {
-	struct btree_iter iter;
-	struct btree *b;
-	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	const struct bch_stripe *s;
+	struct gc_stripe *m;
+	bool bad = false;
 	unsigned i;
+	int ret = 0;
 
-	/* Sliding window of adjacent btree nodes */
-	struct btree *merge[GC_MERGE_NODES];
-	u32 lock_seq[GC_MERGE_NODES];
-
-	/*
-	 * XXX: We don't have a good way of positively matching on sibling nodes
-	 * that have the same parent - this code works by handling the cases
-	 * where they might not have the same parent, and is thus fragile. Ugh.
-	 *
-	 * Perhaps redo this to use multiple linked iterators?
-	 */
-	memset(merge, 0, sizeof(merge));
-
-	__for_each_btree_node(&iter, c, btree_id, POS_MIN,
-			      BTREE_MAX_DEPTH, 0,
-			      BTREE_ITER_PREFETCH, b) {
-		memmove(merge + 1, merge,
-			sizeof(merge) - sizeof(merge[0]));
-		memmove(lock_seq + 1, lock_seq,
-			sizeof(lock_seq) - sizeof(lock_seq[0]));
+	if (k.k->type != KEY_TYPE_stripe)
+		return 0;
 
-		merge[0] = b;
+	s = bkey_s_c_to_stripe(k).v;
+	m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
 
-		for (i = 1; i < GC_MERGE_NODES; i++) {
-			if (!merge[i] ||
-			    !six_relock_intent(&merge[i]->lock, lock_seq[i]))
-				break;
+	for (i = 0; i < s->nr_blocks; i++) {
+		u32 old = stripe_blockcount_get(s, i);
+		u32 new = (m ? m->block_sectors[i] : 0);
 
-			if (merge[i]->level != merge[0]->level) {
-				six_unlock_intent(&merge[i]->lock);
-				break;
-			}
+		if (old != new) {
+			prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n",
+				   i, old, new);
+			bad = true;
 		}
-		memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
+	}
 
-		bch2_coalesce_nodes(c, &iter, merge);
+	if (bad)
+		bch2_bkey_val_to_text(&buf, c, k);
 
-		for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
-			lock_seq[i] = merge[i]->lock.state.seq;
-			six_unlock_intent(&merge[i]->lock);
-		}
+	if (fsck_err_on(bad,
+			trans, stripe_sector_count_wrong,
+			"%s", buf.buf)) {
+		struct bkey_i_stripe *new;
 
-		lock_seq[0] = merge[0]->lock.state.seq;
+		new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			return ret;
 
-		if (kthread && kthread_should_stop()) {
-			bch2_btree_iter_unlock(&iter);
-			return -ESHUTDOWN;
-		}
+		bkey_reassemble(&new->k_i, k);
 
-		bch2_btree_iter_cond_resched(&iter);
+		for (i = 0; i < new->v.nr_blocks; i++)
+			stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
 
-		/*
-		 * If the parent node wasn't relocked, it might have been split
-		 * and the nodes in our sliding window might not have the same
-		 * parent anymore - blow away the sliding window:
-		 */
-		if (btree_iter_node(&iter, iter.level + 1) &&
-		    !btree_node_intent_locked(&iter, iter.level + 1))
-			memset(merge + 1, 0,
-			       (GC_MERGE_NODES - 1) * sizeof(merge[0]));
+		ret = bch2_trans_update(trans, iter, &new->k_i, 0);
 	}
-	return bch2_btree_iter_unlock(&iter);
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int bch2_gc_stripes_done(struct bch_fs *c)
+{
+	return bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_stripes, POS_MIN,
+				BTREE_ITER_prefetch, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			bch2_gc_write_stripes_key(trans, &iter, k)));
 }
 
 /**
- * bch_coalesce - coalesce adjacent nodes with low occupancy
+ * bch2_check_allocations - walk all references to buckets, and recompute them:
+ *
+ * @c:			filesystem object
+ *
+ * Returns: 0 on success, or standard errcode on failure
+ *
+ * Order matters here:
+ *  - Concurrent GC relies on the fact that we have a total ordering for
+ *    everything that GC walks - see  gc_will_visit_node(),
+ *    gc_will_visit_root()
+ *
+ *  - also, references move around in the course of index updates and
+ *    various other crap: everything needs to agree on the ordering
+ *    references are allowed to move around in - e.g., we're allowed to
+ *    start with a reference owned by an open_bucket (the allocator) and
+ *    move it to the btree, but not the reverse.
+ *
+ *    This is necessary to ensure that gc doesn't miss references that
+ *    move around - if references move backwards in the ordering GC
+ *    uses, GC could skip past them
  */
-void bch2_coalesce(struct bch_fs *c)
+int bch2_check_allocations(struct bch_fs *c)
 {
-	enum btree_id id;
+	int ret;
 
-	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
-		return;
+	lockdep_assert_held(&c->state_lock);
 
-	down_read(&c->gc_lock);
-	trace_gc_coalesce_start(c);
-
-	for (id = 0; id < BTREE_ID_NR; id++) {
-		int ret = c->btree_roots[id].b
-			? bch2_coalesce_btree(c, id)
-			: 0;
+	down_write(&c->gc_lock);
 
-		if (ret) {
-			if (ret != -ESHUTDOWN)
-				bch_err(c, "btree coalescing failed: %d", ret);
-			set_bit(BCH_FS_GC_FAILURE, &c->flags);
-			return;
-		}
-	}
+	bch2_btree_interior_updates_flush(c);
 
-	trace_gc_coalesce_end(c);
-	up_read(&c->gc_lock);
-}
+	ret   = bch2_gc_accounting_start(c) ?:
+		bch2_gc_start(c) ?:
+		bch2_gc_alloc_start(c) ?:
+		bch2_gc_reflink_start(c);
+	if (ret)
+		goto out;
 
-static int bch2_gc_thread(void *arg)
-{
-	struct bch_fs *c = arg;
-	struct io_clock *clock = &c->io_clock[WRITE];
-	unsigned long last = atomic_long_read(&clock->now);
-	unsigned last_kick = atomic_read(&c->kick_gc);
+	gc_pos_set(c, gc_phase(GC_PHASE_start));
 
-	set_freezable();
+	ret = bch2_mark_superblocks(c);
+	bch_err_msg(c, ret, "marking superblocks");
+	if (ret)
+		goto out;
 
-	while (1) {
-		while (1) {
-			set_current_state(TASK_INTERRUPTIBLE);
+	ret = bch2_gc_btrees(c);
+	if (ret)
+		goto out;
 
-			if (kthread_should_stop()) {
-				__set_current_state(TASK_RUNNING);
-				return 0;
-			}
+	c->gc_count++;
 
-			if (atomic_read(&c->kick_gc) != last_kick)
-				break;
+	ret   = bch2_gc_alloc_done(c) ?:
+		bch2_gc_accounting_done(c) ?:
+		bch2_gc_stripes_done(c) ?:
+		bch2_gc_reflink_done(c);
+out:
+	percpu_down_write(&c->mark_lock);
+	/* Indicates that gc is no longer in progress: */
+	__gc_pos_set(c, gc_phase(GC_PHASE_not_running));
 
-			if (c->btree_gc_periodic) {
-				unsigned long next = last + c->capacity / 16;
+	bch2_gc_free(c);
+	percpu_up_write(&c->mark_lock);
 
-				if (atomic_long_read(&clock->now) >= next)
-					break;
+	up_write(&c->gc_lock);
 
-				bch2_io_clock_schedule_timeout(clock, next);
-			} else {
-				schedule();
-			}
+	/*
+	 * At startup, allocations can happen directly instead of via the
+	 * allocator thread - issue wakeup in case they blocked on gc_lock:
+	 */
+	closure_wake_up(&c->freelist_wait);
+	bch_err_fn(c, ret);
+	return ret;
+}
 
-			try_to_freeze();
+static int gc_btree_gens_key(struct btree_trans *trans,
+			     struct btree_iter *iter,
+			     struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct bkey_i *u;
+	int ret;
+
+	if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
+		return -EROFS;
+
+	percpu_down_read(&c->mark_lock);
+	rcu_read_lock();
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+		if (!ca)
+			continue;
+
+		if (dev_ptr_stale(ca, ptr) > 16) {
+			rcu_read_unlock();
+			percpu_up_read(&c->mark_lock);
+			goto update;
 		}
-		__set_current_state(TASK_RUNNING);
-
-		last = atomic_long_read(&clock->now);
-		last_kick = atomic_read(&c->kick_gc);
+	}
 
-		bch2_gc(c);
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+		if (!ca)
+			continue;
 
-		debug_check_no_locks_held();
+		u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
+		if (gen_after(*gen, ptr->gen))
+			*gen = ptr->gen;
 	}
+	rcu_read_unlock();
+	percpu_up_read(&c->mark_lock);
+	return 0;
+update:
+	u = bch2_bkey_make_mut(trans, iter, &k, 0);
+	ret = PTR_ERR_OR_ZERO(u);
+	if (ret)
+		return ret;
 
+	bch2_extent_normalize(c, bkey_i_to_s(u));
 	return 0;
 }
 
-void bch2_gc_thread_stop(struct bch_fs *c)
+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca,
+				       struct btree_iter *iter, struct bkey_s_c k)
 {
-	struct task_struct *p;
-
-	p = c->gc_thread;
-	c->gc_thread = NULL;
-
-	if (p) {
-		kthread_stop(p);
-		put_task_struct(p);
-	}
-}
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+	struct bkey_i_alloc_v4 *a_mut;
+	int ret;
 
-int bch2_gc_thread_start(struct bch_fs *c)
-{
-	struct task_struct *p;
+	if (a->oldest_gen == ca->oldest_gen[iter->pos.offset])
+		return 0;
 
-	BUG_ON(c->gc_thread);
+	a_mut = bch2_alloc_to_v4_mut(trans, k);
+	ret = PTR_ERR_OR_ZERO(a_mut);
+	if (ret)
+		return ret;
 
-	p = kthread_create(bch2_gc_thread, c, "bch_gc");
-	if (IS_ERR(p))
-		return PTR_ERR(p);
+	a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
+	alloc_data_type_set(&a_mut->v, a_mut->v.data_type);
 
-	get_task_struct(p);
-	c->gc_thread = p;
-	wake_up_process(p);
-	return 0;
+	return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
 }
 
-/* Initial GC computes bucket marks during startup */
-
-static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id)
+int bch2_gc_gens(struct bch_fs *c)
 {
-	struct btree_iter iter;
-	struct btree *b;
-	struct range_checks r;
-	int ret = 0;
-
-	btree_node_range_checks_init(&r, 0);
+	u64 b, start_time = local_clock();
+	int ret;
 
-	gc_pos_set(c, gc_pos_btree(id, POS_MIN, 0));
-
-	if (!c->btree_roots[id].b)
+	if (!mutex_trylock(&c->gc_gens_lock))
 		return 0;
 
-	b = c->btree_roots[id].b;
-	if (!btree_node_fake(b))
-		ret = bch2_btree_mark_key_initial(c, BKEY_TYPE_BTREE,
-						  bkey_i_to_s_c(&b->key));
-	if (ret)
-		return ret;
+	trace_and_count(c, gc_gens_start, c);
 
 	/*
-	 * We have to hit every btree node before starting journal replay, in
-	 * order for the journal seq blacklist machinery to work:
+	 * We have to use trylock here. Otherwise, we would
+	 * introduce a deadlock in the RO path - we take the
+	 * state lock at the start of going RO.
 	 */
-	for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
-		btree_node_range_checks(c, b, &r);
-
-		if (btree_node_has_ptrs(b)) {
-			struct btree_node_iter node_iter;
-			struct bkey unpacked;
-			struct bkey_s_c k;
-
-			for_each_btree_node_key_unpack(b, k, &node_iter,
-						       btree_node_is_extents(b),
-						       &unpacked) {
-				ret = bch2_btree_mark_key_initial(c,
-							btree_node_type(b), k);
-				if (ret)
-					goto err;
-			}
-		}
-
-		bch2_btree_iter_cond_resched(&iter);
+	if (!down_read_trylock(&c->state_lock)) {
+		mutex_unlock(&c->gc_gens_lock);
+		return 0;
 	}
-err:
-	return bch2_btree_iter_unlock(&iter) ?: ret;
-}
-
-int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
-{
-	unsigned iter = 0;
-	enum btree_id id;
-	int ret = 0;
 
-	down_write(&c->gc_lock);
-again:
-	bch2_gc_start(c);
+	for_each_member_device(c, ca) {
+		struct bucket_gens *gens = bucket_gens(ca);
 
-	bch2_mark_superblocks(c);
+		BUG_ON(ca->oldest_gen);
 
-	for (id = 0; id < BTREE_ID_NR; id++) {
-		ret = bch2_initial_gc_btree(c, id);
-		if (ret)
+		ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
+		if (!ca->oldest_gen) {
+			bch2_dev_put(ca);
+			ret = -BCH_ERR_ENOMEM_gc_gens;
 			goto err;
+		}
+
+		for (b = gens->first_bucket;
+		     b < gens->nbuckets; b++)
+			ca->oldest_gen[b] = gens->b[b];
 	}
 
-	ret = bch2_journal_mark(c, journal);
+	for (unsigned i = 0; i < BTREE_ID_NR; i++)
+		if (btree_type_has_ptrs(i)) {
+			c->gc_gens_btree = i;
+			c->gc_gens_pos = POS_MIN;
+
+			ret = bch2_trans_run(c,
+				for_each_btree_key_commit(trans, iter, i,
+						POS_MIN,
+						BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
+						k,
+						NULL, NULL,
+						BCH_TRANS_COMMIT_no_enospc,
+					gc_btree_gens_key(trans, &iter, k)));
+			if (ret)
+				goto err;
+		}
+
+	struct bch_dev *ca = NULL;
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
+				POS_MIN,
+				BTREE_ITER_prefetch,
+				k,
+				NULL, NULL,
+				BCH_TRANS_COMMIT_no_enospc, ({
+			ca = bch2_dev_iterate(c, ca, k.k->p.inode);
+			if (!ca) {
+				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+				continue;
+			}
+			bch2_alloc_write_oldest_gen(trans, ca, &iter, k);
+		})));
+	bch2_dev_put(ca);
+
 	if (ret)
 		goto err;
 
-	if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
-		if (iter++ > 2) {
-			bch_info(c, "Unable to fix bucket gens, looping");
-			ret = -EINVAL;
-			goto err;
-		}
-
-		bch_info(c, "Fixed gens, restarting initial mark and sweep:");
-		clear_bit(BCH_FS_FIXED_GENS, &c->flags);
-		goto again;
-	}
+	c->gc_gens_btree	= 0;
+	c->gc_gens_pos		= POS_MIN;
 
-	/*
-	 * Skip past versions that might have possibly been used (as nonces),
-	 * but hadn't had their pointers written:
-	 */
-	if (c->sb.encryption_type)
-		atomic64_add(1 << 16, &c->key_version);
+	c->gc_count++;
 
-	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
-	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+	trace_and_count(c, gc_gens_end, c);
 err:
-	up_write(&c->gc_lock);
+	for_each_member_device(c, ca) {
+		kvfree(ca->oldest_gen);
+		ca->oldest_gen = NULL;
+	}
+
+	up_read(&c->state_lock);
+	mutex_unlock(&c->gc_gens_lock);
+	if (!bch2_err_matches(ret, EROFS))
+		bch_err_fn(c, ret);
 	return ret;
 }
+
+static void bch2_gc_gens_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work);
+	bch2_gc_gens(c);
+	bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
+}
+
+void bch2_gc_gens_async(struct bch_fs *c)
+{
+	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) &&
+	    !queue_work(c->write_ref_wq, &c->gc_gens_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
+}
+
+void bch2_fs_gc_init(struct bch_fs *c)
+{
+	seqcount_init(&c->gc_pos_lock);
+
+	INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
+}
diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h
index 214a3fe3..8a47e8bd 100644
--- a/libbcachefs/btree_gc.h
+++ b/libbcachefs/btree_gc.h
@@ -1,19 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_BTREE_GC_H
 #define _BCACHEFS_BTREE_GC_H
 
+#include "bkey.h"
+#include "btree_gc_types.h"
 #include "btree_types.h"
 
-enum bkey_type;
-
-void bch2_coalesce(struct bch_fs *);
-void bch2_gc(struct bch_fs *);
-void bch2_gc_thread_stop(struct bch_fs *);
-int bch2_gc_thread_start(struct bch_fs *);
-int bch2_initial_gc(struct bch_fs *, struct list_head *);
-u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c);
-int bch2_btree_mark_key_initial(struct bch_fs *, enum bkey_type,
-				struct bkey_s_c);
-void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
+int bch2_check_topology(struct bch_fs *);
+int bch2_check_allocations(struct bch_fs *);
 
 /*
  * For concurrent mark and sweep (with other index updates), we define a total
@@ -39,74 +33,55 @@ void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
 /* Position of (the start of) a gc phase: */
 static inline struct gc_pos gc_phase(enum gc_phase phase)
 {
-	return (struct gc_pos) {
-		.phase	= phase,
-		.pos	= POS_MIN,
-		.level	= 0,
-	};
+	return (struct gc_pos) { .phase	= phase, };
 }
 
-static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
-{
-	if (l.phase != r.phase)
-		return l.phase < r.phase ? -1 : 1;
-	if (bkey_cmp(l.pos, r.pos))
-		return bkey_cmp(l.pos, r.pos);
-	if (l.level != r.level)
-		return l.level < r.level ? -1 : 1;
-	return 0;
-}
-
-static inline struct gc_pos gc_pos_btree(enum btree_id id,
-					 struct bpos pos, unsigned level)
+static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level,
+					 struct bpos pos)
 {
 	return (struct gc_pos) {
-		.phase	= GC_PHASE_BTREE_EXTENTS + id,
-		.pos	= pos,
+		.phase	= GC_PHASE_btree,
+		.btree	= btree,
 		.level	= level,
+		.pos	= pos,
 	};
 }
 
-/*
- * GC position of the pointers within a btree node: note, _not_ for &b->key
- * itself, that lives in the parent node:
- */
-static inline struct gc_pos gc_pos_btree_node(struct btree *b)
+static inline int gc_btree_order(enum btree_id btree)
 {
-	return gc_pos_btree(b->btree_id, b->key.k.p, b->level);
+	if (btree == BTREE_ID_alloc)
+		return -2;
+	if (btree == BTREE_ID_stripes)
+		return -1;
+	return btree;
 }
 
-/*
- * GC position of the pointer to a btree root: we don't use
- * gc_pos_pointer_to_btree_node() here to avoid a potential race with
- * btree_split() increasing the tree depth - the new root will have level > the
- * old root and thus have a greater gc position than the old root, but that
- * would be incorrect since once gc has marked the root it's not coming back.
- */
-static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
-{
-	return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH);
-}
-
-static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob)
+static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
 {
-	return (struct gc_pos) {
-		.phase	= GC_PHASE_ALLOC,
-		.pos	= POS(ob ? ob - c->open_buckets : 0, 0),
-	};
+	return  cmp_int(l.phase, r.phase) ?:
+		cmp_int(gc_btree_order(l.btree),
+			gc_btree_order(r.btree)) ?:
+		cmp_int(l.level, r.level) ?:
+		bpos_cmp(l.pos, r.pos);
 }
 
-static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
+static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
 {
 	unsigned seq;
 	bool ret;
 
 	do {
 		seq = read_seqcount_begin(&c->gc_pos_lock);
-		ret = gc_pos_cmp(c->gc_pos, pos) < 0;
+		ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
 	} while (read_seqcount_retry(&c->gc_pos_lock, seq));
 
 	return ret;
 }
 
+void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *);
+
+int bch2_gc_gens(struct bch_fs *);
+void bch2_gc_gens_async(struct bch_fs *);
+void bch2_fs_gc_init(struct bch_fs *);
+
 #endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/libbcachefs/btree_gc_types.h b/libbcachefs/btree_gc_types.h
new file mode 100644
index 00000000..c24dd6ed
--- /dev/null
+++ b/libbcachefs/btree_gc_types.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_GC_TYPES_H
+#define _BCACHEFS_BTREE_GC_TYPES_H
+
+#include <linux/generic-radix-tree.h>
+
+#define GC_PHASES()		\
+	x(not_running)		\
+	x(start)		\
+	x(sb)			\
+	x(btree)
+
+enum gc_phase {
+#define x(n)	GC_PHASE_##n,
+	GC_PHASES()
+#undef x
+};
+
+struct gc_pos {
+	enum gc_phase		phase:8;
+	enum btree_id		btree:8;
+	u16			level;
+	struct bpos		pos;
+};
+
+struct reflink_gc {
+	u64		offset;
+	u32		size;
+	u32		refcount;
+};
+
+typedef GENRADIX(struct reflink_gc) reflink_gc_table;
+
+#endif /* _BCACHEFS_BTREE_GC_TYPES_H */
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 94f56dbb..9df9fc1c 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1,6 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
 #include "bkey_methods.h"
+#include "bkey_sort.h"
 #include "btree_cache.h"
 #include "btree_io.h"
 #include "btree_iter.h"
@@ -12,357 +14,231 @@
 #include "debug.h"
 #include "error.h"
 #include "extents.h"
-#include "io.h"
+#include "io_write.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
+#include "recovery.h"
 #include "super-io.h"
+#include "trace.h"
 
-#include <trace/events/bcachefs.h>
+#include <linux/sched/mm.h>
 
-/* btree_node_iter_large: */
-
-#define btree_node_iter_cmp_heap(h, _l, _r)				\
-	__btree_node_iter_cmp((iter)->is_extents, b,			\
-			       __btree_node_offset_to_key(b, (_l).k),	\
-			       __btree_node_offset_to_key(b, (_r).k))
-
-void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter,
-				     struct btree *b,
-				     const struct bkey_packed *k,
-				     const struct bkey_packed *end)
+static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
 {
-	if (k != end) {
-		struct btree_node_iter_set n =
-			((struct btree_node_iter_set) {
-				 __btree_node_key_to_offset(b, k),
-				 __btree_node_key_to_offset(b, end)
-			 });
-
-		__heap_add(iter, n, btree_node_iter_cmp_heap);
-	}
+	bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn));
+	prt_printf(out, " seq %llux\n", bn->keys.seq);
+	prt_str(out, "min: ");
+	bch2_bpos_to_text(out, bn->min_key);
+	prt_newline(out);
+	prt_str(out, "max: ");
+	bch2_bpos_to_text(out, bn->max_key);
 }
 
-void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter,
-					struct btree *b)
+void bch2_btree_node_io_unlock(struct btree *b)
 {
-	iter->data->k += __btree_node_offset_to_key(b, iter->data->k)->u64s;
+	EBUG_ON(!btree_node_write_in_flight(b));
 
-	EBUG_ON(!iter->used);
-	EBUG_ON(iter->data->k > iter->data->end);
-
-	if (iter->data->k == iter->data->end)
-		heap_del(iter, 0, btree_node_iter_cmp_heap);
-	else
-		heap_sift_down(iter, 0, btree_node_iter_cmp_heap);
+	clear_btree_node_write_in_flight_inner(b);
+	clear_btree_node_write_in_flight(b);
+	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
 }
 
-static void verify_no_dups(struct btree *b,
-			   struct bkey_packed *start,
-			   struct bkey_packed *end)
+void bch2_btree_node_io_lock(struct btree *b)
 {
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct bkey_packed *k;
-
-	for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) {
-		struct bkey l = bkey_unpack_key(b, k);
-		struct bkey r = bkey_unpack_key(b, bkey_next(k));
-
-		BUG_ON(btree_node_is_extents(b)
-		       ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
-		       : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
-		//BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
-	}
-#endif
+	wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
+			    TASK_UNINTERRUPTIBLE);
 }
 
-static void clear_needs_whiteout(struct bset *i)
+void __bch2_btree_node_wait_on_read(struct btree *b)
 {
-	struct bkey_packed *k;
-
-	for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
-		k->needs_whiteout = false;
+	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+		       TASK_UNINTERRUPTIBLE);
 }
 
-static void set_needs_whiteout(struct bset *i)
+void __bch2_btree_node_wait_on_write(struct btree *b)
 {
-	struct bkey_packed *k;
-
-	for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
-		k->needs_whiteout = true;
+	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+		       TASK_UNINTERRUPTIBLE);
 }
 
-static void btree_bounce_free(struct bch_fs *c, unsigned order,
-			      bool used_mempool, void *p)
+void bch2_btree_node_wait_on_read(struct btree *b)
 {
-	if (used_mempool)
-		mempool_free(p, &c->btree_bounce_pool);
-	else
-		vpfree(p, PAGE_SIZE << order);
+	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+		       TASK_UNINTERRUPTIBLE);
 }
 
-static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
-				bool *used_mempool)
+void bch2_btree_node_wait_on_write(struct btree *b)
 {
-	void *p;
-
-	BUG_ON(order > btree_page_order(c));
-
-	*used_mempool = false;
-	p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
-	if (p)
-		return p;
-
-	*used_mempool = true;
-	return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
+	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+		       TASK_UNINTERRUPTIBLE);
 }
 
-typedef int (*sort_cmp_fn)(struct btree *,
-			   struct bkey_packed *,
-			   struct bkey_packed *);
-
-struct sort_iter {
-	struct btree	*b;
-	unsigned		used;
-
-	struct sort_iter_set {
-		struct bkey_packed *k, *end;
-	} data[MAX_BSETS + 1];
-};
-
-static void sort_iter_init(struct sort_iter *iter, struct btree *b)
-{
-	memset(iter, 0, sizeof(*iter));
-	iter->b = b;
-}
-
-static inline void __sort_iter_sift(struct sort_iter *iter,
-				    unsigned from,
-				    sort_cmp_fn cmp)
-{
-	unsigned i;
-
-	for (i = from;
-	     i + 1 < iter->used &&
-	     cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
-	     i++)
-		swap(iter->data[i], iter->data[i + 1]);
-}
-
-static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
+static void verify_no_dups(struct btree *b,
+			   struct bkey_packed *start,
+			   struct bkey_packed *end)
 {
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bkey_packed *k, *p;
 
-	__sort_iter_sift(iter, 0, cmp);
-}
+	if (start == end)
+		return;
 
-static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-	unsigned i = iter->used;
+	for (p = start, k = bkey_p_next(start);
+	     k != end;
+	     p = k, k = bkey_p_next(k)) {
+		struct bkey l = bkey_unpack_key(b, p);
+		struct bkey r = bkey_unpack_key(b, k);
 
-	while (i--)
-		__sort_iter_sift(iter, i, cmp);
+		BUG_ON(bpos_ge(l.p, bkey_start_pos(&r)));
+	}
+#endif
 }
 
-static void sort_iter_add(struct sort_iter *iter,
-			  struct bkey_packed *k,
-			  struct bkey_packed *end)
+static void set_needs_whiteout(struct bset *i, int v)
 {
-	BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
+	struct bkey_packed *k;
 
-	if (k != end)
-		iter->data[iter->used++] = (struct sort_iter_set) { k, end };
+	for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
+		k->needs_whiteout = v;
 }
 
-static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
-{
-	return iter->used ? iter->data->k : NULL;
-}
-
-static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
+static void btree_bounce_free(struct bch_fs *c, size_t size,
+			      bool used_mempool, void *p)
 {
-	iter->data->k = bkey_next(iter->data->k);
-
-	BUG_ON(iter->data->k > iter->data->end);
-
-	if (iter->data->k == iter->data->end)
-		array_remove_item(iter->data, iter->used, 0);
+	if (used_mempool)
+		mempool_free(p, &c->btree_bounce_pool);
 	else
-		sort_iter_sift(iter, cmp);
+		kvfree(p);
 }
 
-static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
-						 sort_cmp_fn cmp)
+static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
+				bool *used_mempool)
 {
-	struct bkey_packed *ret = sort_iter_peek(iter);
+	unsigned flags = memalloc_nofs_save();
+	void *p;
 
-	if (ret)
-		sort_iter_advance(iter, cmp);
+	BUG_ON(size > c->opts.btree_node_size);
 
-	return ret;
+	*used_mempool = false;
+	p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
+	if (!p) {
+		*used_mempool = true;
+		p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
+	}
+	memalloc_nofs_restore(flags);
+	return p;
 }
 
-static inline int sort_key_whiteouts_cmp(struct btree *b,
-					 struct bkey_packed *l,
-					 struct bkey_packed *r)
+static void sort_bkey_ptrs(const struct btree *bt,
+			   struct bkey_packed **ptrs, unsigned nr)
 {
-	return bkey_cmp_packed(b, l, r);
-}
+	unsigned n = nr, a = nr / 2, b, c, d;
 
-static unsigned sort_key_whiteouts(struct bkey_packed *dst,
-				   struct sort_iter *iter)
-{
-	struct bkey_packed *in, *out = dst;
+	if (!a)
+		return;
 
-	sort_iter_sort(iter, sort_key_whiteouts_cmp);
+	/* Heap sort: see lib/sort.c: */
+	while (1) {
+		if (a)
+			a--;
+		else if (--n)
+			swap(ptrs[0], ptrs[n]);
+		else
+			break;
 
-	while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
-		bkey_copy(out, in);
-		out = bkey_next(out);
+		for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
+			b = bch2_bkey_cmp_packed(bt,
+					    ptrs[c],
+					    ptrs[d]) >= 0 ? c : d;
+		if (d == n)
+			b = c;
+
+		while (b != a &&
+		       bch2_bkey_cmp_packed(bt,
+				       ptrs[a],
+				       ptrs[b]) >= 0)
+			b = (b - 1) / 2;
+		c = b;
+		while (b != a) {
+			b = (b - 1) / 2;
+			swap(ptrs[b], ptrs[c]);
+		}
 	}
-
-	return (u64 *) out - (u64 *) dst;
-}
-
-static inline int sort_extent_whiteouts_cmp(struct btree *b,
-					    struct bkey_packed *l,
-					    struct bkey_packed *r)
-{
-	struct bkey ul = bkey_unpack_key(b, l);
-	struct bkey ur = bkey_unpack_key(b, r);
-
-	return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
 }
 
-static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
-				      struct sort_iter *iter)
+static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
 {
-	const struct bkey_format *f = &iter->b->format;
-	struct bkey_packed *in, *out = dst;
-	struct bkey_i l, r;
-	bool prev = false, l_packed = false;
-	u64 max_packed_size	= bkey_field_max(f, BKEY_FIELD_SIZE);
-	u64 max_packed_offset	= bkey_field_max(f, BKEY_FIELD_OFFSET);
-	u64 new_size;
-
-	max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
-
-	sort_iter_sort(iter, sort_extent_whiteouts_cmp);
-
-	while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
-		EBUG_ON(bkeyp_val_u64s(f, in));
-		EBUG_ON(in->type != KEY_TYPE_DISCARD);
-
-		r.k = bkey_unpack_key(iter->b, in);
-
-		if (prev &&
-		    bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
-			if (bkey_cmp(l.k.p, r.k.p) >= 0)
-				continue;
-
-			new_size = l_packed
-				? min(max_packed_size, max_packed_offset -
-				      bkey_start_offset(&l.k))
-				: KEY_SIZE_MAX;
+	struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k;
+	bool used_mempool = false;
+	size_t bytes = b->whiteout_u64s * sizeof(u64);
 
-			new_size = min(new_size, r.k.p.offset -
-				       bkey_start_offset(&l.k));
+	if (!b->whiteout_u64s)
+		return;
 
-			BUG_ON(new_size < l.k.size);
+	new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
 
-			bch2_key_resize(&l.k, new_size);
+	ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
 
-			if (bkey_cmp(l.k.p, r.k.p) >= 0)
-				continue;
+	for (k = unwritten_whiteouts_start(b);
+	     k != unwritten_whiteouts_end(b);
+	     k = bkey_p_next(k))
+		*--ptrs = k;
 
-			bch2_cut_front(l.k.p, &r);
-		}
+	sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs);
 
-		if (prev) {
-			if (!bch2_bkey_pack(out, &l, f)) {
-				BUG_ON(l_packed);
-				bkey_copy(out, &l);
-			}
-			out = bkey_next(out);
-		}
+	k = new_whiteouts;
 
-		l = r;
-		prev = true;
-		l_packed = bkey_packed(in);
+	while (ptrs != ptrs_end) {
+		bkey_p_copy(k, *ptrs);
+		k = bkey_p_next(k);
+		ptrs++;
 	}
 
-	if (prev) {
-		if (!bch2_bkey_pack(out, &l, f)) {
-			BUG_ON(l_packed);
-			bkey_copy(out, &l);
-		}
-		out = bkey_next(out);
-	}
+	verify_no_dups(b, new_whiteouts,
+		       (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
+
+	memcpy_u64s(unwritten_whiteouts_start(b),
+		    new_whiteouts, b->whiteout_u64s);
 
-	return (u64 *) out - (u64 *) dst;
+	btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
 }
 
-static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
-				    bool compacting,
-				    enum compact_mode mode)
+static bool should_compact_bset(struct btree *b, struct bset_tree *t,
+				bool compacting, enum compact_mode mode)
 {
-	unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
-	unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
+	if (!bset_dead_u64s(b, t))
+		return false;
 
-	if (mode == COMPACT_LAZY) {
-		if (should_compact_bset_lazy(b, t) ||
-		    (compacting && bset_unwritten(b, bset(b, t))))
-			return dead_u64s;
-	} else {
-		if (bset_written(b, bset(b, t)))
-			return dead_u64s;
+	switch (mode) {
+	case COMPACT_LAZY:
+		return should_compact_bset_lazy(b, t) ||
+			(compacting && !bset_written(b, bset(b, t)));
+	case COMPACT_ALL:
+		return true;
+	default:
+		BUG();
 	}
-
-	return 0;
 }
 
-bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
-			     enum compact_mode mode)
+static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
 {
-	const struct bkey_format *f = &b->format;
-	struct bset_tree *t;
-	struct bkey_packed *whiteouts = NULL;
-	struct bkey_packed *u_start, *u_pos;
-	struct sort_iter sort_iter;
-	unsigned order, whiteout_u64s = 0, u64s;
-	bool used_mempool, compacting = false;
-
-	for_each_bset(b, t)
-		whiteout_u64s += should_compact_bset(b, t,
-					whiteout_u64s != 0, mode);
-
-	if (!whiteout_u64s)
-		return false;
-
-	sort_iter_init(&sort_iter, b);
-
-	whiteout_u64s += b->whiteout_u64s;
-	order = get_order(whiteout_u64s * sizeof(u64));
-
-	whiteouts = btree_bounce_alloc(c, order, &used_mempool);
-	u_start = u_pos = whiteouts;
-
-	memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
-		    b->whiteout_u64s);
-	u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64);
-
-	sort_iter_add(&sort_iter, u_start, u_pos);
+	bool ret = false;
 
 	for_each_bset(b, t) {
 		struct bset *i = bset(b, t);
 		struct bkey_packed *k, *n, *out, *start, *end;
 		struct btree_node_entry *src = NULL, *dst = NULL;
 
-		if (t != b->set && bset_unwritten(b, i)) {
+		if (t != b->set && !bset_written(b, i)) {
 			src = container_of(i, struct btree_node_entry, keys);
 			dst = max(write_block(b),
-				  (void *) btree_bkey_last(b, t -1));
+				  (void *) btree_bkey_last(b, t - 1));
 		}
 
-		if (!should_compact_bset(b, t, compacting, mode)) {
+		if (src != dst)
+			ret = true;
+
+		if (!should_compact_bset(b, t, ret, mode)) {
 			if (src != dst) {
 				memmove(dst, src, sizeof(*src) +
 					le16_to_cpu(src->keys.u64s) *
@@ -373,10 +249,8 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 			continue;
 		}
 
-		compacting = true;
-		u_start = u_pos;
-		start = i->start;
-		end = vstruct_last(i);
+		start	= btree_bkey_first(b, t);
+		end	= btree_bkey_last(b, t);
 
 		if (src != dst) {
 			memmove(dst, src, sizeof(*src));
@@ -387,247 +261,76 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
 		out = i->start;
 
 		for (k = start; k != end; k = n) {
-			n = bkey_next(k);
-
-			if (bkey_deleted(k) && btree_node_is_extents(b))
-				continue;
-
-			if (bkey_whiteout(k) && !k->needs_whiteout)
-				continue;
+			n = bkey_p_next(k);
 
-			if (bkey_whiteout(k)) {
-				unreserve_whiteout(b, t, k);
-				memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
-				set_bkeyp_val_u64s(f, u_pos, 0);
-				u_pos = bkey_next(u_pos);
-			} else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
-				bkey_copy(out, k);
-				out = bkey_next(out);
-			}
-		}
-
-		sort_iter_add(&sort_iter, u_start, u_pos);
-
-		if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
-			i->u64s = cpu_to_le16((u64 *) out - i->_data);
-			set_btree_bset_end(b, t);
-			bch2_bset_set_no_aux_tree(b, t);
-		}
-	}
-
-	b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
-
-	BUG_ON((void *) unwritten_whiteouts_start(c, b) <
-	       (void *) btree_bkey_last(b, bset_tree_last(b)));
-
-	u64s = btree_node_is_extents(b)
-		? sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
-					&sort_iter)
-		: sort_key_whiteouts(unwritten_whiteouts_start(c, b),
-				     &sort_iter);
-
-	BUG_ON(u64s > b->whiteout_u64s);
-	BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b));
-	BUG_ON(u_pos != whiteouts && !u64s);
-
-	if (u64s != b->whiteout_u64s) {
-		void *src = unwritten_whiteouts_start(c, b);
-
-		b->whiteout_u64s = u64s;
-		memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s);
-	}
-
-	verify_no_dups(b,
-		       unwritten_whiteouts_start(c, b),
-		       unwritten_whiteouts_end(c, b));
-
-	btree_bounce_free(c, order, used_mempool, whiteouts);
-
-	if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK)
-		bch2_btree_build_aux_trees(b);
-
-	bch_btree_keys_u64s_remaining(c, b);
-	bch2_verify_btree_nr_keys(b);
-
-	return true;
-}
-
-static bool bch2_drop_whiteouts(struct btree *b)
-{
-	struct bset_tree *t;
-	bool ret = false;
-
-	for_each_bset(b, t) {
-		struct bset *i = bset(b, t);
-		struct bkey_packed *k, *n, *out, *start, *end;
-
-		if (!should_compact_bset(b, t, true, COMPACT_WRITTEN))
-			continue;
-
-		start	= btree_bkey_first(b, t);
-		end	= btree_bkey_last(b, t);
-
-		if (bset_unwritten(b, i) &&
-		    t != b->set) {
-			struct bset *dst =
-			       max_t(struct bset *, write_block(b),
-				     (void *) btree_bkey_last(b, t -1));
-
-			memmove(dst, i, sizeof(struct bset));
-			i = dst;
-			set_btree_bset(b, t, i);
-		}
-
-		out = i->start;
-
-		for (k = start; k != end; k = n) {
-			n = bkey_next(k);
-
-			if (!bkey_whiteout(k)) {
-				bkey_copy(out, k);
-				out = bkey_next(out);
+			if (!bkey_deleted(k)) {
+				bkey_p_copy(out, k);
+				out = bkey_p_next(out);
+			} else {
+				BUG_ON(k->needs_whiteout);
 			}
 		}
 
 		i->u64s = cpu_to_le16((u64 *) out - i->_data);
+		set_btree_bset_end(b, t);
 		bch2_bset_set_no_aux_tree(b, t);
 		ret = true;
 	}
 
 	bch2_verify_btree_nr_keys(b);
 
-	return ret;
-}
-
-static inline int sort_keys_cmp(struct btree *b,
-				struct bkey_packed *l,
-				struct bkey_packed *r)
-{
-	return bkey_cmp_packed(b, l, r) ?:
-		(int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
-		(int) l->needs_whiteout - (int) r->needs_whiteout;
-}
-
-static unsigned sort_keys(struct bkey_packed *dst,
-			  struct sort_iter *iter,
-			  bool filter_whiteouts)
-{
-	const struct bkey_format *f = &iter->b->format;
-	struct bkey_packed *in, *next, *out = dst;
-
-	sort_iter_sort(iter, sort_keys_cmp);
-
-	while ((in = sort_iter_next(iter, sort_keys_cmp))) {
-		if (bkey_whiteout(in) &&
-		    (filter_whiteouts || !in->needs_whiteout))
-			continue;
-
-		if (bkey_whiteout(in) &&
-		    (next = sort_iter_peek(iter)) &&
-		    !bkey_cmp_packed(iter->b, in, next)) {
-			BUG_ON(in->needs_whiteout &&
-			       next->needs_whiteout);
-			/*
-			 * XXX racy, called with read lock from write path
-			 *
-			 * leads to spurious BUG_ON() in bkey_unpack_key() in
-			 * debug mode
-			 */
-			next->needs_whiteout |= in->needs_whiteout;
-			continue;
-		}
-
-		if (bkey_whiteout(in)) {
-			memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
-			set_bkeyp_val_u64s(f, out, 0);
-		} else {
-			bkey_copy(out, in);
-		}
-		out = bkey_next(out);
-	}
-
-	return (u64 *) out - (u64 *) dst;
-}
+	bch2_btree_build_aux_trees(b);
 
-static inline int sort_extents_cmp(struct btree *b,
-				   struct bkey_packed *l,
-				   struct bkey_packed *r)
-{
-	return bkey_cmp_packed(b, l, r) ?:
-		(int) bkey_deleted(l) - (int) bkey_deleted(r);
+	return ret;
 }
 
-static unsigned sort_extents(struct bkey_packed *dst,
-			     struct sort_iter *iter,
-			     bool filter_whiteouts)
+bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
+			    enum compact_mode mode)
 {
-	struct bkey_packed *in, *out = dst;
-
-	sort_iter_sort(iter, sort_extents_cmp);
-
-	while ((in = sort_iter_next(iter, sort_extents_cmp))) {
-		if (bkey_deleted(in))
-			continue;
-
-		if (bkey_whiteout(in) &&
-		    (filter_whiteouts || !in->needs_whiteout))
-			continue;
-
-		bkey_copy(out, in);
-		out = bkey_next(out);
-	}
-
-	return (u64 *) out - (u64 *) dst;
+	return bch2_drop_whiteouts(b, mode);
 }
 
 static void btree_node_sort(struct bch_fs *c, struct btree *b,
-			    struct btree_iter *iter,
 			    unsigned start_idx,
-			    unsigned end_idx,
-			    bool filter_whiteouts)
+			    unsigned end_idx)
 {
 	struct btree_node *out;
-	struct sort_iter sort_iter;
+	struct sort_iter_stack sort_iter;
 	struct bset_tree *t;
 	struct bset *start_bset = bset(b, &b->set[start_idx]);
 	bool used_mempool = false;
 	u64 start_time, seq = 0;
-	unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
+	unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1;
 	bool sorting_entire_node = start_idx == 0 &&
 		end_idx == b->nsets;
 
-	sort_iter_init(&sort_iter, b);
+	sort_iter_stack_init(&sort_iter, b);
 
 	for (t = b->set + start_idx;
 	     t < b->set + end_idx;
 	     t++) {
 		u64s += le16_to_cpu(bset(b, t)->u64s);
-		sort_iter_add(&sort_iter,
+		sort_iter_add(&sort_iter.iter,
 			      btree_bkey_first(b, t),
 			      btree_bkey_last(b, t));
 	}
 
-	order = sorting_entire_node
-		? btree_page_order(c)
-		: get_order(__vstruct_bytes(struct btree_node, u64s));
+	bytes = sorting_entire_node
+		? btree_buf_bytes(b)
+		: __vstruct_bytes(struct btree_node, u64s);
 
-	out = btree_bounce_alloc(c, order, &used_mempool);
+	out = btree_bounce_alloc(c, bytes, &used_mempool);
 
 	start_time = local_clock();
 
-	if (btree_node_is_extents(b))
-		filter_whiteouts = bset_written(b, start_bset);
-
-	u64s = btree_node_is_extents(b)
-		? sort_extents(out->keys.start, &sort_iter, filter_whiteouts)
-		: sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
+	u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter);
 
 	out->keys.u64s = cpu_to_le16(u64s);
 
-	BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
+	BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
 
 	if (sorting_entire_node)
-		bch2_time_stats_update(&c->times[BCH_TIME_btree_sort],
+		bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
 				       start_time);
 
 	/* Make sure we preserve bset journal_seq: */
@@ -636,9 +339,9 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	start_bset->journal_seq = cpu_to_le64(seq);
 
 	if (sorting_entire_node) {
-		unsigned u64s = le16_to_cpu(out->keys.u64s);
+		u64s = le16_to_cpu(out->keys.u64s);
 
-		BUG_ON(order != btree_page_order(c));
+		BUG_ON(bytes != btree_buf_bytes(b));
 
 		/*
 		 * Our temporary buffer is the same size as the btree node's
@@ -673,106 +376,11 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 	set_btree_bset_end(b, &b->set[start_idx]);
 	bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
 
-	btree_bounce_free(c, order, used_mempool, out);
+	btree_bounce_free(c, bytes, used_mempool, out);
 
 	bch2_verify_btree_nr_keys(b);
 }
 
-/* Sort + repack in a new format: */
-static struct btree_nr_keys sort_repack(struct bset *dst,
-					struct btree *src,
-					struct btree_node_iter *src_iter,
-					struct bkey_format *out_f,
-					bool filter_whiteouts)
-{
-	struct bkey_format *in_f = &src->format;
-	struct bkey_packed *in, *out = vstruct_last(dst);
-	struct btree_nr_keys nr;
-
-	memset(&nr, 0, sizeof(nr));
-
-	while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
-		if (filter_whiteouts && bkey_whiteout(in))
-			continue;
-
-		if (bch2_bkey_transform(out_f, out, bkey_packed(in)
-				       ? in_f : &bch2_bkey_format_current, in))
-			out->format = KEY_FORMAT_LOCAL_BTREE;
-		else
-			bch2_bkey_unpack(src, (void *) out, in);
-
-		btree_keys_account_key_add(&nr, 0, out);
-		out = bkey_next(out);
-	}
-
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	return nr;
-}
-
-/* Sort, repack, and merge: */
-static struct btree_nr_keys sort_repack_merge(struct bch_fs *c,
-					      struct bset *dst,
-					      struct btree *src,
-					      struct btree_node_iter *iter,
-					      struct bkey_format *out_f,
-					      bool filter_whiteouts,
-					      key_filter_fn filter,
-					      key_merge_fn merge)
-{
-	struct bkey_packed *k, *prev = NULL, *out;
-	struct btree_nr_keys nr;
-	BKEY_PADDED(k) tmp;
-
-	memset(&nr, 0, sizeof(nr));
-
-	while ((k = bch2_btree_node_iter_next_all(iter, src))) {
-		if (filter_whiteouts && bkey_whiteout(k))
-			continue;
-
-		/*
-		 * The filter might modify pointers, so we have to unpack the
-		 * key and values to &tmp.k:
-		 */
-		bch2_bkey_unpack(src, &tmp.k, k);
-
-		if (filter && filter(c, src, bkey_i_to_s(&tmp.k)))
-			continue;
-
-		/* prev is always unpacked, for key merging: */
-
-		if (prev &&
-		    merge &&
-		    merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE)
-			continue;
-
-		/*
-		 * the current key becomes the new prev: advance prev, then
-		 * copy the current key - but first pack prev (in place):
-		 */
-		if (prev) {
-			bch2_bkey_pack(prev, (void *) prev, out_f);
-
-			btree_keys_account_key_add(&nr, 0, prev);
-			prev = bkey_next(prev);
-		} else {
-			prev = vstruct_last(dst);
-		}
-
-		bkey_copy(prev, &tmp.k);
-	}
-
-	if (prev) {
-		bch2_bkey_pack(prev, (void *) prev, out_f);
-		btree_keys_account_key_add(&nr, 0, prev);
-		out = bkey_next(prev);
-	} else {
-		out = vstruct_last(dst);
-	}
-
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	return nr;
-}
-
 void bch2_btree_sort_into(struct bch_fs *c,
 			 struct btree *dst,
 			 struct btree *src)
@@ -785,24 +393,15 @@ void bch2_btree_sort_into(struct bch_fs *c,
 
 	bch2_bset_set_no_aux_tree(dst, dst->set);
 
-	bch2_btree_node_iter_init_from_start(&src_iter, src,
-					    btree_node_is_extents(src));
-
-	if (btree_node_ops(src)->key_normalize ||
-	    btree_node_ops(src)->key_merge)
-		nr = sort_repack_merge(c, btree_bset_first(dst),
-				src, &src_iter,
-				&dst->format,
-				true,
-				btree_node_ops(src)->key_normalize,
-				btree_node_ops(src)->key_merge);
-	else
-		nr = sort_repack(btree_bset_first(dst),
-				src, &src_iter,
-				&dst->format,
-				true);
+	bch2_btree_node_iter_init_from_start(&src_iter, src);
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], start_time);
+	nr = bch2_sort_repack(btree_bset_first(dst),
+			src, &src_iter,
+			&dst->format,
+			true);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+			       start_time);
 
 	set_btree_bset_end(dst, dst->set);
 
@@ -814,14 +413,11 @@ void bch2_btree_sort_into(struct bch_fs *c,
 	bch2_verify_btree_nr_keys(dst);
 }
 
-#define SORT_CRIT	(4096 / sizeof(u64))
-
 /*
  * We're about to add another bset to the btree node, so if there's currently
  * too many bsets - sort some of them together:
  */
-static bool btree_node_compact(struct bch_fs *c, struct btree *b,
-			       struct btree_iter *iter)
+static bool btree_node_compact(struct bch_fs *c, struct btree *b)
 {
 	unsigned unwritten_idx;
 	bool ret = false;
@@ -829,17 +425,16 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b,
 	for (unwritten_idx = 0;
 	     unwritten_idx < b->nsets;
 	     unwritten_idx++)
-		if (bset_unwritten(b, bset(b, &b->set[unwritten_idx])))
+		if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
 			break;
 
 	if (b->nsets - unwritten_idx > 1) {
-		btree_node_sort(c, b, iter, unwritten_idx,
-				b->nsets, false);
+		btree_node_sort(c, b, unwritten_idx, b->nsets);
 		ret = true;
 	}
 
 	if (unwritten_idx > 1) {
-		btree_node_sort(c, b, iter, 0, unwritten_idx, false);
+		btree_node_sort(c, b, 0, unwritten_idx);
 		ret = true;
 	}
 
@@ -848,15 +443,31 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b,
 
 void bch2_btree_build_aux_trees(struct btree *b)
 {
-	struct bset_tree *t;
-
 	for_each_bset(b, t)
 		bch2_bset_build_aux_tree(b, t,
-				bset_unwritten(b, bset(b, t)) &&
+				!bset_written(b, bset(b, t)) &&
 				t == bset_tree_last(b));
 }
 
 /*
+ * If we have MAX_BSETS (3) bsets, should we sort them all down to just one?
+ *
+ * The first bset is going to be of similar order to the size of the node, the
+ * last bset is bounded by btree_write_set_buffer(), which is set to keep the
+ * memmove on insert from being too expensive: the middle bset should, ideally,
+ * be the geometric mean of the first and the last.
+ *
+ * Returns true if the middle bset is greater than that geometric mean:
+ */
+static inline bool should_compact_all(struct bch_fs *c, struct btree *b)
+{
+	unsigned mid_u64s_bits =
+		(ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2;
+
+	return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits;
+}
+
+/*
  * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
  * inserted into
  *
@@ -865,420 +476,759 @@ void bch2_btree_build_aux_trees(struct btree *b)
  *
  * Returns true if we sorted (i.e. invalidated iterators
  */
-void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
-			  struct btree_iter *iter)
+void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_node_entry *bne;
-	bool did_sort;
+	bool reinit_iter = false;
+
+	EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]);
+	BUG_ON(bset_written(b, bset(b, &b->set[1])));
+	BUG_ON(btree_node_just_written(b));
+
+	if (b->nsets == MAX_BSETS &&
+	    !btree_node_write_in_flight(b) &&
+	    should_compact_all(c, b)) {
+		bch2_btree_node_write(c, b, SIX_LOCK_write,
+				      BTREE_WRITE_init_next_bset);
+		reinit_iter = true;
+	}
 
-	EBUG_ON(!(b->lock.state.seq & 1));
-	EBUG_ON(iter && iter->l[b->level].b != b);
+	if (b->nsets == MAX_BSETS &&
+	    btree_node_compact(c, b))
+		reinit_iter = true;
 
-	did_sort = btree_node_compact(c, b, iter);
+	BUG_ON(b->nsets >= MAX_BSETS);
 
 	bne = want_new_bset(c, b);
 	if (bne)
-		bch2_bset_init_next(c, b, bne);
+		bch2_bset_init_next(b, bne);
 
 	bch2_btree_build_aux_trees(b);
 
-	if (iter && did_sort)
-		bch2_btree_iter_reinit_node(iter, b);
+	if (reinit_iter)
+		bch2_trans_node_reinit_iter(trans, b);
 }
 
-static struct nonce btree_nonce(struct bset *i, unsigned offset)
+static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
+			  struct bch_dev *ca,
+			  struct btree *b, struct bset *i, struct bkey_packed *k,
+			  unsigned offset, int write)
 {
-	return (struct nonce) {{
-		[0] = cpu_to_le32(offset),
-		[1] = ((__le32 *) &i->seq)[0],
-		[2] = ((__le32 *) &i->seq)[1],
-		[3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
-	}};
+	prt_printf(out, bch2_log_msg(c, "%s"),
+		   write == READ
+		   ? "error validating btree node "
+		   : "corrupt btree node before write ");
+	if (ca)
+		prt_printf(out, "on %s ", ca->name);
+	prt_printf(out, "at btree ");
+	bch2_btree_pos_to_text(out, c, b);
+
+	printbuf_indent_add(out, 2);
+
+	prt_printf(out, "\nnode offset %u/%u",
+		   b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)));
+	if (i)
+		prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
+	if (k)
+		prt_printf(out, " bset byte offset %lu",
+			   (unsigned long)(void *)k -
+			   ((unsigned long)(void *)i & ~511UL));
+	prt_str(out, ": ");
 }
 
-static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
+__printf(10, 11)
+static int __btree_err(int ret,
+		       struct bch_fs *c,
+		       struct bch_dev *ca,
+		       struct btree *b,
+		       struct bset *i,
+		       struct bkey_packed *k,
+		       int write,
+		       bool have_retry,
+		       enum bch_sb_error_id err_type,
+		       const char *fmt, ...)
 {
-	struct nonce nonce = btree_nonce(i, offset);
+	struct printbuf out = PRINTBUF;
+	bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes;
+	va_list args;
 
-	if (!offset) {
-		struct btree_node *bn = container_of(i, struct btree_node, keys);
-		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+	btree_err_msg(&out, c, ca, b, i, k, b->written, write);
 
-		bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
-			     bytes);
+	va_start(args, fmt);
+	prt_vprintf(&out, fmt, args);
+	va_end(args);
 
-		nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE));
+	if (write == WRITE) {
+		bch2_print_string_as_lines(KERN_ERR, out.buf);
+		ret = c->opts.errors == BCH_ON_ERROR_continue
+			? 0
+			: -BCH_ERR_fsck_errors_not_fixed;
+		goto out;
 	}
 
-	bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
-		     vstruct_end(i) - (void *) i->_data);
-}
-
-static int btree_err_msg(struct bch_fs *c, struct btree *b, struct bset *i,
-			 unsigned offset, int write, char *buf, size_t len)
-{
-	char *out = buf, *end = buf + len;
-
-	out += scnprintf(out, end - out,
-			 "error validating btree node %s"
-			 "at btree %u level %u/%u\n"
-			 "pos %llu:%llu node offset %u",
-			 write ? "before write " : "",
-			 b->btree_id, b->level,
-			 c->btree_roots[b->btree_id].level,
-			 b->key.k.p.inode, b->key.k.p.offset,
-			 b->written);
-	if (i)
-		out += scnprintf(out, end - out,
-				 " bset u64s %u",
-				 le16_to_cpu(i->u64s));
-
-	return out - buf;
+	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
+		ret = -BCH_ERR_btree_node_read_err_fixable;
+	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
+		ret = -BCH_ERR_btree_node_read_err_bad_node;
+
+	if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable)
+		bch2_sb_error_count(c, err_type);
+
+	switch (ret) {
+	case -BCH_ERR_btree_node_read_err_fixable:
+		ret = !silent
+			? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf)
+			: -BCH_ERR_fsck_fix;
+		if (ret != -BCH_ERR_fsck_fix &&
+		    ret != -BCH_ERR_fsck_ignore)
+			goto fsck_err;
+		ret = -BCH_ERR_fsck_fix;
+		break;
+	case -BCH_ERR_btree_node_read_err_want_retry:
+	case -BCH_ERR_btree_node_read_err_must_retry:
+		if (!silent)
+			bch2_print_string_as_lines(KERN_ERR, out.buf);
+		break;
+	case -BCH_ERR_btree_node_read_err_bad_node:
+		if (!silent)
+			bch2_print_string_as_lines(KERN_ERR, out.buf);
+		ret = bch2_topology_error(c);
+		break;
+	case -BCH_ERR_btree_node_read_err_incompatible:
+		if (!silent)
+			bch2_print_string_as_lines(KERN_ERR, out.buf);
+		ret = -BCH_ERR_fsck_errors_not_fixed;
+		break;
+	default:
+		BUG();
+	}
+out:
+fsck_err:
+	printbuf_exit(&out);
+	return ret;
 }
 
-enum btree_err_type {
-	BTREE_ERR_FIXABLE,
-	BTREE_ERR_WANT_RETRY,
-	BTREE_ERR_MUST_RETRY,
-	BTREE_ERR_FATAL,
-};
-
-enum btree_validate_ret {
-	BTREE_RETRY_READ = 64,
-};
-
-#define btree_err(type, c, b, i, msg, ...)				\
+#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...)		\
 ({									\
-	__label__ out;							\
-	char _buf[300], *out = _buf, *end = out + sizeof(_buf);		\
-									\
-	out += btree_err_msg(c, b, i, b->written, write, out, end - out);\
-	out += scnprintf(out, end - out, ": " msg, ##__VA_ARGS__);	\
+	int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry,	\
+			       BCH_FSCK_ERR_##_err_type,		\
+			       msg, ##__VA_ARGS__);			\
 									\
-	if (type == BTREE_ERR_FIXABLE &&				\
-	    write == READ &&						\
-	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {		\
-		mustfix_fsck_err(c, "%s", _buf);			\
-		goto out;						\
+	if (_ret != -BCH_ERR_fsck_fix) {				\
+		ret = _ret;						\
+		goto fsck_err;						\
 	}								\
 									\
-	switch (write) {						\
-	case READ:							\
-		bch_err(c, "%s", _buf);					\
-									\
-		switch (type) {						\
-		case BTREE_ERR_FIXABLE:					\
-			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
-			goto fsck_err;					\
-		case BTREE_ERR_WANT_RETRY:				\
-			if (have_retry) {				\
-				ret = BTREE_RETRY_READ;			\
-				goto fsck_err;				\
-			}						\
-			break;						\
-		case BTREE_ERR_MUST_RETRY:				\
-			ret = BTREE_RETRY_READ;				\
-			goto fsck_err;					\
-		case BTREE_ERR_FATAL:					\
-			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
-			goto fsck_err;					\
-		}							\
-		break;							\
-	case WRITE:							\
-		bch_err(c, "corrupt metadata before write: %s", _buf);	\
-									\
-		if (bch2_fs_inconsistent(c)) {				\
-			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
-			goto fsck_err;					\
-		}							\
-		break;							\
-	}								\
-out:									\
-	true;								\
+	*saw_error = true;						\
 })
 
 #define btree_err_on(cond, ...)	((cond) ? btree_err(__VA_ARGS__) : false)
 
-static int validate_bset(struct bch_fs *c, struct btree *b,
-			 struct bset *i, unsigned sectors,
-			 unsigned *whiteout_u64s, int write,
-			 bool have_retry)
+/*
+ * When btree topology repair changes the start or end of a node, that might
+ * mean we have to drop keys that are no longer inside the node:
+ */
+__cold
+void bch2_btree_node_drop_keys_outside_node(struct btree *b)
 {
-	struct bkey_packed *k, *prev = NULL;
-	struct bpos prev_pos = POS_MIN;
-	enum bkey_type type = btree_node_type(b);
-	bool seen_non_whiteout = false;
-	const char *err;
-	int ret = 0;
+	for_each_bset(b, t) {
+		struct bset *i = bset(b, t);
+		struct bkey_packed *k;
 
-	if (i == &b->data->keys) {
-		/* These indicate that we read the wrong btree node: */
-		btree_err_on(BTREE_NODE_ID(b->data) != b->btree_id,
-			     BTREE_ERR_MUST_RETRY, c, b, i,
-			     "incorrect btree id");
+		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
+			if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
+				break;
 
-		btree_err_on(BTREE_NODE_LEVEL(b->data) != b->level,
-			     BTREE_ERR_MUST_RETRY, c, b, i,
-			     "incorrect level");
+		if (k != i->start) {
+			unsigned shift = (u64 *) k - (u64 *) i->start;
+
+			memmove_u64s_down(i->start, k,
+					  (u64 *) vstruct_end(i) - (u64 *) k);
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
+			set_btree_bset_end(b, t);
+		}
 
-		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
-			u64 *p = (u64 *) &b->data->ptr;
+		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
+			if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
+				break;
 
-			*p = swab64(*p);
-			bch2_bpos_swab(&b->data->min_key);
-			bch2_bpos_swab(&b->data->max_key);
+		if (k != vstruct_last(i)) {
+			i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
+			set_btree_bset_end(b, t);
 		}
+	}
 
-		btree_err_on(bkey_cmp(b->data->max_key, b->key.k.p),
-			     BTREE_ERR_MUST_RETRY, c, b, i,
-			     "incorrect max key");
+	/*
+	 * Always rebuild search trees: eytzinger search tree nodes directly
+	 * depend on the values of min/max key:
+	 */
+	bch2_bset_set_no_aux_tree(b, b->set);
+	bch2_btree_build_aux_trees(b);
+	b->nr = bch2_btree_node_count_keys(b);
+
+	struct bkey_s_c k;
+	struct bkey unpacked;
+	struct btree_node_iter iter;
+	for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
+		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
+		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
+	}
+}
 
-		/* XXX: ideally we would be validating min_key too */
-#if 0
-		/*
-		 * not correct anymore, due to btree node write error
-		 * handling
-		 *
-		 * need to add b->data->seq to btree keys and verify
-		 * against that
-		 */
-		btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
-						  b->data->ptr),
-			     BTREE_ERR_FATAL, c, b, i,
-			     "incorrect backpointer");
-#endif
-		err = bch2_bkey_format_validate(&b->data->format);
-		btree_err_on(err,
-			     BTREE_ERR_FATAL, c, b, i,
-			     "invalid bkey format: %s", err);
+static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
+			 struct btree *b, struct bset *i,
+			 unsigned offset, unsigned sectors,
+			 int write, bool have_retry, bool *saw_error)
+{
+	unsigned version = le16_to_cpu(i->version);
+	unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+	int ret = 0;
+
+	btree_err_on(!bch2_version_compatible(version),
+		     -BCH_ERR_btree_node_read_err_incompatible,
+		     c, ca, b, i, NULL,
+		     btree_node_unsupported_version,
+		     "unsupported bset version %u.%u",
+		     BCH_VERSION_MAJOR(version),
+		     BCH_VERSION_MINOR(version));
+
+	if (btree_err_on(version < c->sb.version_min,
+			 -BCH_ERR_btree_node_read_err_fixable,
+			 c, NULL, b, i, NULL,
+			 btree_node_bset_older_than_sb_min,
+			 "bset version %u older than superblock version_min %u",
+			 version, c->sb.version_min)) {
+		mutex_lock(&c->sb_lock);
+		c->disk_sb.sb->version_min = cpu_to_le16(version);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
 	}
 
-	if (btree_err_on(le16_to_cpu(i->version) != BCACHE_BSET_VERSION,
-			 BTREE_ERR_FIXABLE, c, b, i,
-			 "unsupported bset version")) {
-		i->version = cpu_to_le16(BCACHE_BSET_VERSION);
-		i->u64s = 0;
-		return 0;
+	if (btree_err_on(BCH_VERSION_MAJOR(version) >
+			 BCH_VERSION_MAJOR(c->sb.version),
+			 -BCH_ERR_btree_node_read_err_fixable,
+			 c, NULL, b, i, NULL,
+			 btree_node_bset_newer_than_sb,
+			 "bset version %u newer than superblock version %u",
+			 version, c->sb.version)) {
+		mutex_lock(&c->sb_lock);
+		c->disk_sb.sb->version = cpu_to_le16(version);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
 	}
 
-	if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
-			 BTREE_ERR_FIXABLE, c, b, i,
-			 "bset past end of btree node")) {
+	btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
+		     -BCH_ERR_btree_node_read_err_incompatible,
+		     c, ca, b, i, NULL,
+		     btree_node_unsupported_version,
+		     "BSET_SEPARATE_WHITEOUTS no longer supported");
+
+	if (!write &&
+	    btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)),
+			 -BCH_ERR_btree_node_read_err_fixable,
+			 c, ca, b, i, NULL,
+			 bset_past_end_of_btree_node,
+			 "bset past end of btree node (offset %u len %u but written %zu)",
+			 offset, sectors, ptr_written ?: btree_sectors(c)))
 		i->u64s = 0;
-		return 0;
-	}
 
-	btree_err_on(b->written && !i->u64s,
-		     BTREE_ERR_FIXABLE, c, b, i,
+	btree_err_on(offset && !i->u64s,
+		     -BCH_ERR_btree_node_read_err_fixable,
+		     c, ca, b, i, NULL,
+		     bset_empty,
 		     "empty bset");
 
-	if (!BSET_SEPARATE_WHITEOUTS(i)) {
-		seen_non_whiteout = true;
-		*whiteout_u64s = 0;
-	}
+	btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
+		     -BCH_ERR_btree_node_read_err_want_retry,
+		     c, ca, b, i, NULL,
+		     bset_wrong_sector_offset,
+		     "bset at wrong sector offset");
 
-	for (k = i->start;
-	     k != vstruct_last(i);) {
-		struct bkey_s_c u;
-		struct bkey tmp;
-		const char *invalid;
+	if (!offset) {
+		struct btree_node *bn =
+			container_of(i, struct btree_node, keys);
+		/* These indicate that we read the wrong btree node: */
 
-		if (btree_err_on(!k->u64s,
-				 BTREE_ERR_FIXABLE, c, b, i,
-				 "KEY_U64s 0: %zu bytes of metadata lost",
-				 vstruct_end(i) - (void *) k)) {
-			i->u64s = cpu_to_le16((u64 *) k - i->_data);
-			break;
-		}
+		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+			struct bch_btree_ptr_v2 *bp =
+				&bkey_i_to_btree_ptr_v2(&b->key)->v;
 
-		if (btree_err_on(bkey_next(k) > vstruct_last(i),
-				 BTREE_ERR_FIXABLE, c, b, i,
-				 "key extends past end of bset")) {
-			i->u64s = cpu_to_le16((u64 *) k - i->_data);
-			break;
+			/* XXX endianness */
+			btree_err_on(bp->seq != bn->keys.seq,
+				     -BCH_ERR_btree_node_read_err_must_retry,
+				     c, ca, b, NULL, NULL,
+				     bset_bad_seq,
+				     "incorrect sequence number (wrong btree node)");
 		}
 
-		if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
-				 BTREE_ERR_FIXABLE, c, b, i,
-				 "invalid bkey format %u", k->format)) {
-			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
-			memmove_u64s_down(k, bkey_next(k),
-					  (u64 *) vstruct_end(i) - (u64 *) k);
-			continue;
+		btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
+			     -BCH_ERR_btree_node_read_err_must_retry,
+			     c, ca, b, i, NULL,
+			     btree_node_bad_btree,
+			     "incorrect btree id");
+
+		btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
+			     -BCH_ERR_btree_node_read_err_must_retry,
+			     c, ca, b, i, NULL,
+			     btree_node_bad_level,
+			     "incorrect level");
+
+		if (!write)
+			compat_btree_node(b->c.level, b->c.btree_id, version,
+					  BSET_BIG_ENDIAN(i), write, bn);
+
+		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+			struct bch_btree_ptr_v2 *bp =
+				&bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+			if (BTREE_PTR_RANGE_UPDATED(bp)) {
+				b->data->min_key = bp->min_key;
+				b->data->max_key = b->key.k.p;
+			}
+
+			btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
+				     -BCH_ERR_btree_node_read_err_must_retry,
+				     c, ca, b, NULL, NULL,
+				     btree_node_bad_min_key,
+				     "incorrect min_key: got %s should be %s",
+				     (printbuf_reset(&buf1),
+				      bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
+				     (printbuf_reset(&buf2),
+				      bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
 		}
 
-		if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
-			bch2_bkey_swab(type, &b->format, k);
+		btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
+			     -BCH_ERR_btree_node_read_err_must_retry,
+			     c, ca, b, i, NULL,
+			     btree_node_bad_max_key,
+			     "incorrect max key %s",
+			     (printbuf_reset(&buf1),
+			      bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
+
+		if (write)
+			compat_btree_node(b->c.level, b->c.btree_id, version,
+					  BSET_BIG_ENDIAN(i), write, bn);
+
+		btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
+			     -BCH_ERR_btree_node_read_err_bad_node,
+			     c, ca, b, i, NULL,
+			     btree_node_bad_format,
+			     "invalid bkey format: %s\n  %s", buf1.buf,
+			     (printbuf_reset(&buf2),
+			      bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
+		printbuf_reset(&buf1);
+
+		compat_bformat(b->c.level, b->c.btree_id, version,
+			       BSET_BIG_ENDIAN(i), write,
+			       &bn->format);
+	}
+fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
+	return ret;
+}
 
-		u = bkey_disassemble(b, k, &tmp);
+static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b,
+					struct bkey_s_c k,
+					enum bch_validate_flags flags)
+{
+	return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) {
+		.from	= BKEY_VALIDATE_btree_node,
+		.level	= b->c.level,
+		.btree	= b->c.btree_id,
+		.flags	= flags
+	});
+}
 
-		invalid = __bch2_bkey_invalid(c, type, u) ?:
-			bch2_bkey_in_btree_node(b, u) ?:
-			(write ? bch2_bkey_val_invalid(c, type, u) : NULL);
-		if (invalid) {
-			char buf[160];
+static int bset_key_validate(struct bch_fs *c, struct btree *b,
+			     struct bkey_s_c k,
+			     bool updated_range,
+			     enum bch_validate_flags flags)
+{
+	struct bkey_validate_context from = (struct bkey_validate_context) {
+		.from	= BKEY_VALIDATE_btree_node,
+		.level	= b->c.level,
+		.btree	= b->c.btree_id,
+		.flags	= flags,
+	};
+	return __bch2_bkey_validate(c, k, from) ?:
+		(!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?:
+		(flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0);
+}
 
-			bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
-			btree_err(BTREE_ERR_FIXABLE, c, b, i,
-				  "invalid bkey:\n%s\n%s", invalid, buf);
+static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
+			 struct bset *i, struct bkey_packed *k)
+{
+	if (bkey_p_next(k) > vstruct_last(i))
+		return false;
 
-			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
-			memmove_u64s_down(k, bkey_next(k),
-					  (u64 *) vstruct_end(i) - (u64 *) k);
-			continue;
+	if (k->format > KEY_FORMAT_CURRENT)
+		return false;
+
+	if (!bkeyp_u64s_valid(&b->format, k))
+		return false;
+
+	struct bkey tmp;
+	struct bkey_s u = __bkey_disassemble(b, k, &tmp);
+	return !__bch2_bkey_validate(c, u.s_c,
+				     (struct bkey_validate_context) {
+					.from	= BKEY_VALIDATE_btree_node,
+					.level	= b->c.level,
+					.btree	= b->c.btree_id,
+					.flags	= BCH_VALIDATE_silent
+				     });
+}
+
+static inline int btree_node_read_bkey_cmp(const struct btree *b,
+				const struct bkey_packed *l,
+				const struct bkey_packed *r)
+{
+	return bch2_bkey_cmp_packed(b, l, r)
+		?: (int) bkey_deleted(r) - (int) bkey_deleted(l);
+}
+
+static int validate_bset_keys(struct bch_fs *c, struct btree *b,
+			 struct bset *i, int write,
+			 bool have_retry, bool *saw_error)
+{
+	unsigned version = le16_to_cpu(i->version);
+	struct bkey_packed *k, *prev = NULL;
+	struct printbuf buf = PRINTBUF;
+	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
+	int ret = 0;
+
+	for (k = i->start;
+	     k != vstruct_last(i);) {
+		struct bkey_s u;
+		struct bkey tmp;
+		unsigned next_good_key;
+
+		if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
+				 -BCH_ERR_btree_node_read_err_fixable,
+				 c, NULL, b, i, k,
+				 btree_node_bkey_past_bset_end,
+				 "key extends past end of bset")) {
+			i->u64s = cpu_to_le16((u64 *) k - i->_data);
+			break;
 		}
 
-		/*
-		 * with the separate whiteouts thing (used for extents), the
-		 * second set of keys actually can have whiteouts too, so we
-		 * can't solely go off bkey_whiteout()...
-		 */
+		if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
+				 -BCH_ERR_btree_node_read_err_fixable,
+				 c, NULL, b, i, k,
+				 btree_node_bkey_bad_format,
+				 "invalid bkey format %u", k->format))
+			goto drop_this_key;
+
+		if (btree_err_on(!bkeyp_u64s_valid(&b->format, k),
+				 -BCH_ERR_btree_node_read_err_fixable,
+				 c, NULL, b, i, k,
+				 btree_node_bkey_bad_u64s,
+				 "bad k->u64s %u (min %u max %zu)", k->u64s,
+				 bkeyp_key_u64s(&b->format, k),
+				 U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k)))
+			goto drop_this_key;
+
+		if (!write)
+			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
+				    BSET_BIG_ENDIAN(i), write,
+				    &b->format, k);
+
+		u = __bkey_disassemble(b, k, &tmp);
+
+		ret = bset_key_validate(c, b, u.s_c, updated_range, write);
+		if (ret == -BCH_ERR_fsck_delete_bkey)
+			goto drop_this_key;
+		if (ret)
+			goto fsck_err;
 
-		if (!seen_non_whiteout &&
-		    (!bkey_whiteout(k) ||
-		     (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
-			*whiteout_u64s = k->_data - i->_data;
-			seen_non_whiteout = true;
-		} else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
-			btree_err(BTREE_ERR_FATAL, c, b, i,
-				  "keys out of order: %llu:%llu > %llu:%llu",
-				  prev_pos.inode,
-				  prev_pos.offset,
-				  u.k->p.inode,
-				  bkey_start_offset(u.k));
-			/* XXX: repair this */
+		if (write)
+			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
+				    BSET_BIG_ENDIAN(i), write,
+				    &b->format, k);
+
+		if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) {
+			struct bkey up = bkey_unpack_key(b, prev);
+
+			printbuf_reset(&buf);
+			prt_printf(&buf, "keys out of order: ");
+			bch2_bkey_to_text(&buf, &up);
+			prt_printf(&buf, " > ");
+			bch2_bkey_to_text(&buf, u.k);
+
+			if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
+				      c, NULL, b, i, k,
+				      btree_node_bkey_out_of_order,
+				      "%s", buf.buf))
+				goto drop_this_key;
 		}
 
-		prev_pos = u.k->p;
 		prev = k;
-		k = bkey_next(k);
-	}
+		k = bkey_p_next(k);
+		continue;
+drop_this_key:
+		next_good_key = k->u64s;
+
+		if (!next_good_key ||
+		    (BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN &&
+		     version >= bcachefs_metadata_version_snapshot)) {
+			/*
+			 * only do scanning if bch2_bkey_compat() has nothing to
+			 * do
+			 */
 
-	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+			if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
+				for (next_good_key = 1;
+				     next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
+				     next_good_key++)
+					if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
+						goto got_good_key;
+			}
+
+			/*
+			 * didn't find a good key, have to truncate the rest of
+			 * the bset
+			 */
+			next_good_key = (u64 *) vstruct_last(i) - (u64 *) k;
+		}
+got_good_key:
+		le16_add_cpu(&i->u64s, -next_good_key);
+		memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k);
+		set_btree_node_need_rewrite(b);
+	}
 fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
-int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry)
+int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
+			      struct btree *b, bool have_retry, bool *saw_error)
 {
 	struct btree_node_entry *bne;
-	struct btree_node_iter_large *iter;
+	struct sort_iter *iter;
 	struct btree_node *sorted;
 	struct bkey_packed *k;
 	struct bset *i;
-	bool used_mempool;
+	bool used_mempool, blacklisted;
+	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
 	unsigned u64s;
-	int ret, retry_read = 0, write = READ;
+	unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
+	u64 max_journal_seq = 0;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0, retry_read = 0, write = READ;
+	u64 start_time = local_clock();
 
-	iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
-	__bch2_btree_node_iter_large_init(iter, btree_node_is_extents(b));
+	b->version_ondisk = U16_MAX;
+	/* We might get called multiple times on read retry: */
+	b->written = 0;
+
+	iter = mempool_alloc(&c->fill_iter, GFP_NOFS);
+	sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
 
 	if (bch2_meta_read_fault("btree"))
-		btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
+		btree_err(-BCH_ERR_btree_node_read_err_must_retry,
+			  c, ca, b, NULL, NULL,
+			  btree_node_fault_injected,
 			  "dynamic fault");
 
 	btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
-		     BTREE_ERR_MUST_RETRY, c, b, NULL,
-		     "bad magic");
-
-	btree_err_on(!b->data->keys.seq,
-		     BTREE_ERR_MUST_RETRY, c, b, NULL,
-		     "bad btree header");
+		     -BCH_ERR_btree_node_read_err_must_retry,
+		     c, ca, b, NULL, NULL,
+		     btree_node_bad_magic,
+		     "bad magic: want %llx, got %llx",
+		     bset_magic(c), le64_to_cpu(b->data->magic));
+
+	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+		struct bch_btree_ptr_v2 *bp =
+			&bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+		bch2_bpos_to_text(&buf, b->data->min_key);
+		prt_str(&buf, "-");
+		bch2_bpos_to_text(&buf, b->data->max_key);
+
+		btree_err_on(b->data->keys.seq != bp->seq,
+			     -BCH_ERR_btree_node_read_err_must_retry,
+			     c, ca, b, NULL, NULL,
+			     btree_node_bad_seq,
+			     "got wrong btree node: got\n%s",
+			     (printbuf_reset(&buf),
+			      bch2_btree_node_header_to_text(&buf, b->data),
+			      buf.buf));
+	} else {
+		btree_err_on(!b->data->keys.seq,
+			     -BCH_ERR_btree_node_read_err_must_retry,
+			     c, ca, b, NULL, NULL,
+			     btree_node_bad_seq,
+			     "bad btree header: seq 0\n%s",
+			     (printbuf_reset(&buf),
+			      bch2_btree_node_header_to_text(&buf, b->data),
+			      buf.buf));
+	}
 
-	while (b->written < c->opts.btree_node_size) {
-		unsigned sectors, whiteout_u64s = 0;
-		struct nonce nonce;
-		struct bch_csum csum;
+	while (b->written < (ptr_written ?: btree_sectors(c))) {
+		unsigned sectors;
 		bool first = !b->written;
 
-		if (!b->written) {
+		if (first) {
+			bne = NULL;
 			i = &b->data->keys;
-
-			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-				     BTREE_ERR_WANT_RETRY, c, b, i,
-				     "unknown checksum type");
-
-			nonce = btree_nonce(i, b->written << 9);
-			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
-
-			btree_err_on(bch2_crc_cmp(csum, b->data->csum),
-				     BTREE_ERR_WANT_RETRY, c, b, i,
-				     "invalid checksum");
-
-			bset_encrypt(c, i, b->written << 9);
-
-			sectors = vstruct_sectors(b->data, c->block_bits);
-
-			btree_node_set_format(b, b->data->format);
 		} else {
 			bne = write_block(b);
 			i = &bne->keys;
 
 			if (i->seq != b->data->keys.seq)
 				break;
+		}
 
-			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-				     BTREE_ERR_WANT_RETRY, c, b, i,
-				     "unknown checksum type");
-
-			nonce = btree_nonce(i, b->written << 9);
-			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+		struct nonce nonce = btree_nonce(i, b->written << 9);
+		bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
+
+		btree_err_on(!good_csum_type,
+			     bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))
+			     ? -BCH_ERR_btree_node_read_err_must_retry
+			     : -BCH_ERR_btree_node_read_err_want_retry,
+			     c, ca, b, i, NULL,
+			     bset_unknown_csum,
+			     "unknown checksum type %llu", BSET_CSUM_TYPE(i));
+
+		if (first) {
+			if (good_csum_type) {
+				struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
+				bool csum_bad = bch2_crc_cmp(b->data->csum, csum);
+				if (csum_bad)
+					bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+
+				btree_err_on(csum_bad,
+					     -BCH_ERR_btree_node_read_err_want_retry,
+					     c, ca, b, i, NULL,
+					     bset_bad_csum,
+					     "%s",
+					     (printbuf_reset(&buf),
+					      bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
+					      buf.buf));
+
+				ret = bset_encrypt(c, i, b->written << 9);
+				if (bch2_fs_fatal_err_on(ret, c,
+							 "decrypting btree node: %s", bch2_err_str(ret)))
+					goto fsck_err;
+			}
 
-			btree_err_on(bch2_crc_cmp(csum, bne->csum),
-				     BTREE_ERR_WANT_RETRY, c, b, i,
-				     "invalid checksum");
+			btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
+				     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
+				     -BCH_ERR_btree_node_read_err_incompatible,
+				     c, NULL, b, NULL, NULL,
+				     btree_node_unsupported_version,
+				     "btree node does not have NEW_EXTENT_OVERWRITE set");
 
-			bset_encrypt(c, i, b->written << 9);
+			sectors = vstruct_sectors(b->data, c->block_bits);
+		} else {
+			if (good_csum_type) {
+				struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+				bool csum_bad = bch2_crc_cmp(bne->csum, csum);
+				if (ca && csum_bad)
+					bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+
+				btree_err_on(csum_bad,
+					     -BCH_ERR_btree_node_read_err_want_retry,
+					     c, ca, b, i, NULL,
+					     bset_bad_csum,
+					     "%s",
+					     (printbuf_reset(&buf),
+					      bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
+					      buf.buf));
+
+				ret = bset_encrypt(c, i, b->written << 9);
+				if (bch2_fs_fatal_err_on(ret, c,
+						"decrypting btree node: %s", bch2_err_str(ret)))
+					goto fsck_err;
+			}
 
 			sectors = vstruct_sectors(bne, c->block_bits);
 		}
 
-		ret = validate_bset(c, b, i, sectors, &whiteout_u64s,
-				    READ, have_retry);
+		b->version_ondisk = min(b->version_ondisk,
+					le16_to_cpu(i->version));
+
+		ret = validate_bset(c, ca, b, i, b->written, sectors,
+				    READ, have_retry, saw_error);
 		if (ret)
 			goto fsck_err;
 
-		b->written += sectors;
+		if (!b->written)
+			btree_node_set_format(b, b->data->format);
 
-		ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b);
-		if (ret < 0) {
-			btree_err(BTREE_ERR_FATAL, c, b, i,
-				  "insufficient memory");
-			goto err;
-		}
+		ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error);
+		if (ret)
+			goto fsck_err;
 
-		if (ret) {
-			btree_err_on(first,
-				     BTREE_ERR_FIXABLE, c, b, i,
-				     "first btree node bset has blacklisted journal seq");
-			if (!first)
-				continue;
-		}
+		SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+		blacklisted = bch2_journal_seq_is_blacklisted(c,
+					le64_to_cpu(i->journal_seq),
+					true);
+
+		btree_err_on(blacklisted && first,
+			     -BCH_ERR_btree_node_read_err_fixable,
+			     c, ca, b, i, NULL,
+			     bset_blacklisted_journal_seq,
+			     "first btree node bset has blacklisted journal seq (%llu)",
+			     le64_to_cpu(i->journal_seq));
+
+		btree_err_on(blacklisted && ptr_written,
+			     -BCH_ERR_btree_node_read_err_fixable,
+			     c, ca, b, i, NULL,
+			     first_bset_blacklisted_journal_seq,
+			     "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
+			     le64_to_cpu(i->journal_seq),
+			     b->written, b->written + sectors, ptr_written);
+
+		b->written += sectors;
+
+		if (blacklisted && !first)
+			continue;
 
-		bch2_btree_node_iter_large_push(iter, b,
-					   i->start,
-					   vstruct_idx(i, whiteout_u64s));
+		sort_iter_add(iter,
+			      vstruct_idx(i, 0),
+			      vstruct_last(i));
 
-		bch2_btree_node_iter_large_push(iter, b,
-					   vstruct_idx(i, whiteout_u64s),
-					   vstruct_last(i));
+		max_journal_seq = max(max_journal_seq, le64_to_cpu(i->journal_seq));
 	}
 
-	for (bne = write_block(b);
-	     bset_byte_offset(b, bne) < btree_bytes(c);
-	     bne = (void *) bne + block_bytes(c))
-		btree_err_on(bne->keys.seq == b->data->keys.seq,
-			     BTREE_ERR_WANT_RETRY, c, b, NULL,
-			     "found bset signature after last bset");
+	if (ptr_written) {
+		btree_err_on(b->written < ptr_written,
+			     -BCH_ERR_btree_node_read_err_want_retry,
+			     c, ca, b, NULL, NULL,
+			     btree_node_data_missing,
+			     "btree node data missing: expected %u sectors, found %u",
+			     ptr_written, b->written);
+	} else {
+		for (bne = write_block(b);
+		     bset_byte_offset(b, bne) < btree_buf_bytes(b);
+		     bne = (void *) bne + block_bytes(c))
+			btree_err_on(bne->keys.seq == b->data->keys.seq &&
+				     !bch2_journal_seq_is_blacklisted(c,
+								      le64_to_cpu(bne->keys.journal_seq),
+								      true),
+				     -BCH_ERR_btree_node_read_err_want_retry,
+				     c, ca, b, NULL, NULL,
+				     btree_node_bset_after_end,
+				     "found bset signature after last bset");
+	}
 
-	sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool);
+	sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
 	sorted->keys.u64s = 0;
 
 	set_btree_bset(b, b->set, &b->data->keys);
 
-	b->nr = btree_node_is_extents(b)
-		? bch2_extent_sort_fix_overlapping(c, &sorted->keys, b, iter)
-		: bch2_key_sort_fix_overlapping(&sorted->keys, b, iter);
+	b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
+	memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0,
+			btree_buf_bytes(b) -
+			sizeof(struct btree_node) -
+			b->nr.live_u64s * sizeof(u64));
 
 	u64s = le16_to_cpu(sorted->keys.u64s);
 	*sorted = *b->data;
@@ -1286,54 +1236,74 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 	swap(sorted, b->data);
 	set_btree_bset(b, b->set, &b->data->keys);
 	b->nsets = 1;
+	b->data->keys.journal_seq = cpu_to_le64(max_journal_seq);
 
 	BUG_ON(b->nr.live_u64s != u64s);
 
-	btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
+	btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
+
+	if (updated_range)
+		bch2_btree_node_drop_keys_outside_node(b);
 
 	i = &b->data->keys;
 	for (k = i->start; k != vstruct_last(i);) {
-		enum bkey_type type = btree_node_type(b);
 		struct bkey tmp;
-		struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
-		const char *invalid = bch2_bkey_val_invalid(c, type, u);
-
-		if (invalid ||
-		    (inject_invalid_keys(c) &&
-		     !bversion_cmp(u.k->version, MAX_VERSION))) {
-			char buf[160];
-
-			bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
-			btree_err(BTREE_ERR_FIXABLE, c, b, i,
-				  "invalid bkey %s: %s", buf, invalid);
+		struct bkey_s u = __bkey_disassemble(b, k, &tmp);
 
+		ret = btree_node_bkey_val_validate(c, b, u.s_c, READ);
+		if (ret == -BCH_ERR_fsck_delete_bkey ||
+		    (bch2_inject_invalid_keys &&
+		     !bversion_cmp(u.k->bversion, MAX_VERSION))) {
 			btree_keys_account_key_drop(&b->nr, 0, k);
 
 			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
-			memmove_u64s_down(k, bkey_next(k),
+			memmove_u64s_down(k, bkey_p_next(k),
 					  (u64 *) vstruct_end(i) - (u64 *) k);
 			set_btree_bset_end(b, b->set);
+			set_btree_node_need_rewrite(b);
 			continue;
 		}
+		if (ret)
+			goto fsck_err;
+
+		if (u.k->type == KEY_TYPE_btree_ptr_v2) {
+			struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
 
-		k = bkey_next(k);
+			bp.v->mem_ptr = 0;
+		}
+
+		k = bkey_p_next(k);
 	}
 
 	bch2_bset_build_aux_tree(b, b->set, false);
 
-	set_needs_whiteout(btree_bset_first(b));
+	set_needs_whiteout(btree_bset_first(b), true);
 
 	btree_node_reset_sib_u64s(b);
+
+	rcu_read_lock();
+	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+		struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
+
+		if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw)
+			set_btree_node_need_rewrite(b);
+	}
+	rcu_read_unlock();
+
+	if (!ptr_written)
+		set_btree_node_need_rewrite(b);
 out:
 	mempool_free(iter, &c->fill_iter);
+	printbuf_exit(&buf);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
 	return retry_read;
-err:
 fsck_err:
-	if (ret == BTREE_RETRY_READ) {
+	if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
+	    ret == -BCH_ERR_btree_node_read_err_must_retry) {
 		retry_read = 1;
 	} else {
-		bch2_inconsistent_error(c);
 		set_btree_node_read_error(b);
+		bch2_btree_lost_data(c, b->c.btree_id);
 	}
 	goto out;
 }
@@ -1343,23 +1313,24 @@ static void btree_node_read_work(struct work_struct *work)
 	struct btree_read_bio *rb =
 		container_of(work, struct btree_read_bio, work);
 	struct bch_fs *c	= rb->c;
-	struct bch_dev *ca	= bch_dev_bkey_exists(c, rb->pick.ptr.dev);
-	struct btree *b		= rb->bio.bi_private;
+	struct bch_dev *ca	= rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
+	struct btree *b		= rb->b;
 	struct bio *bio		= &rb->bio;
-	struct bch_devs_mask avoid;
+	struct bch_io_failures failed = { .nr = 0 };
+	struct printbuf buf = PRINTBUF;
+	bool saw_error = false;
+	bool retry = false;
 	bool can_retry;
 
-	memset(&avoid, 0, sizeof(avoid));
-
 	goto start;
 	while (1) {
+		retry = true;
 		bch_info(c, "retrying read");
-		ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
-		rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
-		bio_reset(bio);
-		bio->bi_opf		= REQ_OP_READ|REQ_SYNC|REQ_META;
+		ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ);
+		rb->have_ioref		= ca != NULL;
+		bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
 		bio->bi_iter.bi_sector	= rb->pick.ptr.offset;
-		bio->bi_iter.bi_size	= btree_bytes(c);
+		bio->bi_iter.bi_size	= btree_buf_bytes(b);
 
 		if (rb->have_ioref) {
 			bio_set_dev(bio, ca->disk_sb.bdev);
@@ -1368,26 +1339,58 @@ static void btree_node_read_work(struct work_struct *work)
 			bio->bi_status = BLK_STS_REMOVED;
 		}
 start:
-		bch2_dev_io_err_on(bio->bi_status, ca, "btree read");
+		printbuf_reset(&buf);
+		bch2_btree_pos_to_text(&buf, c, b);
+		bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+				   "btree read error %s for %s",
+				   bch2_blk_status_to_str(bio->bi_status), buf.buf);
 		if (rb->have_ioref)
 			percpu_ref_put(&ca->io_ref);
 		rb->have_ioref = false;
 
-		__set_bit(rb->pick.ptr.dev, avoid.d);
-		can_retry = bch2_btree_pick_ptr(c, b, &avoid, &rb->pick) > 0;
+		bch2_mark_io_failure(&failed, &rb->pick);
+
+		can_retry = bch2_bkey_pick_read_device(c,
+				bkey_i_to_s_c(&b->key),
+				&failed, &rb->pick) > 0;
 
 		if (!bio->bi_status &&
-		    !bch2_btree_node_read_done(c, b, can_retry))
+		    !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
+			if (retry)
+				bch_info(c, "retry success");
 			break;
+		}
+
+		saw_error = true;
 
 		if (!can_retry) {
 			set_btree_node_read_error(b);
+			bch2_btree_lost_data(c, b->c.btree_id);
 			break;
 		}
 	}
 
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_read], rb->start_time);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
+			       rb->start_time);
 	bio_put(&rb->bio);
+
+	if ((saw_error ||
+	     btree_node_need_rewrite(b)) &&
+	    !btree_node_read_error(b) &&
+	    c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
+		if (saw_error) {
+			printbuf_reset(&buf);
+			bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+			prt_str(&buf, " ");
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+			bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n  %s",
+					    __func__, buf.buf);
+		}
+
+		bch2_btree_node_rewrite_async(c, b);
+	}
+
+	printbuf_exit(&buf);
 	clear_btree_node_read_in_flight(b);
 	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
 }
@@ -1399,58 +1402,349 @@ static void btree_node_read_endio(struct bio *bio)
 	struct bch_fs *c	= rb->c;
 
 	if (rb->have_ioref) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+		struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
+
 		bch2_latency_acct(ca, rb->start_time, READ);
 	}
 
-	queue_work(system_unbound_wq, &rb->work);
+	queue_work(c->btree_read_complete_wq, &rb->work);
 }
 
-void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
+struct btree_node_read_all {
+	struct closure		cl;
+	struct bch_fs		*c;
+	struct btree		*b;
+	unsigned		nr;
+	void			*buf[BCH_REPLICAS_MAX];
+	struct bio		*bio[BCH_REPLICAS_MAX];
+	blk_status_t		err[BCH_REPLICAS_MAX];
+};
+
+static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
+{
+	struct btree_node *bn = data;
+	struct btree_node_entry *bne;
+	unsigned offset = 0;
+
+	if (le64_to_cpu(bn->magic) !=  bset_magic(c))
+		return 0;
+
+	while (offset < btree_sectors(c)) {
+		if (!offset) {
+			offset += vstruct_sectors(bn, c->block_bits);
+		} else {
+			bne = data + (offset << 9);
+			if (bne->keys.seq != bn->keys.seq)
+				break;
+			offset += vstruct_sectors(bne, c->block_bits);
+		}
+	}
+
+	return offset;
+}
+
+static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data)
+{
+	struct btree_node *bn = data;
+	struct btree_node_entry *bne;
+
+	if (!offset)
+		return false;
+
+	while (offset < btree_sectors(c)) {
+		bne = data + (offset << 9);
+		if (bne->keys.seq == bn->keys.seq)
+			return true;
+		offset++;
+	}
+
+	return false;
+	return offset;
+}
+
+static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
+{
+	closure_type(ra, struct btree_node_read_all, cl);
+	struct bch_fs *c = ra->c;
+	struct btree *b = ra->b;
+	struct printbuf buf = PRINTBUF;
+	bool dump_bset_maps = false;
+	bool have_retry = false;
+	int ret = 0, best = -1, write = READ;
+	unsigned i, written = 0, written2 = 0;
+	__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
+		? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
+	bool _saw_error = false, *saw_error = &_saw_error;
+
+	for (i = 0; i < ra->nr; i++) {
+		struct btree_node *bn = ra->buf[i];
+
+		if (ra->err[i])
+			continue;
+
+		if (le64_to_cpu(bn->magic) != bset_magic(c) ||
+		    (seq && seq != bn->keys.seq))
+			continue;
+
+		if (best < 0) {
+			best = i;
+			written = btree_node_sectors_written(c, bn);
+			continue;
+		}
+
+		written2 = btree_node_sectors_written(c, ra->buf[i]);
+		if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable,
+				 c, NULL, b, NULL, NULL,
+				 btree_node_replicas_sectors_written_mismatch,
+				 "btree node sectors written mismatch: %u != %u",
+				 written, written2) ||
+		    btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
+				 -BCH_ERR_btree_node_read_err_fixable,
+				 c, NULL, b, NULL, NULL,
+				 btree_node_bset_after_end,
+				 "found bset signature after last bset") ||
+		    btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
+				 -BCH_ERR_btree_node_read_err_fixable,
+				 c, NULL, b, NULL, NULL,
+				 btree_node_replicas_data_mismatch,
+				 "btree node replicas content mismatch"))
+			dump_bset_maps = true;
+
+		if (written2 > written) {
+			written = written2;
+			best = i;
+		}
+	}
+fsck_err:
+	if (dump_bset_maps) {
+		for (i = 0; i < ra->nr; i++) {
+			struct btree_node *bn = ra->buf[i];
+			struct btree_node_entry *bne = NULL;
+			unsigned offset = 0, sectors;
+			bool gap = false;
+
+			if (ra->err[i])
+				continue;
+
+			printbuf_reset(&buf);
+
+			while (offset < btree_sectors(c)) {
+				if (!offset) {
+					sectors = vstruct_sectors(bn, c->block_bits);
+				} else {
+					bne = ra->buf[i] + (offset << 9);
+					if (bne->keys.seq != bn->keys.seq)
+						break;
+					sectors = vstruct_sectors(bne, c->block_bits);
+				}
+
+				prt_printf(&buf, " %u-%u", offset, offset + sectors);
+				if (bne && bch2_journal_seq_is_blacklisted(c,
+							le64_to_cpu(bne->keys.journal_seq), false))
+					prt_printf(&buf, "*");
+				offset += sectors;
+			}
+
+			while (offset < btree_sectors(c)) {
+				bne = ra->buf[i] + (offset << 9);
+				if (bne->keys.seq == bn->keys.seq) {
+					if (!gap)
+						prt_printf(&buf, " GAP");
+					gap = true;
+
+					sectors = vstruct_sectors(bne, c->block_bits);
+					prt_printf(&buf, " %u-%u", offset, offset + sectors);
+					if (bch2_journal_seq_is_blacklisted(c,
+							le64_to_cpu(bne->keys.journal_seq), false))
+						prt_printf(&buf, "*");
+				}
+				offset++;
+			}
+
+			bch_err(c, "replica %u:%s", i, buf.buf);
+		}
+	}
+
+	if (best >= 0) {
+		memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
+		ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
+	} else {
+		ret = -1;
+	}
+
+	if (ret) {
+		set_btree_node_read_error(b);
+		bch2_btree_lost_data(c, b->c.btree_id);
+	} else if (*saw_error)
+		bch2_btree_node_rewrite_async(c, b);
+
+	for (i = 0; i < ra->nr; i++) {
+		mempool_free(ra->buf[i], &c->btree_bounce_pool);
+		bio_put(ra->bio[i]);
+	}
+
+	closure_debug_destroy(&ra->cl);
+	kfree(ra);
+	printbuf_exit(&buf);
+
+	clear_btree_node_read_in_flight(b);
+	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+}
+
+static void btree_node_read_all_replicas_endio(struct bio *bio)
+{
+	struct btree_read_bio *rb =
+		container_of(bio, struct btree_read_bio, bio);
+	struct bch_fs *c	= rb->c;
+	struct btree_node_read_all *ra = rb->ra;
+
+	if (rb->have_ioref) {
+		struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
+
+		bch2_latency_acct(ca, rb->start_time, READ);
+	}
+
+	ra->err[rb->idx] = bio->bi_status;
+	closure_put(&ra->cl);
+}
+
+/*
+ * XXX This allocates multiple times from the same mempools, and can deadlock
+ * under sufficient memory pressure (but is only a debug path)
+ */
+static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync)
+{
+	struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded pick;
+	struct btree_node_read_all *ra;
+	unsigned i;
+
+	ra = kzalloc(sizeof(*ra), GFP_NOFS);
+	if (!ra)
+		return -BCH_ERR_ENOMEM_btree_node_read_all_replicas;
+
+	closure_init(&ra->cl, NULL);
+	ra->c	= c;
+	ra->b	= b;
+	ra->nr	= bch2_bkey_nr_ptrs(k);
+
+	for (i = 0; i < ra->nr; i++) {
+		ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
+		ra->bio[i] = bio_alloc_bioset(NULL,
+					      buf_pages(ra->buf[i], btree_buf_bytes(b)),
+					      REQ_OP_READ|REQ_SYNC|REQ_META,
+					      GFP_NOFS,
+					      &c->btree_bio);
+	}
+
+	i = 0;
+	bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
+		struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+		struct btree_read_bio *rb =
+			container_of(ra->bio[i], struct btree_read_bio, bio);
+		rb->c			= c;
+		rb->b			= b;
+		rb->ra			= ra;
+		rb->start_time		= local_clock();
+		rb->have_ioref		= ca != NULL;
+		rb->idx			= i;
+		rb->pick		= pick;
+		rb->bio.bi_iter.bi_sector = pick.ptr.offset;
+		rb->bio.bi_end_io	= btree_node_read_all_replicas_endio;
+		bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b));
+
+		if (rb->have_ioref) {
+			this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
+				     bio_sectors(&rb->bio));
+			bio_set_dev(&rb->bio, ca->disk_sb.bdev);
+
+			closure_get(&ra->cl);
+			submit_bio(&rb->bio);
+		} else {
+			ra->err[i] = BLK_STS_REMOVED;
+		}
+
+		i++;
+	}
+
+	if (sync) {
+		closure_sync(&ra->cl);
+		btree_node_read_all_replicas_done(&ra->cl.work);
+	} else {
+		continue_at(&ra->cl, btree_node_read_all_replicas_done,
+			    c->btree_read_complete_wq);
+	}
+
+	return 0;
+}
+
+void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
 			  bool sync)
 {
-	struct extent_pick_ptr pick;
+	struct bch_fs *c = trans->c;
+	struct extent_ptr_decoded pick;
 	struct btree_read_bio *rb;
 	struct bch_dev *ca;
 	struct bio *bio;
 	int ret;
 
-	trace_btree_read(c, b);
+	trace_and_count(c, btree_node_read, trans, b);
+
+	if (bch2_verify_all_btree_replicas &&
+	    !btree_node_read_all_replicas(c, b, sync))
+		return;
+
+	ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
+					 NULL, &pick);
+
+	if (ret <= 0) {
+		struct printbuf buf = PRINTBUF;
+
+		prt_str(&buf, "btree node read error: no device to read from\n at ");
+		bch2_btree_pos_to_text(&buf, c, b);
+		bch_err_ratelimited(c, "%s", buf.buf);
+
+		if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
+		    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
+			bch2_fatal_error(c);
 
-	ret = bch2_btree_pick_ptr(c, b, NULL, &pick);
-	if (bch2_fs_fatal_err_on(ret <= 0, c,
-			"btree node read error: no device to read from")) {
 		set_btree_node_read_error(b);
+		bch2_btree_lost_data(c, b->c.btree_id);
+		clear_btree_node_read_in_flight(b);
+		wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+		printbuf_exit(&buf);
 		return;
 	}
 
-	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+	ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
 
-	bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
+	bio = bio_alloc_bioset(NULL,
+			       buf_pages(b->data, btree_buf_bytes(b)),
+			       REQ_OP_READ|REQ_SYNC|REQ_META,
+			       GFP_NOFS,
+			       &c->btree_bio);
 	rb = container_of(bio, struct btree_read_bio, bio);
 	rb->c			= c;
+	rb->b			= b;
+	rb->ra			= NULL;
 	rb->start_time		= local_clock();
-	rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
+	rb->have_ioref		= ca != NULL;
 	rb->pick		= pick;
 	INIT_WORK(&rb->work, btree_node_read_work);
-	bio->bi_opf		= REQ_OP_READ|REQ_SYNC|REQ_META;
 	bio->bi_iter.bi_sector	= pick.ptr.offset;
-	bio->bi_iter.bi_size	= btree_bytes(c);
 	bio->bi_end_io		= btree_node_read_endio;
-	bio->bi_private		= b;
-	bch2_bio_map(bio, b->data);
-
-	set_btree_node_read_in_flight(b);
+	bch2_bio_map(bio, b->data, btree_buf_bytes(b));
 
 	if (rb->have_ioref) {
-		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE],
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
 			     bio_sectors(bio));
 		bio_set_dev(bio, ca->disk_sb.bdev);
 
 		if (sync) {
 			submit_bio_wait(bio);
-
-			bio->bi_private	= b;
+			bch2_latency_acct(ca, rb->start_time, READ);
 			btree_node_read_work(&rb->work);
 		} else {
 			submit_bio(bio);
@@ -1461,14 +1755,14 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 		if (sync)
 			btree_node_read_work(&rb->work);
 		else
-			queue_work(system_unbound_wq, &rb->work);
-
+			queue_work(c->btree_read_complete_wq, &rb->work);
 	}
 }
 
-int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
-			const struct bkey_i *k, unsigned level)
+static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
+				  const struct bkey_i *k, unsigned level)
 {
+	struct bch_fs *c = trans->c;
 	struct closure cl;
 	struct btree *b;
 	int ret;
@@ -1476,139 +1770,115 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
 	closure_init_stack(&cl);
 
 	do {
-		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+		ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
 		closure_sync(&cl);
 	} while (ret);
 
-	b = bch2_btree_node_mem_alloc(c);
-	bch2_btree_cache_cannibalize_unlock(c);
+	b = bch2_btree_node_mem_alloc(trans, level != 0);
+	bch2_btree_cache_cannibalize_unlock(trans);
 
 	BUG_ON(IS_ERR(b));
 
 	bkey_copy(&b->key, k);
 	BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
 
-	bch2_btree_node_read(c, b, true);
+	set_btree_node_read_in_flight(b);
 
-	if (btree_node_read_error(b)) {
-		bch2_btree_node_hash_remove(&c->btree_cache, b);
+	/* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */
+	bch2_trans_unlock(trans);
+	bch2_btree_node_read(trans, b, true);
 
+	if (btree_node_read_error(b)) {
 		mutex_lock(&c->btree_cache.lock);
-		list_move(&b->list, &c->btree_cache.freeable);
+		bch2_btree_node_hash_remove(&c->btree_cache, b);
 		mutex_unlock(&c->btree_cache.lock);
 
-		ret = -EIO;
+		ret = -BCH_ERR_btree_node_read_error;
 		goto err;
 	}
 
 	bch2_btree_set_root_for_read(c, b);
 err:
-	six_unlock_write(&b->lock);
-	six_unlock_intent(&b->lock);
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
 
 	return ret;
 }
 
-void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
-			      struct btree_write *w)
+int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
+			const struct bkey_i *k, unsigned level)
 {
-	unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
+	return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
+}
 
+static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
+				      struct btree_write *w)
+{
+	unsigned long old, new;
+
+	old = READ_ONCE(b->will_make_reachable);
 	do {
-		old = new = v;
+		new = old;
 		if (!(old & 1))
 			break;
 
 		new &= ~1UL;
-	} while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old);
+	} while (!try_cmpxchg(&b->will_make_reachable, &old, new));
 
 	if (old & 1)
 		closure_put(&((struct btree_update *) new)->cl);
 
 	bch2_journal_pin_drop(&c->journal, &w->journal);
-	closure_wake_up(&w->wait);
 }
 
-static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 {
 	struct btree_write *w = btree_prev_write(b);
+	unsigned long old, new;
+	unsigned type = 0;
 
 	bch2_btree_complete_write(c, b, w);
-	btree_node_io_unlock(b);
-}
-
-static void bch2_btree_node_write_error(struct bch_fs *c,
-					struct btree_write_bio *wbio)
-{
-	struct btree *b		= wbio->wbio.bio.bi_private;
-	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-	struct bkey_i_extent *new_key;
-	struct bkey_s_extent e;
-	struct bch_extent_ptr *ptr;
-	struct btree_iter iter;
-	int ret;
 
-	__bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
-			       BTREE_MAX_DEPTH,
-			       b->level, BTREE_ITER_NODES);
-retry:
-	ret = bch2_btree_iter_traverse(&iter);
-	if (ret)
-		goto err;
-
-	/* has node been freed? */
-	if (iter.l[b->level].b != b) {
-		/* node has been freed: */
-		BUG_ON(!btree_node_dying(b));
-		goto out;
-	}
-
-	BUG_ON(!btree_node_hashed(b));
-
-	bkey_copy(&tmp.k, &b->key);
-
-	new_key = bkey_i_to_extent(&tmp.k);
-	e = extent_i_to_s(new_key);
-	extent_for_each_ptr_backwards(e, ptr)
-		if (bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev))
-			bch2_extent_drop_ptr(e, ptr);
-
-	if (!bch2_extent_nr_ptrs(e.c))
-		goto err;
+	old = READ_ONCE(b->flags);
+	do {
+		new = old;
+
+		if ((old & (1U << BTREE_NODE_dirty)) &&
+		    (old & (1U << BTREE_NODE_need_write)) &&
+		    !(old & (1U << BTREE_NODE_never_write)) &&
+		    !(old & (1U << BTREE_NODE_write_blocked)) &&
+		    !(old & (1U << BTREE_NODE_will_make_reachable))) {
+			new &= ~(1U << BTREE_NODE_dirty);
+			new &= ~(1U << BTREE_NODE_need_write);
+			new |=  (1U << BTREE_NODE_write_in_flight);
+			new |=  (1U << BTREE_NODE_write_in_flight_inner);
+			new |=  (1U << BTREE_NODE_just_written);
+			new ^=  (1U << BTREE_NODE_write_idx);
+
+			type = new & BTREE_WRITE_TYPE_MASK;
+			new &= ~BTREE_WRITE_TYPE_MASK;
+		} else {
+			new &= ~(1U << BTREE_NODE_write_in_flight);
+			new &= ~(1U << BTREE_NODE_write_in_flight_inner);
+		}
+	} while (!try_cmpxchg(&b->flags, &old, new));
 
-	ret = bch2_btree_node_update_key(c, &iter, b, new_key);
-	if (ret == -EINTR)
-		goto retry;
-	if (ret)
-		goto err;
-out:
-	bch2_btree_iter_unlock(&iter);
-	bio_put(&wbio->wbio.bio);
-	btree_node_write_done(c, b);
-	return;
-err:
-	set_btree_node_noevict(b);
-	bch2_fs_fatal_error(c, "fatal error writing btree node");
-	goto out;
+	if (new & (1U << BTREE_NODE_write_in_flight))
+		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
+	else
+		wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
 }
 
-void bch2_btree_write_error_work(struct work_struct *work)
+static void btree_node_write_done(struct bch_fs *c, struct btree *b)
 {
-	struct bch_fs *c = container_of(work, struct bch_fs,
-					btree_write_error_work);
-	struct bio *bio;
-
-	while (1) {
-		spin_lock_irq(&c->btree_write_error_lock);
-		bio = bio_list_pop(&c->btree_write_error_list);
-		spin_unlock_irq(&c->btree_write_error_lock);
+	struct btree_trans *trans = bch2_trans_get(c);
 
-		if (!bio)
-			break;
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
 
-		bch2_btree_node_write_error(c,
-			container_of(bio, struct btree_write_bio, wbio.bio));
-	}
+	/* we don't need transaction context anymore after we got the lock. */
+	bch2_trans_put(trans);
+	__btree_node_write_done(c, b);
+	six_unlock_read(&b->c.lock);
 }
 
 static void btree_node_write_work(struct work_struct *work)
@@ -1617,25 +1887,45 @@ static void btree_node_write_work(struct work_struct *work)
 		container_of(work, struct btree_write_bio, work);
 	struct bch_fs *c	= wbio->wbio.c;
 	struct btree *b		= wbio->wbio.bio.bi_private;
+	int ret = 0;
 
 	btree_bounce_free(c,
-		wbio->wbio.order,
+		wbio->data_bytes,
 		wbio->wbio.used_mempool,
 		wbio->data);
 
-	if (wbio->wbio.failed.nr) {
-		unsigned long flags;
+	bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
+		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
 
-		spin_lock_irqsave(&c->btree_write_error_lock, flags);
-		bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio);
-		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
-
-		queue_work(c->wq, &c->btree_write_error_work);
-		return;
+	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
+		ret = -BCH_ERR_btree_node_write_all_failed;
+		goto err;
 	}
 
+	if (wbio->wbio.first_btree_write) {
+		if (wbio->wbio.failed.nr) {
+
+		}
+	} else {
+		ret = bch2_trans_do(c,
+			bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
+					BCH_WATERMARK_interior_updates|
+					BCH_TRANS_COMMIT_journal_reclaim|
+					BCH_TRANS_COMMIT_no_enospc|
+					BCH_TRANS_COMMIT_no_check_rw,
+					!wbio->wbio.failed.nr));
+		if (ret)
+			goto err;
+	}
+out:
 	bio_put(&wbio->wbio.bio);
 	btree_node_write_done(c, b);
+	return;
+err:
+	set_btree_node_noevict(b);
+	bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
+			     "writing btree node: %s", bch2_err_str(ret));
+	goto out;
 }
 
 static void btree_node_write_endio(struct bio *bio)
@@ -1643,15 +1933,19 @@ static void btree_node_write_endio(struct bio *bio)
 	struct bch_write_bio *wbio	= to_wbio(bio);
 	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
 	struct bch_write_bio *orig	= parent ?: wbio;
+	struct btree_write_bio *wb	= container_of(orig, struct btree_write_bio, wbio);
 	struct bch_fs *c		= wbio->c;
-	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
+	struct btree *b			= wbio->bio.bi_private;
+	struct bch_dev *ca		= wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
 	unsigned long flags;
 
 	if (wbio->have_ioref)
 		bch2_latency_acct(ca, wbio->submit_time, WRITE);
 
-	if (bio->bi_status == BLK_STS_REMOVED ||
-	    bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
+	if (!ca ||
+	    bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+			       "btree write error: %s",
+			       bch2_blk_status_to_str(bio->bi_status)) ||
 	    bch2_meta_write_fault("btree")) {
 		spin_lock_irqsave(&c->btree_write_error_lock, flags);
 		bch2_dev_list_add_dev(&orig->failed, wbio->dev);
@@ -1664,53 +1958,75 @@ static void btree_node_write_endio(struct bio *bio)
 	if (parent) {
 		bio_put(bio);
 		bio_endio(&parent->bio);
-	} else {
-		struct btree_write_bio *wb =
-			container_of(orig, struct btree_write_bio, wbio);
-
-		INIT_WORK(&wb->work, btree_node_write_work);
-		queue_work(system_unbound_wq, &wb->work);
+		return;
 	}
+
+	clear_btree_node_write_in_flight_inner(b);
+	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
+	INIT_WORK(&wb->work, btree_node_write_work);
+	queue_work(c->btree_io_complete_wq, &wb->work);
 }
 
 static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 				   struct bset *i, unsigned sectors)
 {
-	const struct bch_extent_ptr *ptr;
-	unsigned whiteout_u64s = 0;
-	int ret;
-
-	extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr)
-		break;
+	bool saw_error;
+
+	int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
+				     (struct bkey_validate_context) {
+					.from	= BKEY_VALIDATE_btree_node,
+					.level	= b->c.level + 1,
+					.btree	= b->c.btree_id,
+					.flags	= BCH_VALIDATE_write,
+				     });
+	if (ret) {
+		bch2_fs_inconsistent(c, "invalid btree node key before write");
+		return ret;
+	}
 
-	ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE, false);
-	if (ret)
+	ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
+		validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
+	if (ret) {
 		bch2_inconsistent_error(c);
+		dump_stack();
+	}
 
 	return ret;
 }
 
-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
-			    enum six_lock_type lock_type_held)
+static void btree_write_submit(struct work_struct *work)
+{
+	struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
+	BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+
+	bkey_copy(&tmp.k, &wbio->key);
+
+	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr)
+		ptr->offset += wbio->sector_offset;
+
+	bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree,
+				  &tmp.k, false);
+}
+
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
 {
 	struct btree_write_bio *wbio;
-	struct bset_tree *t;
 	struct bset *i;
 	struct btree_node *bn = NULL;
 	struct btree_node_entry *bne = NULL;
-	BKEY_PADDED(key) k;
-	struct bkey_s_extent e;
-	struct bch_extent_ptr *ptr;
-	struct sort_iter sort_iter;
+	struct sort_iter_stack sort_iter;
 	struct nonce nonce;
-	unsigned bytes_to_write, sectors_to_write, order, bytes, u64s;
+	unsigned bytes_to_write, sectors_to_write, bytes, u64s;
 	u64 seq = 0;
 	bool used_mempool;
 	unsigned long old, new;
+	bool validate_before_checksum = false;
+	enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
 	void *data;
+	int ret;
 
-	if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
-		return;
+	if (flags & BTREE_WRITE_ALREADY_STARTED)
+		goto do_write;
 
 	/*
 	 * We may only have a read lock on the btree node - the dirty bit is our
@@ -1719,55 +2035,60 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	 * dirty bit requires a write lock, we can't race with other threads
 	 * redirtying it:
 	 */
+	old = READ_ONCE(b->flags);
 	do {
-		old = new = READ_ONCE(b->flags);
+		new = old;
 
 		if (!(old & (1 << BTREE_NODE_dirty)))
 			return;
 
+		if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
+		    !(old & (1 << BTREE_NODE_need_write)))
+			return;
+
+		if (old &
+		    ((1 << BTREE_NODE_never_write)|
+		     (1 << BTREE_NODE_write_blocked)))
+			return;
+
 		if (b->written &&
-		    !btree_node_may_write(b))
+		    (old & (1 << BTREE_NODE_will_make_reachable)))
 			return;
 
-		if (old & (1 << BTREE_NODE_write_in_flight)) {
-			btree_node_wait_on_io(b);
-			continue;
-		}
+		if (old & (1 << BTREE_NODE_write_in_flight))
+			return;
+
+		if (flags & BTREE_WRITE_ONLY_IF_NEED)
+			type = new & BTREE_WRITE_TYPE_MASK;
+		new &= ~BTREE_WRITE_TYPE_MASK;
 
 		new &= ~(1 << BTREE_NODE_dirty);
 		new &= ~(1 << BTREE_NODE_need_write);
 		new |=  (1 << BTREE_NODE_write_in_flight);
+		new |=  (1 << BTREE_NODE_write_in_flight_inner);
 		new |=  (1 << BTREE_NODE_just_written);
 		new ^=  (1 << BTREE_NODE_write_idx);
-	} while (cmpxchg_acquire(&b->flags, old, new) != old);
+	} while (!try_cmpxchg_acquire(&b->flags, &old, new));
+
+	if (new & (1U << BTREE_NODE_need_write))
+		return;
+do_write:
+	BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
+
+	atomic_long_dec(&c->btree_cache.nr_dirty);
 
 	BUG_ON(btree_node_fake(b));
-	BUG_ON(!list_empty(&b->write_blocked));
 	BUG_ON((b->will_make_reachable != 0) != !b->written);
 
-	BUG_ON(b->written >= c->opts.btree_node_size);
-	BUG_ON(b->written & (c->opts.block_size - 1));
+	BUG_ON(b->written >= btree_sectors(c));
+	BUG_ON(b->written & (block_sectors(c) - 1));
 	BUG_ON(bset_written(b, btree_bset_last(b)));
 	BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
 	BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
 
-	/*
-	 * We can't block on six_lock_write() here; another thread might be
-	 * trying to get a journal reservation with read locks held, and getting
-	 * a journal reservation might be blocked on flushing the journal and
-	 * doing btree writes:
-	 */
-	if (lock_type_held == SIX_LOCK_intent &&
-	    six_trylock_write(&b->lock)) {
-		__bch2_compact_whiteouts(c, b, COMPACT_WRITTEN);
-		six_unlock_write(&b->lock);
-	} else {
-		__bch2_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK);
-	}
+	bch2_sort_whiteouts(c, b);
 
-	BUG_ON(b->uncompacted_whiteout_u64s);
-
-	sort_iter_init(&sort_iter, b);
+	sort_iter_stack_init(&sort_iter, b);
 
 	bytes = !b->written
 		? sizeof(struct btree_node)
@@ -1782,14 +2103,21 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 			continue;
 
 		bytes += le16_to_cpu(i->u64s) * sizeof(u64);
-		sort_iter_add(&sort_iter,
+		sort_iter_add(&sort_iter.iter,
 			      btree_bkey_first(b, t),
 			      btree_bkey_last(b, t));
 		seq = max(seq, le64_to_cpu(i->journal_seq));
 	}
 
-	order = get_order(bytes);
-	data = btree_bounce_alloc(c, order, &used_mempool);
+	BUG_ON(b->written && !seq);
+
+	/* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
+	bytes += 8;
+
+	/* buffer must be a multiple of the block size */
+	bytes = round_up(bytes, block_bytes(c));
+
+	data = btree_bounce_alloc(c, bytes, &used_mempool);
 
 	if (!b->written) {
 		bn = data;
@@ -1804,27 +2132,19 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	i->journal_seq	= cpu_to_le64(seq);
 	i->u64s		= 0;
 
-	if (!btree_node_is_extents(b)) {
-		sort_iter_add(&sort_iter,
-			      unwritten_whiteouts_start(c, b),
-			      unwritten_whiteouts_end(c, b));
-		SET_BSET_SEPARATE_WHITEOUTS(i, false);
-	} else {
-		memcpy_u64s(i->start,
-			    unwritten_whiteouts_start(c, b),
-			    b->whiteout_u64s);
-		i->u64s = cpu_to_le16(b->whiteout_u64s);
-		SET_BSET_SEPARATE_WHITEOUTS(i, true);
-	}
+	sort_iter_add(&sort_iter.iter,
+		      unwritten_whiteouts_start(b),
+		      unwritten_whiteouts_end(b));
+	SET_BSET_SEPARATE_WHITEOUTS(i, false);
+
+	u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter);
+	le16_add_cpu(&i->u64s, u64s);
 
 	b->whiteout_u64s = 0;
 
-	u64s = btree_node_is_extents(b)
-		? sort_extents(vstruct_last(i), &sort_iter, false)
-		: sort_keys(i->start, &sort_iter, false);
-	le16_add_cpu(&i->u64s, u64s);
+	BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
 
-	clear_needs_whiteout(i);
+	set_needs_whiteout(i, false);
 
 	/* do we have data to write? */
 	if (b->written && !i->u64s)
@@ -1833,22 +2153,37 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	bytes_to_write = vstruct_end(i) - data;
 	sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
 
+	if (!b->written &&
+	    b->key.k.type == KEY_TYPE_btree_ptr_v2)
+		BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write);
+
 	memset(data + bytes_to_write, 0,
 	       (sectors_to_write << 9) - bytes_to_write);
 
-	BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size);
+	BUG_ON(b->written + sectors_to_write > btree_sectors(c));
 	BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
 	BUG_ON(i->seq != b->data->keys.seq);
 
-	i->version = cpu_to_le16(BCACHE_BSET_VERSION);
+	i->version = cpu_to_le16(c->sb.version);
+	SET_BSET_OFFSET(i, b->written);
 	SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
 
+	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
+		validate_before_checksum = true;
+
+	/* validate_bset will be modifying: */
+	if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
+		validate_before_checksum = true;
+
 	/* if we're going to be encrypting, check metadata validity first: */
-	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
+	if (validate_before_checksum &&
 	    validate_bset_for_write(c, b, i, sectors_to_write))
 		goto err;
 
-	bset_encrypt(c, i, b->written << 9);
+	ret = bset_encrypt(c, i, b->written << 9);
+	if (bch2_fs_fatal_err_on(ret, c,
+			"encrypting btree node: %s", bch2_err_str(ret)))
+		goto err;
 
 	nonce = btree_nonce(i, b->written << 9);
 
@@ -1858,7 +2193,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 		bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
 
 	/* if we're not encrypting, check metadata after checksumming: */
-	if (!bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
+	if (!validate_before_checksum &&
 	    validate_bset_for_write(c, b, i, sectors_to_write))
 		goto err;
 
@@ -1872,6 +2207,11 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	 * reflect that those writes were done and the data flushed from the
 	 * journal:
 	 *
+	 * Also on journal error, the pending write may have updates that were
+	 * never journalled (interior nodes, see btree_update_nodes_written()) -
+	 * it's critical that we don't do the write in that case otherwise we
+	 * will have updates visible that weren't in the journal:
+	 *
 	 * Make sure to update b->written so bch2_btree_init_next() doesn't
 	 * break:
 	 */
@@ -1879,52 +2219,46 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	    c->opts.nochanges)
 		goto err;
 
-	trace_btree_write(b, bytes_to_write, sectors_to_write);
+	trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write);
 
-	wbio = container_of(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->btree_bio),
+	wbio = container_of(bio_alloc_bioset(NULL,
+				buf_pages(data, sectors_to_write << 9),
+				REQ_OP_WRITE|REQ_META,
+				GFP_NOFS,
+				&c->btree_bio),
 			    struct btree_write_bio, wbio.bio);
 	wbio_init(&wbio->wbio.bio);
 	wbio->data			= data;
-	wbio->wbio.order		= order;
+	wbio->data_bytes		= bytes;
+	wbio->sector_offset		= b->written;
+	wbio->wbio.c			= c;
 	wbio->wbio.used_mempool		= used_mempool;
-	wbio->wbio.bio.bi_opf		= REQ_OP_WRITE|REQ_META|REQ_FUA;
-	wbio->wbio.bio.bi_iter.bi_size	= sectors_to_write << 9;
+	wbio->wbio.first_btree_write	= !b->written;
 	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
 	wbio->wbio.bio.bi_private	= b;
 
-	bch2_bio_map(&wbio->wbio.bio, data);
+	bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
 
-	/*
-	 * If we're appending to a leaf node, we don't technically need FUA -
-	 * this write just needs to be persisted before the next journal write,
-	 * which will be marked FLUSH|FUA.
-	 *
-	 * Similarly if we're writing a new btree root - the pointer is going to
-	 * be in the next journal entry.
-	 *
-	 * But if we're writing a new btree node (that isn't a root) or
-	 * appending to a non leaf btree node, we need either FUA or a flush
-	 * when we write the parent with the new pointer. FUA is cheaper than a
-	 * flush, and writes appending to leaf nodes aren't blocking anything so
-	 * just make all btree node writes FUA to keep things sane.
-	 */
+	bkey_copy(&wbio->key, &b->key);
 
-	bkey_copy(&k.key, &b->key);
-	e = bkey_i_to_s_extent(&k.key);
+	b->written += sectors_to_write;
 
-	extent_for_each_ptr(e, ptr)
-		ptr->offset += b->written;
+	if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
+		bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
+			cpu_to_le16(b->written);
 
-	b->written += sectors_to_write;
+	atomic64_inc(&c->btree_write_stats[type].nr);
+	atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
 
-	bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key);
+	INIT_WORK(&wbio->work, btree_write_submit);
+	queue_work(c->btree_write_submit_wq, &wbio->work);
 	return;
 err:
 	set_btree_node_noevict(b);
 	b->written += sectors_to_write;
 nowrite:
-	btree_bounce_free(c, order, used_mempool, data);
-	btree_node_write_done(c, b);
+	btree_bounce_free(c, bytes, used_mempool, data);
+	__btree_node_write_done(c, b);
 }
 
 /*
@@ -1934,20 +2268,18 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
 {
 	bool invalidated_iter = false;
 	struct btree_node_entry *bne;
-	struct bset_tree *t;
 
 	if (!btree_node_just_written(b))
 		return false;
 
 	BUG_ON(b->whiteout_u64s);
-	BUG_ON(b->uncompacted_whiteout_u64s);
 
 	clear_btree_node_just_written(b);
 
 	/*
-	 * Note: immediately after write, bset_unwritten()/bset_written() don't
-	 * work - the amount of data we had to write after compaction might have
-	 * been smaller than the offset of the last bset.
+	 * Note: immediately after write, bset_written() doesn't work - the
+	 * amount of data we had to write after compaction might have been
+	 * smaller than the offset of the last bset.
 	 *
 	 * However, we know that all bsets have been written here, as long as
 	 * we're still holding the write lock:
@@ -1958,14 +2290,14 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
 	 * single bset:
 	 */
 	if (b->nsets > 1) {
-		btree_node_sort(c, b, NULL, 0, b->nsets, true);
+		btree_node_sort(c, b, 0, b->nsets);
 		invalidated_iter = true;
 	} else {
-		invalidated_iter = bch2_drop_whiteouts(b);
+		invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
 	}
 
 	for_each_bset(b, t)
-		set_needs_whiteout(bset(b, t));
+		set_needs_whiteout(bset(b, t), true);
 
 	bch2_btree_verify(c, b);
 
@@ -1977,7 +2309,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
 
 	bne = want_new_bset(c, b);
 	if (bne)
-		bch2_bset_init_next(c, b, bne);
+		bch2_bset_init_next(b, bne);
 
 	bch2_btree_build_aux_trees(b);
 
@@ -1988,104 +2320,81 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
  * Use this one if the node is intent locked:
  */
 void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
-			  enum six_lock_type lock_type_held)
+			   enum six_lock_type lock_type_held,
+			   unsigned flags)
 {
-	BUG_ON(lock_type_held == SIX_LOCK_write);
-
 	if (lock_type_held == SIX_LOCK_intent ||
-	    six_lock_tryupgrade(&b->lock)) {
-		__bch2_btree_node_write(c, b, SIX_LOCK_intent);
+	    (lock_type_held == SIX_LOCK_read &&
+	     six_lock_tryupgrade(&b->c.lock))) {
+		__bch2_btree_node_write(c, b, flags);
 
 		/* don't cycle lock unnecessarily: */
 		if (btree_node_just_written(b) &&
-		    six_trylock_write(&b->lock)) {
+		    six_trylock_write(&b->c.lock)) {
 			bch2_btree_post_write_cleanup(c, b);
-			six_unlock_write(&b->lock);
+			six_unlock_write(&b->c.lock);
 		}
 
 		if (lock_type_held == SIX_LOCK_read)
-			six_lock_downgrade(&b->lock);
+			six_lock_downgrade(&b->c.lock);
 	} else {
-		__bch2_btree_node_write(c, b, SIX_LOCK_read);
+		__bch2_btree_node_write(c, b, flags);
+		if (lock_type_held == SIX_LOCK_write &&
+		    btree_node_just_written(b))
+			bch2_btree_post_write_cleanup(c, b);
 	}
 }
 
-static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
+static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
 {
 	struct bucket_table *tbl;
 	struct rhash_head *pos;
 	struct btree *b;
 	unsigned i;
+	bool ret = false;
 restart:
 	rcu_read_lock();
 	for_each_cached_btree(b, c, tbl, i, pos)
 		if (test_bit(flag, &b->flags)) {
 			rcu_read_unlock();
 			wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
+			ret = true;
 			goto restart;
-
 		}
 	rcu_read_unlock();
-}
 
-void bch2_btree_flush_all_reads(struct bch_fs *c)
-{
-	__bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
+	return ret;
 }
 
-void bch2_btree_flush_all_writes(struct bch_fs *c)
+bool bch2_btree_flush_all_reads(struct bch_fs *c)
 {
-	__bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
+	return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
 }
 
-void bch2_btree_verify_flushed(struct bch_fs *c)
+bool bch2_btree_flush_all_writes(struct bch_fs *c)
 {
-	struct bucket_table *tbl;
-	struct rhash_head *pos;
-	struct btree *b;
-	unsigned i;
-
-	rcu_read_lock();
-	for_each_cached_btree(b, c, tbl, i, pos) {
-		unsigned long flags = READ_ONCE(b->flags);
-
-		BUG_ON((flags & (1 << BTREE_NODE_dirty)) ||
-		       (flags & (1 << BTREE_NODE_write_in_flight)));
-	}
-	rcu_read_unlock();
+	return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
 }
 
-ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
+static const char * const bch2_btree_write_types[] = {
+#define x(t, n) [n] = #t,
+	BCH_BTREE_WRITE_TYPES()
+	NULL
+};
+
+void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
 {
-	char *out = buf, *end = buf + PAGE_SIZE;
-	struct bucket_table *tbl;
-	struct rhash_head *pos;
-	struct btree *b;
-	unsigned i;
+	printbuf_tabstop_push(out, 20);
+	printbuf_tabstop_push(out, 10);
 
-	rcu_read_lock();
-	for_each_cached_btree(b, c, tbl, i, pos) {
-		unsigned long flags = READ_ONCE(b->flags);
-		unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
-
-		if (//!(flags & (1 << BTREE_NODE_dirty)) &&
-		    !b->writes[0].wait.list.first &&
-		    !b->writes[1].wait.list.first &&
-		    !(b->will_make_reachable & 1))
-			continue;
+	prt_printf(out, "\tnr\tsize\n");
 
-		out += scnprintf(out, end - out, "%p d %u l %u w %u b %u r %u:%lu c %u p %u\n",
-				 b,
-				 (flags & (1 << BTREE_NODE_dirty)) != 0,
-				 b->level,
-				 b->written,
-				 !list_empty_careful(&b->write_blocked),
-				 b->will_make_reachable != 0,
-				 b->will_make_reachable & 1,
-				 b->writes[ idx].wait.list.first != NULL,
-				 b->writes[!idx].wait.list.first != NULL);
-	}
-	rcu_read_unlock();
+	for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
+		u64 nr		= atomic64_read(&c->btree_write_stats[i].nr);
+		u64 bytes	= atomic64_read(&c->btree_write_stats[i].bytes);
 
-	return out - buf;
+		prt_printf(out, "%s:\t%llu\t", bch2_btree_write_types[i], nr);
+		prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
+		prt_newline(out);
+	}
 }
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index fa154642..9b01ca3d 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -1,196 +1,223 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_BTREE_IO_H
 #define _BCACHEFS_BTREE_IO_H
 
+#include "bkey_methods.h"
 #include "bset.h"
+#include "btree_locking.h"
+#include "checksum.h"
 #include "extents.h"
-#include "io_types.h"
+#include "io_write_types.h"
 
 struct bch_fs;
 struct btree_write;
 struct btree;
 struct btree_iter;
+struct btree_node_read_all;
+
+static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
+{
+	if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
+		atomic_long_inc(&c->btree_cache.nr_dirty);
+}
+
+static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
+{
+	if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
+		atomic_long_dec(&c->btree_cache.nr_dirty);
+}
+
+static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k)
+{
+	return k.k->type == KEY_TYPE_btree_ptr_v2
+		? le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors_written)
+		: 0;
+}
 
 struct btree_read_bio {
 	struct bch_fs		*c;
+	struct btree		*b;
+	struct btree_node_read_all *ra;
 	u64			start_time;
 	unsigned		have_ioref:1;
-	struct extent_pick_ptr	pick;
+	unsigned		idx:7;
+	struct extent_ptr_decoded	pick;
 	struct work_struct	work;
 	struct bio		bio;
 };
 
 struct btree_write_bio {
-	void			*data;
 	struct work_struct	work;
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+	void			*data;
+	unsigned		data_bytes;
+	unsigned		sector_offset;
 	struct bch_write_bio	wbio;
 };
 
-static inline void btree_node_io_unlock(struct btree *b)
-{
-	EBUG_ON(!btree_node_write_in_flight(b));
-	clear_btree_node_write_in_flight(b);
-	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-}
-
-static inline void btree_node_io_lock(struct btree *b)
-{
-	wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
-			    TASK_UNINTERRUPTIBLE);
-}
-
-static inline void btree_node_wait_on_io(struct btree *b)
-{
-	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
-		       TASK_UNINTERRUPTIBLE);
-}
-
-static inline bool btree_node_may_write(struct btree *b)
-{
-	return list_empty_careful(&b->write_blocked) &&
-		!b->will_make_reachable;
-}
+void bch2_btree_node_io_unlock(struct btree *);
+void bch2_btree_node_io_lock(struct btree *);
+void __bch2_btree_node_wait_on_read(struct btree *);
+void __bch2_btree_node_wait_on_write(struct btree *);
+void bch2_btree_node_wait_on_read(struct btree *);
+void bch2_btree_node_wait_on_write(struct btree *);
 
 enum compact_mode {
 	COMPACT_LAZY,
-	COMPACT_WRITTEN,
-	COMPACT_WRITTEN_NO_WRITE_LOCK,
+	COMPACT_ALL,
 };
 
-bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode);
+bool bch2_compact_whiteouts(struct bch_fs *, struct btree *,
+			    enum compact_mode);
 
-static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t)
+static inline bool should_compact_bset_lazy(struct btree *b,
+					    struct bset_tree *t)
 {
-	unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
-	unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
+	unsigned total_u64s = bset_u64s(t);
+	unsigned dead_u64s = bset_dead_u64s(b, t);
 
-	return dead_u64s > 128 && dead_u64s * 3 > bset_u64s;
+	return dead_u64s > 64 && dead_u64s * 3 > total_u64s;
 }
 
 static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
 {
-	struct bset_tree *t;
-
 	for_each_bset(b, t)
 		if (should_compact_bset_lazy(b, t))
-			return __bch2_compact_whiteouts(c, b, COMPACT_LAZY);
+			return bch2_compact_whiteouts(c, b, COMPACT_LAZY);
 
 	return false;
 }
 
-void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
+static inline struct nonce btree_nonce(struct bset *i, unsigned offset)
+{
+	return (struct nonce) {{
+		[0] = cpu_to_le32(offset),
+		[1] = ((__le32 *) &i->seq)[0],
+		[2] = ((__le32 *) &i->seq)[1],
+		[3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
+	}};
+}
 
-void bch2_btree_build_aux_trees(struct btree *);
-void bch2_btree_init_next(struct bch_fs *, struct btree *,
-			 struct btree_iter *);
+static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
+{
+	struct nonce nonce = btree_nonce(i, offset);
+	int ret;
 
-int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool);
-void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
-int bch2_btree_root_read(struct bch_fs *, enum btree_id,
-			 const struct bkey_i *, unsigned);
+	if (!offset) {
+		struct btree_node *bn = container_of(i, struct btree_node, keys);
+		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
 
-void bch2_btree_complete_write(struct bch_fs *, struct btree *,
-			      struct btree_write *);
-void bch2_btree_write_error_work(struct work_struct *);
+		ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
+				   &bn->flags, bytes);
+		if (ret)
+			return ret;
 
-void __bch2_btree_node_write(struct bch_fs *, struct btree *,
-			    enum six_lock_type);
-bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
+		nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
+	}
 
-void bch2_btree_node_write(struct bch_fs *, struct btree *,
-			  enum six_lock_type);
-
-/*
- * btree_node_dirty() can be cleared with only a read lock,
- * and for bch2_btree_node_write_cond() we want to set need_write iff it's
- * still dirty:
- */
-static inline void set_btree_node_need_write_if_dirty(struct btree *b)
-{
-	unsigned long old, new, v = READ_ONCE(b->flags);
+	return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
+			    vstruct_end(i) - (void *) i->_data);
+}
 
-	do {
-		old = new = v;
+void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
 
-		if (!(old & (1 << BTREE_NODE_dirty)))
-			return;
+void bch2_btree_node_drop_keys_outside_node(struct btree *);
 
-		new |= (1 << BTREE_NODE_need_write);
-	} while ((v = cmpxchg(&b->flags, old, new)) != old);
-}
+void bch2_btree_build_aux_trees(struct btree *);
+void bch2_btree_init_next(struct btree_trans *, struct btree *);
 
-#define bch2_btree_node_write_cond(_c, _b, cond)			\
-do {									\
-	while ((_b)->written && btree_node_dirty(_b) &&	(cond)) {	\
-		if (!btree_node_may_write(_b)) {			\
-			set_btree_node_need_write_if_dirty(_b);		\
-			break;						\
-		}							\
-									\
-		if (!btree_node_write_in_flight(_b)) {			\
-			bch2_btree_node_write(_c, _b, SIX_LOCK_read);	\
-			break;						\
-		}							\
-									\
-		six_unlock_read(&(_b)->lock);				\
-		btree_node_wait_on_io(_b);				\
-		btree_node_lock_type(c, b, SIX_LOCK_read);		\
-	}								\
-} while (0)
-
-void bch2_btree_flush_all_reads(struct bch_fs *);
-void bch2_btree_flush_all_writes(struct bch_fs *);
-void bch2_btree_verify_flushed(struct bch_fs *);
-ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *);
-
-/* Sorting */
-
-struct btree_node_iter_large {
-	u8		is_extents;
-	u16		used;
-
-	struct btree_node_iter_set data[MAX_BSETS];
-};
+int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
+			      struct btree *, bool, bool *);
+void bch2_btree_node_read(struct btree_trans *, struct btree *, bool);
+int bch2_btree_root_read(struct bch_fs *, enum btree_id,
+			 const struct bkey_i *, unsigned);
 
-static inline void
-__bch2_btree_node_iter_large_init(struct btree_node_iter_large *iter,
-				  bool is_extents)
-{
-	iter->used = 0;
-	iter->is_extents = is_extents;
-}
+bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 
-void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *,
-					struct btree *);
+enum btree_write_flags {
+	__BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
+	__BTREE_WRITE_ALREADY_STARTED,
+};
+#define BTREE_WRITE_ONLY_IF_NEED	BIT(__BTREE_WRITE_ONLY_IF_NEED)
+#define BTREE_WRITE_ALREADY_STARTED	BIT(__BTREE_WRITE_ALREADY_STARTED)
 
-void bch2_btree_node_iter_large_push(struct btree_node_iter_large *,
-				     struct btree *,
-				     const struct bkey_packed *,
-				     const struct bkey_packed *);
+void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
+void bch2_btree_node_write(struct bch_fs *, struct btree *,
+			   enum six_lock_type, unsigned);
 
-static inline bool bch2_btree_node_iter_large_end(struct btree_node_iter_large *iter)
+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
+					    enum six_lock_type lock_held)
 {
-	return !iter->used;
+	bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
 }
 
-static inline struct bkey_packed *
-bch2_btree_node_iter_large_peek_all(struct btree_node_iter_large *iter,
-				    struct btree *b)
+bool bch2_btree_flush_all_reads(struct bch_fs *);
+bool bch2_btree_flush_all_writes(struct bch_fs *);
+
+static inline void compat_bformat(unsigned level, enum btree_id btree_id,
+				  unsigned version, unsigned big_endian,
+				  int write, struct bkey_format *f)
 {
-	return bch2_btree_node_iter_large_end(iter)
-		? NULL
-		: __btree_node_offset_to_key(b, iter->data->k);
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_id == BTREE_ID_inodes) {
+		swap(f->bits_per_field[BKEY_FIELD_INODE],
+		     f->bits_per_field[BKEY_FIELD_OFFSET]);
+		swap(f->field_offset[BKEY_FIELD_INODE],
+		     f->field_offset[BKEY_FIELD_OFFSET]);
+	}
+
+	if (version < bcachefs_metadata_version_snapshot &&
+	    (level || btree_type_has_snapshots(btree_id))) {
+		u64 max_packed =
+			~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+		f->field_offset[BKEY_FIELD_SNAPSHOT] = write
+			? 0
+			: cpu_to_le64(U32_MAX - max_packed);
+	}
 }
 
-static inline struct bkey_packed *
-bch2_btree_node_iter_large_next_all(struct btree_node_iter_large *iter,
-				    struct btree *b)
+static inline void compat_bpos(unsigned level, enum btree_id btree_id,
+			       unsigned version, unsigned big_endian,
+			       int write, struct bpos *p)
 {
-	struct bkey_packed *ret = bch2_btree_node_iter_large_peek_all(iter, b);
+	if (big_endian != CPU_BIG_ENDIAN)
+		bch2_bpos_swab(p);
 
-	if (ret)
-		bch2_btree_node_iter_large_advance(iter, b);
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_id == BTREE_ID_inodes)
+		swap(p->inode, p->offset);
+}
 
-	return ret;
+static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
+				     unsigned version, unsigned big_endian,
+				     int write,
+				     struct btree_node *bn)
+{
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_id_is_extents(btree_id) &&
+	    !bpos_eq(bn->min_key, POS_MIN) &&
+	    write)
+		bn->min_key = bpos_nosnap_predecessor(bn->min_key);
+
+	if (version < bcachefs_metadata_version_snapshot &&
+	    write)
+		bn->max_key.snapshot = 0;
+
+	compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
+	compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
+
+	if (version < bcachefs_metadata_version_snapshot &&
+	    !write)
+		bn->max_key.snapshot = U32_MAX;
+
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_id_is_extents(btree_id) &&
+	    !bpos_eq(bn->min_key, POS_MIN) &&
+	    !write)
+		bn->min_key = bpos_nosnap_successor(bn->min_key);
 }
 
+void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *);
+
 #endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index a52ec12e..9c54891c 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -1,480 +1,438 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
 #include "bkey_methods.h"
+#include "bkey_buf.h"
 #include "btree_cache.h"
 #include "btree_iter.h"
+#include "btree_journal_iter.h"
+#include "btree_key_cache.h"
 #include "btree_locking.h"
+#include "btree_update.h"
 #include "debug.h"
+#include "error.h"
 #include "extents.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "replicas.h"
+#include "snapshot.h"
+#include "trace.h"
 
+#include <linux/random.h>
 #include <linux/prefetch.h>
-#include <trace/events/bcachefs.h>
 
-static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *,
-						    struct btree_iter_level *,
-						    struct bkey *);
+static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
+static inline void btree_path_list_add(struct btree_trans *,
+			btree_path_idx_t, btree_path_idx_t);
 
-#define BTREE_ITER_NOT_END	((struct btree *) 1)
-
-static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
+static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
 {
-	return l < BTREE_MAX_DEPTH &&
-		iter->l[l].b &&
-		iter->l[l].b != BTREE_ITER_NOT_END;
+#ifdef TRACK_PATH_ALLOCATED
+	return iter->ip_allocated;
+#else
+	return 0;
+#endif
 }
 
-/* Btree node locking: */
+static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t);
+static void bch2_trans_srcu_lock(struct btree_trans *);
 
-/*
- * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
- * succeed:
- */
-void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
+static inline int __btree_path_cmp(const struct btree_path *l,
+				   enum btree_id	r_btree_id,
+				   bool			r_cached,
+				   struct bpos		r_pos,
+				   unsigned		r_level)
 {
-	struct btree_iter *linked;
-
-	EBUG_ON(iter->l[b->level].b != b);
-	EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq);
-
-	for_each_btree_iter_with_node(iter, b, linked)
-		linked->lock_seq[b->level] += 2;
-
-	six_unlock_write(&b->lock);
-}
-
-void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
-{
-	struct bch_fs *c = iter->c;
-	struct btree_iter *linked;
-	unsigned readers = 0;
-
-	EBUG_ON(btree_node_read_locked(iter, b->level));
-
-	for_each_linked_btree_iter(iter, linked)
-		if (linked->l[b->level].b == b &&
-		    btree_node_read_locked(linked, b->level))
-			readers++;
-
 	/*
-	 * Must drop our read locks before calling six_lock_write() -
-	 * six_unlock() won't do wakeups until the reader count
-	 * goes to 0, and it's safe because we have the node intent
-	 * locked:
+	 * Must match lock ordering as defined by __bch2_btree_node_lock:
 	 */
-	atomic64_sub(__SIX_VAL(read_lock, readers),
-		     &b->lock.state.counter);
-	btree_node_lock_type(c, b, SIX_LOCK_write);
-	atomic64_add(__SIX_VAL(read_lock, readers),
-		     &b->lock.state.counter);
+	return   cmp_int(l->btree_id,	r_btree_id) ?:
+		 cmp_int((int) l->cached,	(int) r_cached) ?:
+		 bpos_cmp(l->pos,	r_pos) ?:
+		-cmp_int(l->level,	r_level);
 }
 
-/*
- * Lock a btree node if we already have it locked on one of our linked
- * iterators:
- */
-static inline bool btree_node_lock_increment(struct btree_iter *iter,
-					     struct btree *b, unsigned level,
-					     enum btree_node_locked_type want)
+static inline int btree_path_cmp(const struct btree_path *l,
+				 const struct btree_path *r)
 {
-	struct btree_iter *linked;
+	return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
+}
 
-	for_each_linked_btree_iter(iter, linked)
-		if (linked->l[level].b == b &&
-		    btree_node_locked_type(linked, level) >= want) {
-			six_lock_increment(&b->lock, want);
-			return true;
-		}
+static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+{
+	/* Are we iterating over keys in all snapshots? */
+	if (iter->flags & BTREE_ITER_all_snapshots) {
+		p = bpos_successor(p);
+	} else {
+		p = bpos_nosnap_successor(p);
+		p.snapshot = iter->snapshot;
+	}
 
-	return false;
+	return p;
 }
 
-bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
+static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
 {
-	struct btree *b = btree_iter_node(iter, level);
-	int want = __btree_lock_want(iter, level);
+	/* Are we iterating over keys in all snapshots? */
+	if (iter->flags & BTREE_ITER_all_snapshots) {
+		p = bpos_predecessor(p);
+	} else {
+		p = bpos_nosnap_predecessor(p);
+		p.snapshot = iter->snapshot;
+	}
 
-	if (!b || b == BTREE_ITER_NOT_END)
-		return false;
+	return p;
+}
 
-	if (race_fault())
-		return false;
+static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
+{
+	struct bpos pos = iter->pos;
 
-	if (!six_relock_type(&b->lock, want, iter->lock_seq[level]) &&
-	    !(iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 &&
-	      btree_node_lock_increment(iter, b, level, want)))
-		return false;
+	if ((iter->flags & BTREE_ITER_is_extents) &&
+	    !bkey_eq(pos, POS_MAX))
+		pos = bkey_successor(iter, pos);
+	return pos;
+}
 
-	mark_btree_node_locked(iter, level, want);
-	return true;
+static inline bool btree_path_pos_before_node(struct btree_path *path,
+					      struct btree *b)
+{
+	return bpos_lt(path->pos, b->data->min_key);
 }
 
-static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
+static inline bool btree_path_pos_after_node(struct btree_path *path,
+					     struct btree *b)
 {
-	struct btree *b = iter->l[level].b;
+	return bpos_gt(path->pos, b->key.k.p);
+}
 
-	EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
+static inline bool btree_path_pos_in_node(struct btree_path *path,
+					  struct btree *b)
+{
+	return path->btree_id == b->c.btree_id &&
+		!btree_path_pos_before_node(path, b) &&
+		!btree_path_pos_after_node(path, b);
+}
 
-	if (!is_btree_node(iter, level))
-		return false;
+/* Btree iterator: */
 
-	if (btree_node_intent_locked(iter, level))
-		return true;
+#ifdef CONFIG_BCACHEFS_DEBUG
 
-	if (race_fault())
-		return false;
+static void bch2_btree_path_verify_cached(struct btree_trans *trans,
+					  struct btree_path *path)
+{
+	struct bkey_cached *ck;
+	bool locked = btree_node_locked(path, 0);
 
-	if (btree_node_locked(iter, level)
-	    ? six_lock_tryupgrade(&b->lock)
-	    : six_relock_type(&b->lock, SIX_LOCK_intent, iter->lock_seq[level]))
-		goto success;
+	if (!bch2_btree_node_relock(trans, path, 0))
+		return;
 
-	if (iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 &&
-	    btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
-		btree_node_unlock(iter, level);
-		goto success;
-	}
+	ck = (void *) path->l[0].b;
+	BUG_ON(ck->key.btree_id != path->btree_id ||
+	       !bkey_eq(ck->key.pos, path->pos));
 
-	return false;
-success:
-	mark_btree_node_intent_locked(iter, level);
-	return true;
+	if (!locked)
+		btree_node_unlock(trans, path, 0);
 }
 
-static inline bool btree_iter_get_locks(struct btree_iter *iter,
-					bool upgrade)
+static void bch2_btree_path_verify_level(struct btree_trans *trans,
+				struct btree_path *path, unsigned level)
 {
-	unsigned l = iter->level;
-	int fail_idx = -1;
-
-	do {
-		if (!btree_iter_node(iter, l))
-			break;
+	struct btree_path_level *l;
+	struct btree_node_iter tmp;
+	bool locked;
+	struct bkey_packed *p, *k;
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+	struct printbuf buf3 = PRINTBUF;
+	const char *msg;
 
-		if (!(upgrade
-		      ? bch2_btree_node_upgrade(iter, l)
-		      : bch2_btree_node_relock(iter, l))) {
-			fail_idx = l;
-			btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-		}
+	if (!bch2_debug_check_iterators)
+		return;
 
-		l++;
-	} while (l < iter->locks_want);
+	l	= &path->l[level];
+	tmp	= l->iter;
+	locked	= btree_node_locked(path, level);
 
-	/*
-	 * When we fail to get a lock, we have to ensure that any child nodes
-	 * can't be relocked so bch2_btree_iter_traverse has to walk back up to
-	 * the node that we failed to relock:
-	 */
-	while (fail_idx >= 0) {
-		btree_node_unlock(iter, fail_idx);
-		iter->l[fail_idx].b = BTREE_ITER_NOT_END;
-		--fail_idx;
+	if (path->cached) {
+		if (!level)
+			bch2_btree_path_verify_cached(trans, path);
+		return;
 	}
 
-	if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
-		iter->uptodate = BTREE_ITER_NEED_PEEK;
+	if (!btree_path_node(path, level))
+		return;
 
-	bch2_btree_iter_verify_locks(iter);
-	return iter->uptodate < BTREE_ITER_NEED_RELOCK;
-}
+	if (!bch2_btree_node_relock_notrace(trans, path, level))
+		return;
 
-/* Slowpath: */
-bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
-			   unsigned level,
-			   struct btree_iter *iter,
-			   enum six_lock_type type,
-			   bool may_drop_locks)
-{
-	struct bch_fs *c = iter->c;
-	struct btree_iter *linked;
-	bool ret = true;
+	BUG_ON(!btree_path_pos_in_node(path, l->b));
 
-	/* Can't have children locked before ancestors: */
-	EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked));
+	bch2_btree_node_iter_verify(&l->iter, l->b);
 
 	/*
-	 * Can't hold any read locks while we block taking an intent lock - see
-	 * below for reasoning, and we should have already dropped any read
-	 * locks in the current iterator
+	 * For interior nodes, the iterator will have skipped past deleted keys:
 	 */
-	EBUG_ON(type == SIX_LOCK_intent &&
-		iter->nodes_locked != iter->nodes_intent_locked);
-
-	if (btree_node_lock_increment(iter, b, level, type))
-		return true;
+	p = level
+		? bch2_btree_node_iter_prev(&tmp, l->b)
+		: bch2_btree_node_iter_prev_all(&tmp, l->b);
+	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
 
-	/*
-	 * Must lock btree nodes in key order - this case happens when locking
-	 * the prev sibling in btree node merging:
-	 */
-	if (iter->nodes_locked &&
-	    __ffs(iter->nodes_locked) <= level &&
-	    __btree_iter_cmp(iter->btree_id, pos, iter))
-		return false;
+	if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
+		msg = "before";
+		goto err;
+	}
 
-	for_each_linked_btree_iter(iter, linked) {
-		if (!linked->nodes_locked)
-			continue;
+	if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
+		msg = "after";
+		goto err;
+	}
 
-		/* We have to lock btree nodes in key order: */
-		if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
-			ret = false;
+	if (!locked)
+		btree_node_unlock(trans, path, level);
+	return;
+err:
+	bch2_bpos_to_text(&buf1, path->pos);
 
-		/*
-		 * Can't block taking an intent lock if we have _any_ nodes read
-		 * locked:
-		 *
-		 * - Our read lock blocks another thread with an intent lock on
-		 *   the same node from getting a write lock, and thus from
-		 *   dropping its intent lock
-		 *
-		 * - And the other thread may have multiple nodes intent locked:
-		 *   both the node we want to intent lock, and the node we
-		 *   already have read locked - deadlock:
-		 */
-		if (type == SIX_LOCK_intent &&
-		    linked->nodes_locked != linked->nodes_intent_locked) {
-			if (may_drop_locks) {
-				linked->locks_want = max_t(unsigned,
-						linked->locks_want,
-						__fls(linked->nodes_locked) + 1);
-				btree_iter_get_locks(linked, true);
-			}
-			ret = false;
-		}
+	if (p) {
+		struct bkey uk = bkey_unpack_key(l->b, p);
 
-		/*
-		 * Interior nodes must be locked before their descendants: if
-		 * another iterator has possible descendants locked of the node
-		 * we're about to lock, it must have the ancestors locked too:
-		 */
-		if (linked->btree_id == iter->btree_id &&
-		    level > __fls(linked->nodes_locked)) {
-			if (may_drop_locks) {
-				linked->locks_want = max_t(unsigned,
-							   linked->locks_want,
-							   iter->locks_want);
-				btree_iter_get_locks(linked, true);
-			}
-			ret = false;
-		}
+		bch2_bkey_to_text(&buf2, &uk);
+	} else {
+		prt_printf(&buf2, "(none)");
 	}
 
-	if (ret)
-		__btree_node_lock_type(c, b, type);
-	else
-		trans_restart();
+	if (k) {
+		struct bkey uk = bkey_unpack_key(l->b, k);
 
-	return ret;
-}
+		bch2_bkey_to_text(&buf3, &uk);
+	} else {
+		prt_printf(&buf3, "(none)");
+	}
 
-/* Btree iterator locking: */
+	panic("path should be %s key at level %u:\n"
+	      "path pos %s\n"
+	      "prev key %s\n"
+	      "cur  key %s\n",
+	      msg, level, buf1.buf, buf2.buf, buf3.buf);
+}
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_iter_verify_locks(struct btree_iter *iter)
+static void bch2_btree_path_verify(struct btree_trans *trans,
+				   struct btree_path *path)
 {
-	unsigned l;
+	struct bch_fs *c = trans->c;
 
-	for (l = 0; btree_iter_node(iter, l); l++) {
-		if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
-		    !btree_node_locked(iter, l))
-			continue;
+	for (unsigned i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
+		if (!path->l[i].b) {
+			BUG_ON(!path->cached &&
+			       bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
+			break;
+		}
 
-		BUG_ON(btree_lock_want(iter, l) !=
-		       btree_node_locked_type(iter, l));
+		bch2_btree_path_verify_level(trans, path, i);
 	}
+
+	bch2_btree_path_verify_locks(path);
 }
-#endif
 
-__flatten
-static bool __bch2_btree_iter_relock(struct btree_iter *iter)
+void bch2_trans_verify_paths(struct btree_trans *trans)
 {
-	return iter->uptodate >= BTREE_ITER_NEED_RELOCK
-		? btree_iter_get_locks(iter, false)
-		: true;
+	struct btree_path *path;
+	unsigned iter;
+
+	trans_for_each_path(trans, path, iter)
+		bch2_btree_path_verify(trans, path);
 }
 
-bool bch2_btree_iter_relock(struct btree_iter *iter)
+static void bch2_btree_iter_verify(struct btree_iter *iter)
 {
-	struct btree_iter *linked;
-	bool ret = true;
+	struct btree_trans *trans = iter->trans;
 
-	for_each_btree_iter(iter, linked)
-		ret &= __bch2_btree_iter_relock(linked);
+	BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
 
-	return ret;
+	BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
+	       (iter->flags & BTREE_ITER_all_snapshots));
+
+	BUG_ON(!(iter->flags & BTREE_ITER_snapshot_field) &&
+	       (iter->flags & BTREE_ITER_all_snapshots) &&
+	       !btree_type_has_snapshot_field(iter->btree_id));
+
+	if (iter->update_path)
+		bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
+	bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
 }
 
-bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
-			       unsigned new_locks_want)
+static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
 {
-	struct btree_iter *linked;
+	BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) &&
+	       !iter->pos.snapshot);
 
-	EBUG_ON(iter->locks_want >= new_locks_want);
+	BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) &&
+	       iter->pos.snapshot != iter->snapshot);
 
-	iter->locks_want = new_locks_want;
+	BUG_ON(iter->flags & BTREE_ITER_all_snapshots	? !bpos_eq(iter->pos, iter->k.p) :
+	       !(iter->flags & BTREE_ITER_is_extents)	? !bkey_eq(iter->pos, iter->k.p) :
+	       (bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
+		bkey_gt(iter->pos, iter->k.p)));
+}
 
-	if (btree_iter_get_locks(iter, true))
-		return true;
+static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
+{
+	struct btree_trans *trans = iter->trans;
+	struct btree_iter copy;
+	struct bkey_s_c prev;
+	int ret = 0;
 
-	/*
-	 * Ancestor nodes must be locked before child nodes, so set locks_want
-	 * on iterators that might lock ancestors before us to avoid getting
-	 * -EINTR later:
-	 */
-	for_each_linked_btree_iter(iter, linked)
-		if (linked->btree_id == iter->btree_id &&
-		    btree_iter_cmp(linked, iter) <= 0 &&
-		    linked->locks_want < new_locks_want) {
-			linked->locks_want = new_locks_want;
-			btree_iter_get_locks(linked, true);
-		}
+	if (!bch2_debug_check_iterators)
+		return 0;
 
-	return false;
-}
+	if (!(iter->flags & BTREE_ITER_filter_snapshots))
+		return 0;
 
-bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter,
-					unsigned new_locks_want)
-{
-	unsigned l = iter->level;
+	if (bkey_err(k) || !k.k)
+		return 0;
 
-	EBUG_ON(iter->locks_want >= new_locks_want);
+	BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
+					  iter->snapshot,
+					  k.k->p.snapshot));
 
-	iter->locks_want = new_locks_want;
+	bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
+			     BTREE_ITER_nopreserve|
+			     BTREE_ITER_all_snapshots);
+	prev = bch2_btree_iter_prev(&copy);
+	if (!prev.k)
+		goto out;
 
-	do {
-		if (!btree_iter_node(iter, l))
-			break;
+	ret = bkey_err(prev);
+	if (ret)
+		goto out;
 
-		if (!bch2_btree_node_upgrade(iter, l)) {
-			iter->locks_want = l;
-			return false;
-		}
+	if (bkey_eq(prev.k->p, k.k->p) &&
+	    bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
+				      prev.k->p.snapshot) > 0) {
+		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 
-		l++;
-	} while (l < iter->locks_want);
+		bch2_bkey_to_text(&buf1, k.k);
+		bch2_bkey_to_text(&buf2, prev.k);
 
-	return true;
+		panic("iter snap %u\n"
+		      "k    %s\n"
+		      "prev %s\n",
+		      iter->snapshot,
+		      buf1.buf, buf2.buf);
+	}
+out:
+	bch2_trans_iter_exit(trans, &copy);
+	return ret;
 }
 
-void __bch2_btree_iter_downgrade(struct btree_iter *iter,
-				 unsigned downgrade_to)
+void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+			    struct bpos pos)
 {
-	struct btree_iter *linked;
-	unsigned l;
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 
-	/*
-	 * We downgrade linked iterators as well because btree_iter_upgrade
-	 * might have had to modify locks_want on linked iterators due to lock
-	 * ordering:
-	 */
-	for_each_btree_iter(iter, linked) {
-		unsigned new_locks_want = downgrade_to ?:
-			(linked->flags & BTREE_ITER_INTENT ? 1 : 0);
+	struct btree_path *path;
+	struct trans_for_each_path_inorder_iter iter;
+	struct printbuf buf = PRINTBUF;
 
-		if (linked->locks_want <= new_locks_want)
-			continue;
+	btree_trans_sort_paths(trans);
 
-		linked->locks_want = new_locks_want;
+	trans_for_each_path_inorder(trans, path, iter) {
+		if (path->btree_id != id ||
+		    !btree_node_locked(path, 0) ||
+		    !path->should_be_locked)
+			continue;
 
-		while (linked->nodes_locked &&
-		       (l = __fls(linked->nodes_locked)) >= linked->locks_want) {
-			if (l > linked->level) {
-				btree_node_unlock(linked, l);
-			} else {
-				if (btree_node_intent_locked(linked, l)) {
-					six_lock_downgrade(&linked->l[l].b->lock);
-					linked->nodes_intent_locked ^= 1 << l;
-				}
-				break;
-			}
+		if (!path->cached) {
+			if (bkey_ge(pos, path->l[0].b->data->min_key) &&
+			    bkey_le(pos, path->l[0].b->key.k.p))
+				return;
+		} else {
+			if (bkey_eq(pos, path->pos))
+				return;
 		}
-
-		bch2_btree_iter_verify_locks(linked);
 	}
-}
 
-int bch2_btree_iter_unlock(struct btree_iter *iter)
-{
-	struct btree_iter *linked;
-
-	for_each_btree_iter(iter, linked)
-		__bch2_btree_iter_unlock(linked);
+	bch2_dump_trans_paths_updates(trans);
+	bch2_bpos_to_text(&buf, pos);
 
-	return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
+	panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf);
 }
 
-/* Btree iterator: */
+#else
 
-#ifdef CONFIG_BCACHEFS_DEBUG
+static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
+						struct btree_path *path, unsigned l) {}
+static inline void bch2_btree_path_verify(struct btree_trans *trans,
+					  struct btree_path *path) {}
+static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
+static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
+static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
 
-static void __bch2_btree_iter_verify(struct btree_iter *iter,
-				     struct btree *b)
-{
-	struct btree_iter_level *l = &iter->l[b->level];
-	struct btree_node_iter tmp = l->iter;
-	struct bkey_packed *k;
+#endif
 
-	bch2_btree_node_iter_verify(&l->iter, b);
+/* Btree path: fixups after btree updates */
 
-	/*
-	 * For interior nodes, the iterator will have skipped past
-	 * deleted keys:
-	 */
-	k = b->level
-		? bch2_btree_node_iter_prev(&tmp, b)
-		: bch2_btree_node_iter_prev_all(&tmp, b);
-	if (k && btree_iter_pos_cmp_packed(b, &iter->pos, k,
-				iter->flags & BTREE_ITER_IS_EXTENTS)) {
-		char buf[100];
-		struct bkey uk = bkey_unpack_key(b, k);
+static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
+					struct btree *b,
+					struct bset_tree *t,
+					struct bkey_packed *k)
+{
+	struct btree_node_iter_set *set;
 
-		bch2_bkey_to_text(buf, sizeof(buf), &uk);
-		panic("prev key should be before after pos:\n%s\n%llu:%llu\n",
-		      buf, iter->pos.inode, iter->pos.offset);
-	}
+	btree_node_iter_for_each(iter, set)
+		if (set->end == t->end_offset) {
+			set->k = __btree_node_key_to_offset(b, k);
+			bch2_btree_node_iter_sort(iter, b);
+			return;
+		}
 
-	k = bch2_btree_node_iter_peek_all(&l->iter, b);
-	if (k && !btree_iter_pos_cmp_packed(b, &iter->pos, k,
-				iter->flags & BTREE_ITER_IS_EXTENTS)) {
-		char buf[100];
-		struct bkey uk = bkey_unpack_key(b, k);
+	bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
+}
 
-		bch2_bkey_to_text(buf, sizeof(buf), &uk);
-		panic("next key should be before iter pos:\n%llu:%llu\n%s\n",
-		      iter->pos.inode, iter->pos.offset, buf);
-	}
+static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
+					       struct btree *b,
+					       struct bkey_packed *where)
+{
+	struct btree_path_level *l = &path->l[b->c.level];
 
-	if (iter->uptodate == BTREE_ITER_UPTODATE &&
-	    (iter->flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES) {
-		BUG_ON(!bkey_whiteout(&iter->k) &&
-		       bch2_btree_node_iter_end(&l->iter));
-	}
+	if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
+		return;
+
+	if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
+		bch2_btree_node_iter_advance(&l->iter, l->b);
 }
 
-void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
+void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
+				      struct btree *b,
+				      struct bkey_packed *where)
 {
-	struct btree_iter *linked;
+	struct btree_path *path;
+	unsigned i;
 
-	for_each_btree_iter_with_node(iter, b, linked)
-		__bch2_btree_iter_verify(linked, b);
+	trans_for_each_path_with_node(trans, b, path, i) {
+		__bch2_btree_path_fix_key_modified(path, b, where);
+		bch2_btree_path_verify_level(trans, path, b->c.level);
+	}
 }
 
-#endif
-
-static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
-				      struct btree *b,
-				      struct btree_node_iter *node_iter,
-				      struct bset_tree *t,
-				      struct bkey_packed *where,
-				      unsigned clobber_u64s,
-				      unsigned new_u64s)
+static void __bch2_btree_node_iter_fix(struct btree_path *path,
+				       struct btree *b,
+				       struct btree_node_iter *node_iter,
+				       struct bset_tree *t,
+				       struct bkey_packed *where,
+				       unsigned clobber_u64s,
+				       unsigned new_u64s)
 {
 	const struct bkey_packed *end = btree_bkey_last(b, t);
 	struct btree_node_iter_set *set;
 	unsigned offset = __btree_node_key_to_offset(b, where);
 	int shift = new_u64s - clobber_u64s;
-	unsigned old_end = (int) __btree_node_key_to_offset(b, end) - shift;
+	unsigned old_end = t->end_offset - shift;
+	unsigned orig_iter_pos = node_iter->data[0].k;
+	bool iter_current_key_modified =
+		orig_iter_pos >= offset &&
+		orig_iter_pos <= offset + clobber_u64s;
 
 	btree_node_iter_for_each(node_iter, set)
 		if (set->end == old_end)
@@ -482,525 +440,693 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 
 	/* didn't find the bset in the iterator - might have to readd it: */
 	if (new_u64s &&
-	    btree_iter_pos_cmp_packed(b, &iter->pos, where,
-				      iter->flags & BTREE_ITER_IS_EXTENTS)) {
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-
+	    bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
 		bch2_btree_node_iter_push(node_iter, b, where, end);
-
-		if (!b->level &&
-		    node_iter == &iter->l[0].iter)
-			bkey_disassemble(b,
-				bch2_btree_node_iter_peek_all(node_iter, b),
-				&iter->k);
+		goto fixup_done;
+	} else {
+		/* Iterator is after key that changed */
+		return;
 	}
-	return;
 found:
-	set->end = (int) set->end + shift;
+	set->end = t->end_offset;
 
 	/* Iterator hasn't gotten to the key that changed yet: */
 	if (set->k < offset)
 		return;
 
 	if (new_u64s &&
-	    btree_iter_pos_cmp_packed(b, &iter->pos, where,
-				iter->flags & BTREE_ITER_IS_EXTENTS)) {
+	    bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
 		set->k = offset;
 	} else if (set->k < offset + clobber_u64s) {
 		set->k = offset + new_u64s;
 		if (set->k == set->end)
 			bch2_btree_node_iter_set_drop(node_iter, set);
 	} else {
+		/* Iterator is after key that changed */
 		set->k = (int) set->k + shift;
-		goto iter_current_key_not_modified;
+		return;
 	}
 
-	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-
 	bch2_btree_node_iter_sort(node_iter, b);
-	if (!b->level && node_iter == &iter->l[0].iter)
-		__btree_iter_peek_all(iter, &iter->l[0], &iter->k);
-iter_current_key_not_modified:
+fixup_done:
+	if (node_iter->data[0].k != orig_iter_pos)
+		iter_current_key_modified = true;
 
 	/*
-	 * Interior nodes are special because iterators for interior nodes don't
-	 * obey the usual invariants regarding the iterator position:
-	 *
-	 * We may have whiteouts that compare greater than the iterator
-	 * position, and logically should be in the iterator, but that we
-	 * skipped past to find the first live key greater than the iterator
-	 * position. This becomes an issue when we insert a new key that is
-	 * greater than the current iterator position, but smaller than the
-	 * whiteouts we've already skipped past - this happens in the course of
-	 * a btree split.
-	 *
-	 * We have to rewind the iterator past to before those whiteouts here,
-	 * else bkey_node_iter_prev() is not going to work and who knows what
-	 * else would happen. And we have to do it manually, because here we've
-	 * already done the insert and the iterator is currently inconsistent:
-	 *
-	 * We've got multiple competing invariants, here - we have to be careful
-	 * about rewinding iterators for interior nodes, because they should
-	 * always point to the key for the child node the btree iterator points
-	 * to.
+	 * When a new key is added, and the node iterator now points to that
+	 * key, the iterator might have skipped past deleted keys that should
+	 * come after the key the iterator now points to. We have to rewind to
+	 * before those deleted keys - otherwise
+	 * bch2_btree_node_iter_prev_all() breaks:
 	 */
-	if (b->level && new_u64s && !bkey_deleted(where) &&
-	    btree_iter_pos_cmp_packed(b, &iter->pos, where,
-				iter->flags & BTREE_ITER_IS_EXTENTS)) {
-		struct bset_tree *t;
-		struct bkey_packed *k;
+	if (!bch2_btree_node_iter_end(node_iter) &&
+	    iter_current_key_modified &&
+	    b->c.level) {
+		struct bkey_packed *k, *k2, *p;
+
+		k = bch2_btree_node_iter_peek_all(node_iter, b);
 
 		for_each_bset(b, t) {
-			if (bch2_bkey_to_bset(b, where) == t)
+			bool set_pos = false;
+
+			if (node_iter->data[0].end == t->end_offset)
 				continue;
 
-			k = bch2_bkey_prev_all(b, t,
-				bch2_btree_node_iter_bset_pos(node_iter, b, t));
-			if (k &&
-			    __btree_node_iter_cmp(node_iter, b,
-						  k, where) > 0) {
-				struct btree_node_iter_set *set;
-				unsigned offset =
-					__btree_node_key_to_offset(b, bkey_next(k));
-
-				btree_node_iter_for_each(node_iter, set)
-					if (set->k == offset) {
-						set->k = __btree_node_key_to_offset(b, k);
-						bch2_btree_node_iter_sort(node_iter, b);
-						goto next_bset;
-					}
-
-				bch2_btree_node_iter_push(node_iter, b, k,
-						btree_bkey_last(b, t));
+			k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
+
+			while ((p = bch2_bkey_prev_all(b, t, k2)) &&
+			       bkey_iter_cmp(b, k, p) < 0) {
+				k2 = p;
+				set_pos = true;
 			}
-next_bset:
-			t = t;
+
+			if (set_pos)
+				btree_node_iter_set_set_pos(node_iter,
+							    b, t, k2);
 		}
 	}
 }
 
-void bch2_btree_node_iter_fix(struct btree_iter *iter,
-			     struct btree *b,
-			     struct btree_node_iter *node_iter,
-			     struct bset_tree *t,
-			     struct bkey_packed *where,
-			     unsigned clobber_u64s,
-			     unsigned new_u64s)
+void bch2_btree_node_iter_fix(struct btree_trans *trans,
+			      struct btree_path *path,
+			      struct btree *b,
+			      struct btree_node_iter *node_iter,
+			      struct bkey_packed *where,
+			      unsigned clobber_u64s,
+			      unsigned new_u64s)
 {
-	struct btree_iter *linked;
+	struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
+	struct btree_path *linked;
+	unsigned i;
 
-	if (node_iter != &iter->l[b->level].iter)
-		__bch2_btree_node_iter_fix(iter, b, node_iter, t,
-					  where, clobber_u64s, new_u64s);
+	if (node_iter != &path->l[b->c.level].iter) {
+		__bch2_btree_node_iter_fix(path, b, node_iter, t,
+					   where, clobber_u64s, new_u64s);
 
-	for_each_btree_iter_with_node(iter, b, linked)
-		__bch2_btree_node_iter_fix(linked, b,
-					  &linked->l[b->level].iter, t,
-					  where, clobber_u64s, new_u64s);
+		if (bch2_debug_check_iterators)
+			bch2_btree_node_iter_verify(node_iter, b);
+	}
 
-	/* interior node iterators are... special... */
-	if (!b->level)
-		bch2_btree_iter_verify(iter, b);
+	trans_for_each_path_with_node(trans, b, linked, i) {
+		__bch2_btree_node_iter_fix(linked, b,
+					   &linked->l[b->c.level].iter, t,
+					   where, clobber_u64s, new_u64s);
+		bch2_btree_path_verify_level(trans, linked, b->c.level);
+	}
 }
 
-static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
-						  struct btree_iter_level *l,
+/* Btree path level: pointer to a particular btree node and node iter */
+
+static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
+						  struct btree_path_level *l,
 						  struct bkey *u,
 						  struct bkey_packed *k)
 {
-	struct bkey_s_c ret;
-
 	if (unlikely(!k)) {
 		/*
 		 * signal to bch2_btree_iter_peek_slot() that we're currently at
 		 * a hole
 		 */
-		u->type = KEY_TYPE_DELETED;
+		u->type = KEY_TYPE_deleted;
 		return bkey_s_c_null;
 	}
 
-	ret = bkey_disassemble(l->b, k, u);
-
-	if (debug_check_bkeys(iter->c))
-		bch2_bkey_debugcheck(iter->c, l->b, ret);
-
-	return ret;
+	return bkey_disassemble(l->b, k, u);
 }
 
-/* peek_all() doesn't skip deleted keys */
-static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter,
-						    struct btree_iter_level *l,
-						    struct bkey *u)
+static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
+							struct btree_path_level *l,
+							struct bkey *u)
 {
-	return __btree_iter_unpack(iter, l, u,
+	return __btree_iter_unpack(c, l, u,
 			bch2_btree_node_iter_peek_all(&l->iter, l->b));
 }
 
-static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
-						struct btree_iter_level *l)
+static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
+						    struct btree_path *path,
+						    struct btree_path_level *l,
+						    struct bkey *u)
 {
-	return __btree_iter_unpack(iter, l, &iter->k,
+	struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
 			bch2_btree_node_iter_peek(&l->iter, l->b));
+
+	path->pos = k.k ? k.k->p : l->b->key.k.p;
+	trans->paths_sorted = false;
+	bch2_btree_path_verify_level(trans, path, l - path->l);
+	return k;
 }
 
-static inline void __btree_iter_advance(struct btree_iter_level *l)
+static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
+						    struct btree_path *path,
+						    struct btree_path_level *l,
+						    struct bkey *u)
 {
-	bch2_btree_node_iter_advance(&l->iter, l->b);
+	struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
+			bch2_btree_node_iter_prev(&l->iter, l->b));
+
+	path->pos = k.k ? k.k->p : l->b->data->min_key;
+	trans->paths_sorted = false;
+	bch2_btree_path_verify_level(trans, path, l - path->l);
+	return k;
 }
 
-/*
- * Verify that iterator for parent node points to child node:
- */
-static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
+static inline bool btree_path_advance_to_pos(struct btree_path *path,
+					     struct btree_path_level *l,
+					     int max_advance)
 {
-	struct btree_iter_level *l;
-	unsigned plevel;
-	bool parent_locked;
 	struct bkey_packed *k;
+	int nr_advanced = 0;
 
-	if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
-		return;
-
-	plevel = b->level + 1;
-	if (!btree_iter_node(iter, plevel))
-		return;
-
-	parent_locked = btree_node_locked(iter, plevel);
-
-	if (!bch2_btree_node_relock(iter, plevel))
-		return;
-
-	l = &iter->l[plevel];
-	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
-	if (!k ||
-	    bkey_deleted(k) ||
-	    bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
-		char buf[100];
-		struct bkey uk = bkey_unpack_key(b, k);
+	while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
+	       bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
+		if (max_advance > 0 && nr_advanced >= max_advance)
+			return false;
 
-		bch2_bkey_to_text(buf, sizeof(buf), &uk);
-		panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
-		      buf, b->key.k.p.inode, b->key.k.p.offset);
+		bch2_btree_node_iter_advance(&l->iter, l->b);
+		nr_advanced++;
 	}
 
-	if (!parent_locked)
-		btree_node_unlock(iter, b->level + 1);
+	return true;
 }
 
-/* Returns true if @k is after iterator position @pos */
-static inline bool btree_iter_pos_cmp(struct btree_iter *iter,
-				      const struct bkey *k)
+static inline void __btree_path_level_init(struct btree_path *path,
+					   unsigned level)
 {
-	int cmp = bkey_cmp(k->p, iter->pos);
+	struct btree_path_level *l = &path->l[level];
 
-	return cmp > 0 ||
-		(cmp == 0 &&
-		 !(iter->flags & BTREE_ITER_IS_EXTENTS) && !bkey_deleted(k));
-}
+	bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
 
-static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
-					     struct btree *b)
-{
-	return !btree_iter_pos_cmp(iter, &b->key.k) &&
-		bkey_cmp(b->key.k.p, POS_MAX);
-}
-
-static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
-					  struct btree *b)
-{
-	return iter->btree_id == b->btree_id &&
-		bkey_cmp(iter->pos, b->data->min_key) >= 0 &&
-		!btree_iter_pos_after_node(iter, b);
+	/*
+	 * Iterators to interior nodes should always be pointed at the first non
+	 * whiteout:
+	 */
+	if (level)
+		bch2_btree_node_iter_peek(&l->iter, l->b);
 }
 
-static inline void __btree_iter_init(struct btree_iter *iter,
-				     struct btree *b)
+void bch2_btree_path_level_init(struct btree_trans *trans,
+				struct btree_path *path,
+				struct btree *b)
 {
-	struct btree_iter_level *l = &iter->l[b->level];
+	BUG_ON(path->cached);
 
-	bch2_btree_node_iter_init(&l->iter, b, iter->pos,
-				  iter->flags & BTREE_ITER_IS_EXTENTS,
-				  btree_node_is_extents(b));
+	EBUG_ON(!btree_path_pos_in_node(path, b));
 
-	/* Skip to first non whiteout: */
-	if (b->level)
-		bch2_btree_node_iter_peek(&l->iter, b);
-
-	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+	path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
+	path->l[b->c.level].b = b;
+	__btree_path_level_init(path, b->c.level);
 }
 
-static inline void btree_iter_node_set(struct btree_iter *iter,
-				       struct btree *b)
+/* Btree path: fixups after btree node updates: */
+
+static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
 {
-	btree_iter_verify_new_node(iter, b);
+	struct bch_fs *c = trans->c;
+
+	trans_for_each_update(trans, i)
+		if (!i->cached &&
+		    i->level	== b->c.level &&
+		    i->btree_id	== b->c.btree_id &&
+		    bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
+		    bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
+			i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v;
 
-	EBUG_ON(!btree_iter_pos_in_node(iter, b));
-	EBUG_ON(b->lock.state.seq & 1);
+			if (unlikely(trans->journal_replay_not_finished)) {
+				struct bkey_i *j_k =
+					bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
+								    i->k->k.p);
 
-	iter->lock_seq[b->level] = b->lock.state.seq;
-	iter->l[b->level].b = b;
-	__btree_iter_init(iter, b);
+				if (j_k) {
+					i->old_k = j_k->k;
+					i->old_v = &j_k->v;
+				}
+			}
+		}
 }
 
 /*
  * A btree node is being replaced - update the iterator to point to the new
  * node:
  */
-void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
+void bch2_trans_node_add(struct btree_trans *trans,
+			 struct btree_path *path,
+			 struct btree *b)
 {
-	enum btree_node_locked_type t;
-	struct btree_iter *linked;
+	struct btree_path *prev;
 
-	for_each_btree_iter(iter, linked)
-		if (btree_iter_pos_in_node(linked, b)) {
-			/*
-			 * bch2_btree_iter_node_drop() has already been called -
-			 * the old node we're replacing has already been
-			 * unlocked and the pointer invalidated
-			 */
-			BUG_ON(btree_node_locked(linked, b->level));
+	BUG_ON(!btree_path_pos_in_node(path, b));
+
+	while ((prev = prev_btree_path(trans, path)) &&
+	       btree_path_pos_in_node(prev, b))
+		path = prev;
+
+	for (;
+	     path && btree_path_pos_in_node(path, b);
+	     path = next_btree_path(trans, path))
+		if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) {
+			enum btree_node_locked_type t =
+				btree_lock_want(path, b->c.level);
 
-			t = btree_lock_want(linked, b->level);
 			if (t != BTREE_NODE_UNLOCKED) {
-				six_lock_increment(&b->lock, t);
-				mark_btree_node_locked(linked, b->level, t);
+				btree_node_unlock(trans, path, b->c.level);
+				six_lock_increment(&b->c.lock, (enum six_lock_type) t);
+				mark_btree_node_locked(trans, path, b->c.level, t);
 			}
 
-			btree_iter_node_set(linked, b);
+			bch2_btree_path_level_init(trans, path, b);
 		}
 
-	six_unlock_intent(&b->lock);
-}
-
-void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
-{
-	struct btree_iter *linked;
-	unsigned level = b->level;
-
-	for_each_btree_iter(iter, linked)
-		if (linked->l[level].b == b) {
-			btree_node_unlock(linked, level);
-			linked->l[level].b = BTREE_ITER_NOT_END;
-		}
+	bch2_trans_revalidate_updates_in_node(trans, b);
 }
 
 /*
  * A btree node has been modified in such a way as to invalidate iterators - fix
  * them:
  */
-void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
+void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
 {
-	struct btree_iter *linked;
+	struct btree_path *path;
+	unsigned i;
+
+	trans_for_each_path_with_node(trans, b, path, i)
+		__btree_path_level_init(path, b->c.level);
 
-	for_each_btree_iter_with_node(iter, b, linked)
-		__btree_iter_init(linked, b);
+	bch2_trans_revalidate_updates_in_node(trans, b);
 }
 
-static inline int btree_iter_lock_root(struct btree_iter *iter,
-				       unsigned depth_want)
+/* Btree path: traverse, set_pos: */
+
+static inline int btree_path_lock_root(struct btree_trans *trans,
+				       struct btree_path *path,
+				       unsigned depth_want,
+				       unsigned long trace_ip)
 {
-	struct bch_fs *c = iter->c;
-	struct btree *b;
+	struct bch_fs *c = trans->c;
+	struct btree_root *r = bch2_btree_id_root(c, path->btree_id);
 	enum six_lock_type lock_type;
 	unsigned i;
+	int ret;
 
-	EBUG_ON(iter->nodes_locked);
+	EBUG_ON(path->nodes_locked);
 
 	while (1) {
-		b = READ_ONCE(c->btree_roots[iter->btree_id].b);
-		iter->level = READ_ONCE(b->level);
+		struct btree *b = READ_ONCE(r->b);
+		if (unlikely(!b)) {
+			BUG_ON(!r->error);
+			return r->error;
+		}
+
+		path->level = READ_ONCE(b->c.level);
 
-		if (unlikely(iter->level < depth_want)) {
+		if (unlikely(path->level < depth_want)) {
 			/*
 			 * the root is at a lower depth than the depth we want:
 			 * got to the end of the btree, or we're walking nodes
 			 * greater than some depth and there are no nodes >=
 			 * that depth
 			 */
-			iter->level = depth_want;
-			iter->l[iter->level].b = NULL;
-			return 0;
+			path->level = depth_want;
+			for (i = path->level; i < BTREE_MAX_DEPTH; i++)
+				path->l[i].b = NULL;
+			return 1;
 		}
 
-		lock_type = __btree_lock_want(iter, iter->level);
-		if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
-					      iter, lock_type, true)))
-			return -EINTR;
+		lock_type = __btree_lock_want(path, path->level);
+		ret = btree_node_lock(trans, path, &b->c,
+				      path->level, lock_type, trace_ip);
+		if (unlikely(ret)) {
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				return ret;
+			BUG();
+		}
 
-		if (likely(b == c->btree_roots[iter->btree_id].b &&
-			   b->level == iter->level &&
+		if (likely(b == READ_ONCE(r->b) &&
+			   b->c.level == path->level &&
 			   !race_fault())) {
-			for (i = 0; i < iter->level; i++)
-				iter->l[i].b = BTREE_ITER_NOT_END;
-			iter->l[iter->level].b = b;
-
-			mark_btree_node_locked(iter, iter->level, lock_type);
-			btree_iter_node_set(iter, b);
+			for (i = 0; i < path->level; i++)
+				path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
+			path->l[path->level].b = b;
+			for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
+				path->l[i].b = NULL;
+
+			mark_btree_node_locked(trans, path, path->level,
+					       (enum btree_node_locked_type) lock_type);
+			bch2_btree_path_level_init(trans, path, b);
 			return 0;
-
 		}
 
-		six_unlock_type(&b->lock, lock_type);
+		six_unlock_type(&b->c.lock, lock_type);
 	}
 }
 
 noinline
-static void btree_iter_prefetch(struct btree_iter *iter)
+static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
 {
-	struct btree_iter_level *l = &iter->l[iter->level];
+	struct bch_fs *c = trans->c;
+	struct btree_path_level *l = path_l(path);
 	struct btree_node_iter node_iter = l->iter;
 	struct bkey_packed *k;
-	BKEY_PADDED(k) tmp;
-	unsigned nr = test_bit(BCH_FS_STARTED, &iter->c->flags)
-		? (iter->level > 1 ? 0 :  2)
-		: (iter->level > 1 ? 1 : 16);
-	bool was_locked = btree_node_locked(iter, iter->level);
-
-	while (nr) {
-		if (!bch2_btree_node_relock(iter, iter->level))
-			return;
+	struct bkey_buf tmp;
+	unsigned nr = test_bit(BCH_FS_started, &c->flags)
+		? (path->level > 1 ? 0 :  2)
+		: (path->level > 1 ? 1 : 16);
+	bool was_locked = btree_node_locked(path, path->level);
+	int ret = 0;
+
+	bch2_bkey_buf_init(&tmp);
+
+	while (nr-- && !ret) {
+		if (!bch2_btree_node_relock(trans, path, path->level))
+			break;
 
 		bch2_btree_node_iter_advance(&node_iter, l->b);
 		k = bch2_btree_node_iter_peek(&node_iter, l->b);
 		if (!k)
 			break;
 
-		bch2_bkey_unpack(l->b, &tmp.k, k);
-		bch2_btree_node_prefetch(iter->c, iter, &tmp.k,
-					 iter->level - 1);
+		bch2_bkey_buf_unpack(&tmp, c, l->b, k);
+		ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
+					       path->level - 1);
 	}
 
 	if (!was_locked)
-		btree_node_unlock(iter, iter->level);
+		btree_node_unlock(trans, path, path->level);
+
+	bch2_bkey_buf_exit(&tmp, c);
+	return ret;
 }
 
-static inline int btree_iter_down(struct btree_iter *iter)
+static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
+				 struct btree_and_journal_iter *jiter)
 {
-	struct btree_iter_level *l = &iter->l[iter->level];
-	struct btree *b;
-	unsigned level = iter->level - 1;
-	enum six_lock_type lock_type = __btree_lock_want(iter, level);
-	BKEY_PADDED(k) tmp;
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+	struct bkey_buf tmp;
+	unsigned nr = test_bit(BCH_FS_started, &c->flags)
+		? (path->level > 1 ? 0 :  2)
+		: (path->level > 1 ? 1 : 16);
+	bool was_locked = btree_node_locked(path, path->level);
+	int ret = 0;
 
-	BUG_ON(!btree_node_locked(iter, iter->level));
+	bch2_bkey_buf_init(&tmp);
 
-	bch2_bkey_unpack(l->b, &tmp.k,
-			 bch2_btree_node_iter_peek(&l->iter, l->b));
+	jiter->fail_if_too_many_whiteouts = true;
 
-	b = bch2_btree_node_get(iter->c, iter, &tmp.k, level, lock_type, true);
-	if (unlikely(IS_ERR(b)))
-		return PTR_ERR(b);
+	while (nr-- && !ret) {
+		if (!bch2_btree_node_relock(trans, path, path->level))
+			break;
 
-	mark_btree_node_locked(iter, level, lock_type);
-	btree_iter_node_set(iter, b);
+		bch2_btree_and_journal_iter_advance(jiter);
+		k = bch2_btree_and_journal_iter_peek(jiter);
+		if (!k.k)
+			break;
 
-	if (iter->flags & BTREE_ITER_PREFETCH)
-		btree_iter_prefetch(iter);
+		bch2_bkey_buf_reassemble(&tmp, c, k);
+		ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
+					       path->level - 1);
+	}
 
-	iter->level = level;
+	if (!was_locked)
+		btree_node_unlock(trans, path, path->level);
 
-	return 0;
+	bch2_bkey_buf_exit(&tmp, c);
+	return ret;
 }
 
-static void btree_iter_up(struct btree_iter *iter)
+static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
+					    struct btree_path *path,
+					    unsigned plevel, struct btree *b)
 {
-	btree_node_unlock(iter, iter->level++);
+	struct btree_path_level *l = &path->l[plevel];
+	bool locked = btree_node_locked(path, plevel);
+	struct bkey_packed *k;
+	struct bch_btree_ptr_v2 *bp;
+
+	if (!bch2_btree_node_relock(trans, path, plevel))
+		return;
+
+	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+	BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
+
+	bp = (void *) bkeyp_val(&l->b->format, k);
+	bp->mem_ptr = (unsigned long)b;
+
+	if (!locked)
+		btree_node_unlock(trans, path, plevel);
 }
 
-int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
+static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
+						     struct btree_path *path,
+						     unsigned flags,
+						     struct bkey_buf *out)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path_level *l = path_l(path);
+	struct btree_and_journal_iter jiter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	__bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
+
+	k = bch2_btree_and_journal_iter_peek(&jiter);
+	if (!k.k) {
+		struct printbuf buf = PRINTBUF;
+
+		prt_str(&buf, "node not found at pos ");
+		bch2_bpos_to_text(&buf, path->pos);
+		prt_str(&buf, " at btree ");
+		bch2_btree_pos_to_text(&buf, c, l->b);
+
+		ret = bch2_fs_topology_error(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+		goto err;
+	}
 
-static int btree_iter_traverse_error(struct btree_iter *iter, int ret)
+	bch2_bkey_buf_reassemble(out, c, k);
+
+	if ((flags & BTREE_ITER_prefetch) &&
+	    c->opts.btree_node_prefetch)
+		ret = btree_path_prefetch_j(trans, path, &jiter);
+
+err:
+	bch2_btree_and_journal_iter_exit(&jiter);
+	return ret;
+}
+
+static __always_inline int btree_path_down(struct btree_trans *trans,
+					   struct btree_path *path,
+					   unsigned flags,
+					   unsigned long trace_ip)
 {
-	struct bch_fs *c = iter->c;
-	struct btree_iter *linked, *sorted_iters, **i;
+	struct bch_fs *c = trans->c;
+	struct btree_path_level *l = path_l(path);
+	struct btree *b;
+	unsigned level = path->level - 1;
+	enum six_lock_type lock_type = __btree_lock_want(path, level);
+	struct bkey_buf tmp;
+	int ret;
+
+	EBUG_ON(!btree_node_locked(path, path->level));
+
+	bch2_bkey_buf_init(&tmp);
+
+	if (unlikely(trans->journal_replay_not_finished)) {
+		ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
+		if (ret)
+			goto err;
+	} else {
+		struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
+		if (!k) {
+			struct printbuf buf = PRINTBUF;
+
+			prt_str(&buf, "node not found at pos ");
+			bch2_bpos_to_text(&buf, path->pos);
+			prt_str(&buf, " within parent node ");
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key));
+
+			bch2_fs_fatal_error(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+			ret = -BCH_ERR_btree_need_topology_repair;
+			goto err;
+		}
+
+		bch2_bkey_buf_unpack(&tmp, c, l->b, k);
+
+		if ((flags & BTREE_ITER_prefetch) &&
+		    c->opts.btree_node_prefetch) {
+			ret = btree_path_prefetch(trans, path);
+			if (ret)
+				goto err;
+		}
+	}
+
+	b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
+	ret = PTR_ERR_OR_ZERO(b);
+	if (unlikely(ret))
+		goto err;
+
+	if (likely(!trans->journal_replay_not_finished &&
+		   tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
+	    unlikely(b != btree_node_mem_ptr(tmp.k)))
+		btree_node_mem_ptr_set(trans, path, level + 1, b);
+
+	if (btree_node_read_locked(path, level + 1))
+		btree_node_unlock(trans, path, level + 1);
+
+	mark_btree_node_locked(trans, path, level,
+			       (enum btree_node_locked_type) lock_type);
+	path->level = level;
+	bch2_btree_path_level_init(trans, path, b);
+
+	bch2_btree_path_verify_locks(path);
+err:
+	bch2_bkey_buf_exit(&tmp, c);
+	return ret;
+}
+
+static int bch2_btree_path_traverse_all(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path *path;
+	unsigned long trace_ip = _RET_IP_;
+	unsigned i;
+	int ret = 0;
+
+	if (trans->in_traverse_all)
+		return -BCH_ERR_transaction_restart_in_traverse_all;
+
+	trans->in_traverse_all = true;
 retry_all:
-	bch2_btree_iter_unlock(iter);
+	trans->restarted = 0;
+	trans->last_restarted_ip = 0;
 
-	if (ret != -ENOMEM && ret != -EINTR)
-		goto io_error;
+	trans_for_each_path(trans, path, i)
+		path->should_be_locked = false;
 
-	if (ret == -ENOMEM) {
+	btree_trans_sort_paths(trans);
+
+	bch2_trans_unlock(trans);
+	cond_resched();
+	trans_set_locked(trans, false);
+
+	if (unlikely(trans->memory_allocation_failure)) {
 		struct closure cl;
 
 		closure_init_stack(&cl);
 
 		do {
-			ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+			ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
 			closure_sync(&cl);
 		} while (ret);
 	}
 
+	/* Now, redo traversals in correct order: */
+	i = 0;
+	while (i < trans->nr_sorted) {
+		btree_path_idx_t idx = trans->sorted[i];
+
+		/*
+		 * Traversing a path can cause another path to be added at about
+		 * the same position:
+		 */
+		if (trans->paths[idx].uptodate) {
+			__btree_path_get(trans, &trans->paths[idx], false);
+			ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_);
+			__btree_path_put(trans, &trans->paths[idx], false);
+
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+			    bch2_err_matches(ret, ENOMEM))
+				goto retry_all;
+			if (ret)
+				goto err;
+		} else {
+			i++;
+		}
+	}
+
 	/*
-	 * Linked iters are normally a circular singly linked list - break cycle
-	 * while we sort them:
+	 * We used to assert that all paths had been traversed here
+	 * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
+	 * path->should_be_locked is not set yet, we might have unlocked and
+	 * then failed to relock a path - that's fine.
 	 */
-	linked = iter->next;
-	iter->next = NULL;
-	sorted_iters = NULL;
-
-	while (linked) {
-		iter = linked;
-		linked = linked->next;
+err:
+	bch2_btree_cache_cannibalize_unlock(trans);
 
-		i = &sorted_iters;
-		while (*i && btree_iter_cmp(iter, *i) > 0)
-			i = &(*i)->next;
+	trans->in_traverse_all = false;
 
-		iter->next = *i;
-		*i = iter;
-	}
+	trace_and_count(c, trans_traverse_all, trans, trace_ip);
+	return ret;
+}
 
-	/* Make list circular again: */
-	iter = sorted_iters;
-	while (iter->next)
-		iter = iter->next;
-	iter->next = sorted_iters;
+static inline bool btree_path_check_pos_in_node(struct btree_path *path,
+						unsigned l, int check_pos)
+{
+	if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
+		return false;
+	if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
+		return false;
+	return true;
+}
 
-	/* Now, redo traversals in correct order: */
+static inline bool btree_path_good_node(struct btree_trans *trans,
+					struct btree_path *path,
+					unsigned l, int check_pos)
+{
+	return is_btree_node(path, l) &&
+		bch2_btree_node_relock(trans, path, l) &&
+		btree_path_check_pos_in_node(path, l, check_pos);
+}
 
-	iter = sorted_iters;
-	do {
-retry:
-		ret = __bch2_btree_iter_traverse(iter);
-		if (unlikely(ret)) {
-			if (ret == -EINTR)
-				goto retry;
-			goto retry_all;
-		}
+static void btree_path_set_level_down(struct btree_trans *trans,
+				      struct btree_path *path,
+				      unsigned new_level)
+{
+	unsigned l;
 
-		iter = iter->next;
-	} while (iter != sorted_iters);
+	path->level = new_level;
 
-	ret = btree_iter_linked(iter) ? -EINTR : 0;
-out:
-	bch2_btree_cache_cannibalize_unlock(c);
-	return ret;
-io_error:
-	BUG_ON(ret != -EIO);
+	for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
+		if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
+			btree_node_unlock(trans, path, l);
 
-	iter->flags |= BTREE_ITER_ERROR;
-	iter->l[iter->level].b = BTREE_ITER_NOT_END;
-	goto out;
+	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+	bch2_btree_path_verify(trans, path);
 }
 
-static unsigned btree_iter_up_until_locked(struct btree_iter *iter,
-					   bool check_pos)
+static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
+							 struct btree_path *path,
+							 int check_pos)
 {
-	unsigned l = iter->level;
+	unsigned i, l = path->level;
+again:
+	while (btree_path_node(path, l) &&
+	       !btree_path_good_node(trans, path, l, check_pos))
+		__btree_path_set_level_up(trans, path, l++);
 
-	while (btree_iter_node(iter, l) &&
-	       !(is_btree_node(iter, l) &&
-		 bch2_btree_node_relock(iter, l) &&
-		 (!check_pos ||
-		  btree_iter_pos_in_node(iter, iter->l[l].b)))) {
-		btree_node_unlock(iter, l);
-		iter->l[l].b = BTREE_ITER_NOT_END;
-		l++;
-	}
+	/* If we need intent locks, take them too: */
+	for (i = l + 1;
+	     i < path->locks_want && btree_path_node(path, i);
+	     i++)
+		if (!bch2_btree_node_relock(trans, path, i)) {
+			while (l <= i)
+				__btree_path_set_level_up(trans, path, l++);
+			goto again;
+		}
 
 	return l;
 }
 
+static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
+						     struct btree_path *path,
+						     int check_pos)
+{
+	return likely(btree_node_locked(path, path->level) &&
+		      btree_path_check_pos_in_node(path, path->level, check_pos))
+		? path->level
+		: __btree_path_up_until_good_node(trans, path, check_pos);
+}
+
 /*
  * This is the main state machine for walking down the btree - walks down to a
  * specified depth
@@ -1008,855 +1134,2500 @@ static unsigned btree_iter_up_until_locked(struct btree_iter *iter,
  * Returns 0 on success, -EIO on error (error reading in a btree node).
  *
  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
- * stashed in the iterator and returned from bch2_btree_iter_unlock().
+ * stashed in the iterator and returned from bch2_trans_exit().
  */
-int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
+int bch2_btree_path_traverse_one(struct btree_trans *trans,
+				 btree_path_idx_t path_idx,
+				 unsigned flags,
+				 unsigned long trace_ip)
 {
-	unsigned depth_want = iter->level;
-
-	if (unlikely(iter->level >= BTREE_MAX_DEPTH))
-		return 0;
+	struct btree_path *path = &trans->paths[path_idx];
+	unsigned depth_want = path->level;
+	int ret = -((int) trans->restarted);
 
-	if (__bch2_btree_iter_relock(iter))
-		return 0;
+	if (unlikely(ret))
+		goto out;
 
-	iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF;
+	if (unlikely(!trans->srcu_held))
+		bch2_trans_srcu_lock(trans);
 
-	/*
-	 * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
-	 * here unnecessary
-	 */
-	iter->level = btree_iter_up_until_locked(iter, true);
+	trace_btree_path_traverse_start(trans, path);
 
 	/*
-	 * If we've got a btree node locked (i.e. we aren't about to relock the
-	 * root) - advance its node iterator if necessary:
-	 *
-	 * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
+	 * Ensure we obey path->should_be_locked: if it's set, we can't unlock
+	 * and re-traverse the path without a transaction restart:
 	 */
-	if (btree_iter_node(iter, iter->level)) {
-		struct btree_iter_level *l = &iter->l[iter->level];
-		struct bkey_s_c k;
-		struct bkey u;
+	if (path->should_be_locked) {
+		ret = bch2_btree_path_relock(trans, path, trace_ip);
+		goto out;
+	}
 
-		while ((k = __btree_iter_peek_all(iter, l, &u)).k &&
-		       !btree_iter_pos_cmp(iter, k.k))
-			__btree_iter_advance(l);
+	if (path->cached) {
+		ret = bch2_btree_path_traverse_cached(trans, path, flags);
+		goto out;
 	}
 
+	path = &trans->paths[path_idx];
+
+	if (unlikely(path->level >= BTREE_MAX_DEPTH))
+		goto out_uptodate;
+
+	path->level = btree_path_up_until_good_node(trans, path, 0);
+	unsigned max_level = path->level;
+
+	EBUG_ON(btree_path_node(path, path->level) &&
+		!btree_node_locked(path, path->level));
+
 	/*
-	 * Note: iter->nodes[iter->level] may be temporarily NULL here - that
+	 * Note: path->nodes[path->level] may be temporarily NULL here - that
 	 * would indicate to other code that we got to the end of the btree,
 	 * here it indicates that relocking the root failed - it's critical that
-	 * btree_iter_lock_root() comes next and that it can't fail
+	 * btree_path_lock_root() comes next and that it can't fail
 	 */
-	while (iter->level > depth_want) {
-		int ret = btree_iter_node(iter, iter->level)
-			? btree_iter_down(iter)
-			: btree_iter_lock_root(iter, depth_want);
+	while (path->level > depth_want) {
+		ret = btree_path_node(path, path->level)
+			? btree_path_down(trans, path, flags, trace_ip)
+			: btree_path_lock_root(trans, path, depth_want, trace_ip);
 		if (unlikely(ret)) {
-			iter->level = depth_want;
-			iter->l[iter->level].b = BTREE_ITER_NOT_END;
-			return ret;
+			if (ret == 1) {
+				/*
+				 * No nodes at this level - got to the end of
+				 * the btree:
+				 */
+				ret = 0;
+				goto out;
+			}
+
+			__bch2_btree_path_unlock(trans, path);
+			path->level = depth_want;
+			path->l[path->level].b = ERR_PTR(ret);
+			goto out;
 		}
 	}
 
-	iter->uptodate = BTREE_ITER_NEED_PEEK;
-	bch2_btree_iter_verify_locks(iter);
-	return 0;
+	if (unlikely(max_level > path->level)) {
+		struct btree_path *linked;
+		unsigned iter;
+
+		trans_for_each_path_with_node(trans, path_l(path)->b, linked, iter)
+			for (unsigned j = path->level + 1; j < max_level; j++)
+				linked->l[j] = path->l[j];
+	}
+
+out_uptodate:
+	path->uptodate = BTREE_ITER_UPTODATE;
+	trace_btree_path_traverse_end(trans, path);
+out:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
+		panic("ret %s (%i) trans->restarted %s (%i)\n",
+		      bch2_err_str(ret), ret,
+		      bch2_err_str(trans->restarted), trans->restarted);
+	bch2_btree_path_verify(trans, path);
+	return ret;
 }
 
-int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
+static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
+			    struct btree_path *src)
 {
-	int ret;
+	unsigned i, offset = offsetof(struct btree_path, pos);
 
-	ret = __bch2_btree_iter_traverse(iter);
-	if (unlikely(ret))
-		ret = btree_iter_traverse_error(iter, ret);
+	memcpy((void *) dst + offset,
+	       (void *) src + offset,
+	       sizeof(struct btree_path) - offset);
 
-	BUG_ON(ret == -EINTR && !btree_iter_linked(iter));
+	for (i = 0; i < BTREE_MAX_DEPTH; i++) {
+		unsigned t = btree_node_locked_type(dst, i);
 
-	return ret;
+		if (t != BTREE_NODE_UNLOCKED)
+			six_lock_increment(&dst->l[i].b->c.lock, t);
+	}
+}
+
+static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src,
+					 bool intent, unsigned long ip)
+{
+	btree_path_idx_t new = btree_path_alloc(trans, src);
+	btree_path_copy(trans, trans->paths + new, trans->paths + src);
+	__btree_path_get(trans, trans->paths + new, intent);
+#ifdef TRACK_PATH_ALLOCATED
+	trans->paths[new].ip_allocated = ip;
+#endif
+	return new;
+}
+
+__flatten
+btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
+			btree_path_idx_t path, bool intent, unsigned long ip)
+{
+	struct btree_path *old = trans->paths + path;
+	__btree_path_put(trans, trans->paths + path, intent);
+	path = btree_path_clone(trans, path, intent, ip);
+	trace_btree_path_clone(trans, old, trans->paths + path);
+	trans->paths[path].preserve = false;
+	return path;
+}
+
+btree_path_idx_t __must_check
+__bch2_btree_path_set_pos(struct btree_trans *trans,
+			  btree_path_idx_t path_idx, struct bpos new_pos,
+			  bool intent, unsigned long ip)
+{
+	int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos);
+
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
+	EBUG_ON(!trans->paths[path_idx].ref);
+
+	trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos);
+
+	path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip);
+
+	struct btree_path *path = trans->paths + path_idx;
+	path->pos		= new_pos;
+	trans->paths_sorted	= false;
+
+	if (unlikely(path->cached)) {
+		btree_node_unlock(trans, path, 0);
+		path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+		goto out;
+	}
+
+	unsigned level = btree_path_up_until_good_node(trans, path, cmp);
+
+	if (btree_path_node(path, level)) {
+		struct btree_path_level *l = &path->l[level];
+
+		BUG_ON(!btree_node_locked(path, level));
+		/*
+		 * We might have to skip over many keys, or just a few: try
+		 * advancing the node iterator, and if we have to skip over too
+		 * many keys just reinit it (or if we're rewinding, since that
+		 * is expensive).
+		 */
+		if (cmp < 0 ||
+		    !btree_path_advance_to_pos(path, l, 8))
+			bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
+
+		/*
+		 * Iterators to interior nodes should always be pointed at the first non
+		 * whiteout:
+		 */
+		if (unlikely(level))
+			bch2_btree_node_iter_peek(&l->iter, l->b);
+	}
+
+	if (unlikely(level != path->level)) {
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+		__bch2_btree_path_unlock(trans, path);
+	}
+out:
+	bch2_btree_path_verify(trans, path);
+	return path_idx;
+}
+
+/* Btree path: main interface: */
+
+static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
+{
+	struct btree_path *sib;
+
+	sib = prev_btree_path(trans, path);
+	if (sib && !btree_path_cmp(sib, path))
+		return sib;
+
+	sib = next_btree_path(trans, path);
+	if (sib && !btree_path_cmp(sib, path))
+		return sib;
+
+	return NULL;
+}
+
+static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
+{
+	struct btree_path *sib;
+
+	sib = prev_btree_path(trans, path);
+	if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
+		return sib;
+
+	sib = next_btree_path(trans, path);
+	if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
+		return sib;
+
+	return NULL;
+}
+
+static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path)
+{
+	__bch2_btree_path_unlock(trans, trans->paths + path);
+	btree_path_list_remove(trans, trans->paths + path);
+	__clear_bit(path, trans->paths_allocated);
+}
+
+static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_path *path)
+{
+	unsigned l = path->level;
+
+	do {
+		if (!btree_path_node(path, l))
+			break;
+
+		if (!is_btree_node(path, l))
+			return false;
+
+		if (path->l[l].lock_seq != path->l[l].b->c.lock.seq)
+			return false;
+
+		l++;
+	} while (l < path->locks_want);
+
+	return true;
+}
+
+void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent)
+{
+	struct btree_path *path = trans->paths + path_idx, *dup;
+
+	if (!__btree_path_put(trans, path, intent))
+		return;
+
+	dup = path->preserve
+		? have_path_at_pos(trans, path)
+		: have_node_at_pos(trans, path);
+
+	trace_btree_path_free(trans, path_idx, dup);
+
+	if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
+		return;
+
+	if (path->should_be_locked && !trans->restarted) {
+		if (!dup)
+			return;
+
+		if (!(trans->locked
+		      ? bch2_btree_path_relock_norestart(trans, dup)
+		      : bch2_btree_path_can_relock(trans, dup)))
+			return;
+	}
+
+	if (dup) {
+		dup->preserve		|= path->preserve;
+		dup->should_be_locked	|= path->should_be_locked;
+	}
+
+	__bch2_path_free(trans, path_idx);
+}
+
+static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path,
+				 bool intent)
+{
+	if (!__btree_path_put(trans, trans->paths + path, intent))
+		return;
+
+	__bch2_path_free(trans, path);
+}
+
+void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
+{
+	panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
+	      trans->restart_count, restart_count,
+	      (void *) trans->last_begin_ip);
+}
+
+static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct printbuf buf = PRINTBUF;
+	bch2_prt_backtrace(&buf, &trans->last_restarted_trace);
+	panic("in transaction restart: %s, last restarted by\n%s",
+	      bch2_err_str(trans->restarted),
+	      buf.buf);
+#else
+	panic("in transaction restart: %s, last restarted by %pS\n",
+	      bch2_err_str(trans->restarted),
+	      (void *) trans->last_restarted_ip);
+#endif
+}
+
+void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans)
+{
+	if (trans->restarted)
+		bch2_trans_in_restart_error(trans);
+
+	if (!trans->locked)
+		panic("trans should be locked, unlocked by %pS\n",
+		      (void *) trans->last_unlock_ip);
+
+	BUG();
+}
+
+noinline __cold
+void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
+{
+	prt_printf(buf, "%u transaction updates for %s journal seq %llu\n",
+		   trans->nr_updates, trans->fn, trans->journal_res.seq);
+	printbuf_indent_add(buf, 2);
+
+	trans_for_each_update(trans, i) {
+		struct bkey_s_c old = { &i->old_k, i->old_v };
+
+		prt_str(buf, "update: btree=");
+		bch2_btree_id_to_text(buf, i->btree_id);
+		prt_printf(buf, " cached=%u %pS\n",
+			   i->cached,
+			   (void *) i->ip_allocated);
+
+		prt_printf(buf, "  old ");
+		bch2_bkey_val_to_text(buf, trans->c, old);
+		prt_newline(buf);
+
+		prt_printf(buf, "  new ");
+		bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
+		prt_newline(buf);
+	}
+
+	for (struct jset_entry *e = trans->journal_entries;
+	     e != btree_trans_journal_entries_top(trans);
+	     e = vstruct_next(e))
+		bch2_journal_entry_to_text(buf, trans->c, e);
+
+	printbuf_indent_sub(buf, 2);
 }
 
-static inline void bch2_btree_iter_checks(struct btree_iter *iter,
-					  enum btree_iter_type type)
+noinline __cold
+void bch2_dump_trans_updates(struct btree_trans *trans)
 {
-	EBUG_ON(iter->btree_id >= BTREE_ID_NR);
-	EBUG_ON((iter->flags & BTREE_ITER_TYPE) != type);
-	EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
-		(iter->btree_id == BTREE_ID_EXTENTS &&
-		 type != BTREE_ITER_NODES));
+	struct printbuf buf = PRINTBUF;
 
-	bch2_btree_iter_verify_locks(iter);
+	bch2_trans_updates_to_text(&buf, trans);
+	bch2_print_str(trans->c, buf.buf);
+	printbuf_exit(&buf);
+}
+
+static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
+{
+	struct btree_path *path = trans->paths + path_idx;
+
+	prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ",
+		   path_idx, path->ref, path->intent_ref,
+		   path->preserve ? 'P' : ' ',
+		   path->should_be_locked ? 'S' : ' ',
+		   path->cached ? 'C' : 'B');
+	bch2_btree_id_level_to_text(out, path->btree_id, path->level);
+	prt_str(out, " pos ");
+	bch2_bpos_to_text(out, path->pos);
+
+	if (!path->cached && btree_node_locked(path, path->level)) {
+		prt_char(out, ' ');
+		struct btree *b = path_l(path)->b;
+		bch2_bpos_to_text(out, b->data->min_key);
+		prt_char(out, '-');
+		bch2_bpos_to_text(out, b->key.k.p);
+	}
+
+#ifdef TRACK_PATH_ALLOCATED
+	prt_printf(out, " %pS", (void *) path->ip_allocated);
+#endif
+}
+
+static const char *btree_node_locked_str(enum btree_node_locked_type t)
+{
+	switch (t) {
+	case BTREE_NODE_UNLOCKED:
+		return "unlocked";
+	case BTREE_NODE_READ_LOCKED:
+		return "read";
+	case BTREE_NODE_INTENT_LOCKED:
+		return "intent";
+	case BTREE_NODE_WRITE_LOCKED:
+		return "write";
+	default:
+		return NULL;
+	}
+}
+
+void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
+{
+	bch2_btree_path_to_text_short(out, trans, path_idx);
+
+	struct btree_path *path = trans->paths + path_idx;
+
+	prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want);
+	prt_newline(out);
+
+	printbuf_indent_add(out, 2);
+	for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
+		prt_printf(out, "l=%u locks %s seq %u node ", l,
+			   btree_node_locked_str(btree_node_locked_type(path, l)),
+			   path->l[l].lock_seq);
+
+		int ret = PTR_ERR_OR_ZERO(path->l[l].b);
+		if (ret)
+			prt_str(out, bch2_err_str(ret));
+		else
+			prt_printf(out, "%px", path->l[l].b);
+		prt_newline(out);
+	}
+	printbuf_indent_sub(out, 2);
+}
+
+static noinline __cold
+void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
+				bool nosort)
+{
+	struct trans_for_each_path_inorder_iter iter;
+
+	if (!nosort)
+		btree_trans_sort_paths(trans);
+
+	trans_for_each_path_idx_inorder(trans, iter) {
+		bch2_btree_path_to_text_short(out, trans, iter.path_idx);
+		prt_newline(out);
+	}
+}
+
+noinline __cold
+void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
+{
+	__bch2_trans_paths_to_text(out, trans, false);
+}
+
+static noinline __cold
+void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
+{
+	struct printbuf buf = PRINTBUF;
+
+	__bch2_trans_paths_to_text(&buf, trans, nosort);
+	bch2_trans_updates_to_text(&buf, trans);
+
+	bch2_print_str(trans->c, buf.buf);
+	printbuf_exit(&buf);
+}
+
+noinline __cold
+void bch2_dump_trans_paths_updates(struct btree_trans *trans)
+{
+	__bch2_dump_trans_paths_updates(trans, false);
+}
+
+noinline __cold
+static void bch2_trans_update_max_paths(struct btree_trans *trans)
+{
+	struct btree_transaction_stats *s = btree_trans_stats(trans);
+	struct printbuf buf = PRINTBUF;
+	size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths);
+
+	bch2_trans_paths_to_text(&buf, trans);
+
+	if (!buf.allocation_failure) {
+		mutex_lock(&s->lock);
+		if (nr > s->nr_max_paths) {
+			s->nr_max_paths = nr;
+			swap(s->max_paths_text, buf.buf);
+		}
+		mutex_unlock(&s->lock);
+	}
+
+	printbuf_exit(&buf);
+
+	trans->nr_paths_max = nr;
+}
+
+noinline __cold
+int __bch2_btree_trans_too_many_iters(struct btree_trans *trans)
+{
+	if (trace_trans_restart_too_many_iters_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_trans_paths_to_text(&buf, trans);
+		trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf);
+		printbuf_exit(&buf);
+	}
+
+	count_event(trans->c, trans_restart_too_many_iters);
+
+	return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
+}
+
+static noinline void btree_path_overflow(struct btree_trans *trans)
+{
+	bch2_dump_trans_paths_updates(trans);
+	bch_err(trans->c, "trans path overflow");
+}
+
+static noinline void btree_paths_realloc(struct btree_trans *trans)
+{
+	unsigned nr = trans->nr_paths * 2;
+
+	void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
+			  sizeof(struct btree_trans_paths) +
+			  nr * sizeof(struct btree_path) +
+			  nr * sizeof(btree_path_idx_t) + 8 +
+			  nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL);
+
+	unsigned long *paths_allocated = p;
+	memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long));
+	p += BITS_TO_LONGS(nr) * sizeof(unsigned long);
+
+	p += sizeof(struct btree_trans_paths);
+	struct btree_path *paths = p;
+	*trans_paths_nr(paths) = nr;
+	memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path));
+	p += nr * sizeof(struct btree_path);
+
+	btree_path_idx_t *sorted = p;
+	memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t));
+	p += nr * sizeof(btree_path_idx_t) + 8;
+
+	struct btree_insert_entry *updates = p;
+	memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry));
+
+	unsigned long *old = trans->paths_allocated;
+
+	rcu_assign_pointer(trans->paths_allocated,	paths_allocated);
+	rcu_assign_pointer(trans->paths,		paths);
+	rcu_assign_pointer(trans->sorted,		sorted);
+	rcu_assign_pointer(trans->updates,		updates);
+
+	trans->nr_paths		= nr;
+
+	if (old != trans->_paths_allocated)
+		kfree_rcu_mightsleep(old);
+}
+
+static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans,
+						btree_path_idx_t pos)
+{
+	btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths);
+
+	if (unlikely(idx == trans->nr_paths)) {
+		if (trans->nr_paths == BTREE_ITER_MAX) {
+			btree_path_overflow(trans);
+			return 0;
+		}
+
+		btree_paths_realloc(trans);
+	}
+
+	/*
+	 * Do this before marking the new path as allocated, since it won't be
+	 * initialized yet:
+	 */
+	if (unlikely(idx > trans->nr_paths_max))
+		bch2_trans_update_max_paths(trans);
+
+	__set_bit(idx, trans->paths_allocated);
+
+	struct btree_path *path = &trans->paths[idx];
+	path->ref		= 0;
+	path->intent_ref	= 0;
+	path->nodes_locked	= 0;
+
+	btree_path_list_add(trans, pos, idx);
+	trans->paths_sorted = false;
+	return idx;
+}
+
+btree_path_idx_t bch2_path_get(struct btree_trans *trans,
+			     enum btree_id btree_id, struct bpos pos,
+			     unsigned locks_want, unsigned level,
+			     unsigned flags, unsigned long ip)
+{
+	struct btree_path *path;
+	bool cached = flags & BTREE_ITER_cached;
+	bool intent = flags & BTREE_ITER_intent;
+	struct trans_for_each_path_inorder_iter iter;
+	btree_path_idx_t path_pos = 0, path_idx;
+
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
+	bch2_trans_verify_locks(trans);
+
+	btree_trans_sort_paths(trans);
+
+	trans_for_each_path_inorder(trans, path, iter) {
+		if (__btree_path_cmp(path,
+				     btree_id,
+				     cached,
+				     pos,
+				     level) > 0)
+			break;
+
+		path_pos = iter.path_idx;
+	}
+
+	if (path_pos &&
+	    trans->paths[path_pos].cached	== cached &&
+	    trans->paths[path_pos].btree_id	== btree_id &&
+	    trans->paths[path_pos].level	== level) {
+		trace_btree_path_get(trans, trans->paths + path_pos, &pos);
+
+		__btree_path_get(trans, trans->paths + path_pos, intent);
+		path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
+		path = trans->paths + path_idx;
+	} else {
+		path_idx = btree_path_alloc(trans, path_pos);
+		path = trans->paths + path_idx;
+
+		__btree_path_get(trans, path, intent);
+		path->pos			= pos;
+		path->btree_id			= btree_id;
+		path->cached			= cached;
+		path->uptodate			= BTREE_ITER_NEED_TRAVERSE;
+		path->should_be_locked		= false;
+		path->level			= level;
+		path->locks_want		= locks_want;
+		path->nodes_locked		= 0;
+		for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++)
+			path->l[i].b		= ERR_PTR(-BCH_ERR_no_btree_node_init);
+#ifdef TRACK_PATH_ALLOCATED
+		path->ip_allocated		= ip;
+#endif
+		trans->paths_sorted		= false;
+
+		trace_btree_path_alloc(trans, path);
+	}
+
+	if (!(flags & BTREE_ITER_nopreserve))
+		path->preserve = true;
+
+	if (path->intent_ref)
+		locks_want = max(locks_want, level + 1);
+
+	/*
+	 * If the path has locks_want greater than requested, we don't downgrade
+	 * it here - on transaction restart because btree node split needs to
+	 * upgrade locks, we might be putting/getting the iterator again.
+	 * Downgrading iterators only happens via bch2_trans_downgrade(), after
+	 * a successful transaction commit.
+	 */
+
+	locks_want = min(locks_want, BTREE_MAX_DEPTH);
+	if (locks_want > path->locks_want)
+		bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
+
+	return path_idx;
+}
+
+btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *trans,
+					    enum btree_id btree_id,
+					    unsigned level,
+					    struct bpos pos)
+{
+	btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level,
+			     BTREE_ITER_nopreserve|
+			     BTREE_ITER_intent, _RET_IP_);
+	path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_);
+
+	struct btree_path *path = trans->paths + path_idx;
+	bch2_btree_path_downgrade(trans, path);
+	__bch2_btree_path_unlock(trans, path);
+	return path_idx;
+}
+
+struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
+{
+
+	struct btree_path_level *l = path_l(path);
+	struct bkey_packed *_k;
+	struct bkey_s_c k;
+
+	if (unlikely(!l->b))
+		return bkey_s_c_null;
+
+	EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+	EBUG_ON(!btree_node_locked(path, path->level));
+
+	if (!path->cached) {
+		_k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+		k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
+
+		EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos));
+
+		if (!k.k || !bpos_eq(path->pos, k.k->p))
+			goto hole;
+	} else {
+		struct bkey_cached *ck = (void *) path->l[0].b;
+		if (!ck)
+			return bkey_s_c_null;
+
+		EBUG_ON(path->btree_id != ck->key.btree_id ||
+			!bkey_eq(path->pos, ck->key.pos));
+
+		*u = ck->k->k;
+		k = bkey_i_to_s_c(ck->k);
+	}
+
+	return k;
+hole:
+	bkey_init(u);
+	u->p = path->pos;
+	return (struct bkey_s_c) { u, NULL };
+}
+
+void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+
+	if (!iter->path || trans->restarted)
+		return;
+
+	struct btree_path *path = btree_iter_path(trans, iter);
+	path->preserve		= false;
+	if (path->ref == 1)
+		path->should_be_locked	= false;
+}
+/* Btree iterators: */
+
+int __must_check
+__bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+	return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+}
+
+int __must_check
+bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+	int ret;
+
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
+
+	iter->path = bch2_btree_path_set_pos(trans, iter->path,
+					btree_iter_search_key(iter),
+					iter->flags & BTREE_ITER_intent,
+					btree_iter_ip_allocated(iter));
+
+	ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+	if (ret)
+		return ret;
+
+	struct btree_path *path = btree_iter_path(trans, iter);
+	if (btree_path_node(path, path->level))
+		btree_path_set_should_be_locked(trans, path);
+	return 0;
 }
 
 /* Iterate across nodes (leaf and interior nodes) */
 
 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 {
-	struct btree *b;
+	struct btree_trans *trans = iter->trans;
+	struct btree *b = NULL;
 	int ret;
 
-	bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
-
-	if (iter->uptodate == BTREE_ITER_UPTODATE)
-		return iter->l[iter->level].b;
+	EBUG_ON(trans->paths[iter->path].cached);
+	bch2_btree_iter_verify(iter);
 
-	ret = bch2_btree_iter_traverse(iter);
+	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
 	if (ret)
-		return NULL;
+		goto err;
 
-	b = btree_iter_node(iter, iter->level);
+	struct btree_path *path = btree_iter_path(trans, iter);
+	b = btree_path_node(path, path->level);
 	if (!b)
-		return NULL;
+		goto out;
 
-	BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
+	BUG_ON(bpos_lt(b->key.k.p, iter->pos));
 
-	iter->pos = b->key.k.p;
-	iter->uptodate = BTREE_ITER_UPTODATE;
+	bkey_init(&iter->k);
+	iter->k.p = iter->pos = b->key.k.p;
+
+	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
+					iter->flags & BTREE_ITER_intent,
+					btree_iter_ip_allocated(iter));
+	btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
+out:
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
 
 	return b;
+err:
+	b = ERR_PTR(ret);
+	goto out;
 }
 
-struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
+/* Only kept for -tools */
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
 {
 	struct btree *b;
-	int ret;
 
-	bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
+	while (b = bch2_btree_iter_peek_node(iter),
+	       bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
+		bch2_trans_begin(iter->trans);
 
-	/* already got to end? */
-	if (!btree_iter_node(iter, iter->level))
-		return NULL;
+	return b;
+}
 
-	btree_iter_up(iter);
+struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+	struct btree *b = NULL;
+	int ret;
 
-	if (!bch2_btree_node_relock(iter, iter->level))
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+	EBUG_ON(trans->paths[iter->path].cached);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
+	bch2_btree_iter_verify(iter);
 
-	ret = bch2_btree_iter_traverse(iter);
+	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
 	if (ret)
+		goto err;
+
+
+	struct btree_path *path = btree_iter_path(trans, iter);
+
+	/* already at end? */
+	if (!btree_path_node(path, path->level))
 		return NULL;
 
 	/* got to end? */
-	b = btree_iter_node(iter, iter->level);
-	if (!b)
+	if (!btree_path_node(path, path->level + 1)) {
+		btree_path_set_level_up(trans, path);
 		return NULL;
+	}
+
+	if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
+		__bch2_btree_path_unlock(trans, path);
+		path->l[path->level].b		= ERR_PTR(-BCH_ERR_no_btree_node_relock);
+		path->l[path->level + 1].b	= ERR_PTR(-BCH_ERR_no_btree_node_relock);
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+		trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
+		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+		goto err;
+	}
+
+	b = btree_path_node(path, path->level + 1);
+
+	if (bpos_eq(iter->pos, b->key.k.p)) {
+		__btree_path_set_level_up(trans, path, path->level++);
+	} else {
+		if (btree_lock_want(path, path->level + 1) == BTREE_NODE_UNLOCKED)
+			btree_node_unlock(trans, path, path->level + 1);
 
-	if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
 		/*
 		 * Haven't gotten to the end of the parent node: go back down to
 		 * the next child node
 		 */
+		iter->path = bch2_btree_path_set_pos(trans, iter->path,
+					bpos_successor(iter->pos),
+					iter->flags & BTREE_ITER_intent,
+					btree_iter_ip_allocated(iter));
 
-		/*
-		 * We don't really want to be unlocking here except we can't
-		 * directly tell btree_iter_traverse() "traverse to this level"
-		 * except by setting iter->level, so we have to unlock so we
-		 * don't screw up our lock invariants:
-		 */
-		if (btree_node_read_locked(iter, iter->level))
-			btree_node_unlock(iter, iter->level);
-
-		/* ick: */
-		iter->pos	= iter->btree_id == BTREE_ID_INODES
-			? btree_type_successor(iter->btree_id, iter->pos)
-			: bkey_successor(iter->pos);
-		iter->level	= depth;
+		path = btree_iter_path(trans, iter);
+		btree_path_set_level_down(trans, path, iter->min_depth);
 
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-		ret = bch2_btree_iter_traverse(iter);
+		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
 		if (ret)
-			return NULL;
+			goto err;
 
-		b = iter->l[iter->level].b;
+		path = btree_iter_path(trans, iter);
+		b = path->l[path->level].b;
 	}
 
-	iter->pos = b->key.k.p;
-	iter->uptodate = BTREE_ITER_UPTODATE;
+	bkey_init(&iter->k);
+	iter->k.p = iter->pos = b->key.k.p;
+
+	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
+					iter->flags & BTREE_ITER_intent,
+					btree_iter_ip_allocated(iter));
+	btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
+	EBUG_ON(btree_iter_path(trans, iter)->uptodate);
+out:
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
 
 	return b;
+err:
+	b = ERR_PTR(ret);
+	goto out;
 }
 
 /* Iterate across keys (in leaf nodes only) */
 
-void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
+inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 {
-	struct btree_iter_level *l = &iter->l[0];
-	struct bkey_packed *k;
+	struct bpos pos = iter->k.p;
+	bool ret = !(iter->flags & BTREE_ITER_all_snapshots
+		     ? bpos_eq(pos, SPOS_MAX)
+		     : bkey_eq(pos, SPOS_MAX));
 
-	EBUG_ON(iter->level != 0);
-	EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
-	EBUG_ON(!btree_node_locked(iter, 0));
-	EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);
+	if (ret && !(iter->flags & BTREE_ITER_is_extents))
+		pos = bkey_successor(iter, pos);
+	bch2_btree_iter_set_pos(iter, pos);
+	return ret;
+}
 
-	iter->pos = new_pos;
-	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
+{
+	struct bpos pos = bkey_start_pos(&iter->k);
+	bool ret = !(iter->flags & BTREE_ITER_all_snapshots
+		     ? bpos_eq(pos, POS_MIN)
+		     : bkey_eq(pos, POS_MIN));
 
-	while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
-	       !btree_iter_pos_cmp_packed(l->b, &iter->pos, k,
-					  iter->flags & BTREE_ITER_IS_EXTENTS))
-		__btree_iter_advance(l);
+	if (ret && !(iter->flags & BTREE_ITER_is_extents))
+		pos = bkey_predecessor(iter, pos);
+	bch2_btree_iter_set_pos(iter, pos);
+	return ret;
+}
 
-	if (!k && btree_iter_pos_after_node(iter, l->b)) {
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-		iter->flags |= BTREE_ITER_AT_END_OF_LEAF;
-	}
+static noinline
+void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter,
+					struct bkey_s_c *k)
+{
+	struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key;
+
+	trans_for_each_update(trans, i)
+		if (!i->key_cache_already_flushed &&
+		    i->btree_id == iter->btree_id &&
+		    bpos_le(i->k->k.p, iter->pos) &&
+		    bpos_ge(i->k->k.p, k->k ? k->k->p : end)) {
+			iter->k = i->k->k;
+			*k = bkey_i_to_s_c(i->k);
+		}
 }
 
-void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
-{
-	int cmp = bkey_cmp(new_pos, iter->pos);
-	unsigned level;
+static noinline
+void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter,
+				   struct bkey_s_c *k)
+{
+	struct btree_path *path = btree_iter_path(trans, iter);
+	struct bpos end = path_l(path)->b->key.k.p;
+
+	trans_for_each_update(trans, i)
+		if (!i->key_cache_already_flushed &&
+		    i->btree_id == iter->btree_id &&
+		    bpos_ge(i->k->k.p, path->pos) &&
+		    bpos_le(i->k->k.p, k->k ? k->k->p : end)) {
+			iter->k = i->k->k;
+			*k = bkey_i_to_s_c(i->k);
+		}
+}
 
-	if (!cmp)
-		return;
+static noinline
+void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_iter *iter,
+					struct bkey_s_c *k)
+{
+	trans_for_each_update(trans, i)
+		if (!i->key_cache_already_flushed &&
+		    i->btree_id == iter->btree_id &&
+		    bpos_eq(i->k->k.p, iter->pos)) {
+			iter->k = i->k->k;
+			*k = bkey_i_to_s_c(i->k);
+		}
+}
 
-	iter->pos = new_pos;
+static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
+					      struct btree_iter *iter,
+					      struct bpos end_pos)
+{
+	struct btree_path *path = btree_iter_path(trans, iter);
 
-	level = btree_iter_up_until_locked(iter, true);
+	return bch2_journal_keys_peek_max(trans->c, iter->btree_id,
+					   path->level,
+					   path->pos,
+					   end_pos,
+					   &iter->journal_idx);
+}
 
-	if (btree_iter_node(iter, level)) {
-		unsigned nr_advanced = 0;
-		struct btree_iter_level *l = &iter->l[level];
-		struct bkey_s_c k;
-		struct bkey u;
+static noinline
+struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
+					      struct btree_iter *iter)
+{
+	struct btree_path *path = btree_iter_path(trans, iter);
+	struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos);
 
-		/*
-		 * We might have to skip over many keys, or just a few: try
-		 * advancing the node iterator, and if we have to skip over too
-		 * many keys just reinit it (or if we're rewinding, since that
-		 * is expensive).
-		 */
-		if (cmp > 0) {
-			while ((k = __btree_iter_peek_all(iter, l, &u)).k &&
-			       !btree_iter_pos_cmp(iter, k.k)) {
-				if (nr_advanced > 8)
-					goto reinit_node;
-
-				__btree_iter_advance(l);
-				nr_advanced++;
-			}
-		} else {
-reinit_node:
-			__btree_iter_init(iter, iter->l[level].b);
-		}
+	if (k) {
+		iter->k = k->k;
+		return bkey_i_to_s_c(k);
+	} else {
+		return bkey_s_c_null;
+	}
+}
 
-		/* Don't leave it locked if we're not supposed to: */
-		if (btree_lock_want(iter, level) == BTREE_NODE_UNLOCKED)
-			btree_node_unlock(iter, level);
+static noinline
+struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 struct bkey_s_c k)
+{
+	struct btree_path *path = btree_iter_path(trans, iter);
+	struct bkey_i *next_journal =
+		bch2_btree_journal_peek(trans, iter,
+				k.k ? k.k->p : path_l(path)->b->key.k.p);
+
+	if (next_journal) {
+		iter->k = next_journal->k;
+		k = bkey_i_to_s_c(next_journal);
 	}
 
-	if (level != iter->level)
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-	else
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+	return k;
 }
 
-static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
+static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
+					      struct btree_iter *iter,
+					      struct bpos end_pos)
 {
-	struct btree_iter_level *l = &iter->l[0];
-	struct bkey_s_c ret = { .k = &iter->k };
+	struct btree_path *path = btree_iter_path(trans, iter);
 
-	if (!bkey_deleted(&iter->k)) {
-		EBUG_ON(bch2_btree_node_iter_end(&l->iter));
-		ret.v = bkeyp_val(&l->b->format,
-			__bch2_btree_node_iter_peek_all(&l->iter, l->b));
+	return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id,
+					   path->level,
+					   path->pos,
+					   end_pos,
+					   &iter->journal_idx);
+}
+
+static noinline
+struct bkey_s_c btree_trans_peek_prev_journal(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 struct bkey_s_c k)
+{
+	struct btree_path *path = btree_iter_path(trans, iter);
+	struct bkey_i *next_journal =
+		bch2_btree_journal_peek_prev(trans, iter,
+				k.k ? k.k->p : path_l(path)->b->key.k.p);
+
+	if (next_journal) {
+		iter->k = next_journal->k;
+		k = bkey_i_to_s_c(next_journal);
 	}
 
-	if (debug_check_bkeys(iter->c) &&
-	    !bkey_deleted(ret.k))
-		bch2_bkey_debugcheck(iter->c, l->b, ret);
-	return ret;
+	return k;
 }
 
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+/*
+ * Checks btree key cache for key at iter->pos and returns it if present, or
+ * bkey_s_c_null:
+ */
+static noinline
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
 {
-	struct btree_iter_level *l = &iter->l[0];
+	struct btree_trans *trans = iter->trans;
+	struct bch_fs *c = trans->c;
+	struct bkey u;
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
 
-	if (iter->uptodate == BTREE_ITER_UPTODATE)
-		return btree_iter_peek_uptodate(iter);
-
-	while (1) {
-		ret = bch2_btree_iter_traverse(iter);
-		if (unlikely(ret))
-			return bkey_s_c_err(ret);
+	if ((iter->flags & BTREE_ITER_key_cache_fill) &&
+	    bpos_eq(iter->pos, pos))
+		return bkey_s_c_null;
 
-		k = __btree_iter_peek(iter, l);
-		if (likely(k.k))
-			break;
+	if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
+		return bkey_s_c_null;
 
-		/* got to the end of the leaf, iterator needs to be traversed: */
-		iter->pos	= l->b->key.k.p;
-		iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
+	if (!iter->key_cache_path)
+		iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
+						     iter->flags & BTREE_ITER_intent, 0,
+						     iter->flags|BTREE_ITER_cached|
+						     BTREE_ITER_cached_nofill,
+						     _THIS_IP_);
 
-		if (!bkey_cmp(iter->pos, POS_MAX))
-			return bkey_s_c_null;
+	iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
+					iter->flags & BTREE_ITER_intent,
+					btree_iter_ip_allocated(iter));
 
-		iter->pos = btree_type_successor(iter->btree_id, iter->pos);
-	}
+	ret =   bch2_btree_path_traverse(trans, iter->key_cache_path,
+					 iter->flags|BTREE_ITER_cached) ?:
+		bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_);
+	if (unlikely(ret))
+		return bkey_s_c_err(ret);
 
-	/*
-	 * iter->pos should always be equal to the key we just
-	 * returned - except extents can straddle iter->pos:
-	 */
-	if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
-	    bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
-		iter->pos = bkey_start_pos(k.k);
+	btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
 
-	iter->uptodate = BTREE_ITER_UPTODATE;
+	k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
+	if (k.k && !bkey_err(k)) {
+		iter->k = u;
+		k.k = &iter->k;
+	}
 	return k;
 }
 
-static noinline
-struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter)
+static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
 {
-	struct btree_iter_level *l = &iter->l[0];
+	struct btree_trans *trans = iter->trans;
+	struct bkey_s_c k, k2;
+	int ret;
 
-	iter->pos	= l->b->key.k.p;
-	iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;
+	EBUG_ON(btree_iter_path(trans, iter)->cached);
+	bch2_btree_iter_verify(iter);
 
-	if (!bkey_cmp(iter->pos, POS_MAX))
-		return bkey_s_c_null;
+	while (1) {
+		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+					iter->flags & BTREE_ITER_intent,
+					btree_iter_ip_allocated(iter));
 
-	iter->pos = btree_type_successor(iter->btree_id, iter->pos);
+		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+		if (unlikely(ret)) {
+			/* ensure that iter->k is consistent with iter->pos: */
+			bch2_btree_iter_set_pos(iter, iter->pos);
+			k = bkey_s_c_err(ret);
+			goto out;
+		}
 
-	return bch2_btree_iter_peek(iter);
-}
+		struct btree_path *path = btree_iter_path(trans, iter);
+		struct btree_path_level *l = path_l(path);
 
-struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
-{
-	struct btree_iter_level *l = &iter->l[0];
-	struct bkey_packed *p;
-	struct bkey_s_c k;
+		if (unlikely(!l->b)) {
+			/* No btree nodes at requested level: */
+			bch2_btree_iter_set_pos(iter, SPOS_MAX);
+			k = bkey_s_c_null;
+			goto out;
+		}
 
-	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+		btree_path_set_should_be_locked(trans, path);
 
-	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
-		k = bch2_btree_iter_peek(iter);
-		if (IS_ERR_OR_NULL(k.k))
-			return k;
-	}
+		k = btree_path_level_peek_all(trans->c, l, &iter->k);
 
-	do {
-		__btree_iter_advance(l);
-		p = bch2_btree_node_iter_peek_all(&l->iter, l->b);
-		if (unlikely(!p))
-			return bch2_btree_iter_peek_next_leaf(iter);
-	} while (bkey_whiteout(p));
+		if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
+		    k.k &&
+		    (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+			k = k2;
+			ret = bkey_err(k);
+			if (ret) {
+				bch2_btree_iter_set_pos(iter, iter->pos);
+				goto out;
+			}
+		}
 
-	k = __btree_iter_unpack(iter, l, &iter->k, p);
+		if (unlikely(iter->flags & BTREE_ITER_with_journal))
+			k = btree_trans_peek_journal(trans, iter, k);
+
+		if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
+			     trans->nr_updates))
+			bch2_btree_trans_peek_updates(trans, iter, &k);
+
+		if (k.k && bkey_deleted(k.k)) {
+			/*
+			 * If we've got a whiteout, and it's after the search
+			 * key, advance the search key to the whiteout instead
+			 * of just after the whiteout - it might be a btree
+			 * whiteout, with a real key at the same position, since
+			 * in the btree deleted keys sort before non deleted.
+			 */
+			search_key = !bpos_eq(search_key, k.k->p)
+				? k.k->p
+				: bpos_successor(k.k->p);
+			continue;
+		}
+
+		if (likely(k.k)) {
+			break;
+		} else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) {
+			/* Advance to next leaf node: */
+			search_key = bpos_successor(l->b->key.k.p);
+		} else {
+			/* End of btree: */
+			bch2_btree_iter_set_pos(iter, SPOS_MAX);
+			k = bkey_s_c_null;
+			goto out;
+		}
+	}
+out:
+	bch2_btree_iter_verify(iter);
 
-	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0);
-	iter->pos = bkey_start_pos(k.k);
 	return k;
 }
 
-struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
+/**
+ * bch2_btree_iter_peek_max() - returns first key greater than or equal to
+ * iterator's current position
+ * @iter:	iterator to peek from
+ * @end:	search limit: returns keys less than or equal to @end
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end)
 {
-	struct btree_iter_level *l = &iter->l[0];
-	struct bkey_packed *p;
+	struct btree_trans *trans = iter->trans;
+	struct bpos search_key = btree_iter_search_key(iter);
 	struct bkey_s_c k;
+	struct bpos iter_pos = iter->pos;
 	int ret;
 
-	bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
+	bch2_btree_iter_verify_entry_exit(iter);
+	EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
 
-	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
-		k = bch2_btree_iter_peek(iter);
-		if (IS_ERR(k.k))
-			return k;
+	if (iter->update_path) {
+		bch2_path_put_nokeep(trans, iter->update_path,
+				     iter->flags & BTREE_ITER_intent);
+		iter->update_path = 0;
 	}
 
 	while (1) {
-		p = bch2_btree_node_iter_prev(&l->iter, l->b);
-		if (likely(p))
-			break;
+		k = __bch2_btree_iter_peek(iter, search_key);
+		if (unlikely(!k.k))
+			goto end;
+		if (unlikely(bkey_err(k)))
+			goto out_no_locked;
 
-		iter->pos = l->b->data->min_key;
-		if (!bkey_cmp(iter->pos, POS_MIN))
-			return bkey_s_c_null;
+		if (iter->flags & BTREE_ITER_filter_snapshots) {
+			/*
+			 * We need to check against @end before FILTER_SNAPSHOTS because
+			 * if we get to a different inode that requested we might be
+			 * seeing keys for a different snapshot tree that will all be
+			 * filtered out.
+			 *
+			 * But we can't do the full check here, because bkey_start_pos()
+			 * isn't monotonically increasing before FILTER_SNAPSHOTS, and
+			 * that's what we check against in extents mode:
+			 */
+			if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
+				     ? bkey_gt(k.k->p, end)
+				     : k.k->p.inode > end.inode))
+				goto end;
+
+			if (iter->update_path &&
+			    !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
+				bch2_path_put_nokeep(trans, iter->update_path,
+						     iter->flags & BTREE_ITER_intent);
+				iter->update_path = 0;
+			}
 
-		bch2_btree_iter_set_pos(iter,
-			btree_type_predecessor(iter->btree_id, iter->pos));
+			if ((iter->flags & BTREE_ITER_intent) &&
+			    !(iter->flags & BTREE_ITER_is_extents) &&
+			    !iter->update_path) {
+				struct bpos pos = k.k->p;
 
-		ret = bch2_btree_iter_traverse(iter);
-		if (unlikely(ret))
-			return bkey_s_c_err(ret);
+				if (pos.snapshot < iter->snapshot) {
+					search_key = bpos_successor(k.k->p);
+					continue;
+				}
 
-		p = bch2_btree_node_iter_peek(&l->iter, l->b);
-		if (p)
-			break;
+				pos.snapshot = iter->snapshot;
+
+				/*
+				 * advance, same as on exit for iter->path, but only up
+				 * to snapshot
+				 */
+				__btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
+				iter->update_path = iter->path;
+
+				iter->update_path = bch2_btree_path_set_pos(trans,
+							iter->update_path, pos,
+							iter->flags & BTREE_ITER_intent,
+							_THIS_IP_);
+				ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
+				if (unlikely(ret)) {
+					k = bkey_s_c_err(ret);
+					goto out_no_locked;
+				}
+			}
+
+			/*
+			 * We can never have a key in a leaf node at POS_MAX, so
+			 * we don't have to check these successor() calls:
+			 */
+			if (!bch2_snapshot_is_ancestor(trans->c,
+						       iter->snapshot,
+						       k.k->p.snapshot)) {
+				search_key = bpos_successor(k.k->p);
+				continue;
+			}
+
+			if (bkey_whiteout(k.k)) {
+				search_key = bkey_successor(iter, k.k->p);
+				continue;
+			}
+		}
+
+		/*
+		 * iter->pos should be mononotically increasing, and always be
+		 * equal to the key we just returned - except extents can
+		 * straddle iter->pos:
+		 */
+		if (!(iter->flags & BTREE_ITER_is_extents))
+			iter_pos = k.k->p;
+		else
+			iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
+
+		if (unlikely(iter->flags & BTREE_ITER_all_snapshots	? bpos_gt(iter_pos, end) :
+			     iter->flags & BTREE_ITER_is_extents	? bkey_ge(iter_pos, end) :
+									  bkey_gt(iter_pos, end)))
+			goto end;
+
+		break;
+	}
+
+	iter->pos = iter_pos;
+
+	iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
+				iter->flags & BTREE_ITER_intent,
+				btree_iter_ip_allocated(iter));
+
+	btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
+out_no_locked:
+	if (iter->update_path) {
+		ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_);
+		if (unlikely(ret))
+			k = bkey_s_c_err(ret);
+		else
+			btree_path_set_should_be_locked(trans, trans->paths + iter->update_path);
 	}
 
-	k = __btree_iter_unpack(iter, l, &iter->k, p);
+	if (!(iter->flags & BTREE_ITER_all_snapshots))
+		iter->pos.snapshot = iter->snapshot;
+
+	ret = bch2_btree_iter_verify_ret(iter, k);
+	if (unlikely(ret)) {
+		bch2_btree_iter_set_pos(iter, iter->pos);
+		k = bkey_s_c_err(ret);
+	}
 
-	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
+	bch2_btree_iter_verify_entry_exit(iter);
 
-	iter->pos	= bkey_start_pos(k.k);
-	iter->uptodate	= BTREE_ITER_UPTODATE;
 	return k;
+end:
+	bch2_btree_iter_set_pos(iter, end);
+	k = bkey_s_c_null;
+	goto out_no_locked;
 }
 
-static inline struct bkey_s_c
-__bch2_btree_iter_peek_slot(struct btree_iter *iter)
+/**
+ * bch2_btree_iter_next() - returns first key greater than iterator's current
+ * position
+ * @iter:	iterator to peek from
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 {
-	struct btree_iter_level *l = &iter->l[0];
-	struct bkey_s_c k;
-	struct bkey n;
-	int ret;
+	if (!bch2_btree_iter_advance(iter))
+		return bkey_s_c_null;
 
-recheck:
-	while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
-	       bkey_deleted(k.k) &&
-	       bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0)
-		__btree_iter_advance(l);
+	return bch2_btree_iter_peek(iter);
+}
 
-	/*
-	 * If we got to the end of the node, check if we need to traverse to the
-	 * next node:
-	 */
-	if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-		ret = bch2_btree_iter_traverse(iter);
-		if (unlikely(ret))
-			return bkey_s_c_err(ret);
+static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key)
+{
+	struct btree_trans *trans = iter->trans;
+	struct bkey_s_c k, k2;
 
-		goto recheck;
+	bch2_btree_iter_verify(iter);
+
+	while (1) {
+		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+					iter->flags & BTREE_ITER_intent,
+					btree_iter_ip_allocated(iter));
+
+		int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+		if (unlikely(ret)) {
+			/* ensure that iter->k is consistent with iter->pos: */
+			bch2_btree_iter_set_pos(iter, iter->pos);
+			k = bkey_s_c_err(ret);
+			break;
+		}
+
+		struct btree_path *path = btree_iter_path(trans, iter);
+		struct btree_path_level *l = path_l(path);
+
+		if (unlikely(!l->b)) {
+			/* No btree nodes at requested level: */
+			bch2_btree_iter_set_pos(iter, SPOS_MAX);
+			k = bkey_s_c_null;
+			break;
+		}
+
+		btree_path_set_should_be_locked(trans, path);
+
+		k = btree_path_level_peek_all(trans->c, l, &iter->k);
+		if (!k.k || bpos_gt(k.k->p, search_key)) {
+			k = btree_path_level_prev(trans, path, l, &iter->k);
+
+			BUG_ON(k.k && bpos_gt(k.k->p, search_key));
+		}
+
+		if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
+		    k.k &&
+		    (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+			k = k2;
+			if (bkey_err(k2)) {
+				bch2_btree_iter_set_pos(iter, iter->pos);
+				break;
+			}
+		}
+
+		if (unlikely(iter->flags & BTREE_ITER_with_journal))
+			k = btree_trans_peek_prev_journal(trans, iter, k);
+
+		if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
+			     trans->nr_updates))
+			bch2_btree_trans_peek_prev_updates(trans, iter, &k);
+
+		if (likely(k.k && !bkey_deleted(k.k))) {
+			break;
+		} else if (k.k) {
+			search_key = bpos_predecessor(k.k->p);
+		} else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
+			/* Advance to previous leaf node: */
+			search_key = bpos_predecessor(path->l[0].b->data->min_key);
+		} else {
+			/* Start of btree: */
+			bch2_btree_iter_set_pos(iter, POS_MIN);
+			k = bkey_s_c_null;
+			break;
+		}
 	}
 
-	if (k.k &&
-	    !bkey_whiteout(k.k) &&
-	    bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
-		EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
-		EBUG_ON(bkey_deleted(k.k));
-		iter->uptodate = BTREE_ITER_UPTODATE;
-		return k;
+	bch2_btree_iter_verify(iter);
+	return k;
+}
+
+/**
+ * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to
+ * iterator's current position
+ * @iter:	iterator to peek from
+ * @end:	search limit: returns keys greater than or equal to @end
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end)
+{
+	if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) &&
+	   !bkey_eq(iter->pos, POS_MAX)) {
+		/*
+		 * bkey_start_pos(), for extents, is not monotonically
+		 * increasing until after filtering for snapshots:
+		 *
+		 * Thus, for extents we need to search forward until we find a
+		 * real visible extents - easiest to just use peek_slot() (which
+		 * internally uses peek() for extents)
+		 */
+		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+		if (bkey_err(k))
+			return k;
+
+		if (!bkey_deleted(k.k) &&
+		    (!(iter->flags & BTREE_ITER_is_extents) ||
+		     bkey_lt(bkey_start_pos(k.k), iter->pos)))
+			return k;
 	}
 
-	/* hole */
-	bkey_init(&n);
-	n.p = iter->pos;
+	struct btree_trans *trans = iter->trans;
+	struct bpos search_key = iter->pos;
+	struct bkey_s_c k;
+	btree_path_idx_t saved_path = 0;
 
-	if (iter->flags & BTREE_ITER_IS_EXTENTS) {
-		if (n.p.offset == KEY_OFFSET_MAX) {
-			if (n.p.inode == KEY_INODE_MAX)
-				return bkey_s_c_null;
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
+	bch2_btree_iter_verify_entry_exit(iter);
+	EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN));
 
-			iter->pos = bkey_successor(iter->pos);
-			goto recheck;
-		}
+	while (1) {
+		k = __bch2_btree_iter_peek_prev(iter, search_key);
+		if (unlikely(!k.k))
+			goto end;
+		if (unlikely(bkey_err(k)))
+			goto out_no_locked;
+
+		if (iter->flags & BTREE_ITER_filter_snapshots) {
+			struct btree_path *s = saved_path ? trans->paths + saved_path : NULL;
+			if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) {
+				/*
+				 * If we have a saved candidate, and we're past
+				 * the last possible snapshot overwrite, return
+				 * it:
+				 */
+				bch2_path_put_nokeep(trans, iter->path,
+					      iter->flags & BTREE_ITER_intent);
+				iter->path = saved_path;
+				saved_path = 0;
+				k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
+				break;
+			}
+
+			/*
+			 * We need to check against @end before FILTER_SNAPSHOTS because
+			 * if we get to a different inode that requested we might be
+			 * seeing keys for a different snapshot tree that will all be
+			 * filtered out.
+			 */
+			if (unlikely(bkey_lt(k.k->p, end)))
+				goto end;
+
+			if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) {
+				search_key = bpos_predecessor(k.k->p);
+				continue;
+			}
+
+			if (k.k->p.snapshot != iter->snapshot) {
+				/*
+				 * Have a key visible in iter->snapshot, but
+				 * might have overwrites: - save it and keep
+				 * searching. Unless it's a whiteout - then drop
+				 * our previous saved candidate:
+				 */
+				if (saved_path) {
+					bch2_path_put_nokeep(trans, saved_path,
+					      iter->flags & BTREE_ITER_intent);
+					saved_path = 0;
+				}
 
-		if (k.k && bkey_whiteout(k.k)) {
-			struct btree_node_iter node_iter = l->iter;
+				if (!bkey_whiteout(k.k)) {
+					saved_path = btree_path_clone(trans, iter->path,
+								iter->flags & BTREE_ITER_intent,
+								_THIS_IP_);
+					trace_btree_path_save_pos(trans,
+								  trans->paths + iter->path,
+								  trans->paths + saved_path);
+				}
 
-			k = __btree_iter_unpack(iter, l, &iter->k,
-				bch2_btree_node_iter_peek(&node_iter, l->b));
+				search_key = bpos_predecessor(k.k->p);
+				continue;
+			}
+
+			if (bkey_whiteout(k.k)) {
+				search_key = bkey_predecessor(iter, k.k->p);
+				search_key.snapshot = U32_MAX;
+				continue;
+			}
 		}
 
-		if (!k.k)
-			k.k = &l->b->key.k;
+		EBUG_ON(iter->flags & BTREE_ITER_all_snapshots		? bpos_gt(k.k->p, iter->pos) :
+			iter->flags & BTREE_ITER_is_extents		? bkey_ge(bkey_start_pos(k.k), iter->pos) :
+									  bkey_gt(k.k->p, iter->pos));
 
-		bch2_key_resize(&n,
-				min_t(u64, KEY_SIZE_MAX,
-				      (k.k->p.inode == n.p.inode
-				       ? bkey_start_offset(k.k)
-				       : KEY_OFFSET_MAX) -
-				      n.p.offset));
+		if (unlikely(iter->flags & BTREE_ITER_all_snapshots	? bpos_lt(k.k->p, end) :
+			     iter->flags & BTREE_ITER_is_extents	? bkey_le(k.k->p, end) :
+									  bkey_lt(k.k->p, end)))
+			goto end;
 
-		EBUG_ON(!n.size);
+		break;
 	}
 
-	iter->k	= n;
-	iter->uptodate = BTREE_ITER_UPTODATE;
-	return (struct bkey_s_c) { &iter->k, NULL };
+	/* Extents can straddle iter->pos: */
+	iter->pos = bpos_min(iter->pos, k.k->p);;
+
+	if (iter->flags & BTREE_ITER_filter_snapshots)
+		iter->pos.snapshot = iter->snapshot;
+out_no_locked:
+	if (saved_path)
+		bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent);
+
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
+	return k;
+end:
+	bch2_btree_iter_set_pos(iter, end);
+	k = bkey_s_c_null;
+	goto out_no_locked;
+}
+
+/**
+ * bch2_btree_iter_prev() - returns first key less than iterator's current
+ * position
+ * @iter:	iterator to peek from
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
+{
+	if (!bch2_btree_iter_rewind(iter))
+		return bkey_s_c_null;
+
+	return bch2_btree_iter_peek_prev(iter);
 }
 
 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 {
+	struct btree_trans *trans = iter->trans;
+	struct bpos search_key;
+	struct bkey_s_c k;
 	int ret;
 
-	bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
+	bch2_btree_iter_verify(iter);
+	bch2_btree_iter_verify_entry_exit(iter);
+	EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
+
+	/* extents can't span inode numbers: */
+	if ((iter->flags & BTREE_ITER_is_extents) &&
+	    unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
+		if (iter->pos.inode == KEY_INODE_MAX)
+			return bkey_s_c_null;
+
+		bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+	}
+
+	search_key = btree_iter_search_key(iter);
+	iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+					iter->flags & BTREE_ITER_intent,
+					btree_iter_ip_allocated(iter));
+
+	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+	if (unlikely(ret)) {
+		k = bkey_s_c_err(ret);
+		goto out_no_locked;
+	}
+
+	if ((iter->flags & BTREE_ITER_cached) ||
+	    !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
+		k = bkey_s_c_null;
+
+		if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
+			     trans->nr_updates)) {
+			bch2_btree_trans_peek_slot_updates(trans, iter, &k);
+			if (k.k)
+				goto out;
+		}
+
+		if (unlikely(iter->flags & BTREE_ITER_with_journal) &&
+		    (k = btree_trans_peek_slot_journal(trans, iter)).k)
+			goto out;
+
+		if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
+		    (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
+			if (!bkey_err(k))
+				iter->k = *k.k;
+			/* We're not returning a key from iter->path: */
+			goto out_no_locked;
+		}
+
+		k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
+		if (unlikely(!k.k))
+			goto out_no_locked;
+	} else {
+		struct bpos next;
+		struct bpos end = iter->pos;
+
+		if (iter->flags & BTREE_ITER_is_extents)
+			end.offset = U64_MAX;
+
+		EBUG_ON(btree_iter_path(trans, iter)->level);
+
+		if (iter->flags & BTREE_ITER_intent) {
+			struct btree_iter iter2;
+
+			bch2_trans_copy_iter(&iter2, iter);
+			k = bch2_btree_iter_peek_max(&iter2, end);
+
+			if (k.k && !bkey_err(k)) {
+				swap(iter->key_cache_path, iter2.key_cache_path);
+				iter->k = iter2.k;
+				k.k = &iter->k;
+			}
+			bch2_trans_iter_exit(trans, &iter2);
+		} else {
+			struct bpos pos = iter->pos;
+
+			k = bch2_btree_iter_peek_max(iter, end);
+			if (unlikely(bkey_err(k)))
+				bch2_btree_iter_set_pos(iter, pos);
+			else
+				iter->pos = pos;
+		}
 
-	if (iter->uptodate == BTREE_ITER_UPTODATE)
-		return btree_iter_peek_uptodate(iter);
+		if (unlikely(bkey_err(k)))
+			goto out_no_locked;
 
-	ret = bch2_btree_iter_traverse(iter);
+		next = k.k ? bkey_start_pos(k.k) : POS_MAX;
+
+		if (bkey_lt(iter->pos, next)) {
+			bkey_init(&iter->k);
+			iter->k.p = iter->pos;
+
+			if (iter->flags & BTREE_ITER_is_extents) {
+				bch2_key_resize(&iter->k,
+						min_t(u64, KEY_SIZE_MAX,
+						      (next.inode == iter->pos.inode
+						       ? next.offset
+						       : KEY_OFFSET_MAX) -
+						      iter->pos.offset));
+				EBUG_ON(!iter->k.size);
+			}
+
+			k = (struct bkey_s_c) { &iter->k, NULL };
+		}
+	}
+out:
+	btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
+out_no_locked:
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
+	ret = bch2_btree_iter_verify_ret(iter, k);
 	if (unlikely(ret))
 		return bkey_s_c_err(ret);
 
-	return __bch2_btree_iter_peek_slot(iter);
+	return k;
 }
 
 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 {
-	bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
+	if (!bch2_btree_iter_advance(iter))
+		return bkey_s_c_null;
 
-	iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
+	return bch2_btree_iter_peek_slot(iter);
+}
 
-	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
-		/*
-		 * XXX: when we just need to relock we should be able to avoid
-		 * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
-		 * for that to work
-		 */
-		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
+{
+	if (!bch2_btree_iter_rewind(iter))
+		return bkey_s_c_null;
 
-		return bch2_btree_iter_peek_slot(iter);
-	}
+	return bch2_btree_iter_peek_slot(iter);
+}
 
-	if (!bkey_deleted(&iter->k))
-		__btree_iter_advance(&iter->l[0]);
+/* Obsolete, but still used by rust wrapper in -tools */
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
+{
+	struct bkey_s_c k;
 
-	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+	while (btree_trans_too_many_iters(iter->trans) ||
+	       (k = bch2_btree_iter_peek_type(iter, iter->flags),
+		bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
+		bch2_trans_begin(iter->trans);
 
-	return __bch2_btree_iter_peek_slot(iter);
+	return k;
 }
 
-void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
-			    enum btree_id btree_id, struct bpos pos,
-			    unsigned locks_want, unsigned depth,
-			    unsigned flags)
+/* new transactional stuff: */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
 {
+	struct btree_path *path;
 	unsigned i;
 
-	EBUG_ON(depth >= BTREE_MAX_DEPTH);
-	EBUG_ON(locks_want > BTREE_MAX_DEPTH);
+	BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1);
 
-	iter->c				= c;
-	iter->pos			= pos;
-	bkey_init(&iter->k);
-	iter->k.p			= pos;
-	iter->flags			= flags;
-	iter->uptodate			= BTREE_ITER_NEED_TRAVERSE;
-	iter->btree_id			= btree_id;
-	iter->level			= depth;
-	iter->locks_want		= locks_want;
-	iter->nodes_locked		= 0;
-	iter->nodes_intent_locked	= 0;
-	for (i = 0; i < ARRAY_SIZE(iter->l); i++)
-		iter->l[i].b		= NULL;
-	iter->l[iter->level].b		= BTREE_ITER_NOT_END;
-	iter->next			= iter;
+	trans_for_each_path(trans, path, i) {
+		BUG_ON(path->sorted_idx >= trans->nr_sorted);
+		BUG_ON(trans->sorted[path->sorted_idx] != i);
+	}
+
+	for (i = 0; i < trans->nr_sorted; i++) {
+		unsigned idx = trans->sorted[i];
 
-	prefetch(c->btree_roots[btree_id].b);
+		BUG_ON(!test_bit(idx, trans->paths_allocated));
+		BUG_ON(trans->paths[idx].sorted_idx != i);
+	}
 }
 
-void bch2_btree_iter_unlink(struct btree_iter *iter)
+static void btree_trans_verify_sorted(struct btree_trans *trans)
 {
-	struct btree_iter *linked;
-
-	__bch2_btree_iter_unlock(iter);
+	struct btree_path *path, *prev = NULL;
+	struct trans_for_each_path_inorder_iter iter;
 
-	if (!btree_iter_linked(iter))
+	if (!bch2_debug_check_iterators)
 		return;
 
-	for_each_linked_btree_iter(iter, linked)
-		if (linked->next == iter) {
-			linked->next = iter->next;
-			iter->next = iter;
-			return;
+	trans_for_each_path_inorder(trans, path, iter) {
+		if (prev && btree_path_cmp(prev, path) > 0) {
+			__bch2_dump_trans_paths_updates(trans, true);
+			panic("trans paths out of order!\n");
 		}
+		prev = path;
+	}
+}
+#else
+static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
+static inline void btree_trans_verify_sorted(struct btree_trans *trans) {}
+#endif
 
-	BUG();
+void __bch2_btree_trans_sort_paths(struct btree_trans *trans)
+{
+	int i, l = 0, r = trans->nr_sorted, inc = 1;
+	bool swapped;
+
+	btree_trans_verify_sorted_refs(trans);
+
+	if (trans->paths_sorted)
+		goto out;
+
+	/*
+	 * Cocktail shaker sort: this is efficient because iterators will be
+	 * mostly sorted.
+	 */
+	do {
+		swapped = false;
+
+		for (i = inc > 0 ? l : r - 2;
+		     i + 1 < r && i >= l;
+		     i += inc) {
+			if (btree_path_cmp(trans->paths + trans->sorted[i],
+					   trans->paths + trans->sorted[i + 1]) > 0) {
+				swap(trans->sorted[i], trans->sorted[i + 1]);
+				trans->paths[trans->sorted[i]].sorted_idx = i;
+				trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1;
+				swapped = true;
+			}
+		}
+
+		if (inc > 0)
+			--r;
+		else
+			l++;
+		inc = -inc;
+	} while (swapped);
+
+	trans->paths_sorted = true;
+out:
+	btree_trans_verify_sorted(trans);
 }
 
-void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
+static inline void btree_path_list_remove(struct btree_trans *trans,
+					  struct btree_path *path)
 {
-	BUG_ON(btree_iter_linked(new));
+	EBUG_ON(path->sorted_idx >= trans->nr_sorted);
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	trans->nr_sorted--;
+	memmove_u64s_down_small(trans->sorted + path->sorted_idx,
+				trans->sorted + path->sorted_idx + 1,
+				DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
+					     sizeof(u64) / sizeof(btree_path_idx_t)));
+#else
+	array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
+#endif
+	for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
+		trans->paths[trans->sorted[i]].sorted_idx = i;
+}
 
-	new->next = iter->next;
-	iter->next = new;
+static inline void btree_path_list_add(struct btree_trans *trans,
+				       btree_path_idx_t pos,
+				       btree_path_idx_t path_idx)
+{
+	struct btree_path *path = trans->paths + path_idx;
 
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-		unsigned nr_iters = 0;
+	path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted;
+
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
+			      trans->sorted + path->sorted_idx,
+			      DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
+					   sizeof(u64) / sizeof(btree_path_idx_t)));
+	trans->nr_sorted++;
+	trans->sorted[path->sorted_idx] = path_idx;
+#else
+	array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx);
+#endif
 
-		for_each_btree_iter(new, iter)
-			if (iter->btree_id == new->btree_id)
-				nr_iters++;
+	for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
+		trans->paths[trans->sorted[i]].sorted_idx = i;
 
-		BUG_ON(nr_iters > SIX_LOCK_MAX_RECURSE);
-	}
+	btree_trans_verify_sorted_refs(trans);
 }
 
-void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
+void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
 {
-	unsigned i;
+	if (iter->update_path)
+		bch2_path_put_nokeep(trans, iter->update_path,
+			      iter->flags & BTREE_ITER_intent);
+	if (iter->path)
+		bch2_path_put(trans, iter->path,
+			      iter->flags & BTREE_ITER_intent);
+	if (iter->key_cache_path)
+		bch2_path_put(trans, iter->key_cache_path,
+			      iter->flags & BTREE_ITER_intent);
+	iter->path		= 0;
+	iter->update_path	= 0;
+	iter->key_cache_path	= 0;
+	iter->trans		= NULL;
+}
+
+void bch2_trans_iter_init_outlined(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  enum btree_id btree_id, struct bpos pos,
+			  unsigned flags)
+{
+	bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
+			       bch2_btree_iter_flags(trans, btree_id, flags),
+			       _RET_IP_);
+}
+
+void bch2_trans_node_iter_init(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       enum btree_id btree_id,
+			       struct bpos pos,
+			       unsigned locks_want,
+			       unsigned depth,
+			       unsigned flags)
+{
+	flags |= BTREE_ITER_not_extents;
+	flags |= BTREE_ITER_snapshot_field;
+	flags |= BTREE_ITER_all_snapshots;
+
+	bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
+			       __bch2_btree_iter_flags(trans, btree_id, flags),
+			       _RET_IP_);
 
-	__bch2_btree_iter_unlock(dst);
-	memcpy(dst, src, offsetof(struct btree_iter, next));
+	iter->min_depth	= depth;
 
-	for (i = 0; i < BTREE_MAX_DEPTH; i++)
-		if (btree_node_locked(dst, i))
-			six_lock_increment(&dst->l[i].b->lock,
-					   __btree_lock_want(dst, i));
+	struct btree_path *path = btree_iter_path(trans, iter);
+	BUG_ON(path->locks_want	 < min(locks_want, BTREE_MAX_DEPTH));
+	BUG_ON(path->level	!= depth);
+	BUG_ON(iter->min_depth	!= depth);
 }
 
-/* new transactional stuff: */
+void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
+{
+	struct btree_trans *trans = src->trans;
+
+	*dst = *src;
+#ifdef TRACK_PATH_ALLOCATED
+	dst->ip_allocated = _RET_IP_;
+#endif
+	if (src->path)
+		__btree_path_get(trans, trans->paths + src->path, src->flags & BTREE_ITER_intent);
+	if (src->update_path)
+		__btree_path_get(trans, trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
+	dst->key_cache_path = 0;
+}
 
-static void btree_trans_verify(struct btree_trans *trans)
+void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 {
-	unsigned i;
+	struct bch_fs *c = trans->c;
+	unsigned new_top = trans->mem_top + size;
+	unsigned old_bytes = trans->mem_bytes;
+	unsigned new_bytes = roundup_pow_of_two(new_top);
+	int ret;
+	void *new_mem;
+	void *p;
+
+	WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
+
+	struct btree_transaction_stats *s = btree_trans_stats(trans);
+	s->max_mem = max(s->max_mem, new_bytes);
+
+	if (trans->used_mempool) {
+		if (trans->mem_bytes >= new_bytes)
+			goto out_change_top;
+
+		/* No more space from mempool item, need malloc new one */
+		new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+		if (unlikely(!new_mem)) {
+			bch2_trans_unlock(trans);
+
+			new_mem = kmalloc(new_bytes, GFP_KERNEL);
+			if (!new_mem)
+				return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+
+			ret = bch2_trans_relock(trans);
+			if (ret) {
+				kfree(new_mem);
+				return ERR_PTR(ret);
+			}
+		}
+		memcpy(new_mem, trans->mem, trans->mem_top);
+		trans->used_mempool = false;
+		mempool_free(trans->mem, &c->btree_trans_mem_pool);
+		goto out_new_mem;
+	}
 
-	for (i = 0; i < trans->nr_iters; i++) {
-		struct btree_iter *iter = &trans->iters[i];
+	new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+	if (unlikely(!new_mem)) {
+		bch2_trans_unlock(trans);
+
+		new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
+		if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
+			new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
+			new_bytes = BTREE_TRANS_MEM_MAX;
+			memcpy(new_mem, trans->mem, trans->mem_top);
+			trans->used_mempool = true;
+			kfree(trans->mem);
+		}
 
-		BUG_ON(btree_iter_linked(iter) !=
-		       ((trans->iters_linked & (1 << i)) &&
-			!is_power_of_2(trans->iters_linked)));
+		if (!new_mem)
+			return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+
+		trans->mem = new_mem;
+		trans->mem_bytes = new_bytes;
+
+		ret = bch2_trans_relock(trans);
+		if (ret)
+			return ERR_PTR(ret);
 	}
+out_new_mem:
+	trans->mem = new_mem;
+	trans->mem_bytes = new_bytes;
+
+	if (old_bytes) {
+		trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
+	}
+out_change_top:
+	p = trans->mem + trans->mem_top;
+	trans->mem_top += size;
+	memset(p, 0, size);
+	return p;
 }
 
-void bch2_trans_iter_free(struct btree_trans *trans,
-			  struct btree_iter *iter)
+static inline void check_srcu_held_too_long(struct btree_trans *trans)
 {
-	unsigned idx;
+	WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
+	     "btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
+	     (jiffies - trans->srcu_lock_time) / HZ);
+}
 
-	for (idx = 0; idx < trans->nr_iters; idx++)
-		if (&trans->iters[idx] == iter)
-			goto found;
-	BUG();
-found:
-	BUG_ON(!(trans->iters_linked & (1U << idx)));
+void bch2_trans_srcu_unlock(struct btree_trans *trans)
+{
+	if (trans->srcu_held) {
+		struct bch_fs *c = trans->c;
+		struct btree_path *path;
+		unsigned i;
+
+		trans_for_each_path(trans, path, i)
+			if (path->cached && !btree_node_locked(path, 0))
+				path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
 
-	trans->iters_live	&= ~(1U << idx);
-	trans->iters_linked	&= ~(1U << idx);
-	bch2_btree_iter_unlink(iter);
+		check_srcu_held_too_long(trans);
+		srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+		trans->srcu_held = false;
+	}
 }
 
-static int btree_trans_realloc_iters(struct btree_trans *trans)
+static void bch2_trans_srcu_lock(struct btree_trans *trans)
 {
-	struct btree_iter *new_iters;
+	if (!trans->srcu_held) {
+		trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
+		trans->srcu_lock_time	= jiffies;
+		trans->srcu_held = true;
+	}
+}
+
+/**
+ * bch2_trans_begin() - reset a transaction after a interrupted attempt
+ * @trans: transaction to reset
+ *
+ * Returns:	current restart counter, to be used with trans_was_restarted()
+ *
+ * While iterating over nodes or updating nodes a attempt to lock a btree node
+ * may return BCH_ERR_transaction_restart when the trylock fails. When this
+ * occurs bch2_trans_begin() should be called and the transaction retried.
+ */
+u32 bch2_trans_begin(struct btree_trans *trans)
+{
+	struct btree_path *path;
 	unsigned i;
+	u64 now;
 
-	bch2_trans_unlock(trans);
+	bch2_trans_reset_updates(trans);
 
-	new_iters = kmalloc(sizeof(struct btree_iter) * BTREE_ITER_MAX,
-			    GFP_NOFS);
-	if (!new_iters)
-		return -ENOMEM;
+	trans->restart_count++;
+	trans->mem_top			= 0;
+	trans->journal_entries		= NULL;
+
+	trans_for_each_path(trans, path, i) {
+		path->should_be_locked = false;
+
+		/*
+		 * If the transaction wasn't restarted, we're presuming to be
+		 * doing something new: dont keep iterators excpt the ones that
+		 * are in use - except for the subvolumes btree:
+		 */
+		if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
+			path->preserve = false;
 
-	memcpy(new_iters, trans->iters,
-	       sizeof(struct btree_iter) * trans->nr_iters);
-	trans->iters = new_iters;
+		/*
+		 * XXX: we probably shouldn't be doing this if the transaction
+		 * was restarted, but currently we still overflow transaction
+		 * iterators if we do that
+		 */
+		if (!path->ref && !path->preserve)
+			__bch2_path_free(trans, i);
+		else
+			path->preserve = false;
+	}
 
-	for (i = 0; i < trans->nr_iters; i++)
-		trans->iters[i].next = &trans->iters[i];
+	now = local_clock();
 
-	if (trans->iters_linked) {
-		unsigned first_linked = __ffs(trans->iters_linked);
+	if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
+	    time_after64(now, trans->last_begin_time + 10))
+		__bch2_time_stats_update(&btree_trans_stats(trans)->duration,
+					 trans->last_begin_time, now);
 
-		for (i = first_linked + 1; i < trans->nr_iters; i++)
-			if (trans->iters_linked & (1 << i))
-				bch2_btree_iter_link(&trans->iters[first_linked],
-						     &trans->iters[i]);
+	if (!trans->restarted &&
+	    (need_resched() ||
+	     time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) {
+		bch2_trans_unlock(trans);
+		cond_resched();
+		now = local_clock();
 	}
+	trans->last_begin_time = now;
 
-	btree_trans_verify(trans);
+	if (unlikely(trans->srcu_held &&
+		     time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
+		bch2_trans_srcu_unlock(trans);
 
-	if (trans->iters_live) {
-		trans_restart();
-		return -EINTR;
+	trans->last_begin_ip = _RET_IP_;
+
+	trans_set_locked(trans, false);
+
+	if (trans->restarted) {
+		bch2_btree_path_traverse_all(trans);
+		trans->notrace_relock_fail = false;
 	}
 
-	return 0;
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
+	return trans->restart_count;
 }
 
-int bch2_trans_preload_iters(struct btree_trans *trans)
+const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR] = { "(unknown)" };
+
+unsigned bch2_trans_get_fn_idx(const char *fn)
 {
-	if (trans->iters != trans->iters_onstack)
-		return 0;
+	for (unsigned i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
+		if (!bch2_btree_transaction_fns[i] ||
+		    bch2_btree_transaction_fns[i] == fn) {
+			bch2_btree_transaction_fns[i] = fn;
+			return i;
+		}
 
-	return btree_trans_realloc_iters(trans);
+	pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
+	return 0;
 }
 
-static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
-						 unsigned btree_id,
-						 unsigned flags, u64 iter_id)
+struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
+	__acquires(&c->btree_trans_barrier)
 {
-	struct btree_iter *iter;
-	int idx;
+	struct btree_trans *trans;
 
-	BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
+	if (IS_ENABLED(__KERNEL__)) {
+		trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
+		if (trans) {
+			memset(trans, 0, offsetof(struct btree_trans, list));
+			goto got_trans;
+		}
+	}
 
-	for (idx = 0; idx < trans->nr_iters; idx++)
-		if (trans->iter_ids[idx] == iter_id)
-			goto found;
-	idx = -1;
-found:
-	if (idx < 0) {
-		idx = ffz(trans->iters_linked);
-		if (idx < trans->nr_iters)
-			goto got_slot;
+	trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
+	memset(trans, 0, sizeof(*trans));
 
-		BUG_ON(trans->nr_iters == BTREE_ITER_MAX);
+	seqmutex_lock(&c->btree_trans_lock);
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+		struct btree_trans *pos;
+		pid_t pid = current->pid;
 
-		if (trans->iters == trans->iters_onstack &&
-		    trans->nr_iters == ARRAY_SIZE(trans->iters_onstack)) {
-			int ret = btree_trans_realloc_iters(trans);
-			if (ret)
-				return ERR_PTR(ret);
+		trans->locking_wait.task = current;
+
+		list_for_each_entry(pos, &c->btree_trans_list, list) {
+			struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task);
+			/*
+			 * We'd much prefer to be stricter here and completely
+			 * disallow multiple btree_trans in the same thread -
+			 * but the data move path calls bch2_write when we
+			 * already have a btree_trans initialized.
+			 */
+			BUG_ON(pos_task &&
+			       pid == pos_task->pid &&
+			       pos->locked);
 		}
+	}
 
-		idx = trans->nr_iters++;
-got_slot:
-		trans->iter_ids[idx] = iter_id;
-		iter = &trans->iters[idx];
+	list_add(&trans->list, &c->btree_trans_list);
+	seqmutex_unlock(&c->btree_trans_lock);
+got_trans:
+	trans->c		= c;
+	trans->last_begin_time	= local_clock();
+	trans->fn_idx		= fn_idx;
+	trans->locking_wait.task = current;
+	trans->journal_replay_not_finished =
+		unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) &&
+		atomic_inc_not_zero(&c->journal_keys.ref);
+	trans->nr_paths		= ARRAY_SIZE(trans->_paths);
+	trans->paths_allocated	= trans->_paths_allocated;
+	trans->sorted		= trans->_sorted;
+	trans->paths		= trans->_paths;
+	trans->updates		= trans->_updates;
 
-		bch2_btree_iter_init(iter, trans->c, btree_id, POS_MIN, flags);
-	} else {
-		iter = &trans->iters[idx];
+	*trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL;
 
-		BUG_ON(iter->btree_id != btree_id);
-		BUG_ON((iter->flags ^ flags) &
-		       (BTREE_ITER_SLOTS|BTREE_ITER_IS_EXTENTS));
+	trans->paths_allocated[0] = 1;
 
-		iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
-		iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
-	}
+	static struct lock_class_key lockdep_key;
+	lockdep_init_map(&trans->dep_map, "bcachefs_btree", &lockdep_key, 0);
 
-	BUG_ON(trans->iters_live & (1 << idx));
-	trans->iters_live |= 1 << idx;
+	if (fn_idx < BCH_TRANSACTIONS_NR) {
+		trans->fn = bch2_btree_transaction_fns[fn_idx];
 
-	if (trans->iters_linked &&
-	    !(trans->iters_linked & (1 << idx)))
-		bch2_btree_iter_link(&trans->iters[__ffs(trans->iters_linked)],
-				     iter);
+		struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx];
 
-	trans->iters_linked |= 1 << idx;
+		if (s->max_mem) {
+			unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
 
-	btree_trans_verify(trans);
+			trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
+			if (likely(trans->mem))
+				trans->mem_bytes = expected_mem_bytes;
+		}
 
-	return iter;
-}
+		trans->nr_paths_max = s->nr_max_paths;
+		trans->journal_entries_size = s->journal_entries_size;
+	}
 
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
-					 enum btree_id btree_id,
-					 struct bpos pos, unsigned flags,
-					 u64 iter_id)
-{
-	struct btree_iter *iter =
-		__btree_trans_get_iter(trans, btree_id, flags, iter_id);
+	trans->srcu_idx		= srcu_read_lock(&c->btree_trans_barrier);
+	trans->srcu_lock_time	= jiffies;
+	trans->srcu_held	= true;
+	trans_set_locked(trans, false);
 
-	if (!IS_ERR(iter))
-		bch2_btree_iter_set_pos(iter, pos);
-	return iter;
+	closure_init_stack_release(&trans->ref);
+	return trans;
 }
 
-struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
-					  struct btree_iter *src,
-					  u64 iter_id)
+static void check_btree_paths_leaked(struct btree_trans *trans)
 {
-	struct btree_iter *iter =
-		__btree_trans_get_iter(trans, src->btree_id,
-				       src->flags, iter_id);
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bch_fs *c = trans->c;
+	struct btree_path *path;
+	unsigned i;
 
-	if (!IS_ERR(iter))
-		bch2_btree_iter_copy(iter, src);
-	return iter;
+	trans_for_each_path(trans, path, i)
+		if (path->ref)
+			goto leaked;
+	return;
+leaked:
+	bch_err(c, "btree paths leaked from %s!", trans->fn);
+	trans_for_each_path(trans, path, i)
+		if (path->ref)
+			printk(KERN_ERR "  btree %s %pS\n",
+			       bch2_btree_id_str(path->btree_id),
+			       (void *) path->ip_allocated);
+	/* Be noisy about this: */
+	bch2_fatal_error(c);
+#endif
 }
 
-void *bch2_trans_kmalloc(struct btree_trans *trans,
-			 size_t size)
+void bch2_trans_put(struct btree_trans *trans)
+	__releases(&c->btree_trans_barrier)
 {
-	void *ret;
+	struct bch_fs *c = trans->c;
 
-	if (trans->mem_top + size > trans->mem_bytes) {
-		size_t old_bytes = trans->mem_bytes;
-		size_t new_bytes = roundup_pow_of_two(trans->mem_top + size);
-		void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
+	if (trans->restarted)
+		bch2_trans_in_restart_error(trans);
 
-		if (!new_mem)
-			return ERR_PTR(-ENOMEM);
+	bch2_trans_unlock(trans);
 
-		trans->mem = new_mem;
-		trans->mem_bytes = new_bytes;
+	trans_for_each_update(trans, i)
+		__btree_path_put(trans, trans->paths + i->path, true);
+	trans->nr_updates	= 0;
 
-		if (old_bytes) {
-			trans_restart();
-			return ERR_PTR(-EINTR);
-		}
+	check_btree_paths_leaked(trans);
+
+	if (trans->srcu_held) {
+		check_srcu_held_too_long(trans);
+		srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
 	}
 
-	ret = trans->mem + trans->mem_top;
-	trans->mem_top += size;
+	if (unlikely(trans->journal_replay_not_finished))
+		bch2_journal_keys_put(c);
+
+	/*
+	 * trans->ref protects trans->locking_wait.task, btree_paths array; used
+	 * by cycle detector
+	 */
+	closure_return_sync(&trans->ref);
+	trans->locking_wait.task = NULL;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	darray_exit(&trans->last_restarted_trace);
+#endif
+
+	unsigned long *paths_allocated = trans->paths_allocated;
+	trans->paths_allocated	= NULL;
+	trans->paths		= NULL;
+
+	if (paths_allocated != trans->_paths_allocated)
+		kvfree_rcu_mightsleep(paths_allocated);
+
+	if (trans->used_mempool)
+		mempool_free(trans->mem, &c->btree_trans_mem_pool);
+	else
+		kfree(trans->mem);
+
+	/* Userspace doesn't have a real percpu implementation: */
+	if (IS_ENABLED(__KERNEL__))
+		trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
+
+	if (trans) {
+		seqmutex_lock(&c->btree_trans_lock);
+		list_del(&trans->list);
+		seqmutex_unlock(&c->btree_trans_lock);
+
+		mempool_free(trans, &c->btree_trans_pool);
+	}
+}
+
+bool bch2_current_has_btree_trans(struct bch_fs *c)
+{
+	seqmutex_lock(&c->btree_trans_lock);
+	struct btree_trans *trans;
+	bool ret = false;
+	list_for_each_entry(trans, &c->btree_trans_list, list)
+		if (trans->locking_wait.task == current &&
+		    trans->locked) {
+			ret = true;
+			break;
+		}
+	seqmutex_unlock(&c->btree_trans_lock);
 	return ret;
 }
 
-int bch2_trans_unlock(struct btree_trans *trans)
+static void __maybe_unused
+bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
+				      struct btree_bkey_cached_common *b)
 {
-	unsigned iters = trans->iters_linked;
-	int ret = 0;
+	struct six_lock_count c = six_lock_counts(&b->lock);
+	struct task_struct *owner;
+	pid_t pid;
+
+	rcu_read_lock();
+	owner = READ_ONCE(b->lock.owner);
+	pid = owner ? owner->pid : 0;
+	rcu_read_unlock();
 
-	while (iters) {
-		unsigned idx = __ffs(iters);
-		struct btree_iter *iter = &trans->iters[idx];
+	prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b');
+	bch2_btree_id_to_text(out, b->btree_id);
+	prt_printf(out, " l=%u:", b->level);
+	bch2_bpos_to_text(out, btree_node_pos(b));
 
-		if (iter->flags & BTREE_ITER_ERROR)
-			ret = -EIO;
+	prt_printf(out, "\t locks %u:%u:%u held by pid %u",
+		   c.n[0], c.n[1], c.n[2], pid);
+}
 
-		__bch2_btree_iter_unlock(iter);
-		iters ^= 1 << idx;
+void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
+{
+	struct btree_bkey_cached_common *b;
+	static char lock_types[] = { 'r', 'i', 'w' };
+	struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+	unsigned l, idx;
+
+	/* before rcu_read_lock(): */
+	bch2_printbuf_make_room(out, 4096);
+
+	if (!out->nr_tabstops) {
+		printbuf_tabstop_push(out, 16);
+		printbuf_tabstop_push(out, 32);
 	}
 
-	return ret;
+	prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn);
+
+	/* trans->paths is rcu protected vs. freeing */
+	rcu_read_lock();
+	out->atomic++;
+
+	struct btree_path *paths = rcu_dereference(trans->paths);
+	if (!paths)
+		goto out;
+
+	unsigned long *paths_allocated = trans_paths_allocated(paths);
+
+	trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) {
+		struct btree_path *path = paths + idx;
+		if (!path->nodes_locked)
+			continue;
+
+		prt_printf(out, "  path %u %c ",
+			   idx,
+			   path->cached ? 'c' : 'b');
+		bch2_btree_id_to_text(out, path->btree_id);
+		prt_printf(out, " l=%u:", path->level);
+		bch2_bpos_to_text(out, path->pos);
+		prt_newline(out);
+
+		for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+			if (btree_node_locked(path, l) &&
+			    !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
+				prt_printf(out, "    %c l=%u ",
+					   lock_types[btree_node_locked_type(path, l)], l);
+				bch2_btree_bkey_cached_common_to_text(out, b);
+				prt_newline(out);
+			}
+		}
+	}
+
+	b = READ_ONCE(trans->locking);
+	if (b) {
+		prt_printf(out, "  blocked for %lluus on\n",
+			   div_u64(local_clock() - trans->locking_wait.start_time, 1000));
+		prt_printf(out, "    %c", lock_types[trans->locking_wait.lock_want]);
+		bch2_btree_bkey_cached_common_to_text(out, b);
+		prt_newline(out);
+	}
+out:
+	--out->atomic;
+	rcu_read_unlock();
 }
 
-void __bch2_trans_begin(struct btree_trans *trans)
+void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
-	unsigned idx;
+	struct btree_transaction_stats *s;
+	struct btree_trans *trans;
+	int cpu;
 
-	btree_trans_verify(trans);
+	if (c->btree_trans_bufs)
+		for_each_possible_cpu(cpu) {
+			struct btree_trans *trans =
+				per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
 
-	/*
-	 * On transaction restart, the transaction isn't required to allocate
-	 * all the same iterators it on the last iteration:
-	 *
-	 * Unlink any iterators it didn't use this iteration, assuming it got
-	 * further (allocated an iter with a higher idx) than where the iter
-	 * was originally allocated:
-	 */
-	while (trans->iters_linked &&
-	       trans->iters_live &&
-	       (idx = __fls(trans->iters_linked)) >
-	       __fls(trans->iters_live)) {
-		trans->iters_linked ^= 1 << idx;
-		bch2_btree_iter_unlink(&trans->iters[idx]);
-	}
+			if (trans) {
+				seqmutex_lock(&c->btree_trans_lock);
+				list_del(&trans->list);
+				seqmutex_unlock(&c->btree_trans_lock);
+			}
+			kfree(trans);
+		}
+	free_percpu(c->btree_trans_bufs);
 
-	trans->iters_live	= 0;
-	trans->nr_updates	= 0;
-	trans->mem_top		= 0;
+	trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
+	if (trans)
+		panic("%s leaked btree_trans\n", trans->fn);
 
-	btree_trans_verify(trans);
+	for (s = c->btree_transaction_stats;
+	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
+	     s++) {
+		kfree(s->max_paths_text);
+		bch2_time_stats_exit(&s->lock_hold_times);
+	}
+
+	if (c->btree_trans_barrier_initialized) {
+		synchronize_srcu_expedited(&c->btree_trans_barrier);
+		cleanup_srcu_struct(&c->btree_trans_barrier);
+	}
+	mempool_exit(&c->btree_trans_mem_pool);
+	mempool_exit(&c->btree_trans_pool);
 }
 
-void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c)
+void bch2_fs_btree_iter_init_early(struct bch_fs *c)
 {
-	trans->c		= c;
-	trans->nr_restarts	= 0;
-	trans->nr_iters		= 0;
-	trans->iters_live	= 0;
-	trans->iters_linked	= 0;
-	trans->nr_updates	= 0;
-	trans->mem_top		= 0;
-	trans->mem_bytes	= 0;
-	trans->mem		= NULL;
-	trans->iters		= trans->iters_onstack;
+	struct btree_transaction_stats *s;
+
+	for (s = c->btree_transaction_stats;
+	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
+	     s++) {
+		bch2_time_stats_init(&s->duration);
+		bch2_time_stats_init(&s->lock_hold_times);
+		mutex_init(&s->lock);
+	}
+
+	INIT_LIST_HEAD(&c->btree_trans_list);
+	seqmutex_init(&c->btree_trans_lock);
 }
 
-int bch2_trans_exit(struct btree_trans *trans)
+int bch2_fs_btree_iter_init(struct bch_fs *c)
 {
-	int ret = bch2_trans_unlock(trans);
+	int ret;
+
+	c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
+	if (!c->btree_trans_bufs)
+		return -ENOMEM;
+
+	ret   = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
+					  sizeof(struct btree_trans)) ?:
+		mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
+					  BTREE_TRANS_MEM_MAX) ?:
+		init_srcu_struct(&c->btree_trans_barrier);
+	if (ret)
+		return ret;
+
+	/*
+	 * static annotation (hackily done) for lock ordering of reclaim vs.
+	 * btree node locks:
+	 */
+#ifdef CONFIG_LOCKDEP
+	fs_reclaim_acquire(GFP_KERNEL);
+	struct btree_trans *trans = bch2_trans_get(c);
+	trans_set_locked(trans, false);
+	bch2_trans_put(trans);
+	fs_reclaim_release(GFP_KERNEL);
+#endif
+
+	c->btree_trans_barrier_initialized = true;
+	return 0;
 
-	kfree(trans->mem);
-	if (trans->iters != trans->iters_onstack)
-		kfree(trans->iters);
-	trans->mem	= (void *) 0x1;
-	trans->iters	= (void *) 0x1;
-	return ret;
 }
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index d046ad71..3477fc8c 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -1,336 +1,941 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_BTREE_ITER_H
 #define _BCACHEFS_BTREE_ITER_H
 
-#include <linux/dynamic_fault.h>
-
+#include "bset.h"
 #include "btree_types.h"
+#include "trace.h"
+
+void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
+void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
+void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
+void bch2_dump_trans_updates(struct btree_trans *);
+void bch2_dump_trans_paths_updates(struct btree_trans *);
+
+static inline int __bkey_err(const struct bkey *k)
+{
+	return PTR_ERR_OR_ZERO(k);
+}
+
+#define bkey_err(_k)	__bkey_err((_k).k)
+
+static inline void __btree_path_get(struct btree_trans *trans, struct btree_path *path, bool intent)
+{
+	unsigned idx = path - trans->paths;
+
+	EBUG_ON(idx >= trans->nr_paths);
+	EBUG_ON(!test_bit(idx, trans->paths_allocated));
+	if (unlikely(path->ref == U8_MAX)) {
+		bch2_dump_trans_paths_updates(trans);
+		panic("path %u refcount overflow\n", idx);
+	}
+
+	path->ref++;
+	path->intent_ref += intent;
+	trace_btree_path_get_ll(trans, path);
+}
 
-static inline void btree_iter_set_dirty(struct btree_iter *iter,
-					enum btree_iter_uptodate u)
+static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
 {
-	iter->uptodate = max_t(unsigned, iter->uptodate, u);
+	EBUG_ON(path - trans->paths >= trans->nr_paths);
+	EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated));
+	EBUG_ON(!path->ref);
+	EBUG_ON(!path->intent_ref && intent);
+
+	trace_btree_path_put_ll(trans, path);
+	path->intent_ref -= intent;
+	return --path->ref == 0;
 }
 
-static inline struct btree *btree_iter_node(struct btree_iter *iter,
+static inline void btree_path_set_dirty(struct btree_path *path,
+					enum btree_path_uptodate u)
+{
+	path->uptodate = max_t(unsigned, path->uptodate, u);
+}
+
+static inline struct btree *btree_path_node(struct btree_path *path,
 					    unsigned level)
 {
-	return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL;
+	return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL;
+}
+
+static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
+					const struct btree *b, unsigned level)
+{
+	return path->l[level].lock_seq == six_lock_seq(&b->c.lock);
 }
 
-static inline struct btree *btree_node_parent(struct btree_iter *iter,
+static inline struct btree *btree_node_parent(struct btree_path *path,
 					      struct btree *b)
 {
-	return btree_iter_node(iter, b->level + 1);
+	return btree_path_node(path, b->c.level + 1);
 }
 
-static inline bool btree_iter_linked(const struct btree_iter *iter)
+/* Iterate over paths within a transaction: */
+
+void __bch2_btree_trans_sort_paths(struct btree_trans *);
+
+static inline void btree_trans_sort_paths(struct btree_trans *trans)
 {
-	return iter->next != iter;
+	if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+	    trans->paths_sorted)
+		return;
+	__bch2_btree_trans_sort_paths(trans);
 }
 
-static inline bool __iter_has_node(const struct btree_iter *iter,
-				   const struct btree *b)
+static inline unsigned long *trans_paths_nr(struct btree_path *paths)
+{
+	return &container_of(paths, struct btree_trans_paths, paths[0])->nr_paths;
+}
+
+static inline unsigned long *trans_paths_allocated(struct btree_path *paths)
 {
+	unsigned long *v = trans_paths_nr(paths);
+	return v - BITS_TO_LONGS(*v);
+}
+
+#define trans_for_each_path_idx_from(_paths_allocated, _nr, _idx, _start)\
+	for (_idx = _start;						\
+	     (_idx = find_next_bit(_paths_allocated, _nr, _idx)) < _nr;	\
+	     _idx++)
+
+static inline struct btree_path *
+__trans_next_path(struct btree_trans *trans, unsigned *idx)
+{
+	unsigned long *w = trans->paths_allocated + *idx / BITS_PER_LONG;
 	/*
-	 * We don't compare the low bits of the lock sequence numbers because
-	 * @iter might have taken a write lock on @b, and we don't want to skip
-	 * the linked iterator if the sequence numbers were equal before taking
-	 * that write lock. The lock sequence number is incremented by taking
-	 * and releasing write locks and is even when unlocked:
+	 * Open coded find_next_bit(), because
+	 *  - this is fast path, we can't afford the function call
+	 *  - and we know that nr_paths is a multiple of BITS_PER_LONG,
 	 */
+	while (*idx < trans->nr_paths) {
+		unsigned long v = *w >> (*idx & (BITS_PER_LONG - 1));
+		if (v) {
+			*idx += __ffs(v);
+			return trans->paths + *idx;
+		}
+
+		*idx += BITS_PER_LONG;
+		*idx &= ~(BITS_PER_LONG - 1);
+		w++;
+	}
 
-	return iter->l[b->level].b == b &&
-		iter->lock_seq[b->level] >> 1 == b->lock.state.seq >> 1;
+	return NULL;
 }
 
-static inline struct btree_iter *
-__next_linked_iter(struct btree_iter *iter, struct btree_iter *linked)
+/*
+ * This version is intended to be safe for use on a btree_trans that is owned by
+ * another thread, for bch2_btree_trans_to_text();
+ */
+#define trans_for_each_path_from(_trans, _path, _idx, _start)		\
+	for (_idx = _start;						\
+	     (_path = __trans_next_path((_trans), &_idx));		\
+	     _idx++)
+
+#define trans_for_each_path(_trans, _path, _idx)			\
+	trans_for_each_path_from(_trans, _path, _idx, 1)
+
+static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
 {
-	return linked->next != iter ? linked->next : NULL;
+	unsigned idx = path ? path->sorted_idx + 1 : 0;
+
+	EBUG_ON(idx > trans->nr_sorted);
+
+	return idx < trans->nr_sorted
+		? trans->paths + trans->sorted[idx]
+		: NULL;
 }
 
-static inline struct btree_iter *
-__next_iter_with_node(struct btree_iter *iter, struct btree *b,
-		      struct btree_iter *linked)
+static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path)
 {
-	while (linked && !__iter_has_node(linked, b))
-		linked = __next_linked_iter(iter, linked);
+	unsigned idx = path ? path->sorted_idx : trans->nr_sorted;
 
-	return linked;
+	return idx
+		? trans->paths + trans->sorted[idx - 1]
+		: NULL;
 }
 
-/**
- * for_each_btree_iter - iterate over all iterators linked with @_iter,
- * including @_iter
- */
-#define for_each_btree_iter(_iter, _linked)				\
-	for ((_linked) = (_iter); (_linked);				\
-	     (_linked) = __next_linked_iter(_iter, _linked))
+#define trans_for_each_path_idx_inorder(_trans, _iter)			\
+	for (_iter = (struct trans_for_each_path_inorder_iter) { 0 };	\
+	     (_iter.path_idx = trans->sorted[_iter.sorted_idx],		\
+	      _iter.sorted_idx < (_trans)->nr_sorted);			\
+	     _iter.sorted_idx++)
+
+struct trans_for_each_path_inorder_iter {
+	btree_path_idx_t	sorted_idx;
+	btree_path_idx_t	path_idx;
+};
+
+#define trans_for_each_path_inorder(_trans, _path, _iter)		\
+	for (_iter = (struct trans_for_each_path_inorder_iter) { 0 };	\
+	     (_iter.path_idx = trans->sorted[_iter.sorted_idx],		\
+	      _path = (_trans)->paths + _iter.path_idx,			\
+	      _iter.sorted_idx < (_trans)->nr_sorted);			\
+	     _iter.sorted_idx++)
+
+#define trans_for_each_path_inorder_reverse(_trans, _path, _i)		\
+	for (_i = trans->nr_sorted - 1;					\
+	     ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\
+	     --_i)
+
+static inline bool __path_has_node(const struct btree_path *path,
+				   const struct btree *b)
+{
+	return path->l[b->c.level].b == b &&
+		btree_node_lock_seq_matches(path, b, b->c.level);
+}
 
-/**
- * for_each_btree_iter_with_node - iterate over all iterators linked with @_iter
- * that also point to @_b
- *
- * @_b is assumed to be locked by @_iter
- *
- * Filters out iterators that don't have a valid btree_node iterator for @_b -
- * i.e. iterators for which bch2_btree_node_relock() would not succeed.
- */
-#define for_each_btree_iter_with_node(_iter, _b, _linked)		\
-	for ((_linked) = (_iter);					\
-	     ((_linked) = __next_iter_with_node(_iter, _b, _linked));	\
-	     (_linked) = __next_linked_iter(_iter, _linked))
+static inline struct btree_path *
+__trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
+			    unsigned *idx)
+{
+	struct btree_path *path;
 
-/**
- * for_each_linked_btree_iter - iterate over all iterators linked with @_iter,
- * _not_ including @_iter
+	while ((path = __trans_next_path(trans, idx)) &&
+		!__path_has_node(path, b))
+	       (*idx)++;
+
+	return path;
+}
+
+#define trans_for_each_path_with_node(_trans, _b, _path, _iter)		\
+	for (_iter = 1;							\
+	     (_path = __trans_next_path_with_node((_trans), (_b), &_iter));\
+	     _iter++)
+
+btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *, btree_path_idx_t,
+					    bool, unsigned long);
+
+static inline btree_path_idx_t __must_check
+bch2_btree_path_make_mut(struct btree_trans *trans,
+			 btree_path_idx_t path, bool intent,
+			 unsigned long ip)
+{
+	if (trans->paths[path].ref > 1 ||
+	    trans->paths[path].preserve)
+		path = __bch2_btree_path_make_mut(trans, path, intent, ip);
+	trans->paths[path].should_be_locked = false;
+	return path;
+}
+
+btree_path_idx_t __must_check
+__bch2_btree_path_set_pos(struct btree_trans *, btree_path_idx_t,
+			  struct bpos, bool, unsigned long);
+
+static inline btree_path_idx_t __must_check
+bch2_btree_path_set_pos(struct btree_trans *trans,
+			btree_path_idx_t path, struct bpos new_pos,
+			bool intent, unsigned long ip)
+{
+	return !bpos_eq(new_pos, trans->paths[path].pos)
+		? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip)
+		: path;
+}
+
+int __must_check bch2_btree_path_traverse_one(struct btree_trans *,
+					      btree_path_idx_t,
+					      unsigned, unsigned long);
+
+static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *);
+
+static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
+					  btree_path_idx_t path, unsigned flags)
+{
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
+
+	if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK)
+		return 0;
+
+	return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_);
+}
+
+btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
+				 unsigned, unsigned, unsigned, unsigned long);
+btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *, enum btree_id,
+					    unsigned, struct bpos);
+
+struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
+
+/*
+ * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
+ * different snapshot:
  */
-#define for_each_linked_btree_iter(_iter, _linked)			\
-	for ((_linked) = (_iter)->next;					\
-	     (_linked) != (_iter);					\
-	     (_linked) = (_linked)->next)
+static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
+{
+	struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
+
+	if (k.k && bpos_eq(path->pos, k.k->p))
+		return k;
+
+	bkey_init(u);
+	u->p = path->pos;
+	return (struct bkey_s_c) { u, NULL };
+}
+
+struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
+					struct btree_iter *, struct bpos);
+
+void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *);
+
+int __bch2_trans_mutex_lock(struct btree_trans *, struct mutex *);
+
+static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock)
+{
+	return mutex_trylock(lock)
+		? 0
+		: __bch2_trans_mutex_lock(trans, lock);
+}
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_iter_verify(struct btree_iter *, struct btree *);
-void bch2_btree_iter_verify_locks(struct btree_iter *);
+void bch2_trans_verify_paths(struct btree_trans *);
+void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos);
 #else
-static inline void bch2_btree_iter_verify(struct btree_iter *iter,
-					  struct btree *b) {}
-static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
+static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
+static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+					  struct bpos pos) {}
 #endif
 
-void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
-			     struct btree_node_iter *, struct bset_tree *,
-			     struct bkey_packed *, unsigned, unsigned);
+void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
+				      struct btree *, struct bkey_packed *);
+void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
+			      struct btree *, struct btree_node_iter *,
+			      struct bkey_packed *, unsigned, unsigned);
+
+int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
 
-int bch2_btree_iter_unlock(struct btree_iter *);
+void bch2_path_put(struct btree_trans *, btree_path_idx_t, bool);
 
-bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
-bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned);
+int bch2_trans_relock(struct btree_trans *);
+int bch2_trans_relock_notrace(struct btree_trans *);
+void bch2_trans_unlock(struct btree_trans *);
+void bch2_trans_unlock_long(struct btree_trans *);
 
-static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
-					   unsigned new_locks_want,
-					   bool may_drop_locks)
+static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
 {
-	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+	return restart_count != trans->restart_count
+		? -BCH_ERR_transaction_restart_nested
+		: 0;
+}
+
+void __noreturn bch2_trans_restart_error(struct btree_trans *, u32);
 
-	return iter->locks_want < new_locks_want
-		? (may_drop_locks
-		   ? __bch2_btree_iter_upgrade(iter, new_locks_want)
-		   : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want))
-		: iter->uptodate <= BTREE_ITER_NEED_PEEK;
+static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
+						   u32 restart_count)
+{
+	if (trans_was_restarted(trans, restart_count))
+		bch2_trans_restart_error(trans, restart_count);
 }
 
-void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
+void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *);
 
-static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
+static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *trans)
 {
-	if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0)
-		__bch2_btree_iter_downgrade(iter, 0);
+	if (trans->restarted || !trans->locked)
+		bch2_trans_unlocked_or_in_restart_error(trans);
 }
 
-void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
-void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
+__always_inline
+static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
+{
+	BUG_ON(err <= 0);
+	BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
+
+	trans->restarted = err;
+	trans->last_restarted_ip = ip;
+#ifdef CONFIG_BCACHEFS_DEBUG
+	darray_exit(&trans->last_restarted_trace);
+	bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT);
+#endif
+	return -err;
+}
 
-void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
+__always_inline
+static int btree_trans_restart(struct btree_trans *trans, int err)
+{
+	return btree_trans_restart_ip(trans, err, _THIS_IP_);
+}
+
+bool bch2_btree_node_upgrade(struct btree_trans *,
+			     struct btree_path *, unsigned);
+
+void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned);
+
+static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
+					     struct btree_path *path)
+{
+	unsigned new_locks_want = path->level + !!path->intent_ref;
 
+	if (path->locks_want > new_locks_want)
+		__bch2_btree_path_downgrade(trans, path, new_locks_want);
+}
+
+void bch2_trans_downgrade(struct btree_trans *);
+
+void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *);
+void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
+
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
 int __must_check bch2_btree_iter_traverse(struct btree_iter *);
 
 struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
-struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned);
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *);
+struct btree *bch2_btree_iter_next_node(struct btree_iter *);
 
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos);
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
+
+static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+	return bch2_btree_iter_peek_max(iter, SPOS_MAX);
+}
+
+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos);
+
+static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
+{
+	return bch2_btree_iter_peek_prev_min(iter, POS_MIN);
+}
+
 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
 
 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
 
-void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
-void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
+bool bch2_btree_iter_advance(struct btree_iter *);
+bool bch2_btree_iter_rewind(struct btree_iter *);
 
-void __bch2_btree_iter_init(struct btree_iter *, struct bch_fs *,
-			   enum btree_id, struct bpos,
-			   unsigned , unsigned, unsigned);
+static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+	iter->k.type = KEY_TYPE_deleted;
+	iter->k.p.inode		= iter->pos.inode	= new_pos.inode;
+	iter->k.p.offset	= iter->pos.offset	= new_pos.offset;
+	iter->k.p.snapshot	= iter->pos.snapshot	= new_pos.snapshot;
+	iter->k.size = 0;
+}
 
-static inline void bch2_btree_iter_init(struct btree_iter *iter,
-			struct bch_fs *c, enum btree_id btree_id,
-			struct bpos pos, unsigned flags)
+static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 {
-	__bch2_btree_iter_init(iter, c, btree_id, pos,
-			       flags & BTREE_ITER_INTENT ? 1 : 0, 0,
-			       (btree_id == BTREE_ID_EXTENTS
-				?  BTREE_ITER_IS_EXTENTS : 0)|flags);
+	struct btree_trans *trans = iter->trans;
+
+	if (unlikely(iter->update_path))
+		bch2_path_put(trans, iter->update_path,
+			      iter->flags & BTREE_ITER_intent);
+	iter->update_path = 0;
+
+	if (!(iter->flags & BTREE_ITER_all_snapshots))
+		new_pos.snapshot = iter->snapshot;
+
+	__bch2_btree_iter_set_pos(iter, new_pos);
 }
 
-void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *);
-void bch2_btree_iter_unlink(struct btree_iter *);
-void bch2_btree_iter_copy(struct btree_iter *, struct btree_iter *);
+static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
+{
+	BUG_ON(!(iter->flags & BTREE_ITER_is_extents));
+	iter->pos = bkey_start_pos(&iter->k);
+}
 
-static inline struct bpos btree_type_successor(enum btree_id id,
-					       struct bpos pos)
+static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
 {
-	if (id == BTREE_ID_INODES) {
-		pos.inode++;
-		pos.offset = 0;
-	} else if (id != BTREE_ID_EXTENTS) {
-		pos = bkey_successor(pos);
-	}
+	struct bpos pos = iter->pos;
 
-	return pos;
+	iter->snapshot = snapshot;
+	pos.snapshot = snapshot;
+	bch2_btree_iter_set_pos(iter, pos);
 }
 
-static inline struct bpos btree_type_predecessor(enum btree_id id,
-					       struct bpos pos)
+void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
+
+static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
+					       unsigned btree_id,
+					       unsigned flags)
 {
-	if (id == BTREE_ID_INODES) {
-		--pos.inode;
-		pos.offset = 0;
-	} else /* if (id != BTREE_ID_EXTENTS) */ {
-		pos = bkey_predecessor(pos);
-	}
+	if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) &&
+	    btree_id_is_extents(btree_id))
+		flags |= BTREE_ITER_is_extents;
+
+	if (!(flags & BTREE_ITER_snapshot_field) &&
+	    !btree_type_has_snapshot_field(btree_id))
+		flags &= ~BTREE_ITER_all_snapshots;
 
-	return pos;
+	if (!(flags & BTREE_ITER_all_snapshots) &&
+	    btree_type_has_snapshots(btree_id))
+		flags |= BTREE_ITER_filter_snapshots;
+
+	if (trans->journal_replay_not_finished)
+		flags |= BTREE_ITER_with_journal;
+
+	return flags;
 }
 
-static inline int __btree_iter_cmp(enum btree_id id,
-				   struct bpos pos,
-				   const struct btree_iter *r)
+static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
+					     unsigned btree_id,
+					     unsigned flags)
 {
-	if (id != r->btree_id)
-		return id < r->btree_id ? -1 : 1;
-	return bkey_cmp(pos, r->pos);
+	if (!btree_id_cached(trans->c, btree_id)) {
+		flags &= ~BTREE_ITER_cached;
+		flags &= ~BTREE_ITER_with_key_cache;
+	} else if (!(flags & BTREE_ITER_cached))
+		flags |= BTREE_ITER_with_key_cache;
+
+	return __bch2_btree_iter_flags(trans, btree_id, flags);
 }
 
-static inline int btree_iter_cmp(const struct btree_iter *l,
-				 const struct btree_iter *r)
+static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
+					  struct btree_iter *iter,
+					  unsigned btree_id, struct bpos pos,
+					  unsigned locks_want,
+					  unsigned depth,
+					  unsigned flags,
+					  unsigned long ip)
 {
-	return __btree_iter_cmp(l->btree_id, l->pos, r);
+	iter->trans		= trans;
+	iter->update_path	= 0;
+	iter->key_cache_path	= 0;
+	iter->btree_id		= btree_id;
+	iter->min_depth		= 0;
+	iter->flags		= flags;
+	iter->snapshot		= pos.snapshot;
+	iter->pos		= pos;
+	iter->k			= POS_KEY(pos);
+	iter->journal_idx	= 0;
+#ifdef CONFIG_BCACHEFS_DEBUG
+	iter->ip_allocated = ip;
+#endif
+	iter->path = bch2_path_get(trans, btree_id, iter->pos,
+				   locks_want, depth, flags, ip);
 }
 
-/*
- * Unlocks before scheduling
- * Note: does not revalidate iterator
- */
-static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter)
+void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *,
+			  enum btree_id, struct bpos, unsigned);
+
+static inline void bch2_trans_iter_init(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  unsigned btree_id, struct bpos pos,
+			  unsigned flags)
 {
-	if (need_resched()) {
-		bch2_btree_iter_unlock(iter);
-		schedule();
-	} else if (race_fault()) {
-		bch2_btree_iter_unlock(iter);
-	}
+	if (__builtin_constant_p(btree_id) &&
+	    __builtin_constant_p(flags))
+		bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
+				bch2_btree_iter_flags(trans, btree_id, flags),
+				_THIS_IP_);
+	else
+		bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags);
 }
 
-#define __for_each_btree_node(_iter, _c, _btree_id, _start,		\
-			      _locks_want, _depth, _flags, _b)		\
-	for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), _start,	\
-				    _locks_want, _depth,		\
-				    _flags|BTREE_ITER_NODES),		\
-	     _b = bch2_btree_iter_peek_node(_iter);			\
-	     (_b);							\
-	     (_b) = bch2_btree_iter_next_node(_iter, _depth))
+void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
+			       enum btree_id, struct bpos,
+			       unsigned, unsigned, unsigned);
+void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
 
-#define for_each_btree_node(_iter, _c, _btree_id, _start, _flags, _b)	\
-	__for_each_btree_node(_iter, _c, _btree_id, _start, 0, 0, _flags, _b)
+void bch2_set_btree_iter_dontneed(struct btree_iter *);
 
-static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
-						     unsigned flags)
+void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
+
+/**
+ * bch2_trans_kmalloc - allocate memory for use by the current transaction
+ *
+ * Must be called after bch2_trans_begin, which on second and further calls
+ * frees all memory allocated in this transaction
+ */
+static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 {
-	return flags & BTREE_ITER_SLOTS
-		? bch2_btree_iter_peek_slot(iter)
-		: bch2_btree_iter_peek(iter);
+	size = roundup(size, 8);
+
+	if (likely(trans->mem_top + size <= trans->mem_bytes)) {
+		void *p = trans->mem + trans->mem_top;
+
+		trans->mem_top += size;
+		memset(p, 0, size);
+		return p;
+	} else {
+		return __bch2_trans_kmalloc(trans, size);
+	}
 }
 
-static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
-						     unsigned flags)
+static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
 {
-	bch2_btree_iter_cond_resched(iter);
+	size = round_up(size, 8);
+
+	if (likely(trans->mem_top + size <= trans->mem_bytes)) {
+		void *p = trans->mem + trans->mem_top;
 
-	return flags & BTREE_ITER_SLOTS
-		? bch2_btree_iter_next_slot(iter)
-		: bch2_btree_iter_next(iter);
+		trans->mem_top += size;
+		return p;
+	} else {
+		return __bch2_trans_kmalloc(trans, size);
+	}
 }
 
-#define for_each_btree_key(_iter, _c, _btree_id,  _start, _flags, _k)	\
-	for (bch2_btree_iter_init((_iter), (_c), (_btree_id),		\
-				  (_start), (_flags)),			\
-	     (_k) = __bch2_btree_iter_peek(_iter, _flags);		\
-	     !IS_ERR_OR_NULL((_k).k);					\
-	     (_k) = __bch2_btree_iter_next(_iter, _flags))
+static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans,
+				struct btree_iter *iter,
+				unsigned btree_id, struct bpos pos,
+				unsigned flags, unsigned type)
+{
+	struct bkey_s_c k;
+
+	bch2_trans_iter_init(trans, iter, btree_id, pos, flags);
+	k = bch2_btree_iter_peek_slot(iter);
 
-#define for_each_btree_key_continue(_iter, _flags, _k)			\
-	for ((_k) = __bch2_btree_iter_peek(_iter, _flags);		\
-	     !IS_ERR_OR_NULL((_k).k);					\
-	     (_k) = __bch2_btree_iter_next(_iter, _flags))
+	if (!bkey_err(k) && type && k.k->type != type)
+		k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch);
+	if (unlikely(bkey_err(k)))
+		bch2_trans_iter_exit(trans, iter);
+	return k;
+}
 
-static inline int btree_iter_err(struct bkey_s_c k)
+static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
+				struct btree_iter *iter,
+				unsigned btree_id, struct bpos pos,
+				unsigned flags)
 {
-	return PTR_ERR_OR_ZERO(k.k);
+	return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0);
 }
 
-/* new multiple iterator interface: */
+#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
+	bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter,			\
+				       _btree_id, _pos, _flags, KEY_TYPE_##_type))
 
-int bch2_trans_preload_iters(struct btree_trans *);
-void bch2_trans_iter_free(struct btree_trans *,
-				struct btree_iter *);
+static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k)
+{
+	unsigned b = min_t(unsigned, dst_size, bkey_val_bytes(src_k.k));
+	memcpy(dst_v, src_k.v, b);
+	if (unlikely(b < dst_size))
+		memset(dst_v + b, 0, dst_size - b);
+}
 
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
-					 struct bpos, unsigned, u64);
-struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *,
-					  struct btree_iter *, u64);
+#define bkey_val_copy(_dst_v, _src_k)					\
+do {									\
+	BUILD_BUG_ON(!__typecheck(*_dst_v, *_src_k.v));			\
+	__bkey_val_copy(_dst_v, sizeof(*_dst_v), _src_k.s_c);		\
+} while (0)
 
-static __always_inline u64 __btree_iter_id(void)
+static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
+				unsigned btree_id, struct bpos pos,
+				unsigned flags, unsigned type,
+				unsigned val_size, void *val)
 {
-	u64 ret = 0;
+	struct btree_iter iter;
+	struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type);
+	int ret = bkey_err(k);
+	if (!ret) {
+		__bkey_val_copy(val, val_size, k);
+		bch2_trans_iter_exit(trans, &iter);
+	}
 
-	ret <<= 32;
-	ret |= _RET_IP_ & U32_MAX;
-	ret <<= 32;
-	ret |= _THIS_IP_ & U32_MAX;
 	return ret;
 }
 
-static __always_inline struct btree_iter *
-bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
-		    struct bpos pos, unsigned flags)
+#define bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, _type, _val)\
+	__bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags,	\
+				  KEY_TYPE_##_type, sizeof(*_val), _val)
+
+void bch2_trans_srcu_unlock(struct btree_trans *);
+
+u32 bch2_trans_begin(struct btree_trans *);
+
+#define __for_each_btree_node(_trans, _iter, _btree_id, _start,			\
+			      _locks_want, _depth, _flags, _b, _do)		\
+({										\
+	bch2_trans_begin((_trans));						\
+										\
+	struct btree_iter _iter;						\
+	bch2_trans_node_iter_init((_trans), &_iter, (_btree_id),		\
+				  _start, _locks_want, _depth, _flags);		\
+	int _ret3 = 0;								\
+	do {									\
+		_ret3 = lockrestart_do((_trans), ({				\
+			struct btree *_b = bch2_btree_iter_peek_node(&_iter);	\
+			if (!_b)						\
+				break;						\
+										\
+			PTR_ERR_OR_ZERO(_b) ?: (_do);				\
+		})) ?:								\
+		lockrestart_do((_trans),					\
+			PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter)));	\
+	} while (!_ret3);							\
+										\
+	bch2_trans_iter_exit((_trans), &(_iter));				\
+	_ret3;									\
+})
+
+#define for_each_btree_node(_trans, _iter, _btree_id, _start,		\
+			    _flags, _b, _do)				\
+	__for_each_btree_node(_trans, _iter, _btree_id, _start,	\
+			      0, 0, _flags, _b, _do)
+
+static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
+							     unsigned flags)
+{
+	return  flags & BTREE_ITER_slots      ? bch2_btree_iter_peek_slot(iter) :
+						bch2_btree_iter_peek_prev(iter);
+}
+
+static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
+							unsigned flags)
 {
-	return __bch2_trans_get_iter(trans, btree_id, pos, flags,
-				     __btree_iter_id());
+	return  flags & BTREE_ITER_slots      ? bch2_btree_iter_peek_slot(iter) :
+						bch2_btree_iter_peek(iter);
 }
 
-static __always_inline struct btree_iter *
-bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
+static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter,
+							     struct bpos end,
+							     unsigned flags)
 {
+	if (!(flags & BTREE_ITER_slots))
+		return bch2_btree_iter_peek_max(iter, end);
 
-	return __bch2_trans_copy_iter(trans, src, __btree_iter_id());
+	if (bkey_gt(iter->pos, end))
+		return bkey_s_c_null;
+
+	return bch2_btree_iter_peek_slot(iter);
 }
 
-void __bch2_trans_begin(struct btree_trans *);
+int __bch2_btree_trans_too_many_iters(struct btree_trans *);
 
-void *bch2_trans_kmalloc(struct btree_trans *, size_t);
-int bch2_trans_unlock(struct btree_trans *);
-void bch2_trans_init(struct btree_trans *, struct bch_fs *);
-int bch2_trans_exit(struct btree_trans *);
+static inline int btree_trans_too_many_iters(struct btree_trans *trans)
+{
+	if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8)
+		return __bch2_btree_trans_too_many_iters(trans);
 
-#ifdef TRACE_TRANSACTION_RESTARTS
-#define bch2_trans_begin(_trans)					\
-do {									\
-	if (is_power_of_2((_trans)->nr_restarts) &&			\
-	    (_trans)->nr_restarts >= 8)					\
-		pr_info("nr restarts: %zu", (_trans)->nr_restarts);	\
+	return 0;
+}
+
+/*
+ * goto instead of loop, so that when used inside for_each_btree_key2()
+ * break/continue work correctly
+ */
+#define lockrestart_do(_trans, _do)					\
+({									\
+	__label__ transaction_restart;					\
+	u32 _restart_count;						\
+	int _ret2;							\
+transaction_restart:							\
+	_restart_count = bch2_trans_begin(_trans);			\
+	_ret2 = (_do);							\
 									\
-	(_trans)->nr_restarts++;					\
-	__bch2_trans_begin(_trans);					\
-} while (0)
-#else
-#define bch2_trans_begin(_trans)	__bch2_trans_begin(_trans)
-#endif
+	if (bch2_err_matches(_ret2, BCH_ERR_transaction_restart))	\
+		goto transaction_restart;				\
+									\
+	if (!_ret2)							\
+		bch2_trans_verify_not_restarted(_trans, _restart_count);\
+	_ret2;								\
+})
 
-#ifdef TRACE_TRANSACTION_RESTARTS_ALL
-#define trans_restart(...) pr_info("transaction restart" __VA_ARGS__)
-#else
-#define trans_restart(...) no_printk("transaction restart" __VA_ARGS__)
-#endif
+/*
+ * nested_lockrestart_do(), nested_commit_do():
+ *
+ * These are like lockrestart_do() and commit_do(), with two differences:
+ *
+ *  - We don't call bch2_trans_begin() unless we had a transaction restart
+ *  - We return -BCH_ERR_transaction_restart_nested if we succeeded after a
+ *  transaction restart
+ */
+#define nested_lockrestart_do(_trans, _do)				\
+({									\
+	u32 _restart_count, _orig_restart_count;			\
+	int _ret2;							\
+									\
+	_restart_count = _orig_restart_count = (_trans)->restart_count;	\
+									\
+	while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\
+		_restart_count = bch2_trans_begin(_trans);		\
+									\
+	if (!_ret2)							\
+		bch2_trans_verify_not_restarted(_trans, _restart_count);\
+									\
+	_ret2 ?: trans_was_restarted(_trans, _restart_count);		\
+})
+
+#define for_each_btree_key_max_continue(_trans, _iter,			\
+					 _end, _flags, _k, _do)		\
+({									\
+	struct bkey_s_c _k;						\
+	int _ret3 = 0;							\
+									\
+	do {								\
+		_ret3 = lockrestart_do(_trans, ({			\
+			(_k) = bch2_btree_iter_peek_max_type(&(_iter),	\
+						_end, (_flags));	\
+			if (!(_k).k)					\
+				break;					\
+									\
+			bkey_err(_k) ?: (_do);				\
+		}));							\
+	} while (!_ret3 && bch2_btree_iter_advance(&(_iter)));		\
+									\
+	bch2_trans_iter_exit((_trans), &(_iter));			\
+	_ret3;								\
+})
+
+#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do)	\
+	for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do)
+
+#define for_each_btree_key_max(_trans, _iter, _btree_id,		\
+				_start, _end, _flags, _k, _do)		\
+({									\
+	bch2_trans_begin(trans);					\
+									\
+	struct btree_iter _iter;					\
+	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
+			     (_start), (_flags));			\
+									\
+	for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\
+})
+
+#define for_each_btree_key(_trans, _iter, _btree_id,			\
+			   _start, _flags, _k, _do)			\
+	for_each_btree_key_max(_trans, _iter, _btree_id, _start,	\
+				 SPOS_MAX, _flags, _k, _do)
+
+#define for_each_btree_key_reverse(_trans, _iter, _btree_id,		\
+				   _start, _flags, _k, _do)		\
+({									\
+	struct btree_iter _iter;					\
+	struct bkey_s_c _k;						\
+	int _ret3 = 0;							\
+									\
+	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
+			     (_start), (_flags));			\
+									\
+	do {								\
+		_ret3 = lockrestart_do(_trans, ({			\
+			(_k) = bch2_btree_iter_peek_prev_type(&(_iter),	\
+							(_flags));	\
+			if (!(_k).k)					\
+				break;					\
+									\
+			bkey_err(_k) ?: (_do);				\
+		}));							\
+	} while (!_ret3 && bch2_btree_iter_rewind(&(_iter)));		\
+									\
+	bch2_trans_iter_exit((_trans), &(_iter));			\
+	_ret3;								\
+})
+
+#define for_each_btree_key_commit(_trans, _iter, _btree_id,		\
+				  _start, _iter_flags, _k,		\
+				  _disk_res, _journal_seq, _commit_flags,\
+				  _do)					\
+	for_each_btree_key(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
+			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_commit_flags)))
+
+#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id,	\
+				  _start, _iter_flags, _k,		\
+				  _disk_res, _journal_seq, _commit_flags,\
+				  _do)					\
+	for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
+			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_commit_flags)))
+
+#define for_each_btree_key_max_commit(_trans, _iter, _btree_id,	\
+				  _start, _end, _iter_flags, _k,	\
+				  _disk_res, _journal_seq, _commit_flags,\
+				  _do)					\
+	for_each_btree_key_max(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
+			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_commit_flags)))
+
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
+
+#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id,	\
+			   _start, _end, _flags, _k, _ret)		\
+	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
+				  (_start), (_flags));			\
+	     (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\
+	for (;									\
+	     (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),	\
+	     !((_ret) = bkey_err(_k)) && (_k).k;				\
+	     bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_norestart(_trans, _iter, _btree_id,		\
+			   _start, _flags, _k, _ret)			\
+	for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\
+					  SPOS_MAX, _flags, _k, _ret)
+
+#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id,	\
+					     _start, _flags, _k, _ret)	\
+	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
+				  (_start), (_flags));			\
+	     (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags),	\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_rewind(&(_iter)))
+
+#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret)	\
+	for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
+
+/*
+ * This should not be used in a fastpath, without first trying _do in
+ * nonblocking mode - it will cause excessive transaction restarts and
+ * potentially livelocking:
+ */
+#define drop_locks_do(_trans, _do)					\
+({									\
+	bch2_trans_unlock(_trans);					\
+	(_do) ?: bch2_trans_relock(_trans);				\
+})
+
+#define allocate_dropping_locks_errcode(_trans, _do)			\
+({									\
+	gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN;				\
+	int _ret = _do;							\
+									\
+	if (bch2_err_matches(_ret, ENOMEM)) {				\
+		_gfp = GFP_KERNEL;					\
+		_ret = drop_locks_do(_trans, _do);			\
+	}								\
+	_ret;								\
+})
+
+#define allocate_dropping_locks(_trans, _ret, _do)			\
+({									\
+	gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN;				\
+	typeof(_do) _p = _do;						\
+									\
+	_ret = 0;							\
+	if (unlikely(!_p)) {						\
+		_gfp = GFP_KERNEL;					\
+		_ret = drop_locks_do(_trans, ((_p = _do), 0));		\
+	}								\
+	_p;								\
+})
+
+#define bch2_trans_run(_c, _do)						\
+({									\
+	struct btree_trans *trans = bch2_trans_get(_c);			\
+	int _ret = (_do);						\
+	bch2_trans_put(trans);						\
+	_ret;								\
+})
+
+#define bch2_trans_do(_c, _do)	bch2_trans_run(_c, lockrestart_do(trans, _do))
+
+struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
+void bch2_trans_put(struct btree_trans *);
+
+bool bch2_current_has_btree_trans(struct bch_fs *);
+
+extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
+unsigned bch2_trans_get_fn_idx(const char *);
+
+#define bch2_trans_get(_c)						\
+({									\
+	static unsigned trans_fn_idx;					\
+									\
+	if (unlikely(!trans_fn_idx))					\
+		trans_fn_idx = bch2_trans_get_fn_idx(__func__);		\
+	__bch2_trans_get(_c, trans_fn_idx);				\
+})
+
+void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
+
+void bch2_fs_btree_iter_exit(struct bch_fs *);
+void bch2_fs_btree_iter_init_early(struct bch_fs *);
+int bch2_fs_btree_iter_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/libbcachefs/btree_journal_iter.c b/libbcachefs/btree_journal_iter.c
new file mode 100644
index 00000000..de3db161
--- /dev/null
+++ b/libbcachefs/btree_journal_iter.c
@@ -0,0 +1,806 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "bset.h"
+#include "btree_cache.h"
+#include "btree_journal_iter.h"
+#include "journal_io.h"
+
+#include <linux/sort.h>
+
+/*
+ * For managing keys we read from the journal: until journal replay works normal
+ * btree lookups need to be able to find and return keys from the journal where
+ * they overwrite what's in the btree, so we have a special iterator and
+ * operations for the regular btree iter code to use:
+ */
+
+static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos)
+{
+	size_t gap_size = keys->size - keys->nr;
+
+	BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size);
+
+	if (pos >= keys->gap)
+		pos -= gap_size;
+	return pos;
+}
+
+static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
+{
+	size_t gap_size = keys->size - keys->nr;
+
+	if (idx >= keys->gap)
+		idx += gap_size;
+	return idx;
+}
+
+static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
+{
+	return keys->data + idx_to_pos(keys, idx);
+}
+
+static size_t __bch2_journal_key_search(struct journal_keys *keys,
+					enum btree_id id, unsigned level,
+					struct bpos pos)
+{
+	size_t l = 0, r = keys->nr, m;
+
+	while (l < r) {
+		m = l + ((r - l) >> 1);
+		if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
+			l = m + 1;
+		else
+			r = m;
+	}
+
+	BUG_ON(l < keys->nr &&
+	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
+
+	BUG_ON(l &&
+	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
+
+	return l;
+}
+
+static size_t bch2_journal_key_search(struct journal_keys *keys,
+				      enum btree_id id, unsigned level,
+				      struct bpos pos)
+{
+	return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
+}
+
+/* Returns first non-overwritten key >= search key: */
+struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id,
+					   unsigned level, struct bpos pos,
+					   struct bpos end_pos, size_t *idx)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	unsigned iters = 0;
+	struct journal_key *k;
+
+	BUG_ON(*idx > keys->nr);
+search:
+	if (!*idx)
+		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+
+	while (*idx &&
+	       __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
+		--(*idx);
+		iters++;
+		if (iters == 10) {
+			*idx = 0;
+			goto search;
+		}
+	}
+
+	struct bkey_i *ret = NULL;
+	rcu_read_lock(); /* for overwritten_ranges */
+
+	while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
+		if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
+			break;
+
+		if (k->overwritten) {
+			if (k->overwritten_range)
+				*idx = rcu_dereference(k->overwritten_range)->end;
+			else
+				*idx += 1;
+			continue;
+		}
+
+		if (__journal_key_cmp(btree_id, level, pos, k) <= 0) {
+			ret = k->k;
+			break;
+		}
+
+		(*idx)++;
+		iters++;
+		if (iters == 10) {
+			*idx = 0;
+			rcu_read_unlock();
+			goto search;
+		}
+	}
+
+	rcu_read_unlock();
+	return ret;
+}
+
+struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
+					   unsigned level, struct bpos pos,
+					   struct bpos end_pos, size_t *idx)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	unsigned iters = 0;
+	struct journal_key *k;
+
+	BUG_ON(*idx > keys->nr);
+search:
+	if (!*idx)
+		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+
+	while (*idx &&
+	       __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
+		(*idx)++;
+		iters++;
+		if (iters == 10) {
+			*idx = 0;
+			goto search;
+		}
+	}
+
+	struct bkey_i *ret = NULL;
+	rcu_read_lock(); /* for overwritten_ranges */
+
+	while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
+		if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
+			break;
+
+		if (k->overwritten) {
+			if (k->overwritten_range)
+				*idx = rcu_dereference(k->overwritten_range)->start - 1;
+			else
+				*idx -= 1;
+			continue;
+		}
+
+		if (__journal_key_cmp(btree_id, level, pos, k) >= 0) {
+			ret = k->k;
+			break;
+		}
+
+		--(*idx);
+		iters++;
+		if (iters == 10) {
+			*idx = 0;
+			goto search;
+		}
+	}
+
+	rcu_read_unlock();
+	return ret;
+}
+
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
+					   unsigned level, struct bpos pos)
+{
+	size_t idx = 0;
+
+	return bch2_journal_keys_peek_max(c, btree_id, level, pos, pos, &idx);
+}
+
+static void journal_iter_verify(struct journal_iter *iter)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct journal_keys *keys = iter->keys;
+	size_t gap_size = keys->size - keys->nr;
+
+	BUG_ON(iter->idx >= keys->gap &&
+	       iter->idx <  keys->gap + gap_size);
+
+	if (iter->idx < keys->size) {
+		struct journal_key *k = keys->data + iter->idx;
+
+		int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
+		BUG_ON(cmp > 0);
+	}
+#endif
+}
+
+static void journal_iters_fix(struct bch_fs *c)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	/* The key we just inserted is immediately before the gap: */
+	size_t gap_end = keys->gap + (keys->size - keys->nr);
+	struct journal_key *new_key = &keys->data[keys->gap - 1];
+	struct journal_iter *iter;
+
+	/*
+	 * If an iterator points one after the key we just inserted, decrement
+	 * the iterator so it points at the key we just inserted - if the
+	 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
+	 * handle that:
+	 */
+	list_for_each_entry(iter, &c->journal_iters, list) {
+		journal_iter_verify(iter);
+		if (iter->idx		== gap_end &&
+		    new_key->btree_id	== iter->btree_id &&
+		    new_key->level	== iter->level)
+			iter->idx = keys->gap - 1;
+		journal_iter_verify(iter);
+	}
+}
+
+static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	struct journal_iter *iter;
+	size_t gap_size = keys->size - keys->nr;
+
+	list_for_each_entry(iter, &c->journal_iters, list) {
+		if (iter->idx > old_gap)
+			iter->idx -= gap_size;
+		if (iter->idx >= new_gap)
+			iter->idx += gap_size;
+	}
+}
+
+int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
+				 unsigned level, struct bkey_i *k)
+{
+	struct journal_key n = {
+		.btree_id	= id,
+		.level		= level,
+		.k		= k,
+		.allocated	= true,
+		/*
+		 * Ensure these keys are done last by journal replay, to unblock
+		 * journal reclaim:
+		 */
+		.journal_seq	= U32_MAX,
+	};
+	struct journal_keys *keys = &c->journal_keys;
+	size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
+
+	BUG_ON(test_bit(BCH_FS_rw, &c->flags));
+
+	if (idx < keys->size &&
+	    journal_key_cmp(&n, &keys->data[idx]) == 0) {
+		if (keys->data[idx].allocated)
+			kfree(keys->data[idx].k);
+		keys->data[idx] = n;
+		return 0;
+	}
+
+	if (idx > keys->gap)
+		idx -= keys->size - keys->nr;
+
+	size_t old_gap = keys->gap;
+
+	if (keys->nr == keys->size) {
+		journal_iters_move_gap(c, old_gap, keys->size);
+		old_gap = keys->size;
+
+		struct journal_keys new_keys = {
+			.nr			= keys->nr,
+			.size			= max_t(size_t, keys->size, 8) * 2,
+		};
+
+		new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL);
+		if (!new_keys.data) {
+			bch_err(c, "%s: error allocating new key array (size %zu)",
+				__func__, new_keys.size);
+			return -BCH_ERR_ENOMEM_journal_key_insert;
+		}
+
+		/* Since @keys was full, there was no gap: */
+		memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr);
+		kvfree(keys->data);
+		keys->data	= new_keys.data;
+		keys->nr	= new_keys.nr;
+		keys->size	= new_keys.size;
+
+		/* And now the gap is at the end: */
+		keys->gap	= keys->nr;
+	}
+
+	journal_iters_move_gap(c, old_gap, idx);
+
+	move_gap(keys, idx);
+
+	keys->nr++;
+	keys->data[keys->gap++] = n;
+
+	journal_iters_fix(c);
+
+	return 0;
+}
+
+/*
+ * Can only be used from the recovery thread while we're still RO - can't be
+ * used once we've got RW, as journal_keys is at that point used by multiple
+ * threads:
+ */
+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
+			    unsigned level, struct bkey_i *k)
+{
+	struct bkey_i *n;
+	int ret;
+
+	n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
+	if (!n)
+		return -BCH_ERR_ENOMEM_journal_key_insert;
+
+	bkey_copy(n, k);
+	ret = bch2_journal_key_insert_take(c, id, level, n);
+	if (ret)
+		kfree(n);
+	return ret;
+}
+
+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
+			    unsigned level, struct bpos pos)
+{
+	struct bkey_i whiteout;
+
+	bkey_init(&whiteout.k);
+	whiteout.k.p = pos;
+
+	return bch2_journal_key_insert(c, id, level, &whiteout);
+}
+
+bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
+				 unsigned level, struct bpos pos)
+{
+	struct journal_keys *keys = &trans->c->journal_keys;
+	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
+
+	if (!trans->journal_replay_not_finished)
+		return false;
+
+	return (idx < keys->size &&
+		keys->data[idx].btree_id	== btree &&
+		keys->data[idx].level		== level &&
+		bpos_eq(keys->data[idx].k->k.p, pos) &&
+		bkey_deleted(&keys->data[idx].k->k));
+}
+
+static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos)
+{
+	struct journal_key *k = keys->data + pos;
+	size_t idx = pos_to_idx(keys, pos);
+
+	k->overwritten = true;
+
+	struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL;
+	struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL;
+
+	bool prev_overwritten = prev && prev->overwritten;
+	bool next_overwritten = next && next->overwritten;
+
+	struct journal_key_range_overwritten *prev_range =
+		prev_overwritten ? prev->overwritten_range : NULL;
+	struct journal_key_range_overwritten *next_range =
+		next_overwritten ? next->overwritten_range : NULL;
+
+	BUG_ON(prev_range && prev_range->end != idx);
+	BUG_ON(next_range && next_range->start != idx + 1);
+
+	if (prev_range && next_range) {
+		prev_range->end = next_range->end;
+
+		keys->data[pos].overwritten_range = prev_range;
+		for (size_t i = next_range->start; i < next_range->end; i++) {
+			struct journal_key *ip = keys->data + idx_to_pos(keys, i);
+			BUG_ON(ip->overwritten_range != next_range);
+			ip->overwritten_range = prev_range;
+		}
+
+		kfree_rcu_mightsleep(next_range);
+	} else if (prev_range) {
+		prev_range->end++;
+		k->overwritten_range = prev_range;
+		if (next_overwritten) {
+			prev_range->end++;
+			next->overwritten_range = prev_range;
+		}
+	} else if (next_range) {
+		next_range->start--;
+		k->overwritten_range = next_range;
+		if (prev_overwritten) {
+			next_range->start--;
+			prev->overwritten_range = next_range;
+		}
+	} else if (prev_overwritten || next_overwritten) {
+		struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL);
+		if (!r)
+			return;
+
+		r->start = idx - (size_t) prev_overwritten;
+		r->end = idx + 1 + (size_t) next_overwritten;
+
+		rcu_assign_pointer(k->overwritten_range, r);
+		if (prev_overwritten)
+			prev->overwritten_range = r;
+		if (next_overwritten)
+			next->overwritten_range = r;
+	}
+}
+
+void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
+				  unsigned level, struct bpos pos)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
+
+	if (idx < keys->size &&
+	    keys->data[idx].btree_id	== btree &&
+	    keys->data[idx].level	== level &&
+	    bpos_eq(keys->data[idx].k->k.p, pos) &&
+	    !keys->data[idx].overwritten) {
+		mutex_lock(&keys->overwrite_lock);
+		__bch2_journal_key_overwritten(keys, idx);
+		mutex_unlock(&keys->overwrite_lock);
+	}
+}
+
+static void bch2_journal_iter_advance(struct journal_iter *iter)
+{
+	if (iter->idx < iter->keys->size) {
+		iter->idx++;
+		if (iter->idx == iter->keys->gap)
+			iter->idx += iter->keys->size - iter->keys->nr;
+	}
+}
+
+static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+{
+	struct bkey_s_c ret = bkey_s_c_null;
+
+	journal_iter_verify(iter);
+
+	rcu_read_lock();
+	while (iter->idx < iter->keys->size) {
+		struct journal_key *k = iter->keys->data + iter->idx;
+
+		int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
+		if (cmp < 0)
+			break;
+		BUG_ON(cmp);
+
+		if (!k->overwritten) {
+			ret = bkey_i_to_s_c(k->k);
+			break;
+		}
+
+		if (k->overwritten_range)
+			iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end);
+		else
+			bch2_journal_iter_advance(iter);
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static void bch2_journal_iter_exit(struct journal_iter *iter)
+{
+	list_del(&iter->list);
+}
+
+static void bch2_journal_iter_init(struct bch_fs *c,
+				   struct journal_iter *iter,
+				   enum btree_id id, unsigned level,
+				   struct bpos pos)
+{
+	iter->btree_id	= id;
+	iter->level	= level;
+	iter->keys	= &c->journal_keys;
+	iter->idx	= bch2_journal_key_search(&c->journal_keys, id, level, pos);
+
+	journal_iter_verify(iter);
+}
+
+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
+{
+	return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
+						iter->b, &iter->unpacked);
+}
+
+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
+{
+	bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
+}
+
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
+{
+	if (bpos_eq(iter->pos, SPOS_MAX))
+		iter->at_end = true;
+	else
+		iter->pos = bpos_successor(iter->pos);
+}
+
+static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
+{
+	struct btree_and_journal_iter iter = *_iter;
+	struct bch_fs *c = iter.trans->c;
+	unsigned level = iter.journal.level;
+	struct bkey_buf tmp;
+	unsigned nr = test_bit(BCH_FS_started, &c->flags)
+		? (level > 1 ? 0 :  2)
+		: (level > 1 ? 1 : 16);
+
+	iter.prefetch = false;
+	iter.fail_if_too_many_whiteouts = true;
+	bch2_bkey_buf_init(&tmp);
+
+	while (nr--) {
+		bch2_btree_and_journal_iter_advance(&iter);
+		struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
+		if (!k.k)
+			break;
+
+		bch2_bkey_buf_reassemble(&tmp, c, k);
+		bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
+	}
+
+	bch2_bkey_buf_exit(&tmp, c);
+}
+
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
+{
+	struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
+	size_t iters = 0;
+
+	if (iter->prefetch && iter->journal.level)
+		btree_and_journal_iter_prefetch(iter);
+again:
+	if (iter->at_end)
+		return bkey_s_c_null;
+
+	iters++;
+
+	if (iters > 20 && iter->fail_if_too_many_whiteouts)
+		return bkey_s_c_null;
+
+	while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
+	       bpos_lt(btree_k.k->p, iter->pos))
+		bch2_journal_iter_advance_btree(iter);
+
+	if (iter->trans->journal_replay_not_finished)
+		while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+		       bpos_lt(journal_k.k->p, iter->pos))
+			bch2_journal_iter_advance(&iter->journal);
+
+	ret = journal_k.k &&
+		(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
+		? journal_k
+		: btree_k;
+
+	if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
+		ret = bkey_s_c_null;
+
+	if (ret.k) {
+		iter->pos = ret.k->p;
+		if (bkey_deleted(ret.k)) {
+			bch2_btree_and_journal_iter_advance(iter);
+			goto again;
+		}
+	} else {
+		iter->pos = SPOS_MAX;
+		iter->at_end = true;
+	}
+
+	return ret;
+}
+
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
+{
+	bch2_journal_iter_exit(&iter->journal);
+}
+
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
+						  struct btree_and_journal_iter *iter,
+						  struct btree *b,
+						  struct btree_node_iter node_iter,
+						  struct bpos pos)
+{
+	memset(iter, 0, sizeof(*iter));
+
+	iter->trans = trans;
+	iter->b = b;
+	iter->node_iter = node_iter;
+	iter->pos = b->data->min_key;
+	iter->at_end = false;
+	INIT_LIST_HEAD(&iter->journal.list);
+
+	if (trans->journal_replay_not_finished) {
+		bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
+		if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
+			list_add(&iter->journal.list, &trans->c->journal_iters);
+	}
+}
+
+/*
+ * this version is used by btree_gc before filesystem has gone RW and
+ * multithreaded, so uses the journal_iters list:
+ */
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
+						struct btree_and_journal_iter *iter,
+						struct btree *b)
+{
+	struct btree_node_iter node_iter;
+
+	bch2_btree_node_iter_init_from_start(&node_iter, b);
+	__bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
+}
+
+/* sort and dedup all keys in the journal: */
+
+/*
+ * When keys compare equal, oldest compares first:
+ */
+static int journal_sort_key_cmp(const void *_l, const void *_r)
+{
+	const struct journal_key *l = _l;
+	const struct journal_key *r = _r;
+
+	return  journal_key_cmp(l, r) ?:
+		cmp_int(l->journal_seq, r->journal_seq) ?:
+		cmp_int(l->journal_offset, r->journal_offset);
+}
+
+void bch2_journal_keys_put(struct bch_fs *c)
+{
+	struct journal_keys *keys = &c->journal_keys;
+
+	BUG_ON(atomic_read(&keys->ref) <= 0);
+
+	if (!atomic_dec_and_test(&keys->ref))
+		return;
+
+	move_gap(keys, keys->nr);
+
+	darray_for_each(*keys, i) {
+		if (i->overwritten_range &&
+		    (i == &darray_last(*keys) ||
+		     i->overwritten_range != i[1].overwritten_range))
+			kfree(i->overwritten_range);
+
+		if (i->allocated)
+			kfree(i->k);
+	}
+
+	kvfree(keys->data);
+	keys->data = NULL;
+	keys->nr = keys->gap = keys->size = 0;
+
+	struct journal_replay **i;
+	struct genradix_iter iter;
+
+	genradix_for_each(&c->journal_entries, iter, i)
+		kvfree(*i);
+	genradix_free(&c->journal_entries);
+}
+
+static void __journal_keys_sort(struct journal_keys *keys)
+{
+	sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
+
+	cond_resched();
+
+	struct journal_key *dst = keys->data;
+
+	darray_for_each(*keys, src) {
+		/*
+		 * We don't accumulate accounting keys here because we have to
+		 * compare each individual accounting key against the version in
+		 * the btree during replay:
+		 */
+		if (src->k->k.type != KEY_TYPE_accounting &&
+		    src + 1 < &darray_top(*keys) &&
+		    !journal_key_cmp(src, src + 1))
+			continue;
+
+		*dst++ = *src;
+	}
+
+	keys->nr = dst - keys->data;
+}
+
+int bch2_journal_keys_sort(struct bch_fs *c)
+{
+	struct genradix_iter iter;
+	struct journal_replay *i, **_i;
+	struct journal_keys *keys = &c->journal_keys;
+	size_t nr_read = 0;
+
+	genradix_for_each(&c->journal_entries, iter, _i) {
+		i = *_i;
+
+		if (journal_replay_ignore(i))
+			continue;
+
+		cond_resched();
+
+		for_each_jset_key(k, entry, &i->j) {
+			struct journal_key n = (struct journal_key) {
+				.btree_id	= entry->btree_id,
+				.level		= entry->level,
+				.k		= k,
+				.journal_seq	= le64_to_cpu(i->j.seq),
+				.journal_offset	= k->_data - i->j._data,
+			};
+
+			if (darray_push(keys, n)) {
+				__journal_keys_sort(keys);
+
+				if (keys->nr * 8 > keys->size * 7) {
+					bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
+						keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
+					return -BCH_ERR_ENOMEM_journal_keys_sort;
+				}
+
+				BUG_ON(darray_push(keys, n));
+			}
+
+			nr_read++;
+		}
+	}
+
+	__journal_keys_sort(keys);
+	keys->gap = keys->nr;
+
+	bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr);
+	return 0;
+}
+
+void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
+				  unsigned level_min, unsigned level_max,
+				  struct bpos start, struct bpos end)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	size_t dst = 0;
+
+	move_gap(keys, keys->nr);
+
+	darray_for_each(*keys, i)
+		if (!(i->btree_id == btree &&
+		      i->level >= level_min &&
+		      i->level <= level_max &&
+		      bpos_ge(i->k->k.p, start) &&
+		      bpos_le(i->k->k.p, end)))
+			keys->data[dst++] = *i;
+	keys->nr = keys->gap = dst;
+}
+
+void bch2_journal_keys_dump(struct bch_fs *c)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	struct printbuf buf = PRINTBUF;
+
+	pr_info("%zu keys:", keys->nr);
+
+	move_gap(keys, keys->nr);
+
+	darray_for_each(*keys, i) {
+		printbuf_reset(&buf);
+		prt_printf(&buf, "btree=");
+		bch2_btree_id_to_text(&buf, i->btree_id);
+		prt_printf(&buf, " l=%u ", i->level);
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
+		pr_err("%s", buf.buf);
+	}
+	printbuf_exit(&buf);
+}
+
+void bch2_fs_journal_keys_init(struct bch_fs *c)
+{
+	struct journal_keys *keys = &c->journal_keys;
+
+	atomic_set(&keys->ref, 1);
+	keys->initial_ref_held = true;
+	mutex_init(&keys->overwrite_lock);
+}
diff --git a/libbcachefs/btree_journal_iter.h b/libbcachefs/btree_journal_iter.h
new file mode 100644
index 00000000..2a308291
--- /dev/null
+++ b/libbcachefs/btree_journal_iter.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H
+#define _BCACHEFS_BTREE_JOURNAL_ITER_H
+
+#include "bkey.h"
+
+struct journal_iter {
+	struct list_head	list;
+	enum btree_id		btree_id;
+	unsigned		level;
+	size_t			idx;
+	struct journal_keys	*keys;
+};
+
+/*
+ * Iterate over keys in the btree, with keys from the journal overlaid on top:
+ */
+
+struct btree_and_journal_iter {
+	struct btree_trans	*trans;
+	struct btree		*b;
+	struct btree_node_iter	node_iter;
+	struct bkey		unpacked;
+
+	struct journal_iter	journal;
+	struct bpos		pos;
+	bool			at_end;
+	bool			prefetch;
+	bool			fail_if_too_many_whiteouts;
+};
+
+static inline int __journal_key_btree_cmp(enum btree_id	l_btree_id,
+					  unsigned	l_level,
+					  const struct journal_key *r)
+{
+	return -cmp_int(l_level,	r->level) ?:
+		cmp_int(l_btree_id,	r->btree_id);
+}
+
+static inline int __journal_key_cmp(enum btree_id	l_btree_id,
+				    unsigned		l_level,
+				    struct bpos	l_pos,
+				    const struct journal_key *r)
+{
+	return __journal_key_btree_cmp(l_btree_id, l_level, r) ?:
+		bpos_cmp(l_pos,	r->k->k.p);
+}
+
+static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
+{
+	return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
+}
+
+struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id,
+				unsigned, struct bpos, struct bpos, size_t *);
+struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id,
+				unsigned, struct bpos, struct bpos, size_t *);
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
+					   unsigned, struct bpos);
+
+int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
+					 struct btree_and_journal_iter *);
+
+int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
+				 unsigned, struct bkey_i *);
+int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
+			    unsigned, struct bkey_i *);
+int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
+			    unsigned, struct bpos);
+bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos);
+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos);
+
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
+
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
+				struct btree_and_journal_iter *, struct btree *,
+				struct btree_node_iter, struct bpos);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
+				struct btree_and_journal_iter *, struct btree *);
+
+void bch2_journal_keys_put(struct bch_fs *);
+
+static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
+{
+	if (c->journal_keys.initial_ref_held)
+		bch2_journal_keys_put(c);
+	c->journal_keys.initial_ref_held = false;
+}
+
+int bch2_journal_keys_sort(struct bch_fs *);
+
+void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
+				  unsigned, unsigned,
+				  struct bpos, struct bpos);
+
+void bch2_journal_keys_dump(struct bch_fs *);
+
+void bch2_fs_journal_keys_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/libbcachefs/btree_journal_iter_types.h b/libbcachefs/btree_journal_iter_types.h
new file mode 100644
index 00000000..8b773823
--- /dev/null
+++ b/libbcachefs/btree_journal_iter_types.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
+#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
+
+struct journal_key_range_overwritten {
+	size_t			start, end;
+};
+
+struct journal_key {
+	u64			journal_seq;
+	u32			journal_offset;
+	enum btree_id		btree_id:8;
+	unsigned		level:8;
+	bool			allocated;
+	bool			overwritten;
+	struct journal_key_range_overwritten __rcu *
+				overwritten_range;
+	struct bkey_i		*k;
+};
+
+struct journal_keys {
+	/* must match layout in darray_types.h */
+	size_t			nr, size;
+	struct journal_key	*data;
+	/*
+	 * Gap buffer: instead of all the empty space in the array being at the
+	 * end of the buffer - from @nr to @size - the empty space is at @gap.
+	 * This means that sequential insertions are O(n) instead of O(n^2).
+	 */
+	size_t			gap;
+	atomic_t		ref;
+	bool			initial_ref_held;
+	struct mutex		overwrite_lock;
+};
+
+#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
new file mode 100644
index 00000000..3bd40ea0
--- /dev/null
+++ b/libbcachefs/btree_key_cache.c
@@ -0,0 +1,820 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_key_cache.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "errcode.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "trace.h"
+
+#include <linux/sched/mm.h>
+
+static inline bool btree_uses_pcpu_readers(enum btree_id id)
+{
+	return id == BTREE_ID_subvolumes;
+}
+
+static struct kmem_cache *bch2_key_cache;
+
+static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
+				       const void *obj)
+{
+	const struct bkey_cached *ck = obj;
+	const struct bkey_cached_key *key = arg->key;
+
+	return ck->key.btree_id != key->btree_id ||
+		!bpos_eq(ck->key.pos, key->pos);
+}
+
+static const struct rhashtable_params bch2_btree_key_cache_params = {
+	.head_offset		= offsetof(struct bkey_cached, hash),
+	.key_offset		= offsetof(struct bkey_cached, key),
+	.key_len		= sizeof(struct bkey_cached_key),
+	.obj_cmpfn		= bch2_btree_key_cache_cmp_fn,
+	.automatic_shrinking	= true,
+};
+
+static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path,
+					 struct bkey_cached *ck,
+					 enum btree_node_locked_type lock_held)
+{
+	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
+	path->l[0].b		= (void *) ck;
+	mark_btree_node_locked(trans, path, 0, lock_held);
+}
+
+__flatten
+inline struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
+{
+	struct bkey_cached_key key = {
+		.btree_id	= btree_id,
+		.pos		= pos,
+	};
+
+	return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
+				      bch2_btree_key_cache_params);
+}
+
+static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
+{
+	if (!six_trylock_intent(&ck->c.lock))
+		return false;
+
+	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		six_unlock_intent(&ck->c.lock);
+		return false;
+	}
+
+	if (!six_trylock_write(&ck->c.lock)) {
+		six_unlock_intent(&ck->c.lock);
+		return false;
+	}
+
+	return true;
+}
+
+static bool bkey_cached_evict(struct btree_key_cache *c,
+			      struct bkey_cached *ck)
+{
+	bool ret = !rhashtable_remove_fast(&c->table, &ck->hash,
+				      bch2_btree_key_cache_params);
+	if (ret) {
+		memset(&ck->key, ~0, sizeof(ck->key));
+		atomic_long_dec(&c->nr_keys);
+	}
+
+	return ret;
+}
+
+static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu)
+{
+	struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier);
+	struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu);
+
+	this_cpu_dec(*c->btree_key_cache.nr_pending);
+	kmem_cache_free(bch2_key_cache, ck);
+}
+
+static void bkey_cached_free(struct btree_key_cache *bc,
+			     struct bkey_cached *ck)
+{
+	kfree(ck->k);
+	ck->k		= NULL;
+	ck->u64s	= 0;
+
+	six_unlock_write(&ck->c.lock);
+	six_unlock_intent(&ck->c.lock);
+
+	bool pcpu_readers = ck->c.lock.readers != NULL;
+	rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu);
+	this_cpu_inc(*bc->nr_pending);
+}
+
+static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
+{
+	gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
+
+	struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp);
+	if (unlikely(!ck))
+		return NULL;
+	ck->k = kmalloc(key_u64s * sizeof(u64), gfp);
+	if (unlikely(!ck->k)) {
+		kmem_cache_free(bch2_key_cache, ck);
+		return NULL;
+	}
+	ck->u64s = key_u64s;
+	return ck;
+}
+
+static struct bkey_cached *
+bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_key_cache *bc = &c->btree_key_cache;
+	bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
+	int ret;
+
+	struct bkey_cached *ck = container_of_or_null(
+				rcu_pending_dequeue(&bc->pending[pcpu_readers]),
+				struct bkey_cached, rcu);
+	if (ck)
+		goto lock;
+
+	ck = allocate_dropping_locks(trans, ret,
+				     __bkey_cached_alloc(key_u64s, _gfp));
+	if (ret) {
+		if (ck)
+			kfree(ck->k);
+		kmem_cache_free(bch2_key_cache, ck);
+		return ERR_PTR(ret);
+	}
+
+	if (ck) {
+		bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
+		ck->c.cached = true;
+		goto lock;
+	}
+
+	ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]),
+				  struct bkey_cached, rcu);
+	if (ck)
+		goto lock;
+lock:
+	six_lock_intent(&ck->c.lock, NULL, NULL);
+	six_lock_write(&ck->c.lock, NULL, NULL);
+	return ck;
+}
+
+static struct bkey_cached *
+bkey_cached_reuse(struct btree_key_cache *c)
+{
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	struct bkey_cached *ck;
+	unsigned i;
+
+	rcu_read_lock();
+	tbl = rht_dereference_rcu(c->table.tbl, &c->table);
+	for (i = 0; i < tbl->size; i++)
+		rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+			if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+			    bkey_cached_lock_for_evict(ck)) {
+				if (bkey_cached_evict(c, ck))
+					goto out;
+				six_unlock_write(&ck->c.lock);
+				six_unlock_intent(&ck->c.lock);
+			}
+		}
+	ck = NULL;
+out:
+	rcu_read_unlock();
+	return ck;
+}
+
+static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *path,
+				  struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_key_cache *bc = &c->btree_key_cache;
+
+	/*
+	 * bch2_varint_decode can read past the end of the buffer by at
+	 * most 7 bytes (it won't be used):
+	 */
+	unsigned key_u64s = k.k->u64s + 1;
+
+	/*
+	 * Allocate some extra space so that the transaction commit path is less
+	 * likely to have to reallocate, since that requires a transaction
+	 * restart:
+	 */
+	key_u64s = min(256U, (key_u64s * 3) / 2);
+	key_u64s = roundup_pow_of_two(key_u64s);
+
+	struct bkey_cached *ck = bkey_cached_alloc(trans, path, key_u64s);
+	int ret = PTR_ERR_OR_ZERO(ck);
+	if (ret)
+		return ret;
+
+	if (unlikely(!ck)) {
+		ck = bkey_cached_reuse(bc);
+		if (unlikely(!ck)) {
+			bch_err(c, "error allocating memory for key cache item, btree %s",
+				bch2_btree_id_str(path->btree_id));
+			return -BCH_ERR_ENOMEM_btree_key_cache_create;
+		}
+	}
+
+	ck->c.level		= 0;
+	ck->c.btree_id		= path->btree_id;
+	ck->key.btree_id	= path->btree_id;
+	ck->key.pos		= path->pos;
+	ck->flags		= 1U << BKEY_CACHED_ACCESSED;
+
+	if (unlikely(key_u64s > ck->u64s)) {
+		mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
+
+		struct bkey_i *new_k = allocate_dropping_locks(trans, ret,
+				kmalloc(key_u64s * sizeof(u64), _gfp));
+		if (unlikely(!new_k)) {
+			bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+				bch2_btree_id_str(ck->key.btree_id), key_u64s);
+			ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
+		} else if (ret) {
+			kfree(new_k);
+			goto err;
+		}
+
+		kfree(ck->k);
+		ck->k = new_k;
+		ck->u64s = key_u64s;
+	}
+
+	bkey_reassemble(ck->k, k);
+
+	ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params);
+	if (unlikely(ret)) /* raced with another fill? */
+		goto err;
+
+	atomic_long_inc(&bc->nr_keys);
+	six_unlock_write(&ck->c.lock);
+
+	enum six_lock_type lock_want = __btree_lock_want(path, 0);
+	if (lock_want == SIX_LOCK_read)
+		six_lock_downgrade(&ck->c.lock);
+	btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
+	path->uptodate = BTREE_ITER_UPTODATE;
+	return 0;
+err:
+	bkey_cached_free(bc, ck);
+	mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
+
+	return ret;
+}
+
+static noinline int btree_key_cache_fill(struct btree_trans *trans,
+					 struct btree_path *ck_path,
+					 unsigned flags)
+{
+	if (flags & BTREE_ITER_cached_nofill) {
+		ck_path->uptodate = BTREE_ITER_UPTODATE;
+		return 0;
+	}
+
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos,
+			     BTREE_ITER_key_cache_fill|
+			     BTREE_ITER_cached_nofill);
+	iter.flags &= ~BTREE_ITER_with_journal;
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	/* Recheck after btree lookup, before allocating: */
+	ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0;
+	if (unlikely(ret))
+		goto out;
+
+	ret = btree_key_cache_create(trans, ck_path, k);
+	if (ret)
+		goto err;
+out:
+	/* We're not likely to need this iterator again: */
+	bch2_set_btree_iter_dontneed(&iter);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static inline int btree_path_traverse_cached_fast(struct btree_trans *trans,
+						  struct btree_path *path)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached *ck;
+retry:
+	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
+	if (!ck)
+		return -ENOENT;
+
+	enum six_lock_type lock_want = __btree_lock_want(path, 0);
+
+	int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_);
+	if (ret)
+		return ret;
+
+	if (ck->key.btree_id != path->btree_id ||
+	    !bpos_eq(ck->key.pos, path->pos)) {
+		six_unlock_type(&ck->c.lock, lock_want);
+		goto retry;
+	}
+
+	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+
+	btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
+	path->uptodate = BTREE_ITER_UPTODATE;
+	return 0;
+}
+
+int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
+				    unsigned flags)
+{
+	EBUG_ON(path->level);
+
+	path->l[1].b = NULL;
+
+	int ret;
+	do {
+		ret = btree_path_traverse_cached_fast(trans, path);
+		if (unlikely(ret == -ENOENT))
+			ret = btree_key_cache_fill(trans, path, flags);
+	} while (ret == -EEXIST);
+
+	if (unlikely(ret)) {
+		path->uptodate = BTREE_ITER_NEED_TRAVERSE;
+		if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+			btree_node_unlock(trans, path, 0);
+			path->l[0].b = ERR_PTR(ret);
+		}
+	}
+	return ret;
+}
+
+static int btree_key_cache_flush_pos(struct btree_trans *trans,
+				     struct bkey_cached_key key,
+				     u64 journal_seq,
+				     unsigned commit_flags,
+				     bool evict)
+{
+	struct bch_fs *c = trans->c;
+	struct journal *j = &c->journal;
+	struct btree_iter c_iter, b_iter;
+	struct bkey_cached *ck = NULL;
+	int ret;
+
+	bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
+			     BTREE_ITER_slots|
+			     BTREE_ITER_intent|
+			     BTREE_ITER_all_snapshots);
+	bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
+			     BTREE_ITER_cached|
+			     BTREE_ITER_intent);
+	b_iter.flags &= ~BTREE_ITER_with_key_cache;
+
+	ret = bch2_btree_iter_traverse(&c_iter);
+	if (ret)
+		goto out;
+
+	ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b;
+	if (!ck)
+		goto out;
+
+	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		if (evict)
+			goto evict;
+		goto out;
+	}
+
+	if (journal_seq && ck->journal.seq != journal_seq)
+		goto out;
+
+	trans->journal_res.seq = ck->journal.seq;
+
+	/*
+	 * If we're at the end of the journal, we really want to free up space
+	 * in the journal right away - we don't want to pin that old journal
+	 * sequence number with a new btree node write, we want to re-journal
+	 * the update
+	 */
+	if (ck->journal.seq == journal_last_seq(j))
+		commit_flags |= BCH_WATERMARK_reclaim;
+
+	if (ck->journal.seq != journal_last_seq(j) ||
+	    !test_bit(JOURNAL_space_low, &c->journal.flags))
+		commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
+
+	struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter);
+	ret = bkey_err(btree_k);
+	if (ret)
+		goto err;
+
+	/* * Check that we're not violating cache coherency rules: */
+	BUG_ON(bkey_deleted(btree_k.k));
+
+	ret   = bch2_trans_update(trans, &b_iter, ck->k,
+				  BTREE_UPDATE_key_cache_reclaim|
+				  BTREE_UPDATE_internal_snapshot_node|
+				  BTREE_TRIGGER_norun) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BCH_TRANS_COMMIT_no_check_rw|
+				  BCH_TRANS_COMMIT_no_enospc|
+				  commit_flags);
+err:
+	bch2_fs_fatal_err_on(ret &&
+			     !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+			     !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
+			     !bch2_journal_error(j), c,
+			     "flushing key cache: %s", bch2_err_str(ret));
+	if (ret)
+		goto out;
+
+	bch2_journal_pin_drop(j, &ck->journal);
+
+	struct btree_path *path = btree_iter_path(trans, &c_iter);
+	BUG_ON(!btree_node_locked(path, 0));
+
+	if (!evict) {
+		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+			atomic_long_dec(&c->btree_key_cache.nr_dirty);
+		}
+	} else {
+		struct btree_path *path2;
+		unsigned i;
+evict:
+		trans_for_each_path(trans, path2, i)
+			if (path2 != path)
+				__bch2_btree_path_unlock(trans, path2);
+
+		bch2_btree_node_lock_write_nofail(trans, path, &ck->c);
+
+		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+			atomic_long_dec(&c->btree_key_cache.nr_dirty);
+		}
+
+		mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
+		if (bkey_cached_evict(&c->btree_key_cache, ck)) {
+			bkey_cached_free(&c->btree_key_cache, ck);
+		} else {
+			six_unlock_write(&ck->c.lock);
+			six_unlock_intent(&ck->c.lock);
+		}
+	}
+out:
+	bch2_trans_iter_exit(trans, &b_iter);
+	bch2_trans_iter_exit(trans, &c_iter);
+	return ret;
+}
+
+int bch2_btree_key_cache_journal_flush(struct journal *j,
+				struct journal_entry_pin *pin, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bkey_cached *ck =
+		container_of(pin, struct bkey_cached, journal);
+	struct bkey_cached_key key;
+	struct btree_trans *trans = bch2_trans_get(c);
+	int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+	int ret = 0;
+
+	btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
+	key = ck->key;
+
+	if (ck->journal.seq != seq ||
+	    !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		six_unlock_read(&ck->c.lock);
+		goto unlock;
+	}
+
+	if (ck->seq != seq) {
+		bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal,
+					bch2_btree_key_cache_journal_flush);
+		six_unlock_read(&ck->c.lock);
+		goto unlock;
+	}
+	six_unlock_read(&ck->c.lock);
+
+	ret = lockrestart_do(trans,
+		btree_key_cache_flush_pos(trans, key, seq,
+				BCH_TRANS_COMMIT_journal_reclaim, false));
+unlock:
+	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+bool bch2_btree_insert_key_cached(struct btree_trans *trans,
+				  unsigned flags,
+				  struct btree_insert_entry *insert_entry)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b;
+	struct bkey_i *insert = insert_entry->k;
+	bool kick_reclaim = false;
+
+	BUG_ON(insert->k.u64s > ck->u64s);
+
+	bkey_copy(ck->k, insert);
+
+	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
+		set_bit(BKEY_CACHED_DIRTY, &ck->flags);
+		atomic_long_inc(&c->btree_key_cache.nr_dirty);
+
+		if (bch2_nr_btree_keys_need_flush(c))
+			kick_reclaim = true;
+	}
+
+	/*
+	 * To minimize lock contention, we only add the journal pin here and
+	 * defer pin updates to the flush callback via ->seq. Be careful not to
+	 * update ->seq on nojournal commits because we don't want to update the
+	 * pin to a seq that doesn't include journal updates on disk. Otherwise
+	 * we risk losing the update after a crash.
+	 *
+	 * The only exception is if the pin is not active in the first place. We
+	 * have to add the pin because journal reclaim drives key cache
+	 * flushing. The flush callback will not proceed unless ->seq matches
+	 * the latest pin, so make sure it starts with a consistent value.
+	 */
+	if (!(insert_entry->flags & BTREE_UPDATE_nojournal) ||
+	    !journal_pin_active(&ck->journal)) {
+		ck->seq = trans->journal_res.seq;
+	}
+	bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
+			     &ck->journal, bch2_btree_key_cache_journal_flush);
+
+	if (kick_reclaim)
+		journal_reclaim_kick(&c->journal);
+	return true;
+}
+
+void bch2_btree_key_cache_drop(struct btree_trans *trans,
+			       struct btree_path *path)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_key_cache *bc = &c->btree_key_cache;
+	struct bkey_cached *ck = (void *) path->l[0].b;
+
+	/*
+	 * We just did an update to the btree, bypassing the key cache: the key
+	 * cache key is now stale and must be dropped, even if dirty:
+	 */
+	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+		atomic_long_dec(&c->btree_key_cache.nr_dirty);
+		bch2_journal_pin_drop(&c->journal, &ck->journal);
+	}
+
+	bkey_cached_evict(bc, ck);
+	bkey_cached_free(bc, ck);
+
+	mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
+	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+	path->should_be_locked = false;
+}
+
+static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
+					   struct shrink_control *sc)
+{
+	struct bch_fs *c = shrink->private_data;
+	struct btree_key_cache *bc = &c->btree_key_cache;
+	struct bucket_table *tbl;
+	struct bkey_cached *ck;
+	size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
+	unsigned iter, start;
+	int srcu_idx;
+
+	srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+	rcu_read_lock();
+
+	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+
+	/*
+	 * Scanning is expensive while a rehash is in progress - most elements
+	 * will be on the new hashtable, if it's in progress
+	 *
+	 * A rehash could still start while we're scanning - that's ok, we'll
+	 * still see most elements.
+	 */
+	if (unlikely(tbl->nest)) {
+		rcu_read_unlock();
+		srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+		return SHRINK_STOP;
+	}
+
+	iter = bc->shrink_iter;
+	if (iter >= tbl->size)
+		iter = 0;
+	start = iter;
+
+	do {
+		struct rhash_head *pos, *next;
+
+		pos = rht_ptr_rcu(&tbl->buckets[iter]);
+
+		while (!rht_is_a_nulls(pos)) {
+			next = rht_dereference_bucket_rcu(pos->next, tbl, iter);
+			ck = container_of(pos, struct bkey_cached, hash);
+
+			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+				bc->skipped_dirty++;
+			} else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) {
+				clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+				bc->skipped_accessed++;
+			} else if (!bkey_cached_lock_for_evict(ck)) {
+				bc->skipped_lock_fail++;
+			} else if (bkey_cached_evict(bc, ck)) {
+				bkey_cached_free(bc, ck);
+				bc->freed++;
+				freed++;
+			} else {
+				six_unlock_write(&ck->c.lock);
+				six_unlock_intent(&ck->c.lock);
+			}
+
+			scanned++;
+			if (scanned >= nr)
+				goto out;
+
+			pos = next;
+		}
+
+		iter++;
+		if (iter >= tbl->size)
+			iter = 0;
+	} while (scanned < nr && iter != start);
+out:
+	bc->shrink_iter = iter;
+
+	rcu_read_unlock();
+	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+
+	return freed;
+}
+
+static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
+					    struct shrink_control *sc)
+{
+	struct bch_fs *c = shrink->private_data;
+	struct btree_key_cache *bc = &c->btree_key_cache;
+	long nr = atomic_long_read(&bc->nr_keys) -
+		atomic_long_read(&bc->nr_dirty);
+
+	/*
+	 * Avoid hammering our shrinker too much if it's nearly empty - the
+	 * shrinker code doesn't take into account how big our cache is, if it's
+	 * mostly empty but the system is under memory pressure it causes nasty
+	 * lock contention:
+	 */
+	nr -= 128;
+
+	return max(0L, nr);
+}
+
+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
+{
+	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+	struct bucket_table *tbl;
+	struct bkey_cached *ck;
+	struct rhash_head *pos;
+	LIST_HEAD(items);
+	unsigned i;
+
+	shrinker_free(bc->shrink);
+
+	/*
+	 * The loop is needed to guard against racing with rehash:
+	 */
+	while (atomic_long_read(&bc->nr_keys)) {
+		rcu_read_lock();
+		tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+		if (tbl) {
+			if (tbl->nest) {
+				/* wait for in progress rehash */
+				rcu_read_unlock();
+				mutex_lock(&bc->table.mutex);
+				mutex_unlock(&bc->table.mutex);
+				rcu_read_lock();
+				continue;
+			}
+			for (i = 0; i < tbl->size; i++)
+				while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
+					ck = container_of(pos, struct bkey_cached, hash);
+					BUG_ON(!bkey_cached_evict(bc, ck));
+					kfree(ck->k);
+					kmem_cache_free(bch2_key_cache, ck);
+				}
+		}
+		rcu_read_unlock();
+	}
+
+	if (atomic_long_read(&bc->nr_dirty) &&
+	    !bch2_journal_error(&c->journal) &&
+	    test_bit(BCH_FS_was_rw, &c->flags))
+		panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
+		      atomic_long_read(&bc->nr_dirty));
+
+	if (atomic_long_read(&bc->nr_keys))
+		panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
+		      atomic_long_read(&bc->nr_keys));
+
+	if (bc->table_init_done)
+		rhashtable_destroy(&bc->table);
+
+	rcu_pending_exit(&bc->pending[0]);
+	rcu_pending_exit(&bc->pending[1]);
+
+	free_percpu(bc->nr_pending);
+}
+
+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
+{
+}
+
+int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
+{
+	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+	struct shrinker *shrink;
+
+	bc->nr_pending = alloc_percpu(size_t);
+	if (!bc->nr_pending)
+		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+
+	if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) ||
+	    rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free))
+		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+
+	if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
+		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+
+	bc->table_init_done = true;
+
+	shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
+	if (!shrink)
+		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+	bc->shrink = shrink;
+	shrink->count_objects	= bch2_btree_key_cache_count;
+	shrink->scan_objects	= bch2_btree_key_cache_scan;
+	shrink->batch		= 1 << 14;
+	shrink->seeks		= 0;
+	shrink->private_data	= c;
+	shrinker_register(shrink);
+	return 0;
+}
+
+void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
+{
+	printbuf_tabstop_push(out, 24);
+	printbuf_tabstop_push(out, 12);
+
+	prt_printf(out, "keys:\t%lu\r\n",		atomic_long_read(&bc->nr_keys));
+	prt_printf(out, "dirty:\t%lu\r\n",		atomic_long_read(&bc->nr_dirty));
+	prt_printf(out, "table size:\t%u\r\n",		bc->table.tbl->size);
+	prt_newline(out);
+	prt_printf(out, "shrinker:\n");
+	prt_printf(out, "requested_to_free:\t%lu\r\n",	bc->requested_to_free);
+	prt_printf(out, "freed:\t%lu\r\n",		bc->freed);
+	prt_printf(out, "skipped_dirty:\t%lu\r\n",	bc->skipped_dirty);
+	prt_printf(out, "skipped_accessed:\t%lu\r\n",	bc->skipped_accessed);
+	prt_printf(out, "skipped_lock_fail:\t%lu\r\n",	bc->skipped_lock_fail);
+	prt_newline(out);
+	prt_printf(out, "pending:\t%zu\r\n",		per_cpu_sum(bc->nr_pending));
+}
+
+void bch2_btree_key_cache_exit(void)
+{
+	kmem_cache_destroy(bch2_key_cache);
+}
+
+int __init bch2_btree_key_cache_init(void)
+{
+	bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT);
+	if (!bch2_key_cache)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h
new file mode 100644
index 00000000..51d6289b
--- /dev/null
+++ b/libbcachefs/btree_key_cache.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
+#define _BCACHEFS_BTREE_KEY_CACHE_H
+
+static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
+{
+	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
+	size_t max_dirty = 1024 + nr_keys  / 2;
+
+	return max_t(ssize_t, 0, nr_dirty - max_dirty);
+}
+
+static inline ssize_t __bch2_btree_key_cache_must_wait(struct bch_fs *c)
+{
+	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
+	size_t max_dirty = 4096 + (nr_keys * 3) / 4;
+
+	return nr_dirty - max_dirty;
+}
+
+static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
+{
+	return __bch2_btree_key_cache_must_wait(c) > 0;
+}
+
+static inline bool bch2_btree_key_cache_wait_done(struct bch_fs *c)
+{
+	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
+	size_t max_dirty = 2048 + (nr_keys * 5) / 8;
+
+	return nr_dirty <= max_dirty;
+}
+
+int bch2_btree_key_cache_journal_flush(struct journal *,
+				struct journal_entry_pin *, u64);
+
+struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
+
+int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
+				    unsigned);
+
+bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
+			struct btree_insert_entry *);
+void bch2_btree_key_cache_drop(struct btree_trans *,
+			       struct btree_path *);
+
+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
+int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
+
+void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *);
+
+void bch2_btree_key_cache_exit(void);
+int __init bch2_btree_key_cache_init(void);
+
+#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
diff --git a/libbcachefs/btree_key_cache_types.h b/libbcachefs/btree_key_cache_types.h
new file mode 100644
index 00000000..722f1ed1
--- /dev/null
+++ b/libbcachefs/btree_key_cache_types.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+
+#include "rcu_pending.h"
+
+struct btree_key_cache {
+	struct rhashtable	table;
+	bool			table_init_done;
+
+	struct shrinker		*shrink;
+	unsigned		shrink_iter;
+
+	/* 0: non pcpu reader locks, 1: pcpu reader locks */
+	struct rcu_pending	pending[2];
+	size_t __percpu		*nr_pending;
+
+	atomic_long_t		nr_keys;
+	atomic_long_t		nr_dirty;
+
+	/* shrinker stats */
+	unsigned long		requested_to_free;
+	unsigned long		freed;
+	unsigned long		skipped_dirty;
+	unsigned long		skipped_accessed;
+	unsigned long		skipped_lock_fail;
+};
+
+struct bkey_cached_key {
+	u32			btree_id;
+	struct bpos		pos;
+} __packed __aligned(4);
+
+#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */
diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c
new file mode 100644
index 00000000..d343df9f
--- /dev/null
+++ b/libbcachefs/btree_locking.c
@@ -0,0 +1,887 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_locking.h"
+#include "btree_types.h"
+
+static struct lock_class_key bch2_btree_node_lock_key;
+
+void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
+			  enum six_lock_init_flags flags)
+{
+	__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags);
+	lockdep_set_notrack_class(&b->lock);
+}
+
+/* Btree node locking: */
+
+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
+						  struct btree_path *skip,
+						  struct btree_bkey_cached_common *b,
+						  unsigned level)
+{
+	struct btree_path *path;
+	struct six_lock_count ret;
+	unsigned i;
+
+	memset(&ret, 0, sizeof(ret));
+
+	if (IS_ERR_OR_NULL(b))
+		return ret;
+
+	trans_for_each_path(trans, path, i)
+		if (path != skip && &path->l[level].b->c == b) {
+			int t = btree_node_locked_type(path, level);
+
+			if (t != BTREE_NODE_UNLOCKED)
+				ret.n[t]++;
+		}
+
+	return ret;
+}
+
+/* unlock */
+
+void bch2_btree_node_unlock_write(struct btree_trans *trans,
+			struct btree_path *path, struct btree *b)
+{
+	bch2_btree_node_unlock_write_inlined(trans, path, b);
+}
+
+/* lock */
+
+/*
+ * @trans wants to lock @b with type @type
+ */
+struct trans_waiting_for_lock {
+	struct btree_trans		*trans;
+	struct btree_bkey_cached_common	*node_want;
+	enum six_lock_type		lock_want;
+
+	/* for iterating over held locks :*/
+	u8				path_idx;
+	u8				level;
+	u64				lock_start_time;
+};
+
+struct lock_graph {
+	struct trans_waiting_for_lock	g[8];
+	unsigned			nr;
+};
+
+static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
+{
+	struct trans_waiting_for_lock *i;
+
+	prt_printf(out, "Found lock cycle (%u entries):\n", g->nr);
+
+	for (i = g->g; i < g->g + g->nr; i++) {
+		struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
+		if (!task)
+			continue;
+
+		bch2_btree_trans_to_text(out, i->trans);
+		bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT);
+	}
+}
+
+static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
+{
+	struct trans_waiting_for_lock *i;
+
+	for (i = g->g; i != g->g + g->nr; i++) {
+		struct task_struct *task = i->trans->locking_wait.task;
+		if (i != g->g)
+			prt_str(out, "<- ");
+		prt_printf(out, "%u ", task ?task->pid : 0);
+	}
+	prt_newline(out);
+}
+
+static void lock_graph_up(struct lock_graph *g)
+{
+	closure_put(&g->g[--g->nr].trans->ref);
+}
+
+static noinline void lock_graph_pop_all(struct lock_graph *g)
+{
+	while (g->nr)
+		lock_graph_up(g);
+}
+
+static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
+{
+	g->g[g->nr++] = (struct trans_waiting_for_lock) {
+		.trans		= trans,
+		.node_want	= trans->locking,
+		.lock_want	= trans->locking_wait.lock_want,
+	};
+}
+
+static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
+{
+	closure_get(&trans->ref);
+	__lock_graph_down(g, trans);
+}
+
+static bool lock_graph_remove_non_waiters(struct lock_graph *g)
+{
+	struct trans_waiting_for_lock *i;
+
+	for (i = g->g + 1; i < g->g + g->nr; i++)
+		if (i->trans->locking != i->node_want ||
+		    i->trans->locking_wait.start_time != i[-1].lock_start_time) {
+			while (g->g + g->nr > i)
+				lock_graph_up(g);
+			return true;
+		}
+
+	return false;
+}
+
+static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+
+	count_event(c, trans_restart_would_deadlock);
+
+	if (trace_trans_restart_would_deadlock_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		buf.atomic++;
+		print_cycle(&buf, g);
+
+		trace_trans_restart_would_deadlock(trans, buf.buf);
+		printbuf_exit(&buf);
+	}
+}
+
+static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
+{
+	if (i == g->g) {
+		trace_would_deadlock(g, i->trans);
+		return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
+	} else {
+		i->trans->lock_must_abort = true;
+		wake_up_process(i->trans->locking_wait.task);
+		return 0;
+	}
+}
+
+static int btree_trans_abort_preference(struct btree_trans *trans)
+{
+	if (trans->lock_may_not_fail)
+		return 0;
+	if (trans->locking_wait.lock_want == SIX_LOCK_write)
+		return 1;
+	if (!trans->in_traverse_all)
+		return 2;
+	return 3;
+}
+
+static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
+{
+	struct trans_waiting_for_lock *i, *abort = NULL;
+	unsigned best = 0, pref;
+	int ret;
+
+	if (lock_graph_remove_non_waiters(g))
+		return 0;
+
+	/* Only checking, for debugfs: */
+	if (cycle) {
+		print_cycle(cycle, g);
+		ret = -1;
+		goto out;
+	}
+
+	for (i = g->g; i < g->g + g->nr; i++) {
+		pref = btree_trans_abort_preference(i->trans);
+		if (pref > best) {
+			abort = i;
+			best = pref;
+		}
+	}
+
+	if (unlikely(!best)) {
+		struct printbuf buf = PRINTBUF;
+		buf.atomic++;
+
+		prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
+
+		for (i = g->g; i < g->g + g->nr; i++) {
+			struct btree_trans *trans = i->trans;
+
+			bch2_btree_trans_to_text(&buf, trans);
+
+			prt_printf(&buf, "backtrace:\n");
+			printbuf_indent_add(&buf, 2);
+			bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
+			printbuf_indent_sub(&buf, 2);
+			prt_newline(&buf);
+		}
+
+		bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf);
+		printbuf_exit(&buf);
+		BUG();
+	}
+
+	ret = abort_lock(g, abort);
+out:
+	if (ret)
+		while (g->nr)
+			lock_graph_up(g);
+	return ret;
+}
+
+static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
+			      struct printbuf *cycle)
+{
+	struct btree_trans *orig_trans = g->g->trans;
+	struct trans_waiting_for_lock *i;
+
+	for (i = g->g; i < g->g + g->nr; i++)
+		if (i->trans == trans) {
+			closure_put(&trans->ref);
+			return break_cycle(g, cycle);
+		}
+
+	if (g->nr == ARRAY_SIZE(g->g)) {
+		closure_put(&trans->ref);
+
+		if (orig_trans->lock_may_not_fail)
+			return 0;
+
+		while (g->nr)
+			lock_graph_up(g);
+
+		if (cycle)
+			return 0;
+
+		trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
+		return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
+	}
+
+	__lock_graph_down(g, trans);
+	return 0;
+}
+
+static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
+{
+	return t1 + t2 > 1;
+}
+
+int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
+{
+	struct lock_graph g;
+	struct trans_waiting_for_lock *top;
+	struct btree_bkey_cached_common *b;
+	btree_path_idx_t path_idx;
+	int ret = 0;
+
+	g.nr = 0;
+
+	if (trans->lock_must_abort) {
+		if (cycle)
+			return -1;
+
+		trace_would_deadlock(&g, trans);
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
+	}
+
+	lock_graph_down(&g, trans);
+
+	/* trans->paths is rcu protected vs. freeing */
+	rcu_read_lock();
+	if (cycle)
+		cycle->atomic++;
+next:
+	if (!g.nr)
+		goto out;
+
+	top = &g.g[g.nr - 1];
+
+	struct btree_path *paths = rcu_dereference(top->trans->paths);
+	if (!paths)
+		goto up;
+
+	unsigned long *paths_allocated = trans_paths_allocated(paths);
+
+	trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths),
+				     path_idx, top->path_idx) {
+		struct btree_path *path = paths + path_idx;
+		if (!path->nodes_locked)
+			continue;
+
+		if (path_idx != top->path_idx) {
+			top->path_idx		= path_idx;
+			top->level		= 0;
+			top->lock_start_time	= 0;
+		}
+
+		for (;
+		     top->level < BTREE_MAX_DEPTH;
+		     top->level++, top->lock_start_time = 0) {
+			int lock_held = btree_node_locked_type(path, top->level);
+
+			if (lock_held == BTREE_NODE_UNLOCKED)
+				continue;
+
+			b = &READ_ONCE(path->l[top->level].b)->c;
+
+			if (IS_ERR_OR_NULL(b)) {
+				/*
+				 * If we get here, it means we raced with the
+				 * other thread updating its btree_path
+				 * structures - which means it can't be blocked
+				 * waiting on a lock:
+				 */
+				if (!lock_graph_remove_non_waiters(&g)) {
+					/*
+					 * If lock_graph_remove_non_waiters()
+					 * didn't do anything, it must be
+					 * because we're being called by debugfs
+					 * checking for lock cycles, which
+					 * invokes us on btree_transactions that
+					 * aren't actually waiting on anything.
+					 * Just bail out:
+					 */
+					lock_graph_pop_all(&g);
+				}
+
+				goto next;
+			}
+
+			if (list_empty_careful(&b->lock.wait_list))
+				continue;
+
+			raw_spin_lock(&b->lock.wait_lock);
+			list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
+				BUG_ON(b != trans->locking);
+
+				if (top->lock_start_time &&
+				    time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
+					continue;
+
+				top->lock_start_time = trans->locking_wait.start_time;
+
+				/* Don't check for self deadlock: */
+				if (trans == top->trans ||
+				    !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
+					continue;
+
+				closure_get(&trans->ref);
+				raw_spin_unlock(&b->lock.wait_lock);
+
+				ret = lock_graph_descend(&g, trans, cycle);
+				if (ret)
+					goto out;
+				goto next;
+
+			}
+			raw_spin_unlock(&b->lock.wait_lock);
+		}
+	}
+up:
+	if (g.nr > 1 && cycle)
+		print_chain(cycle, &g);
+	lock_graph_up(&g);
+	goto next;
+out:
+	if (cycle)
+		--cycle->atomic;
+	rcu_read_unlock();
+	return ret;
+}
+
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
+{
+	struct btree_trans *trans = p;
+
+	return bch2_check_for_deadlock(trans, NULL);
+}
+
+int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path,
+				 struct btree_bkey_cached_common *b,
+				 bool lock_may_not_fail)
+{
+	int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
+	int ret;
+
+	/*
+	 * Must drop our read locks before calling six_lock_write() -
+	 * six_unlock() won't do wakeups until the reader count
+	 * goes to 0, and it's safe because we have the node intent
+	 * locked:
+	 */
+	six_lock_readers_add(&b->lock, -readers);
+	ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write,
+				       lock_may_not_fail, _RET_IP_);
+	six_lock_readers_add(&b->lock, readers);
+
+	if (ret)
+		mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED);
+
+	return ret;
+}
+
+void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
+				       struct btree_path *path,
+				       struct btree_bkey_cached_common *b)
+{
+	int ret = __btree_node_lock_write(trans, path, b, true);
+	BUG_ON(ret);
+}
+
+/* relock */
+
+static inline bool btree_path_get_locks(struct btree_trans *trans,
+					struct btree_path *path,
+					bool upgrade,
+					struct get_locks_fail *f)
+{
+	unsigned l = path->level;
+	int fail_idx = -1;
+
+	do {
+		if (!btree_path_node(path, l))
+			break;
+
+		if (!(upgrade
+		      ? bch2_btree_node_upgrade(trans, path, l)
+		      : bch2_btree_node_relock(trans, path, l))) {
+			fail_idx	= l;
+
+			if (f) {
+				f->l	= l;
+				f->b	= path->l[l].b;
+			}
+		}
+
+		l++;
+	} while (l < path->locks_want);
+
+	/*
+	 * When we fail to get a lock, we have to ensure that any child nodes
+	 * can't be relocked so bch2_btree_path_traverse has to walk back up to
+	 * the node that we failed to relock:
+	 */
+	if (fail_idx >= 0) {
+		__bch2_btree_path_unlock(trans, path);
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+
+		do {
+			path->l[fail_idx].b = upgrade
+				? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
+				: ERR_PTR(-BCH_ERR_no_btree_node_relock);
+			--fail_idx;
+		} while (fail_idx >= 0);
+	}
+
+	if (path->uptodate == BTREE_ITER_NEED_RELOCK)
+		path->uptodate = BTREE_ITER_UPTODATE;
+
+	return path->uptodate < BTREE_ITER_NEED_RELOCK;
+}
+
+bool __bch2_btree_node_relock(struct btree_trans *trans,
+			      struct btree_path *path, unsigned level,
+			      bool trace)
+{
+	struct btree *b = btree_path_node(path, level);
+	int want = __btree_lock_want(path, level);
+
+	if (race_fault())
+		goto fail;
+
+	if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
+	    (btree_node_lock_seq_matches(path, b, level) &&
+	     btree_node_lock_increment(trans, &b->c, level, want))) {
+		mark_btree_node_locked(trans, path, level, want);
+		return true;
+	}
+fail:
+	if (trace && !trans->notrace_relock_fail)
+		trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
+	return false;
+}
+
+/* upgrade */
+
+bool bch2_btree_node_upgrade(struct btree_trans *trans,
+			     struct btree_path *path, unsigned level)
+{
+	struct btree *b = path->l[level].b;
+	struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level);
+
+	if (!is_btree_node(path, level))
+		return false;
+
+	switch (btree_lock_want(path, level)) {
+	case BTREE_NODE_UNLOCKED:
+		BUG_ON(btree_node_locked(path, level));
+		return true;
+	case BTREE_NODE_READ_LOCKED:
+		BUG_ON(btree_node_intent_locked(path, level));
+		return bch2_btree_node_relock(trans, path, level);
+	case BTREE_NODE_INTENT_LOCKED:
+		break;
+	case BTREE_NODE_WRITE_LOCKED:
+		BUG();
+	}
+
+	if (btree_node_intent_locked(path, level))
+		return true;
+
+	if (race_fault())
+		return false;
+
+	if (btree_node_locked(path, level)) {
+		bool ret;
+
+		six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]);
+		ret = six_lock_tryupgrade(&b->c.lock);
+		six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]);
+
+		if (ret)
+			goto success;
+	} else {
+		if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
+			goto success;
+	}
+
+	/*
+	 * Do we already have an intent lock via another path? If so, just bump
+	 * lock count:
+	 */
+	if (btree_node_lock_seq_matches(path, b, level) &&
+	    btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
+		btree_node_unlock(trans, path, level);
+		goto success;
+	}
+
+	trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
+	return false;
+success:
+	mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
+	return true;
+}
+
+/* Btree path locking: */
+
+/*
+ * Only for btree_cache.c - only relocks intent locks
+ */
+int bch2_btree_path_relock_intent(struct btree_trans *trans,
+				  struct btree_path *path)
+{
+	unsigned l;
+
+	for (l = path->level;
+	     l < path->locks_want && btree_path_node(path, l);
+	     l++) {
+		if (!bch2_btree_node_relock(trans, path, l)) {
+			__bch2_btree_path_unlock(trans, path);
+			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+			trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
+			return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
+		}
+	}
+
+	return 0;
+}
+
+__flatten
+bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path)
+{
+	struct get_locks_fail f;
+
+	bool ret = btree_path_get_locks(trans, path, false, &f);
+	bch2_trans_verify_locks(trans);
+	return ret;
+}
+
+int __bch2_btree_path_relock(struct btree_trans *trans,
+			struct btree_path *path, unsigned long trace_ip)
+{
+	if (!bch2_btree_path_relock_norestart(trans, path)) {
+		trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
+	}
+
+	return 0;
+}
+
+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
+			       struct btree_path *path,
+			       unsigned new_locks_want,
+			       struct get_locks_fail *f)
+{
+	EBUG_ON(path->locks_want >= new_locks_want);
+
+	path->locks_want = new_locks_want;
+
+	bool ret = btree_path_get_locks(trans, path, true, f);
+	bch2_trans_verify_locks(trans);
+	return ret;
+}
+
+bool __bch2_btree_path_upgrade(struct btree_trans *trans,
+			       struct btree_path *path,
+			       unsigned new_locks_want,
+			       struct get_locks_fail *f)
+{
+	bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f);
+	if (ret)
+		goto out;
+
+	/*
+	 * XXX: this is ugly - we'd prefer to not be mucking with other
+	 * iterators in the btree_trans here.
+	 *
+	 * On failure to upgrade the iterator, setting iter->locks_want and
+	 * calling get_locks() is sufficient to make bch2_btree_path_traverse()
+	 * get the locks we want on transaction restart.
+	 *
+	 * But if this iterator was a clone, on transaction restart what we did
+	 * to this iterator isn't going to be preserved.
+	 *
+	 * Possibly we could add an iterator field for the parent iterator when
+	 * an iterator is a copy - for now, we'll just upgrade any other
+	 * iterators with the same btree id.
+	 *
+	 * The code below used to be needed to ensure ancestor nodes get locked
+	 * before interior nodes - now that's handled by
+	 * bch2_btree_path_traverse_all().
+	 */
+	if (!path->cached && !trans->in_traverse_all) {
+		struct btree_path *linked;
+		unsigned i;
+
+		trans_for_each_path(trans, linked, i)
+			if (linked != path &&
+			    linked->cached == path->cached &&
+			    linked->btree_id == path->btree_id &&
+			    linked->locks_want < new_locks_want) {
+				linked->locks_want = new_locks_want;
+				btree_path_get_locks(trans, linked, true, NULL);
+			}
+	}
+out:
+	bch2_trans_verify_locks(trans);
+	return ret;
+}
+
+void __bch2_btree_path_downgrade(struct btree_trans *trans,
+				 struct btree_path *path,
+				 unsigned new_locks_want)
+{
+	unsigned l, old_locks_want = path->locks_want;
+
+	if (trans->restarted)
+		return;
+
+	EBUG_ON(path->locks_want < new_locks_want);
+
+	path->locks_want = new_locks_want;
+
+	while (path->nodes_locked &&
+	       (l = btree_path_highest_level_locked(path)) >= path->locks_want) {
+		if (l > path->level) {
+			btree_node_unlock(trans, path, l);
+		} else {
+			if (btree_node_intent_locked(path, l)) {
+				six_lock_downgrade(&path->l[l].b->c.lock);
+				mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED);
+			}
+			break;
+		}
+	}
+
+	bch2_btree_path_verify_locks(path);
+
+	trace_path_downgrade(trans, _RET_IP_, path, old_locks_want);
+}
+
+/* Btree transaction locking: */
+
+void bch2_trans_downgrade(struct btree_trans *trans)
+{
+	struct btree_path *path;
+	unsigned i;
+
+	if (trans->restarted)
+		return;
+
+	trans_for_each_path(trans, path, i)
+		if (path->ref)
+			bch2_btree_path_downgrade(trans, path);
+}
+
+static inline void __bch2_trans_unlock(struct btree_trans *trans)
+{
+	struct btree_path *path;
+	unsigned i;
+
+	trans_for_each_path(trans, path, i)
+		__bch2_btree_path_unlock(trans, path);
+}
+
+static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path,
+						  struct get_locks_fail *f, bool trace)
+{
+	if (!trace)
+		goto out;
+
+	if (trace_trans_restart_relock_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bpos_to_text(&buf, path->pos);
+		prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq);
+		if (IS_ERR_OR_NULL(f->b)) {
+			prt_str(&buf, bch2_err_str(PTR_ERR(f->b)));
+		} else {
+			prt_printf(&buf, "%u", f->b->c.lock.seq);
+
+			struct six_lock_count c =
+				bch2_btree_node_lock_counts(trans, NULL, &f->b->c, f->l);
+			prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
+
+			c = six_lock_counts(&f->b->c.lock);
+			prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
+		}
+
+		trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
+		printbuf_exit(&buf);
+	}
+
+	count_event(trans->c, trans_restart_relock);
+out:
+	__bch2_trans_unlock(trans);
+	bch2_trans_verify_locks(trans);
+	return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+}
+
+static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
+{
+	bch2_trans_verify_locks(trans);
+
+	if (unlikely(trans->restarted))
+		return -((int) trans->restarted);
+	if (unlikely(trans->locked))
+		goto out;
+
+	struct btree_path *path;
+	unsigned i;
+
+	trans_for_each_path(trans, path, i) {
+		struct get_locks_fail f;
+
+		if (path->should_be_locked &&
+		    !btree_path_get_locks(trans, path, false, &f))
+			return bch2_trans_relock_fail(trans, path, &f, trace);
+	}
+
+	trans_set_locked(trans, true);
+out:
+	bch2_trans_verify_locks(trans);
+	return 0;
+}
+
+int bch2_trans_relock(struct btree_trans *trans)
+{
+	return __bch2_trans_relock(trans, true);
+}
+
+int bch2_trans_relock_notrace(struct btree_trans *trans)
+{
+	return __bch2_trans_relock(trans, false);
+}
+
+void bch2_trans_unlock_noassert(struct btree_trans *trans)
+{
+	__bch2_trans_unlock(trans);
+
+	trans_set_unlocked(trans);
+}
+
+void bch2_trans_unlock(struct btree_trans *trans)
+{
+	__bch2_trans_unlock(trans);
+
+	trans_set_unlocked(trans);
+}
+
+void bch2_trans_unlock_long(struct btree_trans *trans)
+{
+	bch2_trans_unlock(trans);
+	bch2_trans_srcu_unlock(trans);
+}
+
+int __bch2_trans_mutex_lock(struct btree_trans *trans,
+			    struct mutex *lock)
+{
+	int ret = drop_locks_do(trans, (mutex_lock(lock), 0));
+
+	if (ret)
+		mutex_unlock(lock);
+	return ret;
+}
+
+/* Debug */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void bch2_btree_path_verify_locks(struct btree_path *path)
+{
+	/*
+	 * A path may be uptodate and yet have nothing locked if and only if
+	 * there is no node at path->level, which generally means we were
+	 * iterating over all nodes and got to the end of the btree
+	 */
+	BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
+	       btree_path_node(path, path->level) &&
+	       !path->nodes_locked);
+
+	if (!path->nodes_locked)
+		return;
+
+	for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
+		int want = btree_lock_want(path, l);
+		int have = btree_node_locked_type(path, l);
+
+		BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
+
+		BUG_ON(is_btree_node(path, l) &&
+		       (want == BTREE_NODE_UNLOCKED ||
+			have != BTREE_NODE_WRITE_LOCKED) &&
+		       want != have);
+	}
+}
+
+static bool bch2_trans_locked(struct btree_trans *trans)
+{
+	struct btree_path *path;
+	unsigned i;
+
+	trans_for_each_path(trans, path, i)
+		if (path->nodes_locked)
+			return true;
+	return false;
+}
+
+void bch2_trans_verify_locks(struct btree_trans *trans)
+{
+	if (!trans->locked) {
+		BUG_ON(bch2_trans_locked(trans));
+		return;
+	}
+
+	struct btree_path *path;
+	unsigned i;
+
+	trans_for_each_path(trans, path, i)
+		bch2_btree_path_verify_locks(path);
+}
+
+#endif
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index 419d0e81..7474ab6c 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_BTREE_LOCKING_H
 #define _BCACHEFS_BTREE_LOCKING_H
 
@@ -10,186 +11,436 @@
  */
 
 #include "btree_iter.h"
-#include "btree_io.h"
 #include "six.h"
 
+void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
+
+void bch2_trans_unlock_noassert(struct btree_trans *);
+
+static inline bool is_btree_node(struct btree_path *path, unsigned l)
+{
+	return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
+}
+
+static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
+{
+	return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
+		? &trans->c->btree_transaction_stats[trans->fn_idx]
+		: NULL;
+}
+
 /* matches six lock types */
 enum btree_node_locked_type {
 	BTREE_NODE_UNLOCKED		= -1,
 	BTREE_NODE_READ_LOCKED		= SIX_LOCK_read,
 	BTREE_NODE_INTENT_LOCKED	= SIX_LOCK_intent,
+	BTREE_NODE_WRITE_LOCKED		= SIX_LOCK_write,
 };
 
-static inline int btree_node_locked_type(struct btree_iter *iter,
+static inline int btree_node_locked_type(struct btree_path *path,
 					 unsigned level)
 {
-	/*
-	 * We're relying on the fact that if nodes_intent_locked is set
-	 * nodes_locked must be set as well, so that we can compute without
-	 * branches:
-	 */
-	return BTREE_NODE_UNLOCKED +
-		((iter->nodes_locked >> level) & 1) +
-		((iter->nodes_intent_locked >> level) & 1);
+	return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3);
 }
 
-static inline bool btree_node_intent_locked(struct btree_iter *iter,
-					    unsigned level)
+static inline bool btree_node_write_locked(struct btree_path *path, unsigned l)
 {
-	return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED;
+	return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED;
 }
 
-static inline bool btree_node_read_locked(struct btree_iter *iter,
-					  unsigned level)
+static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l)
 {
-	return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED;
+	return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED;
 }
 
-static inline bool btree_node_locked(struct btree_iter *iter, unsigned level)
+static inline bool btree_node_read_locked(struct btree_path *path, unsigned l)
 {
-	return iter->nodes_locked & (1 << level);
+	return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED;
 }
 
-static inline void mark_btree_node_unlocked(struct btree_iter *iter,
-					    unsigned level)
+static inline bool btree_node_locked(struct btree_path *path, unsigned level)
 {
-	iter->nodes_locked &= ~(1 << level);
-	iter->nodes_intent_locked &= ~(1 << level);
+	return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED;
 }
 
-static inline void mark_btree_node_locked(struct btree_iter *iter,
-					  unsigned level,
-					  enum six_lock_type type)
+static inline void mark_btree_node_locked_noreset(struct btree_path *path,
+						  unsigned level,
+						  enum btree_node_locked_type type)
 {
 	/* relying on this to avoid a branch */
 	BUILD_BUG_ON(SIX_LOCK_read   != 0);
 	BUILD_BUG_ON(SIX_LOCK_intent != 1);
 
-	iter->nodes_locked |= 1 << level;
-	iter->nodes_intent_locked |= type << level;
+	path->nodes_locked &= ~(3U << (level << 1));
+	path->nodes_locked |= (type + 1) << (level << 1);
+}
+
+static inline void mark_btree_node_unlocked(struct btree_path *path,
+					    unsigned level)
+{
+	EBUG_ON(btree_node_write_locked(path, level));
+	mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
 }
 
-static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
-						 unsigned level)
+static inline void mark_btree_node_locked(struct btree_trans *trans,
+					  struct btree_path *path,
+					  unsigned level,
+					  enum btree_node_locked_type type)
 {
-	mark_btree_node_locked(iter, level, SIX_LOCK_intent);
+	mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type);
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+	path->l[level].lock_taken_time = local_clock();
+#endif
 }
 
-static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level)
+static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
 {
-	return level < iter->locks_want
+	return level < path->locks_want
 		? SIX_LOCK_intent
 		: SIX_LOCK_read;
 }
 
 static inline enum btree_node_locked_type
-btree_lock_want(struct btree_iter *iter, int level)
+btree_lock_want(struct btree_path *path, int level)
 {
-	if (level < iter->level)
+	if (level < path->level)
 		return BTREE_NODE_UNLOCKED;
-	if (level < iter->locks_want)
+	if (level < path->locks_want)
 		return BTREE_NODE_INTENT_LOCKED;
-	if (level == iter->level)
+	if (level == path->level)
 		return BTREE_NODE_READ_LOCKED;
 	return BTREE_NODE_UNLOCKED;
 }
 
-static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
+static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
+					      struct btree_path *path, unsigned level)
 {
-	int lock_type = btree_node_locked_type(iter, level);
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+	__bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
+				 path->l[level].lock_taken_time,
+				 local_clock());
+#endif
+}
+
+/* unlock: */
+
+static inline void btree_node_unlock(struct btree_trans *trans,
+				     struct btree_path *path, unsigned level)
+{
+	int lock_type = btree_node_locked_type(path, level);
 
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
+	EBUG_ON(lock_type == BTREE_NODE_WRITE_LOCKED);
 
-	if (lock_type != BTREE_NODE_UNLOCKED)
-		six_unlock_type(&iter->l[level].b->lock, lock_type);
-	mark_btree_node_unlocked(iter, level);
+	if (lock_type != BTREE_NODE_UNLOCKED) {
+		six_unlock_type(&path->l[level].b->c.lock, lock_type);
+		btree_trans_lock_hold_time_update(trans, path, level);
+	}
+	mark_btree_node_unlocked(path, level);
 }
 
-static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
+static inline int btree_path_lowest_level_locked(struct btree_path *path)
 {
-	btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+	return __ffs(path->nodes_locked) >> 1;
+}
 
-	while (iter->nodes_locked)
-		btree_node_unlock(iter, __ffs(iter->nodes_locked));
+static inline int btree_path_highest_level_locked(struct btree_path *path)
+{
+	return __fls(path->nodes_locked) >> 1;
 }
 
-static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
+static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
+					    struct btree_path *path)
 {
-	switch (type) {
-	case SIX_LOCK_read:
-		return BCH_TIME_btree_lock_contended_read;
-	case SIX_LOCK_intent:
-		return BCH_TIME_btree_lock_contended_intent;
-	case SIX_LOCK_write:
-		return BCH_TIME_btree_lock_contended_write;
-	default:
-		BUG();
-	}
+	btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
+
+	while (path->nodes_locked)
+		btree_node_unlock(trans, path, btree_path_lowest_level_locked(path));
 }
 
 /*
- * wrapper around six locks that just traces lock contended time
+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
+ * succeed:
  */
-static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
-					  enum six_lock_type type)
+static inline void
+bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
+				     struct btree *b)
+{
+	struct btree_path *linked;
+	unsigned i;
+
+	EBUG_ON(path->l[b->c.level].b != b);
+	EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
+	EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
+
+	mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
+
+	trans_for_each_path_with_node(trans, b, linked, i)
+		linked->l[b->c.level].lock_seq++;
+
+	six_unlock_write(&b->c.lock);
+}
+
+void bch2_btree_node_unlock_write(struct btree_trans *,
+			struct btree_path *, struct btree *);
+
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
+
+/* lock: */
+
+static inline void trans_set_locked(struct btree_trans *trans, bool try)
+{
+	if (!trans->locked) {
+		lock_acquire_exclusive(&trans->dep_map, 0, try, NULL, _THIS_IP_);
+		trans->locked = true;
+		trans->last_unlock_ip = 0;
+
+		trans->pf_memalloc_nofs = (current->flags & PF_MEMALLOC_NOFS) != 0;
+		current->flags |= PF_MEMALLOC_NOFS;
+	}
+}
+
+static inline void trans_set_unlocked(struct btree_trans *trans)
 {
-	u64 start_time = local_clock();
+	if (trans->locked) {
+		lock_release(&trans->dep_map, _THIS_IP_);
+		trans->locked = false;
+		trans->last_unlock_ip = _RET_IP_;
+
+		if (!trans->pf_memalloc_nofs)
+			current->flags &= ~PF_MEMALLOC_NOFS;
+	}
+}
+
+static inline int __btree_node_lock_nopath(struct btree_trans *trans,
+					 struct btree_bkey_cached_common *b,
+					 enum six_lock_type type,
+					 bool lock_may_not_fail,
+					 unsigned long ip)
+{
+	trans->lock_may_not_fail = lock_may_not_fail;
+	trans->lock_must_abort	= false;
+	trans->locking		= b;
+
+	int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
+				     bch2_six_check_for_deadlock, trans, ip);
+	WRITE_ONCE(trans->locking, NULL);
+	WRITE_ONCE(trans->locking_wait.start_time, 0);
+
+	if (!ret)
+		trace_btree_path_lock(trans, _THIS_IP_, b);
+	return ret;
+}
 
-	six_lock_type(&b->lock, type);
-	bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
+static inline int __must_check
+btree_node_lock_nopath(struct btree_trans *trans,
+		       struct btree_bkey_cached_common *b,
+		       enum six_lock_type type,
+		       unsigned long ip)
+{
+	return __btree_node_lock_nopath(trans, b, type, false, ip);
 }
 
-static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
-					enum six_lock_type type)
+static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
+					 struct btree_bkey_cached_common *b,
+					 enum six_lock_type type)
 {
-	if (!six_trylock_type(&b->lock, type))
-		__btree_node_lock_type(c, b, type);
+	int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_);
+
+	BUG_ON(ret);
 }
 
-bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
-			    struct btree_iter *, enum six_lock_type, bool);
+/*
+ * Lock a btree node if we already have it locked on one of our linked
+ * iterators:
+ */
+static inline bool btree_node_lock_increment(struct btree_trans *trans,
+					     struct btree_bkey_cached_common *b,
+					     unsigned level,
+					     enum btree_node_locked_type want)
+{
+	struct btree_path *path;
+	unsigned i;
+
+	trans_for_each_path(trans, path, i)
+		if (&path->l[level].b->c == b &&
+		    btree_node_locked_type(path, level) >= want) {
+			six_lock_increment(&b->lock, (enum six_lock_type) want);
+			return true;
+		}
 
-static inline bool btree_node_lock(struct btree *b, struct bpos pos,
-				   unsigned level,
-				   struct btree_iter *iter,
-				   enum six_lock_type type,
-				   bool may_drop_locks)
+	return false;
+}
+
+static inline int btree_node_lock(struct btree_trans *trans,
+			struct btree_path *path,
+			struct btree_bkey_cached_common *b,
+			unsigned level,
+			enum six_lock_type type,
+			unsigned long ip)
 {
+	int ret = 0;
+
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
+
+	if (likely(six_trylock_type(&b->lock, type)) ||
+	    btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
+	    !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) {
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+		path->l[b->level].lock_taken_time = local_clock();
+#endif
+	}
 
-	return likely(six_trylock_type(&b->lock, type)) ||
-		__bch2_btree_node_lock(b, pos, level, iter,
-				       type, may_drop_locks);
+	return ret;
 }
 
-bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
+int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *,
+				 struct btree_bkey_cached_common *b, bool);
 
-static inline bool bch2_btree_node_relock(struct btree_iter *iter,
-					  unsigned level)
+static inline int __btree_node_lock_write(struct btree_trans *trans,
+					  struct btree_path *path,
+					  struct btree_bkey_cached_common *b,
+					  bool lock_may_not_fail)
 {
-	EBUG_ON(btree_node_locked(iter, level) &&
-		btree_node_locked_type(iter, level) !=
-		__btree_lock_want(iter, level));
+	EBUG_ON(&path->l[b->level].b->c != b);
+	EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock));
+	EBUG_ON(!btree_node_intent_locked(path, b->level));
+
+	/*
+	 * six locks are unfair, and read locks block while a thread wants a
+	 * write lock: thus, we need to tell the cycle detector we have a write
+	 * lock _before_ taking the lock:
+	 */
+	mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_WRITE_LOCKED);
+
+	return likely(six_trylock_write(&b->lock))
+		? 0
+		: __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail);
+}
 
-	return likely(btree_node_locked(iter, level)) ||
-		__bch2_btree_node_relock(iter, level);
+static inline int __must_check
+bch2_btree_node_lock_write(struct btree_trans *trans,
+			   struct btree_path *path,
+			   struct btree_bkey_cached_common *b)
+{
+	return __btree_node_lock_write(trans, path, b, false);
 }
 
-bool bch2_btree_iter_relock(struct btree_iter *);
+void bch2_btree_node_lock_write_nofail(struct btree_trans *,
+				       struct btree_path *,
+				       struct btree_bkey_cached_common *);
+
+/* relock: */
+
+bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *);
+int __bch2_btree_path_relock(struct btree_trans *,
+			     struct btree_path *, unsigned long);
 
-void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
+static inline int bch2_btree_path_relock(struct btree_trans *trans,
+				struct btree_path *path, unsigned long trace_ip)
+{
+	return btree_node_locked(path, path->level)
+		? 0
+		: __bch2_btree_path_relock(trans, path, trace_ip);
+}
 
-void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
+bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace);
 
-static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+static inline bool bch2_btree_node_relock(struct btree_trans *trans,
+					  struct btree_path *path, unsigned level)
 {
-	EBUG_ON(iter->l[b->level].b != b);
-	EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
+	EBUG_ON(btree_node_locked(path, level) &&
+		!btree_node_write_locked(path, level) &&
+		btree_node_locked_type(path, level) != __btree_lock_want(path, level));
 
-	if (!six_trylock_write(&b->lock))
-		__bch2_btree_node_lock_write(b, iter);
+	return likely(btree_node_locked(path, level)) ||
+		(!IS_ERR_OR_NULL(path->l[level].b) &&
+		 __bch2_btree_node_relock(trans, path, level, true));
 }
 
-#endif /* _BCACHEFS_BTREE_LOCKING_H */
+static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
+						  struct btree_path *path, unsigned level)
+{
+	EBUG_ON(btree_node_locked(path, level) &&
+		!btree_node_write_locked(path, level) &&
+		btree_node_locked_type(path, level) != __btree_lock_want(path, level));
+
+	return likely(btree_node_locked(path, level)) ||
+		(!IS_ERR_OR_NULL(path->l[level].b) &&
+		 __bch2_btree_node_relock(trans, path, level, false));
+}
 
+/* upgrade */
 
+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
+			       struct btree_path *, unsigned,
+			       struct get_locks_fail *);
+
+bool __bch2_btree_path_upgrade(struct btree_trans *,
+			       struct btree_path *, unsigned,
+			       struct get_locks_fail *);
+
+static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
+					  struct btree_path *path,
+					  unsigned new_locks_want)
+{
+	struct get_locks_fail f = {};
+	unsigned old_locks_want = path->locks_want;
+
+	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+
+	if (path->locks_want < new_locks_want
+	    ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f)
+	    : path->nodes_locked)
+		return 0;
+
+	trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
+			old_locks_want, new_locks_want, &f);
+	return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
+}
+
+/* misc: */
+
+static inline void btree_path_set_should_be_locked(struct btree_trans *trans, struct btree_path *path)
+{
+	EBUG_ON(!btree_node_locked(path, path->level));
+	EBUG_ON(path->uptodate);
+
+	path->should_be_locked = true;
+	trace_btree_path_should_be_locked(trans, path);
+}
+
+static inline void __btree_path_set_level_up(struct btree_trans *trans,
+				      struct btree_path *path,
+				      unsigned l)
+{
+	btree_node_unlock(trans, path, l);
+	path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
+}
+
+static inline void btree_path_set_level_up(struct btree_trans *trans,
+				    struct btree_path *path)
+{
+	__btree_path_set_level_up(trans, path, path->level++);
+	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+}
+
+/* debug */
+
+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
+				struct btree_path *,
+				struct btree_bkey_cached_common *b,
+				unsigned);
+
+int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_path_verify_locks(struct btree_path *);
+void bch2_trans_verify_locks(struct btree_trans *);
+#else
+static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
+static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
+#endif
+
+#endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/libbcachefs/btree_node_scan.c b/libbcachefs/btree_node_scan.c
new file mode 100644
index 00000000..eeafb5e7
--- /dev/null
+++ b/libbcachefs/btree_node_scan.c
@@ -0,0 +1,559 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_journal_iter.h"
+#include "btree_node_scan.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "error.h"
+#include "journal_io.h"
+#include "recovery_passes.h"
+
+#include <linux/kthread.h>
+#include <linux/sort.h>
+
+struct find_btree_nodes_worker {
+	struct closure		*cl;
+	struct find_btree_nodes	*f;
+	struct bch_dev		*ca;
+};
+
+static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
+{
+	bch2_btree_id_level_to_text(out, n->btree_id, n->level);
+	prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ",
+		   n->seq, n->journal_seq, n->cookie);
+	bch2_bpos_to_text(out, n->min_key);
+	prt_str(out, "-");
+	bch2_bpos_to_text(out, n->max_key);
+
+	if (n->range_updated)
+		prt_str(out, " range updated");
+	if (n->overwritten)
+		prt_str(out, " overwritten");
+
+	for (unsigned i = 0; i < n->nr_ptrs; i++) {
+		prt_char(out, ' ');
+		bch2_extent_ptr_to_text(out, c, n->ptrs + i);
+	}
+}
+
+static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
+{
+	printbuf_indent_add(out, 2);
+	darray_for_each(nodes, i) {
+		found_btree_node_to_text(out, c, i);
+		prt_newline(out);
+	}
+	printbuf_indent_sub(out, 2);
+}
+
+static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
+{
+	struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
+
+	set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
+	bp->k.p			= f->max_key;
+	bp->v.seq		= cpu_to_le64(f->cookie);
+	bp->v.sectors_written	= 0;
+	bp->v.flags		= 0;
+	bp->v.sectors_written	= cpu_to_le16(f->sectors_written);
+	bp->v.min_key		= f->min_key;
+	SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
+	memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
+}
+
+static inline u64 bkey_journal_seq(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_inode_v3:
+		return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq);
+	default:
+		return 0;
+	}
+}
+
+static bool found_btree_node_is_readable(struct btree_trans *trans,
+					 struct found_btree_node *f)
+{
+	struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
+
+	found_btree_node_to_key(&tmp.k, f);
+
+	struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false);
+	bool ret = !IS_ERR_OR_NULL(b);
+	if (!ret)
+		return ret;
+
+	f->sectors_written = b->written;
+	f->journal_seq = le64_to_cpu(b->data->keys.journal_seq);
+
+	struct bkey_s_c k;
+	struct bkey unpacked;
+	struct btree_node_iter iter;
+	for_each_btree_node_key_unpack(b, k, &iter, &unpacked)
+		f->journal_seq = max(f->journal_seq, bkey_journal_seq(k));
+
+	six_unlock_read(&b->c.lock);
+
+	/*
+	 * We might update this node's range; if that happens, we need the node
+	 * to be re-read so the read path can trim keys that are no longer in
+	 * this node
+	 */
+	if (b != btree_node_root(trans->c, b))
+		bch2_btree_node_evict(trans, &tmp.k);
+	return ret;
+}
+
+static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
+{
+	const struct found_btree_node *l = _l;
+	const struct found_btree_node *r = _r;
+
+	return  cmp_int(l->btree_id,	r->btree_id) ?:
+		cmp_int(l->level,	r->level) ?:
+		cmp_int(l->cookie,	r->cookie);
+}
+
+/*
+ * Given two found btree nodes, if their sequence numbers are equal, take the
+ * one that's readable:
+ */
+static int found_btree_node_cmp_time(const struct found_btree_node *l,
+				     const struct found_btree_node *r)
+{
+	return  cmp_int(l->seq, r->seq) ?:
+		cmp_int(l->journal_seq, r->journal_seq);
+}
+
+static int found_btree_node_cmp_pos(const void *_l, const void *_r)
+{
+	const struct found_btree_node *l = _l;
+	const struct found_btree_node *r = _r;
+
+	return  cmp_int(l->btree_id,	r->btree_id) ?:
+	       -cmp_int(l->level,	r->level) ?:
+		bpos_cmp(l->min_key,	r->min_key) ?:
+	       -found_btree_node_cmp_time(l, r);
+}
+
+static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
+				struct bio *bio, struct btree_node *bn, u64 offset)
+{
+	struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+
+	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
+	bio->bi_iter.bi_sector	= offset;
+	bch2_bio_map(bio, bn, PAGE_SIZE);
+
+	submit_bio_wait(bio);
+	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+			       "IO error in try_read_btree_node() at %llu: %s",
+			       offset, bch2_blk_status_to_str(bio->bi_status)))
+		return;
+
+	if (le64_to_cpu(bn->magic) != bset_magic(c))
+		return;
+
+	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
+		if (!c->chacha20)
+			return;
+
+		struct nonce nonce = btree_nonce(&bn->keys, 0);
+		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+
+		bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
+	}
+
+	if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
+		return;
+
+	if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
+		return;
+
+	if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
+		return;
+
+	rcu_read_lock();
+	struct found_btree_node n = {
+		.btree_id	= BTREE_NODE_ID(bn),
+		.level		= BTREE_NODE_LEVEL(bn),
+		.seq		= BTREE_NODE_SEQ(bn),
+		.cookie		= le64_to_cpu(bn->keys.seq),
+		.min_key	= bn->min_key,
+		.max_key	= bn->max_key,
+		.nr_ptrs	= 1,
+		.ptrs[0].type	= 1 << BCH_EXTENT_ENTRY_ptr,
+		.ptrs[0].offset	= offset,
+		.ptrs[0].dev	= ca->dev_idx,
+		.ptrs[0].gen	= bucket_gen_get(ca, sector_to_bucket(ca, offset)),
+	};
+	rcu_read_unlock();
+
+	if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
+		mutex_lock(&f->lock);
+		if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
+			bch_err(c, "try_read_btree_node() can't handle endian conversion");
+			f->ret = -EINVAL;
+			goto unlock;
+		}
+
+		if (darray_push(&f->nodes, n))
+			f->ret = -ENOMEM;
+unlock:
+		mutex_unlock(&f->lock);
+	}
+}
+
+static int read_btree_nodes_worker(void *p)
+{
+	struct find_btree_nodes_worker *w = p;
+	struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
+	struct bch_dev *ca = w->ca;
+	void *buf = (void *) __get_free_page(GFP_KERNEL);
+	struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
+	unsigned long last_print = jiffies;
+
+	if (!buf || !bio) {
+		bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
+		w->f->ret = -ENOMEM;
+		goto err;
+	}
+
+	for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
+		for (unsigned bucket_offset = 0;
+		     bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
+		     bucket_offset += btree_sectors(c)) {
+			if (time_after(jiffies, last_print + HZ * 30)) {
+				u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
+				u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
+
+				bch_info(ca, "%s: %2u%% done", __func__,
+					 (unsigned) div64_u64(cur_sector * 100, end_sector));
+				last_print = jiffies;
+			}
+
+			u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
+
+			if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
+			    !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
+				continue;
+
+			try_read_btree_node(w->f, ca, bio, buf, sector);
+		}
+err:
+	bio_put(bio);
+	free_page((unsigned long) buf);
+	percpu_ref_get(&ca->io_ref);
+	closure_put(w->cl);
+	kfree(w);
+	return 0;
+}
+
+static int read_btree_nodes(struct find_btree_nodes *f)
+{
+	struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+	struct closure cl;
+	int ret = 0;
+
+	closure_init_stack(&cl);
+
+	for_each_online_member(c, ca) {
+		if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
+			continue;
+
+		struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
+		struct task_struct *t;
+
+		if (!w) {
+			percpu_ref_put(&ca->io_ref);
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		percpu_ref_get(&ca->io_ref);
+		closure_get(&cl);
+		w->cl		= &cl;
+		w->f		= f;
+		w->ca		= ca;
+
+		t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
+		ret = PTR_ERR_OR_ZERO(t);
+		if (ret) {
+			percpu_ref_put(&ca->io_ref);
+			closure_put(&cl);
+			f->ret = ret;
+			bch_err(c, "error starting kthread: %i", ret);
+			break;
+		}
+	}
+err:
+	closure_sync(&cl);
+	return f->ret ?: ret;
+}
+
+static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
+{
+	while (n + 1 < end &&
+	       found_btree_node_cmp_pos(n, n + 1) > 0) {
+		swap(n[0], n[1]);
+		n++;
+	}
+}
+
+static int handle_overwrites(struct bch_fs *c,
+			     struct found_btree_node *start,
+			     struct found_btree_node *end)
+{
+	struct found_btree_node *n;
+again:
+	for (n = start + 1;
+	     n < end &&
+	     n->btree_id	== start->btree_id &&
+	     n->level		== start->level &&
+	     bpos_lt(n->min_key, start->max_key);
+	     n++)  {
+		int cmp = found_btree_node_cmp_time(start, n);
+
+		if (cmp > 0) {
+			if (bpos_cmp(start->max_key, n->max_key) >= 0)
+				n->overwritten = true;
+			else {
+				n->range_updated = true;
+				n->min_key = bpos_successor(start->max_key);
+				n->range_updated = true;
+				bubble_up(n, end);
+				goto again;
+			}
+		} else if (cmp < 0) {
+			BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
+
+			start->max_key = bpos_predecessor(n->min_key);
+			start->range_updated = true;
+		} else if (n->level) {
+			n->overwritten = true;
+		} else {
+			if (bpos_cmp(start->max_key, n->max_key) >= 0)
+				n->overwritten = true;
+			else {
+				n->range_updated = true;
+				n->min_key = bpos_successor(start->max_key);
+				n->range_updated = true;
+				bubble_up(n, end);
+				goto again;
+			}
+		}
+	}
+
+	return 0;
+}
+
+int bch2_scan_for_btree_nodes(struct bch_fs *c)
+{
+	struct find_btree_nodes *f = &c->found_btree_nodes;
+	struct printbuf buf = PRINTBUF;
+	size_t dst;
+	int ret = 0;
+
+	if (f->nodes.nr)
+		return 0;
+
+	mutex_init(&f->lock);
+
+	ret = read_btree_nodes(f);
+	if (ret)
+		return ret;
+
+	if (!f->nodes.nr) {
+		bch_err(c, "%s: no btree nodes found", __func__);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (0 && c->opts.verbose) {
+		printbuf_reset(&buf);
+		prt_printf(&buf, "%s: nodes found:\n", __func__);
+		found_btree_nodes_to_text(&buf, c, f->nodes);
+		bch2_print_string_as_lines(KERN_INFO, buf.buf);
+	}
+
+	sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
+
+	dst = 0;
+	darray_for_each(f->nodes, i) {
+		struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
+
+		if (prev &&
+		    prev->cookie == i->cookie) {
+			if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
+				bch_err(c, "%s: found too many replicas for btree node", __func__);
+				ret = -EINVAL;
+				goto err;
+			}
+			prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
+		} else {
+			f->nodes.data[dst++] = *i;
+		}
+	}
+	f->nodes.nr = dst;
+
+	sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+
+	if (0 && c->opts.verbose) {
+		printbuf_reset(&buf);
+		prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
+		found_btree_nodes_to_text(&buf, c, f->nodes);
+		bch2_print_string_as_lines(KERN_INFO, buf.buf);
+	}
+
+	dst = 0;
+	darray_for_each(f->nodes, i) {
+		if (i->overwritten)
+			continue;
+
+		ret = handle_overwrites(c, i, &darray_top(f->nodes));
+		if (ret)
+			goto err;
+
+		BUG_ON(i->overwritten);
+		f->nodes.data[dst++] = *i;
+	}
+	f->nodes.nr = dst;
+
+	if (c->opts.verbose) {
+		printbuf_reset(&buf);
+		prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
+		found_btree_nodes_to_text(&buf, c, f->nodes);
+		bch2_print_string_as_lines(KERN_INFO, buf.buf);
+	}
+
+	eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
+{
+	const struct found_btree_node *l = _l;
+	const struct found_btree_node *r = _r;
+
+	return  cmp_int(l->btree_id,	r->btree_id) ?:
+	       -cmp_int(l->level,	r->level) ?:
+		bpos_cmp(l->max_key,	r->min_key);
+}
+
+#define for_each_found_btree_node_in_range(_f, _search, _idx)				\
+	for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr,		\
+					sizeof((_f)->nodes.data[0]),			\
+					found_btree_node_range_start_cmp, &search);	\
+	     _idx < (_f)->nodes.nr &&							\
+	     (_f)->nodes.data[_idx].btree_id == _search.btree_id &&			\
+	     (_f)->nodes.data[_idx].level == _search.level &&				\
+	     bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key);			\
+	     _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
+
+bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
+{
+	struct find_btree_nodes *f = &c->found_btree_nodes;
+
+	struct found_btree_node search = {
+		.btree_id	= b->c.btree_id,
+		.level		= b->c.level,
+		.min_key	= b->data->min_key,
+		.max_key	= b->key.k.p,
+	};
+
+	for_each_found_btree_node_in_range(f, search, idx)
+		if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
+			return true;
+	return false;
+}
+
+bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
+{
+	struct found_btree_node search = {
+		.btree_id	= btree,
+		.level		= 0,
+		.min_key	= POS_MIN,
+		.max_key	= SPOS_MAX,
+	};
+
+	for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
+		return true;
+	return false;
+}
+
+int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
+			   unsigned level, struct bpos node_min, struct bpos node_max)
+{
+	if (btree_id_is_alloc(btree))
+		return 0;
+
+	struct find_btree_nodes *f = &c->found_btree_nodes;
+
+	int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+	if (ret)
+		return ret;
+
+	if (c->opts.verbose) {
+		struct printbuf buf = PRINTBUF;
+
+		prt_str(&buf, "recovery ");
+		bch2_btree_id_level_to_text(&buf, btree, level);
+		prt_str(&buf, " ");
+		bch2_bpos_to_text(&buf, node_min);
+		prt_str(&buf, " - ");
+		bch2_bpos_to_text(&buf, node_max);
+
+		bch_info(c, "%s(): %s", __func__, buf.buf);
+		printbuf_exit(&buf);
+	}
+
+	struct found_btree_node search = {
+		.btree_id	= btree,
+		.level		= level,
+		.min_key	= node_min,
+		.max_key	= node_max,
+	};
+
+	for_each_found_btree_node_in_range(f, search, idx) {
+		struct found_btree_node n = f->nodes.data[idx];
+
+		n.range_updated |= bpos_lt(n.min_key, node_min);
+		n.min_key = bpos_max(n.min_key, node_min);
+
+		n.range_updated |= bpos_gt(n.max_key, node_max);
+		n.max_key = bpos_min(n.max_key, node_max);
+
+		struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
+
+		found_btree_node_to_key(&tmp.k, &n);
+
+		struct printbuf buf = PRINTBUF;
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
+		bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
+		printbuf_exit(&buf);
+
+		BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k),
+					  (struct bkey_validate_context) {
+						.from	= BKEY_VALIDATE_btree_node,
+						.level	= level + 1,
+						.btree	= btree,
+					  }));
+
+		ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
+{
+	darray_exit(&f->nodes);
+}
diff --git a/libbcachefs/btree_node_scan.h b/libbcachefs/btree_node_scan.h
new file mode 100644
index 00000000..08687b20
--- /dev/null
+++ b/libbcachefs/btree_node_scan.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_NODE_SCAN_H
+#define _BCACHEFS_BTREE_NODE_SCAN_H
+
+int bch2_scan_for_btree_nodes(struct bch_fs *);
+bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *);
+bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id);
+int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos);
+void bch2_find_btree_nodes_exit(struct find_btree_nodes *);
+
+#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */
diff --git a/libbcachefs/btree_node_scan_types.h b/libbcachefs/btree_node_scan_types.h
new file mode 100644
index 00000000..b6c36c45
--- /dev/null
+++ b/libbcachefs/btree_node_scan_types.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
+#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
+
+#include "darray.h"
+
+struct found_btree_node {
+	bool			range_updated:1;
+	bool			overwritten:1;
+	u8			btree_id;
+	u8			level;
+	unsigned		sectors_written;
+	u32			seq;
+	u64			journal_seq;
+	u64			cookie;
+
+	struct bpos		min_key;
+	struct bpos		max_key;
+
+	unsigned		nr_ptrs;
+	struct bch_extent_ptr	ptrs[BCH_REPLICAS_MAX];
+};
+
+typedef DARRAY(struct found_btree_node)	found_btree_nodes;
+
+struct find_btree_nodes {
+	int			ret;
+	struct mutex		lock;
+	found_btree_nodes	nodes;
+};
+
+#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */
diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c
new file mode 100644
index 00000000..78d72c26
--- /dev/null
+++ b/libbcachefs/btree_trans_commit.c
@@ -0,0 +1,1117 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "btree_gc.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_journal_iter.h"
+#include "btree_key_cache.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "disk_accounting.h"
+#include "errcode.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "replicas.h"
+#include "snapshot.h"
+
+#include <linux/prefetch.h>
+
+static const char * const trans_commit_flags_strs[] = {
+#define x(n, ...) #n,
+	BCH_TRANS_COMMIT_FLAGS()
+#undef x
+	NULL
+};
+
+void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit_flags flags)
+{
+	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
+
+	prt_printf(out, "watermark=%s", bch2_watermarks[watermark]);
+
+	flags >>= BCH_WATERMARK_BITS;
+	if (flags) {
+		prt_char(out, ' ');
+		bch2_prt_bitflags(out, trans_commit_flags_strs, flags);
+	}
+}
+
+static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bch_fs *c = trans->c;
+	struct bkey u;
+	struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u);
+
+	if (unlikely(trans->journal_replay_not_finished)) {
+		struct bkey_i *j_k =
+			bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
+
+		if (j_k)
+			k = bkey_i_to_s_c(j_k);
+	}
+
+	u = *k.k;
+	u.needs_whiteout = i->old_k.needs_whiteout;
+
+	BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
+	BUG_ON(i->old_v != k.v);
+#endif
+}
+
+static inline struct btree_path_level *insert_l(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+	return (trans->paths + i->path)->l + i->level;
+}
+
+static inline bool same_leaf_as_prev(struct btree_trans *trans,
+				     struct btree_insert_entry *i)
+{
+	return i != trans->updates &&
+		insert_l(trans, &i[0])->b == insert_l(trans, &i[-1])->b;
+}
+
+static inline bool same_leaf_as_next(struct btree_trans *trans,
+				     struct btree_insert_entry *i)
+{
+	return i + 1 < trans->updates + trans->nr_updates &&
+		insert_l(trans, &i[0])->b == insert_l(trans, &i[1])->b;
+}
+
+inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
+					   struct btree_path *path,
+					   struct btree *b)
+{
+	struct bch_fs *c = trans->c;
+
+	if (unlikely(btree_node_just_written(b)) &&
+	    bch2_btree_post_write_cleanup(c, b))
+		bch2_trans_node_reinit_iter(trans, b);
+
+	/*
+	 * If the last bset has been written, or if it's gotten too big - start
+	 * a new bset to insert into:
+	 */
+	if (want_new_bset(c, b))
+		bch2_btree_init_next(trans, b);
+}
+
+static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+	while (--i >= trans->updates) {
+		if (same_leaf_as_prev(trans, i))
+			continue;
+
+		bch2_btree_node_unlock_write(trans, trans->paths + i->path, insert_l(trans, i)->b);
+	}
+
+	trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
+	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
+}
+
+static inline int bch2_trans_lock_write(struct btree_trans *trans)
+{
+	EBUG_ON(trans->write_locked);
+
+	trans_for_each_update(trans, i) {
+		if (same_leaf_as_prev(trans, i))
+			continue;
+
+		if (bch2_btree_node_lock_write(trans, trans->paths + i->path, &insert_l(trans, i)->b->c))
+			return trans_lock_write_fail(trans, i);
+
+		if (!i->cached)
+			bch2_btree_node_prep_for_write(trans, trans->paths + i->path, insert_l(trans, i)->b);
+	}
+
+	trans->write_locked = true;
+	return 0;
+}
+
+static inline void bch2_trans_unlock_write(struct btree_trans *trans)
+{
+	if (likely(trans->write_locked)) {
+		trans_for_each_update(trans, i)
+			if (btree_node_locked_type(trans->paths + i->path, i->level) ==
+			    BTREE_NODE_WRITE_LOCKED)
+				bch2_btree_node_unlock_write_inlined(trans,
+						trans->paths + i->path, insert_l(trans, i)->b);
+		trans->write_locked = false;
+	}
+}
+
+/* Inserting into a given leaf node (last stage of insert): */
+
+/* Handle overwrites and do insert, for non extents: */
+bool bch2_btree_bset_insert_key(struct btree_trans *trans,
+				struct btree_path *path,
+				struct btree *b,
+				struct btree_node_iter *node_iter,
+				struct bkey_i *insert)
+{
+	struct bkey_packed *k;
+	unsigned clobber_u64s = 0, new_u64s = 0;
+
+	EBUG_ON(btree_node_just_written(b));
+	EBUG_ON(bset_written(b, btree_bset_last(b)));
+	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
+	EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
+	EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
+	EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b));
+	EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
+
+	k = bch2_btree_node_iter_peek_all(node_iter, b);
+	if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
+		k = NULL;
+
+	/* @k is the key being overwritten/deleted, if any: */
+	EBUG_ON(k && bkey_deleted(k));
+
+	/* Deleting, but not found? nothing to do: */
+	if (bkey_deleted(&insert->k) && !k)
+		return false;
+
+	if (bkey_deleted(&insert->k)) {
+		/* Deleting: */
+		btree_account_key_drop(b, k);
+		k->type = KEY_TYPE_deleted;
+
+		if (k->needs_whiteout)
+			push_whiteout(b, insert->k.p);
+		k->needs_whiteout = false;
+
+		if (k >= btree_bset_last(b)->start) {
+			clobber_u64s = k->u64s;
+			bch2_bset_delete(b, k, clobber_u64s);
+			goto fix_iter;
+		} else {
+			bch2_btree_path_fix_key_modified(trans, b, k);
+		}
+
+		return true;
+	}
+
+	if (k) {
+		/* Overwriting: */
+		btree_account_key_drop(b, k);
+		k->type = KEY_TYPE_deleted;
+
+		insert->k.needs_whiteout = k->needs_whiteout;
+		k->needs_whiteout = false;
+
+		if (k >= btree_bset_last(b)->start) {
+			clobber_u64s = k->u64s;
+			goto overwrite;
+		} else {
+			bch2_btree_path_fix_key_modified(trans, b, k);
+		}
+	}
+
+	k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
+overwrite:
+	bch2_bset_insert(b, k, insert, clobber_u64s);
+	new_u64s = k->u64s;
+fix_iter:
+	if (clobber_u64s != new_u64s)
+		bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
+					 clobber_u64s, new_u64s);
+	return true;
+}
+
+static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
+			       unsigned i, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct btree_write *w = container_of(pin, struct btree_write, journal);
+	struct btree *b = container_of(w, struct btree, writes[i]);
+	struct btree_trans *trans = bch2_trans_get(c);
+	unsigned long old, new;
+	unsigned idx = w - b->writes;
+
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+
+	old = READ_ONCE(b->flags);
+	do {
+		new = old;
+
+		if (!(old & (1 << BTREE_NODE_dirty)) ||
+		    !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
+		    w->journal.seq != seq)
+			break;
+
+		new &= ~BTREE_WRITE_TYPE_MASK;
+		new |= BTREE_WRITE_journal_reclaim;
+		new |= 1 << BTREE_NODE_need_write;
+	} while (!try_cmpxchg(&b->flags, &old, new));
+
+	btree_node_write_if_need(c, b, SIX_LOCK_read);
+	six_unlock_read(&b->c.lock);
+
+	bch2_trans_put(trans);
+	return 0;
+}
+
+int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+	return __btree_node_flush(j, pin, 0, seq);
+}
+
+int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+	return __btree_node_flush(j, pin, 1, seq);
+}
+
+inline void bch2_btree_add_journal_pin(struct bch_fs *c,
+				       struct btree *b, u64 seq)
+{
+	struct btree_write *w = btree_current_write(b);
+
+	bch2_journal_pin_add(&c->journal, seq, &w->journal,
+			     btree_node_write_idx(b) == 0
+			     ? bch2_btree_node_flush0
+			     : bch2_btree_node_flush1);
+}
+
+/**
+ * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node
+ * @trans:		btree transaction object
+ * @path:		path pointing to @insert's pos
+ * @insert:		key to insert
+ * @journal_seq:	sequence number of journal reservation
+ */
+inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
+				       struct btree_path *path,
+				       struct bkey_i *insert,
+				       u64 journal_seq)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b = path_l(path)->b;
+	struct bset_tree *t = bset_tree_last(b);
+	struct bset *i = bset(b, t);
+	int old_u64s = bset_u64s(t);
+	int old_live_u64s = b->nr.live_u64s;
+	int live_u64s_added, u64s_added;
+
+	if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
+					&path_l(path)->iter, insert)))
+		return;
+
+	i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
+
+	bch2_btree_add_journal_pin(c, b, journal_seq);
+
+	if (unlikely(!btree_node_dirty(b))) {
+		EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
+		set_btree_node_dirty_acct(c, b);
+	}
+
+	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+	u64s_added = (int) bset_u64s(t) - old_u64s;
+
+	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+	if (u64s_added > live_u64s_added &&
+	    bch2_maybe_compact_whiteouts(c, b))
+		bch2_trans_node_reinit_iter(trans, b);
+}
+
+/* Cached btree updates: */
+
+/* Normal update interface: */
+
+static inline void btree_insert_entry_checks(struct btree_trans *trans,
+					     struct btree_insert_entry *i)
+{
+	struct btree_path *path = trans->paths + i->path;
+
+	BUG_ON(!bpos_eq(i->k->k.p, path->pos));
+	BUG_ON(i->cached	!= path->cached);
+	BUG_ON(i->level		!= path->level);
+	BUG_ON(i->btree_id	!= path->btree_id);
+	EBUG_ON(!i->level &&
+		btree_type_has_snapshots(i->btree_id) &&
+		!(i->flags & BTREE_UPDATE_internal_snapshot_node) &&
+		test_bit(JOURNAL_replay_done, &trans->c->journal.flags) &&
+		i->k->k.p.snapshot &&
+		bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0);
+}
+
+static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
+						      unsigned flags)
+{
+	return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
+				    trans->journal_u64s, flags);
+}
+
+#define JSET_ENTRY_LOG_U64s		4
+
+static noinline void journal_transaction_name(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct journal *j = &c->journal;
+	struct jset_entry *entry =
+		bch2_journal_add_entry(j, &trans->journal_res,
+				       BCH_JSET_ENTRY_log, 0, 0,
+				       JSET_ENTRY_LOG_U64s);
+	struct jset_entry_log *l =
+		container_of(entry, struct jset_entry_log, entry);
+
+	strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
+}
+
+static inline int btree_key_can_insert(struct btree_trans *trans,
+				       struct btree *b, unsigned u64s)
+{
+	if (!bch2_btree_node_insert_fits(b, u64s))
+		return -BCH_ERR_btree_insert_btree_node_full;
+
+	return 0;
+}
+
+noinline static int
+btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
+				     struct btree_path *path, unsigned new_u64s)
+{
+	struct bkey_cached *ck = (void *) path->l[0].b;
+	struct bkey_i *new_k;
+	int ret;
+
+	bch2_trans_unlock_write(trans);
+	bch2_trans_unlock(trans);
+
+	new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
+	if (!new_k) {
+		bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+			bch2_btree_id_str(path->btree_id), new_u64s);
+		return -BCH_ERR_ENOMEM_btree_key_cache_insert;
+	}
+
+	ret =   bch2_trans_relock(trans) ?:
+		bch2_trans_lock_write(trans);
+	if (unlikely(ret)) {
+		kfree(new_k);
+		return ret;
+	}
+
+	memcpy(new_k, ck->k, ck->u64s * sizeof(u64));
+
+	trans_for_each_update(trans, i)
+		if (i->old_v == &ck->k->v)
+			i->old_v = &new_k->v;
+
+	kfree(ck->k);
+	ck->u64s	= new_u64s;
+	ck->k		= new_k;
+	return 0;
+}
+
+static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
+				       struct btree_path *path, unsigned u64s)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached *ck = (void *) path->l[0].b;
+	unsigned new_u64s;
+	struct bkey_i *new_k;
+	unsigned watermark = flags & BCH_WATERMARK_MASK;
+
+	EBUG_ON(path->level);
+
+	if (watermark < BCH_WATERMARK_reclaim &&
+	    !test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+	    bch2_btree_key_cache_must_wait(c))
+		return -BCH_ERR_btree_insert_need_journal_reclaim;
+
+	/*
+	 * bch2_varint_decode can read past the end of the buffer by at most 7
+	 * bytes (it won't be used):
+	 */
+	u64s += 1;
+
+	if (u64s <= ck->u64s)
+		return 0;
+
+	new_u64s	= roundup_pow_of_two(u64s);
+	new_k		= krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
+	if (unlikely(!new_k))
+		return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
+
+	trans_for_each_update(trans, i)
+		if (i->old_v == &ck->k->v)
+			i->old_v = &new_k->v;
+
+	ck->u64s	= new_u64s;
+	ck->k		= new_k;
+	return 0;
+}
+
+/* Triggers: */
+
+static int run_one_mem_trigger(struct btree_trans *trans,
+			       struct btree_insert_entry *i,
+			       unsigned flags)
+{
+	verify_update_old_key(trans, i);
+
+	if (unlikely(flags & BTREE_TRIGGER_norun))
+		return 0;
+
+	struct bkey_s_c old = { &i->old_k, i->old_v };
+	struct bkey_i *new = i->k;
+	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
+	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
+
+	if (old_ops->trigger == new_ops->trigger)
+		return bch2_key_trigger(trans, i->btree_id, i->level,
+				old, bkey_i_to_s(new),
+				BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags);
+	else
+		return bch2_key_trigger_new(trans, i->btree_id, i->level,
+				bkey_i_to_s(new), flags) ?:
+		       bch2_key_trigger_old(trans, i->btree_id, i->level,
+				old, flags);
+}
+
+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
+				 bool overwrite)
+{
+	verify_update_old_key(trans, i);
+
+	if ((i->flags & BTREE_TRIGGER_norun) ||
+	    !btree_node_type_has_trans_triggers(i->bkey_type))
+		return 0;
+
+	/*
+	 * Transactional triggers create new btree_insert_entries, so we can't
+	 * pass them a pointer to a btree_insert_entry, that memory is going to
+	 * move:
+	 */
+	struct bkey old_k = i->old_k;
+	struct bkey_s_c old = { &old_k, i->old_v };
+	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
+	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
+	unsigned flags = i->flags|BTREE_TRIGGER_transactional;
+
+	if (!i->insert_trigger_run &&
+	    !i->overwrite_trigger_run &&
+	    old_ops->trigger == new_ops->trigger) {
+		i->overwrite_trigger_run = true;
+		i->insert_trigger_run = true;
+		return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k),
+					BTREE_TRIGGER_insert|
+					BTREE_TRIGGER_overwrite|flags) ?: 1;
+	} else if (overwrite && !i->overwrite_trigger_run) {
+		i->overwrite_trigger_run = true;
+		return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1;
+	} else if (!overwrite && !i->insert_trigger_run) {
+		i->insert_trigger_run = true;
+		return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1;
+	} else {
+		return 0;
+	}
+}
+
+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
+			      unsigned btree_id_start)
+{
+	for (int overwrite = 1; overwrite >= 0; --overwrite) {
+		bool trans_trigger_run;
+
+		/*
+		 * Running triggers will append more updates to the list of updates as
+		 * we're walking it:
+		 */
+		do {
+			trans_trigger_run = false;
+
+			for (unsigned i = btree_id_start;
+			     i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
+			     i++) {
+				if (trans->updates[i].btree_id != btree_id)
+					continue;
+
+				int ret = run_one_trans_trigger(trans, trans->updates + i, overwrite);
+				if (ret < 0)
+					return ret;
+				if (ret)
+					trans_trigger_run = true;
+			}
+		} while (trans_trigger_run);
+	}
+
+	return 0;
+}
+
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+{
+	unsigned btree_id = 0, btree_id_start = 0;
+	int ret = 0;
+
+	/*
+	 *
+	 * For a given btree, this algorithm runs insert triggers before
+	 * overwrite triggers: this is so that when extents are being moved
+	 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
+	 * they are re-added.
+	 */
+	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+		if (btree_id == BTREE_ID_alloc)
+			continue;
+
+		while (btree_id_start < trans->nr_updates &&
+		       trans->updates[btree_id_start].btree_id < btree_id)
+			btree_id_start++;
+
+		ret = run_btree_triggers(trans, btree_id, btree_id_start);
+		if (ret)
+			return ret;
+	}
+
+	for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
+		struct btree_insert_entry *i = trans->updates + idx;
+
+		if (i->btree_id > BTREE_ID_alloc)
+			break;
+		if (i->btree_id == BTREE_ID_alloc) {
+			ret = run_btree_triggers(trans, BTREE_ID_alloc, idx);
+			if (ret)
+				return ret;
+			break;
+		}
+	}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	trans_for_each_update(trans, i)
+		BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
+		       btree_node_type_has_trans_triggers(i->bkey_type) &&
+		       (!i->insert_trigger_run || !i->overwrite_trigger_run));
+#endif
+	return 0;
+}
+
+static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
+{
+	trans_for_each_update(trans, i)
+		if (btree_node_type_has_triggers(i->bkey_type) &&
+		    gc_visited(trans->c, gc_pos_btree(i->btree_id, i->level, i->k->k.p))) {
+			int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc);
+			if (ret)
+				return ret;
+		}
+
+	return 0;
+}
+
+static inline int
+bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
+			       struct btree_insert_entry **stopped_at,
+			       unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_trans_commit_hook *h;
+	unsigned u64s = 0;
+	int ret = 0;
+
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
+
+	if (race_fault()) {
+		trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
+	}
+
+	/*
+	 * Check if the insert will fit in the leaf node with the write lock
+	 * held, otherwise another thread could write the node changing the
+	 * amount of space available:
+	 */
+
+	prefetch(&trans->c->journal.flags);
+
+	trans_for_each_update(trans, i) {
+		/* Multiple inserts might go to same leaf: */
+		if (!same_leaf_as_prev(trans, i))
+			u64s = 0;
+
+		u64s += i->k->k.u64s;
+		ret = !i->cached
+			? btree_key_can_insert(trans, insert_l(trans, i)->b, u64s)
+			: btree_key_can_insert_cached(trans, flags, trans->paths + i->path, u64s);
+		if (ret) {
+			*stopped_at = i;
+			return ret;
+		}
+
+		i->k->k.needs_whiteout = false;
+	}
+
+	/*
+	 * Don't get journal reservation until after we know insert will
+	 * succeed:
+	 */
+	if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
+		ret = bch2_trans_journal_res_get(trans,
+				(flags & BCH_WATERMARK_MASK)|
+				JOURNAL_RES_GET_NONBLOCK);
+		if (ret)
+			return ret;
+
+		if (unlikely(trans->journal_transaction_names))
+			journal_transaction_name(trans);
+	}
+
+	/*
+	 * Not allowed to fail after we've gotten our journal reservation - we
+	 * have to use it:
+	 */
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+	    !(flags & BCH_TRANS_COMMIT_no_journal_res)) {
+		if (bch2_journal_seq_verify)
+			trans_for_each_update(trans, i)
+				i->k->k.bversion.lo = trans->journal_res.seq;
+		else if (bch2_inject_invalid_keys)
+			trans_for_each_update(trans, i)
+				i->k->k.bversion = MAX_VERSION;
+	}
+
+	h = trans->hooks;
+	while (h) {
+		ret = h->fn(trans, h);
+		if (ret)
+			return ret;
+		h = h->next;
+	}
+
+	struct jset_entry *entry = trans->journal_entries;
+
+	percpu_down_read(&c->mark_lock);
+	for (entry = trans->journal_entries;
+	     entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+	     entry = vstruct_next(entry))
+		if (entry->type == BCH_JSET_ENTRY_write_buffer_keys &&
+		    entry->start->k.type == KEY_TYPE_accounting) {
+			ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags);
+			if (ret)
+				goto revert_fs_usage;
+		}
+	percpu_up_read(&c->mark_lock);
+
+	/* XXX: we only want to run this if deltas are nonzero */
+	bch2_trans_account_disk_usage_change(trans);
+
+	trans_for_each_update(trans, i)
+		if (btree_node_type_has_atomic_triggers(i->bkey_type)) {
+			ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags);
+			if (ret)
+				goto fatal_err;
+		}
+
+	if (unlikely(c->gc_pos.phase)) {
+		ret = bch2_trans_commit_run_gc_triggers(trans);
+		if  (ret)
+			goto fatal_err;
+	}
+
+	trans_for_each_update(trans, i) {
+		enum bch_validate_flags invalid_flags = 0;
+
+		if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
+			invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
+
+		ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k),
+					 (struct bkey_validate_context) {
+						.from	= BKEY_VALIDATE_commit,
+						.level	= i->level,
+						.btree	= i->btree_id,
+						.flags	= invalid_flags,
+					 });
+		if (unlikely(ret)){
+			bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
+						trans->fn, (void *) i->ip_allocated);
+			goto fatal_err;
+		}
+		btree_insert_entry_checks(trans, i);
+	}
+
+	for (struct jset_entry *i = trans->journal_entries;
+	     i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+	     i = vstruct_next(i)) {
+		enum bch_validate_flags invalid_flags = 0;
+
+		if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
+			invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
+
+		ret = bch2_journal_entry_validate(c, NULL, i,
+						  bcachefs_metadata_version_current,
+						  CPU_BIG_ENDIAN, invalid_flags);
+		if (unlikely(ret)) {
+			bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
+						trans->fn);
+			goto fatal_err;
+		}
+	}
+
+	if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
+		struct journal *j = &c->journal;
+		struct jset_entry *entry;
+
+		trans_for_each_update(trans, i) {
+			if (i->key_cache_already_flushed)
+				continue;
+
+			if (i->flags & BTREE_UPDATE_nojournal)
+				continue;
+
+			verify_update_old_key(trans, i);
+
+			if (trans->journal_transaction_names) {
+				entry = bch2_journal_add_entry(j, &trans->journal_res,
+						       BCH_JSET_ENTRY_overwrite,
+						       i->btree_id, i->level,
+						       i->old_k.u64s);
+				bkey_reassemble((struct bkey_i *) entry->start,
+						(struct bkey_s_c) { &i->old_k, i->old_v });
+			}
+
+			entry = bch2_journal_add_entry(j, &trans->journal_res,
+					       BCH_JSET_ENTRY_btree_keys,
+					       i->btree_id, i->level,
+					       i->k->k.u64s);
+			bkey_copy((struct bkey_i *) entry->start, i->k);
+		}
+
+		memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
+				  trans->journal_entries,
+				  trans->journal_entries_u64s);
+
+		trans->journal_res.offset	+= trans->journal_entries_u64s;
+		trans->journal_res.u64s		-= trans->journal_entries_u64s;
+
+		if (trans->journal_seq)
+			*trans->journal_seq = trans->journal_res.seq;
+	}
+
+	trans_for_each_update(trans, i) {
+		struct btree_path *path = trans->paths + i->path;
+
+		if (!i->cached)
+			bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq);
+		else if (!i->key_cache_already_flushed)
+			bch2_btree_insert_key_cached(trans, flags, i);
+		else
+			bch2_btree_key_cache_drop(trans, path);
+	}
+
+	return 0;
+fatal_err:
+	bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret));
+	percpu_down_read(&c->mark_lock);
+revert_fs_usage:
+	for (struct jset_entry *entry2 = trans->journal_entries;
+	     entry2 != entry;
+	     entry2 = vstruct_next(entry2))
+		if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys &&
+		    entry2->start->k.type == KEY_TYPE_accounting)
+			bch2_accounting_trans_commit_revert(trans,
+					bkey_i_to_accounting(entry2->start), flags);
+	percpu_up_read(&c->mark_lock);
+	return ret;
+}
+
+static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
+{
+	/*
+	 * Accounting keys aren't deduped in the journal: we have to compare
+	 * each individual update against what's in the btree to see if it has
+	 * been applied yet, and accounting updates also don't overwrite,
+	 * they're deltas that accumulate.
+	 */
+	trans_for_each_update(trans, i)
+		if (i->k->k.type != KEY_TYPE_accounting)
+			bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
+}
+
+static int bch2_trans_commit_journal_pin_flush(struct journal *j,
+				struct journal_entry_pin *_pin, u64 seq)
+{
+	return 0;
+}
+
+/*
+ * Get journal reservation, take write locks, and attempt to do btree update(s):
+ */
+static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
+				       struct btree_insert_entry **stopped_at,
+				       unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+	int ret = 0, u64s_delta = 0;
+
+	for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
+		struct btree_insert_entry *i = trans->updates + idx;
+		if (i->cached)
+			continue;
+
+		u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
+		u64s_delta -= i->old_btree_u64s;
+
+		if (!same_leaf_as_next(trans, i)) {
+			if (u64s_delta <= 0) {
+				ret = bch2_foreground_maybe_merge(trans, i->path,
+							i->level, flags);
+				if (unlikely(ret))
+					return ret;
+			}
+
+			u64s_delta = 0;
+		}
+	}
+
+	ret = bch2_trans_lock_write(trans);
+	if (unlikely(ret))
+		return ret;
+
+	ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
+
+	if (!ret && unlikely(trans->journal_replay_not_finished))
+		bch2_drop_overwrites_from_journal(trans);
+
+	bch2_trans_unlock_write(trans);
+
+	if (!ret && trans->journal_pin)
+		bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
+				     trans->journal_pin,
+				     bch2_trans_commit_journal_pin_flush);
+
+	/*
+	 * Drop journal reservation after dropping write locks, since dropping
+	 * the journal reservation may kick off a journal write:
+	 */
+	if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
+		bch2_journal_res_put(&c->journal, &trans->journal_res);
+
+	return ret;
+}
+
+static int journal_reclaim_wait_done(struct bch_fs *c)
+{
+	int ret = bch2_journal_error(&c->journal) ?:
+		bch2_btree_key_cache_wait_done(c);
+
+	if (!ret)
+		journal_reclaim_kick(&c->journal);
+	return ret;
+}
+
+static noinline
+int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
+			    struct btree_insert_entry *i,
+			    int ret, unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
+
+	switch (ret) {
+	case -BCH_ERR_btree_insert_btree_node_full:
+		ret = bch2_btree_split_leaf(trans, i->path, flags);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			trace_and_count(c, trans_restart_btree_node_split, trans,
+					trace_ip, trans->paths + i->path);
+		break;
+	case -BCH_ERR_btree_insert_need_mark_replicas:
+		ret = drop_locks_do(trans,
+			bch2_accounting_update_sb(trans));
+		break;
+	case -BCH_ERR_journal_res_get_blocked:
+		/*
+		 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
+		 * flag
+		 */
+		if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
+		    watermark < BCH_WATERMARK_reclaim) {
+			ret = -BCH_ERR_journal_reclaim_would_deadlock;
+			break;
+		}
+
+		ret = drop_locks_do(trans,
+			bch2_trans_journal_res_get(trans,
+					(flags & BCH_WATERMARK_MASK)|
+					JOURNAL_RES_GET_CHECK));
+		break;
+	case -BCH_ERR_btree_insert_need_journal_reclaim:
+		bch2_trans_unlock(trans);
+
+		trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
+		track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], true);
+
+		wait_event_freezable(c->journal.reclaim_wait,
+				     (ret = journal_reclaim_wait_done(c)));
+
+		track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], false);
+
+		if (ret < 0)
+			break;
+
+		ret = bch2_trans_relock(trans);
+		break;
+	default:
+		BUG_ON(ret >= 0);
+		break;
+	}
+
+	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
+
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
+				(flags & BCH_TRANS_COMMIT_no_enospc), c,
+		"%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
+
+	return ret;
+}
+
+/*
+ * This is for updates done in the early part of fsck - btree_gc - before we've
+ * gone RW. we only add the new key to the list of keys for journal replay to
+ * do.
+ */
+static noinline int
+do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+
+	BUG_ON(current != c->recovery_task);
+
+	trans_for_each_update(trans, i) {
+		int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
+		if (ret)
+			return ret;
+	}
+
+	for (struct jset_entry *i = trans->journal_entries;
+	     i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+	     i = vstruct_next(i))
+		if (i->type == BCH_JSET_ENTRY_btree_keys ||
+		    i->type == BCH_JSET_ENTRY_write_buffer_keys) {
+			int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start);
+			if (ret)
+				return ret;
+		}
+
+	return 0;
+}
+
+int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
+{
+	struct btree_insert_entry *errored_at = NULL;
+	struct bch_fs *c = trans->c;
+	int ret = 0;
+
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
+
+	if (!trans->nr_updates &&
+	    !trans->journal_entries_u64s)
+		goto out_reset;
+
+	ret = bch2_trans_commit_run_triggers(trans);
+	if (ret)
+		goto out_reset;
+
+	if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
+	    unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
+		if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags)))
+			ret = do_bch2_trans_commit_to_journal_replay(trans);
+		else
+			ret = -BCH_ERR_erofs_trans_commit;
+		goto out_reset;
+	}
+
+	EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
+
+	trans->journal_u64s		= trans->journal_entries_u64s;
+	trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
+	if (trans->journal_transaction_names)
+		trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
+
+	trans_for_each_update(trans, i) {
+		struct btree_path *path = trans->paths + i->path;
+
+		EBUG_ON(!path->should_be_locked);
+
+		ret = bch2_btree_path_upgrade(trans, path, i->level + 1);
+		if (unlikely(ret))
+			goto out;
+
+		EBUG_ON(!btree_node_intent_locked(path, i->level));
+
+		if (i->key_cache_already_flushed)
+			continue;
+
+		if (i->flags & BTREE_UPDATE_nojournal)
+			continue;
+
+		/* we're going to journal the key being updated: */
+		trans->journal_u64s += jset_u64s(i->k->k.u64s);
+
+		/* and we're also going to log the overwrite: */
+		if (trans->journal_transaction_names)
+			trans->journal_u64s += jset_u64s(i->old_k.u64s);
+	}
+
+	if (trans->extra_disk_res) {
+		ret = bch2_disk_reservation_add(c, trans->disk_res,
+				trans->extra_disk_res,
+				(flags & BCH_TRANS_COMMIT_no_enospc)
+				? BCH_DISK_RESERVATION_NOFAIL : 0);
+		if (ret)
+			goto err;
+	}
+retry:
+	errored_at = NULL;
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
+	if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
+		memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+	memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
+
+	ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_);
+
+	/* make sure we didn't drop or screw up locks: */
+	bch2_trans_verify_locks(trans);
+
+	if (ret)
+		goto err;
+
+	trace_and_count(c, transaction_commit, trans, _RET_IP_);
+out:
+	if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw)))
+		bch2_write_ref_put(c, BCH_WRITE_REF_trans);
+out_reset:
+	if (!ret)
+		bch2_trans_downgrade(trans);
+	bch2_trans_reset_updates(trans);
+
+	return ret;
+err:
+	ret = bch2_trans_commit_error(trans, flags, errored_at, ret, _RET_IP_);
+	if (ret)
+		goto out;
+
+	/*
+	 * We might have done another transaction commit in the error path -
+	 * i.e. btree write buffer flush - which will have made use of
+	 * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is
+	 * how the journal sequence number to pin is passed in - so we must
+	 * restart:
+	 */
+	if (flags & BCH_TRANS_COMMIT_no_journal_res) {
+		ret = -BCH_ERR_transaction_restart_nested;
+		goto out;
+	}
+
+	goto retry;
+}
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 39e2db75..baab5288 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -1,15 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_BTREE_TYPES_H
 #define _BCACHEFS_BTREE_TYPES_H
 
 #include <linux/list.h>
 #include <linux/rhashtable.h>
 
-#include "bkey_methods.h"
+#include "bbpos_types.h"
+#include "btree_key_cache_types.h"
+#include "buckets_types.h"
+#include "darray.h"
+#include "errcode.h"
 #include "journal_types.h"
+#include "replicas_types.h"
 #include "six.h"
 
 struct open_bucket;
 struct btree_update;
+struct btree_trans;
 
 #define MAX_BSETS		3U
 
@@ -44,40 +51,35 @@ struct bset_tree {
 	u16			data_offset;
 	u16			aux_data_offset;
 	u16			end_offset;
-
-	struct bpos		max_key;
 };
 
 struct btree_write {
 	struct journal_entry_pin	journal;
-	struct closure_waitlist		wait;
 };
 
-struct btree_ob_ref {
-	u8			nr;
-	u8			refs[BCH_REPLICAS_MAX];
+struct btree_alloc {
+	struct open_buckets	ob;
+	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
 };
 
-struct btree_alloc {
-	struct btree_ob_ref	ob;
-	BKEY_PADDED(k);
+struct btree_bkey_cached_common {
+	struct six_lock		lock;
+	u8			level;
+	u8			btree_id;
+	bool			cached;
 };
 
 struct btree {
-	/* Hottest entries first */
-	struct rhash_head	hash;
-
-	/* Key/pointer for this btree node */
-	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+	struct btree_bkey_cached_common c;
 
-	struct six_lock		lock;
+	struct rhash_head	hash;
+	u64			hash_val;
 
 	unsigned long		flags;
 	u16			written;
-	u8			level;
-	u8			btree_id;
 	u8			nsets;
 	u8			nr_key_bits;
+	u16			version_ondisk;
 
 	struct bkey_format	format;
 
@@ -96,10 +98,14 @@ struct btree {
 	struct btree_nr_keys	nr;
 	u16			sib_u64s[2];
 	u16			whiteout_u64s;
-	u16			uncompacted_whiteout_u64s;
-	u8			page_order;
+	u8			byte_order;
 	u8			unpack_fn_len;
 
+	struct btree_write	writes[2];
+
+	/* Key/pointer for this btree node */
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
 	/*
 	 * XXX: add a delete sequence number, so when bch2_btree_node_relock()
 	 * fails because the lock sequence number has changed - i.e. the
@@ -126,16 +132,35 @@ struct btree {
 	 */
 	unsigned long		will_make_reachable;
 
-	struct btree_ob_ref	ob;
+	struct open_buckets	ob;
 
 	/* lru list */
 	struct list_head	list;
+};
 
-	struct btree_write	writes[2];
+#define BCH_BTREE_CACHE_NOT_FREED_REASONS()	\
+	x(lock_intent)				\
+	x(lock_write)				\
+	x(dirty)				\
+	x(read_in_flight)			\
+	x(write_in_flight)			\
+	x(noevict)				\
+	x(write_blocked)			\
+	x(will_make_reachable)			\
+	x(access_bit)
+
+enum bch_btree_cache_not_freed_reasons {
+#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n,
+	BCH_BTREE_CACHE_NOT_FREED_REASONS()
+#undef x
+	BCH_BTREE_CACHE_NOT_FREED_REASONS_NR,
+};
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-	bool			*expensive_debug_checks;
-#endif
+struct btree_cache_list {
+	unsigned		idx;
+	struct shrinker		*shrink;
+	struct list_head	list;
+	size_t			nr;
 };
 
 struct btree_cache {
@@ -155,14 +180,19 @@ struct btree_cache {
 	 * should never grow past ~2-3 nodes in practice.
 	 */
 	struct mutex		lock;
-	struct list_head	live;
 	struct list_head	freeable;
-	struct list_head	freed;
+	struct list_head	freed_pcpu;
+	struct list_head	freed_nonpcpu;
+	struct btree_cache_list	live[2];
+
+	size_t			nr_freeable;
+	size_t			nr_reserve;
+	size_t			nr_by_btree[BTREE_ID_NR];
+	atomic_long_t		nr_dirty;
 
-	/* Number of elements in live + freeable lists */
-	unsigned		used;
-	unsigned		reserve;
-	struct shrinker		shrink;
+	/* shrinker stats */
+	size_t			nr_freed;
+	u64			not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR];
 
 	/*
 	 * If we need to allocate memory for a new btree node and that
@@ -172,44 +202,163 @@ struct btree_cache {
 	 */
 	struct task_struct	*alloc_lock;
 	struct closure_waitlist	alloc_wait;
+
+	struct bbpos		pinned_nodes_start;
+	struct bbpos		pinned_nodes_end;
+	/* btree id mask: 0 for leaves, 1 for interior */
+	u64			pinned_nodes_mask[2];
 };
 
 struct btree_node_iter {
-	u8		is_extents;
-
 	struct btree_node_iter_set {
 		u16	k, end;
 	} data[MAX_BSETS];
 };
 
-enum btree_iter_type {
-	BTREE_ITER_KEYS,
-	BTREE_ITER_SLOTS,
-	BTREE_ITER_NODES,
-};
+#define BTREE_ITER_FLAGS()			\
+	x(slots)				\
+	x(intent)				\
+	x(prefetch)				\
+	x(is_extents)				\
+	x(not_extents)				\
+	x(cached)				\
+	x(with_key_cache)			\
+	x(with_updates)				\
+	x(with_journal)				\
+	x(snapshot_field)			\
+	x(all_snapshots)			\
+	x(filter_snapshots)			\
+	x(nopreserve)				\
+	x(cached_nofill)			\
+	x(key_cache_fill)			\
+
+#define STR_HASH_FLAGS()			\
+	x(must_create)				\
+	x(must_replace)
+
+#define BTREE_UPDATE_FLAGS()			\
+	x(internal_snapshot_node)		\
+	x(nojournal)				\
+	x(key_cache_reclaim)
 
-#define BTREE_ITER_TYPE			((1 << 2) - 1)
 
-#define BTREE_ITER_INTENT		(1 << 2)
-#define BTREE_ITER_PREFETCH		(1 << 3)
-/*
- * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
- * @pos or the first key strictly greater than @pos
- */
-#define BTREE_ITER_IS_EXTENTS		(1 << 4)
 /*
- * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
+ * BTREE_TRIGGER_norun - don't run triggers at all
+ *
+ * BTREE_TRIGGER_transactional - we're running transactional triggers as part of
+ * a transaction commit: triggers may generate new updates
+ *
+ * BTREE_TRIGGER_atomic - we're running atomic triggers during a transaction
+ * commit: we have our journal reservation, we're holding btree node write
+ * locks, and we know the transaction is going to commit (returning an error
+ * here is a fatal error, causing us to go emergency read-only)
+ *
+ * BTREE_TRIGGER_gc - we're in gc/fsck: running triggers to recalculate e.g. disk usage
+ *
+ * BTREE_TRIGGER_insert - @new is entering the btree
+ * BTREE_TRIGGER_overwrite - @old is leaving the btree
+ *
+ * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc
+ * trigger
  */
-#define BTREE_ITER_AT_END_OF_LEAF	(1 << 5)
-#define BTREE_ITER_ERROR		(1 << 6)
+#define BTREE_TRIGGER_FLAGS()			\
+	x(norun)				\
+	x(transactional)			\
+	x(atomic)				\
+	x(check_repair)				\
+	x(gc)					\
+	x(insert)				\
+	x(overwrite)				\
+	x(is_root)				\
+	x(bucket_invalidate)
+
+enum {
+#define x(n) BTREE_ITER_FLAG_BIT_##n,
+	BTREE_ITER_FLAGS()
+	STR_HASH_FLAGS()
+	BTREE_UPDATE_FLAGS()
+	BTREE_TRIGGER_FLAGS()
+#undef x
+};
+
+/* iter flags must fit in a u16: */
+//BUILD_BUG_ON(BTREE_ITER_FLAG_BIT_key_cache_fill > 15);
+
+enum btree_iter_update_trigger_flags {
+#define x(n) BTREE_ITER_##n	= 1U << BTREE_ITER_FLAG_BIT_##n,
+	BTREE_ITER_FLAGS()
+#undef x
+#define x(n) STR_HASH_##n	= 1U << BTREE_ITER_FLAG_BIT_##n,
+	STR_HASH_FLAGS()
+#undef x
+#define x(n) BTREE_UPDATE_##n	= 1U << BTREE_ITER_FLAG_BIT_##n,
+	BTREE_UPDATE_FLAGS()
+#undef x
+#define x(n) BTREE_TRIGGER_##n	= 1U << BTREE_ITER_FLAG_BIT_##n,
+	BTREE_TRIGGER_FLAGS()
+#undef x
+};
 
-enum btree_iter_uptodate {
+enum btree_path_uptodate {
 	BTREE_ITER_UPTODATE		= 0,
-	BTREE_ITER_NEED_PEEK		= 1,
-	BTREE_ITER_NEED_RELOCK		= 2,
-	BTREE_ITER_NEED_TRAVERSE	= 3,
+	BTREE_ITER_NEED_RELOCK		= 1,
+	BTREE_ITER_NEED_TRAVERSE	= 2,
 };
 
+#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG)
+#define TRACK_PATH_ALLOCATED
+#endif
+
+typedef u16 btree_path_idx_t;
+
+struct btree_path {
+	btree_path_idx_t	sorted_idx;
+	u8			ref;
+	u8			intent_ref;
+
+	/* btree_iter_copy starts here: */
+	struct bpos		pos;
+
+	enum btree_id		btree_id:5;
+	bool			cached:1;
+	bool			preserve:1;
+	enum btree_path_uptodate uptodate:2;
+	/*
+	 * When true, failing to relock this path will cause the transaction to
+	 * restart:
+	 */
+	bool			should_be_locked:1;
+	unsigned		level:3,
+				locks_want:3;
+	u8			nodes_locked;
+
+	struct btree_path_level {
+		struct btree	*b;
+		struct btree_node_iter iter;
+		u32		lock_seq;
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+		u64             lock_taken_time;
+#endif
+	}			l[BTREE_MAX_DEPTH];
+#ifdef TRACK_PATH_ALLOCATED
+	unsigned long		ip_allocated;
+#endif
+};
+
+static inline struct btree_path_level *path_l(struct btree_path *path)
+{
+	return path->l + path->level;
+}
+
+static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
+{
+#ifdef TRACK_PATH_ALLOCATED
+	return path->ip_allocated;
+#else
+	return _THIS_IP_;
+#endif
+}
+
 /*
  * @pos			- iterator's current position
  * @level		- current btree depth
@@ -218,76 +367,252 @@ enum btree_iter_uptodate {
  * @nodes_intent_locked	- bitmask indicating which locks are intent locks
  */
 struct btree_iter {
-	struct bch_fs		*c;
-	struct bpos		pos;
+	struct btree_trans	*trans;
+	btree_path_idx_t	path;
+	btree_path_idx_t	update_path;
+	btree_path_idx_t	key_cache_path;
 
-	u8			flags;
-	enum btree_iter_uptodate uptodate:4;
-	enum btree_id		btree_id:4;
-	unsigned		level:4,
-				locks_want:4,
-				nodes_locked:4,
-				nodes_intent_locked:4;
+	enum btree_id		btree_id:8;
+	u8			min_depth;
 
-	struct btree_iter_level {
-		struct btree	*b;
-		struct btree_node_iter iter;
-	}			l[BTREE_MAX_DEPTH];
+	/* btree_iter_copy starts here: */
+	u16			flags;
 
-	u32			lock_seq[BTREE_MAX_DEPTH];
+	/* When we're filtering by snapshot, the snapshot ID we're looking for: */
+	unsigned		snapshot;
 
+	struct bpos		pos;
 	/*
 	 * Current unpacked key - so that bch2_btree_iter_next()/
 	 * bch2_btree_iter_next_slot() can correctly advance pos.
 	 */
 	struct bkey		k;
 
-	/*
-	 * Circular linked list of linked iterators: linked iterators share
-	 * locks (e.g. two linked iterators may have the same node intent
-	 * locked, or read and write locked, at the same time), and insertions
-	 * through one iterator won't invalidate the other linked iterators.
-	 */
+	/* BTREE_ITER_with_journal: */
+	size_t			journal_idx;
+#ifdef TRACK_PATH_ALLOCATED
+	unsigned long		ip_allocated;
+#endif
+};
+
+#define BKEY_CACHED_ACCESSED		0
+#define BKEY_CACHED_DIRTY		1
+
+struct bkey_cached {
+	struct btree_bkey_cached_common c;
+
+	unsigned long		flags;
+	u16			u64s;
+	struct bkey_cached_key	key;
+
+	struct rhash_head	hash;
 
-	/* Must come last: */
-	struct btree_iter	*next;
+	struct journal_entry_pin journal;
+	u64			seq;
+
+	struct bkey_i		*k;
+	struct rcu_head		rcu;
 };
 
-#define BTREE_ITER_MAX		8
+static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
+{
+	return !b->cached
+		? container_of(b, struct btree, c)->key.k.p
+		: container_of(b, struct bkey_cached, c)->key.pos;
+}
 
 struct btree_insert_entry {
-	struct btree_iter *iter;
-	struct bkey_i	*k;
-	unsigned	extra_res;
+	unsigned		flags;
+	u8			bkey_type;
+	enum btree_id		btree_id:8;
+	u8			level:4;
+	bool			cached:1;
+	bool			insert_trigger_run:1;
+	bool			overwrite_trigger_run:1;
+	bool			key_cache_already_flushed:1;
 	/*
-	 * true if entire key was inserted - can only be false for
-	 * extents
+	 * @old_k may be a key from the journal; @old_btree_u64s always refers
+	 * to the size of the key being overwritten in the btree:
 	 */
-	bool		done;
+	u8			old_btree_u64s;
+	btree_path_idx_t	path;
+	struct bkey_i		*k;
+	/* key being overwritten: */
+	struct bkey		old_k;
+	const struct bch_val	*old_v;
+	unsigned long		ip_allocated;
+};
+
+/* Number of btree paths we preallocate, usually enough */
+#define BTREE_ITER_INITIAL		64
+/*
+ * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code
+ * paths should run inside this limit, and if they don't it usually indicates a
+ * bug (leaking/duplicated btree paths).
+ *
+ * exception: some fsck paths
+ *
+ * bugs with excessive path usage seem to have possibly been eliminated now, so
+ * we might consider eliminating this (and btree_trans_too_many_iter()) at some
+ * point.
+ */
+#define BTREE_ITER_NORMAL_LIMIT		256
+/* never exceed limit */
+#define BTREE_ITER_MAX			(1U << 10)
+
+struct btree_trans_commit_hook;
+typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
+
+struct btree_trans_commit_hook {
+	btree_trans_commit_hook_fn	*fn;
+	struct btree_trans_commit_hook	*next;
+};
+
+#define BTREE_TRANS_MEM_MAX	(1U << 16)
+
+#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS	10000
+
+struct btree_trans_paths {
+	unsigned long		nr_paths;
+	struct btree_path	paths[];
 };
 
 struct btree_trans {
 	struct bch_fs		*c;
-	size_t			nr_restarts;
 
-	u8			nr_iters;
-	u8			iters_live;
-	u8			iters_linked;
-	u8			nr_updates;
+	unsigned long		*paths_allocated;
+	struct btree_path	*paths;
+	btree_path_idx_t	*sorted;
+	struct btree_insert_entry *updates;
 
+	void			*mem;
 	unsigned		mem_top;
 	unsigned		mem_bytes;
-	void			*mem;
 
-	struct btree_iter	*iters;
-	u64			iter_ids[BTREE_ITER_MAX];
+	btree_path_idx_t	nr_sorted;
+	btree_path_idx_t	nr_paths;
+	btree_path_idx_t	nr_paths_max;
+	btree_path_idx_t	nr_updates;
+	u8			fn_idx;
+	u8			lock_must_abort;
+	bool			lock_may_not_fail:1;
+	bool			srcu_held:1;
+	bool			locked:1;
+	bool			pf_memalloc_nofs:1;
+	bool			write_locked:1;
+	bool			used_mempool:1;
+	bool			in_traverse_all:1;
+	bool			paths_sorted:1;
+	bool			memory_allocation_failure:1;
+	bool			journal_transaction_names:1;
+	bool			journal_replay_not_finished:1;
+	bool			notrace_relock_fail:1;
+	enum bch_errcode	restarted:16;
+	u32			restart_count;
+
+	u64			last_begin_time;
+	unsigned long		last_begin_ip;
+	unsigned long		last_restarted_ip;
+#ifdef CONFIG_BCACHEFS_DEBUG
+	bch_stacktrace		last_restarted_trace;
+#endif
+	unsigned long		last_unlock_ip;
+	unsigned long		srcu_lock_time;
+
+	const char		*fn;
+	struct btree_bkey_cached_common *locking;
+	struct six_lock_waiter	locking_wait;
+	int			srcu_idx;
+
+	/* update path: */
+	u16			journal_entries_u64s;
+	u16			journal_entries_size;
+	struct jset_entry	*journal_entries;
+
+	struct btree_trans_commit_hook *hooks;
+	struct journal_entry_pin *journal_pin;
 
-	struct btree_insert_entry updates[BTREE_ITER_MAX];
+	struct journal_res	journal_res;
+	u64			*journal_seq;
+	struct disk_reservation *disk_res;
 
-	struct btree_iter	iters_onstack[2];
+	struct bch_fs_usage_base fs_usage_delta;
+
+	unsigned		journal_u64s;
+	unsigned		extra_disk_res; /* XXX kill */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+	/* Entries before this are zeroed out on every bch2_trans_get() call */
+
+	struct list_head	list;
+	struct closure		ref;
+
+	unsigned long		_paths_allocated[BITS_TO_LONGS(BTREE_ITER_INITIAL)];
+	struct btree_trans_paths trans_paths;
+	struct btree_path	_paths[BTREE_ITER_INITIAL];
+	btree_path_idx_t	_sorted[BTREE_ITER_INITIAL + 4];
+	struct btree_insert_entry _updates[BTREE_ITER_INITIAL];
+};
+
+static inline struct btree_path *btree_iter_path(struct btree_trans *trans, struct btree_iter *iter)
+{
+	return trans->paths + iter->path;
+}
+
+static inline struct btree_path *btree_iter_key_cache_path(struct btree_trans *trans, struct btree_iter *iter)
+{
+	return iter->key_cache_path
+		? trans->paths + iter->key_cache_path
+		: NULL;
+}
+
+#define BCH_BTREE_WRITE_TYPES()						\
+	x(initial,		0)					\
+	x(init_next_bset,	1)					\
+	x(cache_reclaim,	2)					\
+	x(journal_reclaim,	3)					\
+	x(interior,		4)
+
+enum btree_write_type {
+#define x(t, n) BTREE_WRITE_##t,
+	BCH_BTREE_WRITE_TYPES()
+#undef x
+	BTREE_WRITE_TYPE_NR,
+};
+
+#define BTREE_WRITE_TYPE_MASK	(roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
+#define BTREE_WRITE_TYPE_BITS	ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR))
+
+#define BTREE_FLAGS()							\
+	x(read_in_flight)						\
+	x(read_error)							\
+	x(dirty)							\
+	x(need_write)							\
+	x(write_blocked)						\
+	x(will_make_reachable)						\
+	x(noevict)							\
+	x(write_idx)							\
+	x(accessed)							\
+	x(write_in_flight)						\
+	x(write_in_flight_inner)					\
+	x(just_written)							\
+	x(dying)							\
+	x(fake)								\
+	x(need_rewrite)							\
+	x(never_write)							\
+	x(pinned)
+
+enum btree_flags {
+	/* First bits for btree node write type */
+	BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1,
+#define x(flag)	BTREE_NODE_##flag,
+	BTREE_FLAGS()
+#undef x
 };
 
-#define BTREE_FLAG(flag)						\
+#define x(flag)								\
 static inline bool btree_node_ ## flag(struct btree *b)			\
 {	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
 									\
@@ -297,31 +622,8 @@ static inline void set_btree_node_ ## flag(struct btree *b)		\
 static inline void clear_btree_node_ ## flag(struct btree *b)		\
 {	clear_bit(BTREE_NODE_ ## flag, &b->flags); }
 
-enum btree_flags {
-	BTREE_NODE_read_in_flight,
-	BTREE_NODE_read_error,
-	BTREE_NODE_dirty,
-	BTREE_NODE_need_write,
-	BTREE_NODE_noevict,
-	BTREE_NODE_write_idx,
-	BTREE_NODE_accessed,
-	BTREE_NODE_write_in_flight,
-	BTREE_NODE_just_written,
-	BTREE_NODE_dying,
-	BTREE_NODE_fake,
-};
-
-BTREE_FLAG(read_in_flight);
-BTREE_FLAG(read_error);
-BTREE_FLAG(dirty);
-BTREE_FLAG(need_write);
-BTREE_FLAG(noevict);
-BTREE_FLAG(write_idx);
-BTREE_FLAG(accessed);
-BTREE_FLAG(write_in_flight);
-BTREE_FLAG(just_written);
-BTREE_FLAG(dying);
-BTREE_FLAG(fake);
+BTREE_FLAGS()
+#undef x
 
 static inline struct btree_write *btree_current_write(struct btree *b)
 {
@@ -339,10 +641,38 @@ static inline struct bset_tree *bset_tree_last(struct btree *b)
 	return b->set + b->nsets - 1;
 }
 
+static inline void *
+__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
+{
+	return (void *) ((u64 *) b->data + 1 + offset);
+}
+
+static inline u16
+__btree_node_ptr_to_offset(const struct btree *b, const void *p)
+{
+	u16 ret = (u64 *) p - 1 - (u64 *) b->data;
+
+	EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
+	return ret;
+}
+
 static inline struct bset *bset(const struct btree *b,
 				const struct bset_tree *t)
 {
-	return (void *) b->data + t->data_offset * sizeof(u64);
+	return __btree_node_offset_to_ptr(b, t->data_offset);
+}
+
+static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
+{
+	t->end_offset =
+		__btree_node_ptr_to_offset(b, vstruct_last(bset(b, t)));
+}
+
+static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
+				  const struct bset *i)
+{
+	t->data_offset = __btree_node_ptr_to_offset(b, i);
+	set_btree_bset_end(b, t);
 }
 
 static inline struct bset *btree_bset_first(struct btree *b)
@@ -358,19 +688,27 @@ static inline struct bset *btree_bset_last(struct btree *b)
 static inline u16
 __btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
 {
-	size_t ret = (u64 *) k - (u64 *) b->data - 1;
-
-	EBUG_ON(ret > U16_MAX);
-	return ret;
+	return __btree_node_ptr_to_offset(b, k);
 }
 
 static inline struct bkey_packed *
 __btree_node_offset_to_key(const struct btree *b, u16 k)
 {
-	return (void *) ((u64 *) b->data + k + 1);
+	return __btree_node_offset_to_ptr(b, k);
 }
 
-#define btree_bkey_first(_b, _t)	(bset(_b, _t)->start)
+static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
+{
+	return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
+}
+
+#define btree_bkey_first(_b, _t)					\
+({									\
+	EBUG_ON(bset(_b, _t)->start !=					\
+		__btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\
+									\
+	bset(_b, _t)->start;						\
+})
 
 #define btree_bkey_last(_b, _t)						\
 ({									\
@@ -380,21 +718,15 @@ __btree_node_offset_to_key(const struct btree *b, u16 k)
 	__btree_node_offset_to_key(_b, (_t)->end_offset);		\
 })
 
-static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
+static inline unsigned bset_u64s(struct bset_tree *t)
 {
-	t->end_offset =
-		__btree_node_key_to_offset(b, vstruct_last(bset(b, t)));
-	btree_bkey_last(b, t);
+	return t->end_offset - t->data_offset -
+		sizeof(struct bset) / sizeof(u64);
 }
 
-static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
-				  const struct bset *i)
+static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t)
 {
-	t->data_offset = (u64 *) i - (u64 *) b->data;
-
-	EBUG_ON(bset(b, t) != i);
-
-	set_btree_bset_end(b, t);
+	return bset_u64s(t) - b->nr.bset_u64s[t - b->set];
 }
 
 static inline unsigned bset_byte_offset(struct btree *b, void *i)
@@ -402,63 +734,119 @@ static inline unsigned bset_byte_offset(struct btree *b, void *i)
 	return i - (void *) b->data;
 }
 
+enum btree_node_type {
+	BKEY_TYPE_btree,
+#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1,
+	BCH_BTREE_IDS()
+#undef x
+	BKEY_TYPE_NR
+};
+
+/* Type of a key in btree @id at level @level: */
+static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
+{
+	return level ? BKEY_TYPE_btree : (unsigned) id + 1;
+}
+
 /* Type of keys @b contains: */
-static inline enum bkey_type btree_node_type(struct btree *b)
+static inline enum btree_node_type btree_node_type(struct btree *b)
+{
+	return __btree_node_type(b->c.level, b->c.btree_id);
+}
+
+const char *bch2_btree_node_type_str(enum btree_node_type);
+
+#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS		\
+	(BIT_ULL(BKEY_TYPE_extents)|			\
+	 BIT_ULL(BKEY_TYPE_alloc)|			\
+	 BIT_ULL(BKEY_TYPE_inodes)|			\
+	 BIT_ULL(BKEY_TYPE_stripes)|			\
+	 BIT_ULL(BKEY_TYPE_reflink)|			\
+	 BIT_ULL(BKEY_TYPE_subvolumes)|			\
+	 BIT_ULL(BKEY_TYPE_btree))
+
+#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS		\
+	(BIT_ULL(BKEY_TYPE_alloc)|			\
+	 BIT_ULL(BKEY_TYPE_inodes)|			\
+	 BIT_ULL(BKEY_TYPE_stripes)|			\
+	 BIT_ULL(BKEY_TYPE_snapshots))
+
+#define BTREE_NODE_TYPE_HAS_TRIGGERS			\
+	(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|		\
+	 BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS)
+
+static inline bool btree_node_type_has_trans_triggers(enum btree_node_type type)
+{
+	return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS;
+}
+
+static inline bool btree_node_type_has_atomic_triggers(enum btree_node_type type)
+{
+	return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS;
+}
+
+static inline bool btree_node_type_has_triggers(enum btree_node_type type)
+{
+	return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS;
+}
+
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
 {
-	return b->level ? BKEY_TYPE_BTREE : b->btree_id;
+	const u64 mask = 0
+#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1))
+	BCH_BTREE_IDS()
+#undef x
+	;
+
+	return BIT_ULL(type) & mask;
 }
 
-static inline const struct bkey_ops *btree_node_ops(struct btree *b)
+static inline bool btree_id_is_extents(enum btree_id btree)
 {
-	return &bch2_bkey_ops[btree_node_type(b)];
+	return btree_node_type_is_extents(__btree_node_type(0, btree));
 }
 
-static inline bool btree_node_has_ptrs(struct btree *b)
+static inline bool btree_type_has_snapshots(enum btree_id id)
 {
-	return btree_type_has_ptrs(btree_node_type(b));
+	const u64 mask = 0
+#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr)
+	BCH_BTREE_IDS()
+#undef x
+	;
+
+	return BIT_ULL(id) & mask;
 }
 
-static inline bool btree_node_is_extents(struct btree *b)
+static inline bool btree_type_has_snapshot_field(enum btree_id id)
 {
-	return btree_node_type(b) == BKEY_TYPE_EXTENTS;
+	const u64 mask = 0
+#define x(name, nr, flags, ...)	|((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
+	BCH_BTREE_IDS()
+#undef x
+	;
+
+	return BIT_ULL(id) & mask;
+}
+
+static inline bool btree_type_has_ptrs(enum btree_id id)
+{
+	const u64 mask = 0
+#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_DATA)) << nr)
+	BCH_BTREE_IDS()
+#undef x
+	;
+
+	return BIT_ULL(id) & mask;
 }
 
 struct btree_root {
 	struct btree		*b;
 
-	struct btree_update	*as;
-
 	/* On disk root - see async splits: */
 	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 	u8			level;
 	u8			alive;
-};
-
-/*
- * Optional hook that will be called just prior to a btree node update, when
- * we're holding the write lock and we know what key is about to be overwritten:
- */
-
-struct btree_iter;
-struct btree_node_iter;
-
-enum btree_insert_ret {
-	BTREE_INSERT_OK,
-	/* extent spanned multiple leaf nodes: have to traverse to next node: */
-	BTREE_INSERT_NEED_TRAVERSE,
-	/* write lock held for too long */
-	BTREE_INSERT_NEED_RESCHED,
-	/* leaf node needs to be split */
-	BTREE_INSERT_BTREE_NODE_FULL,
-	BTREE_INSERT_JOURNAL_RES_FULL,
-	BTREE_INSERT_ENOSPC,
-	BTREE_INSERT_NEED_GC_LOCK,
-};
-
-struct extent_insert_hook {
-	enum btree_insert_ret
-	(*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
-	      struct bkey_s_c, const struct bkey_i *);
+	s16			error;
 };
 
 enum btree_gc_coalesce_fail_reason {
@@ -472,8 +860,9 @@ enum btree_node_sibling {
 	btree_next_sib,
 };
 
-typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
-							struct btree *,
-							struct btree_node_iter *);
+struct get_locks_fail {
+	unsigned	l;
+	struct btree	*b;
+};
 
 #endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c
new file mode 100644
index 00000000..06fd5aa6
--- /dev/null
+++ b/libbcachefs/btree_update.c
@@ -0,0 +1,898 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_iter.h"
+#include "btree_journal_iter.h"
+#include "btree_locking.h"
+#include "buckets.h"
+#include "debug.h"
+#include "errcode.h"
+#include "error.h"
+#include "extents.h"
+#include "keylist.h"
+#include "snapshot.h"
+#include "trace.h"
+
+static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
+					 const struct btree_insert_entry *r)
+{
+	return   cmp_int(l->btree_id,	r->btree_id) ?:
+		 cmp_int(l->cached,	r->cached) ?:
+		 -cmp_int(l->level,	r->level) ?:
+		 bpos_cmp(l->k->k.p,	r->k->k.p);
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t,
+			  struct bkey_i *, enum btree_iter_update_trigger_flags,
+			  unsigned long ip);
+
+static noinline int extent_front_merge(struct btree_trans *trans,
+				       struct btree_iter *iter,
+				       struct bkey_s_c k,
+				       struct bkey_i **insert,
+				       enum btree_iter_update_trigger_flags flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i *update;
+	int ret;
+
+	if (unlikely(trans->journal_replay_not_finished))
+		return 0;
+
+	update = bch2_bkey_make_mut_noupdate(trans, k);
+	ret = PTR_ERR_OR_ZERO(update);
+	if (ret)
+		return ret;
+
+	if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
+		return 0;
+
+	ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?:
+		bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p);
+	if (ret < 0)
+		return ret;
+	if (ret)
+		return 0;
+
+	ret = bch2_btree_delete_at(trans, iter, flags);
+	if (ret)
+		return ret;
+
+	*insert = update;
+	return 0;
+}
+
+static noinline int extent_back_merge(struct btree_trans *trans,
+				      struct btree_iter *iter,
+				      struct bkey_i *insert,
+				      struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	int ret;
+
+	if (unlikely(trans->journal_replay_not_finished))
+		return 0;
+
+	ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?:
+		bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p);
+	if (ret < 0)
+		return ret;
+	if (ret)
+		return 0;
+
+	bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+	return 0;
+}
+
+/*
+ * When deleting, check if we need to emit a whiteout (because we're overwriting
+ * something in an ancestor snapshot)
+ */
+static int need_whiteout_for_snapshot(struct btree_trans *trans,
+				      enum btree_id btree_id, struct bpos pos)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u32 snapshot = pos.snapshot;
+	int ret;
+
+	if (!bch2_snapshot_parent(trans->c, pos.snapshot))
+		return 0;
+
+	pos.snapshot++;
+
+	for_each_btree_key_norestart(trans, iter, btree_id, pos,
+			   BTREE_ITER_all_snapshots|
+			   BTREE_ITER_nopreserve, k, ret) {
+		if (!bkey_eq(k.k->p, pos))
+			break;
+
+		if (bch2_snapshot_is_ancestor(trans->c, snapshot,
+					      k.k->p.snapshot)) {
+			ret = !bkey_whiteout(k.k);
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
+				   enum btree_id id,
+				   struct bpos old_pos,
+				   struct bpos new_pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter old_iter, new_iter = { NULL };
+	struct bkey_s_c old_k, new_k;
+	snapshot_id_list s;
+	struct bkey_i *update;
+	int ret = 0;
+
+	if (!bch2_snapshot_has_children(c, old_pos.snapshot))
+		return 0;
+
+	darray_init(&s);
+
+	bch2_trans_iter_init(trans, &old_iter, id, old_pos,
+			     BTREE_ITER_not_extents|
+			     BTREE_ITER_all_snapshots);
+	while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
+	       !(ret = bkey_err(old_k)) &&
+	       bkey_eq(old_pos, old_k.k->p)) {
+		struct bpos whiteout_pos =
+			SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);
+
+		if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
+		    snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
+			continue;
+
+		new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
+					   BTREE_ITER_not_extents|
+					   BTREE_ITER_intent);
+		ret = bkey_err(new_k);
+		if (ret)
+			break;
+
+		if (new_k.k->type == KEY_TYPE_deleted) {
+			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+			ret = PTR_ERR_OR_ZERO(update);
+			if (ret)
+				break;
+
+			bkey_init(&update->k);
+			update->k.p		= whiteout_pos;
+			update->k.type		= KEY_TYPE_whiteout;
+
+			ret = bch2_trans_update(trans, &new_iter, update,
+						BTREE_UPDATE_internal_snapshot_node);
+		}
+		bch2_trans_iter_exit(trans, &new_iter);
+
+		ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &new_iter);
+	bch2_trans_iter_exit(trans, &old_iter);
+	darray_exit(&s);
+
+	return ret;
+}
+
+int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
+				       struct btree_iter *iter,
+				       enum btree_iter_update_trigger_flags flags,
+				       struct bkey_s_c old,
+				       struct bkey_s_c new)
+{
+	enum btree_id btree_id = iter->btree_id;
+	struct bkey_i *update;
+	struct bpos new_start = bkey_start_pos(new.k);
+	unsigned front_split = bkey_lt(bkey_start_pos(old.k), new_start);
+	unsigned back_split  = bkey_gt(old.k->p, new.k->p);
+	unsigned middle_split = (front_split || back_split) &&
+		old.k->p.snapshot != new.k->p.snapshot;
+	unsigned nr_splits = front_split + back_split + middle_split;
+	int ret = 0, compressed_sectors;
+
+	/*
+	 * If we're going to be splitting a compressed extent, note it
+	 * so that __bch2_trans_commit() can increase our disk
+	 * reservation:
+	 */
+	if (nr_splits > 1 &&
+	    (compressed_sectors = bch2_bkey_sectors_compressed(old)))
+		trans->extra_disk_res += compressed_sectors * (nr_splits - 1);
+
+	if (front_split) {
+		update = bch2_bkey_make_mut_noupdate(trans, old);
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bch2_cut_back(new_start, update);
+
+		ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+					old.k->p, update->k.p) ?:
+			bch2_btree_insert_nonextent(trans, btree_id, update,
+					BTREE_UPDATE_internal_snapshot_node|flags);
+		if (ret)
+			return ret;
+	}
+
+	/* If we're overwriting in a different snapshot - middle split: */
+	if (middle_split) {
+		update = bch2_bkey_make_mut_noupdate(trans, old);
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bch2_cut_front(new_start, update);
+		bch2_cut_back(new.k->p, update);
+
+		ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+					old.k->p, update->k.p) ?:
+			bch2_btree_insert_nonextent(trans, btree_id, update,
+					  BTREE_UPDATE_internal_snapshot_node|flags);
+		if (ret)
+			return ret;
+	}
+
+	if (bkey_le(old.k->p, new.k->p)) {
+		update = bch2_trans_kmalloc(trans, sizeof(*update));
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bkey_init(&update->k);
+		update->k.p = old.k->p;
+		update->k.p.snapshot = new.k->p.snapshot;
+
+		if (new.k->p.snapshot != old.k->p.snapshot) {
+			update->k.type = KEY_TYPE_whiteout;
+		} else if (btree_type_has_snapshots(btree_id)) {
+			ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
+			if (ret < 0)
+				return ret;
+			if (ret)
+				update->k.type = KEY_TYPE_whiteout;
+		}
+
+		ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+					  BTREE_UPDATE_internal_snapshot_node|flags);
+		if (ret)
+			return ret;
+	}
+
+	if (back_split) {
+		update = bch2_bkey_make_mut_noupdate(trans, old);
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bch2_cut_front(new.k->p, update);
+
+		ret = bch2_trans_update_by_path(trans, iter->path, update,
+					  BTREE_UPDATE_internal_snapshot_node|
+					  flags, _RET_IP_);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int bch2_trans_update_extent(struct btree_trans *trans,
+				    struct btree_iter *orig_iter,
+				    struct bkey_i *insert,
+				    enum btree_iter_update_trigger_flags flags)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	enum btree_id btree_id = orig_iter->btree_id;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
+			     BTREE_ITER_intent|
+			     BTREE_ITER_with_updates|
+			     BTREE_ITER_not_extents);
+	k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
+	if ((ret = bkey_err(k)))
+		goto err;
+	if (!k.k)
+		goto out;
+
+	if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
+		if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
+			ret = extent_front_merge(trans, &iter, k, &insert, flags);
+			if (ret)
+				goto err;
+		}
+
+		goto next;
+	}
+
+	while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
+		bool done = bkey_lt(insert->k.p, k.k->p);
+
+		ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
+		if (ret)
+			goto err;
+
+		if (done)
+			goto out;
+next:
+		bch2_btree_iter_advance(&iter);
+		k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
+		if ((ret = bkey_err(k)))
+			goto err;
+		if (!k.k)
+			goto out;
+	}
+
+	if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
+		ret = extent_back_merge(trans, &iter, insert, k);
+		if (ret)
+			goto err;
+	}
+out:
+	if (!bkey_deleted(&insert->k))
+		ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+static noinline int flush_new_cached_update(struct btree_trans *trans,
+					    struct btree_insert_entry *i,
+					    enum btree_iter_update_trigger_flags flags,
+					    unsigned long ip)
+{
+	struct bkey k;
+	int ret;
+
+	btree_path_idx_t path_idx =
+		bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0,
+			      BTREE_ITER_intent, _THIS_IP_);
+	ret = bch2_btree_path_traverse(trans, path_idx, 0);
+	if (ret)
+		goto out;
+
+	struct btree_path *btree_path = trans->paths + path_idx;
+
+	/*
+	 * The old key in the insert entry might actually refer to an existing
+	 * key in the btree that has been deleted from cache and not yet
+	 * flushed. Check for this and skip the flush so we don't run triggers
+	 * against a stale key.
+	 */
+	bch2_btree_path_peek_slot_exact(btree_path, &k);
+	if (!bkey_deleted(&k))
+		goto out;
+
+	i->key_cache_already_flushed = true;
+	i->flags |= BTREE_TRIGGER_norun;
+
+	btree_path_set_should_be_locked(trans, btree_path);
+	ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip);
+out:
+	bch2_path_put(trans, path_idx, true);
+	return ret;
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
+			  struct bkey_i *k, enum btree_iter_update_trigger_flags flags,
+			  unsigned long ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i, n;
+	int cmp;
+
+	struct btree_path *path = trans->paths + path_idx;
+	EBUG_ON(!path->should_be_locked);
+	EBUG_ON(trans->nr_updates >= trans->nr_paths);
+	EBUG_ON(!bpos_eq(k->k.p, path->pos));
+
+	n = (struct btree_insert_entry) {
+		.flags		= flags,
+		.bkey_type	= __btree_node_type(path->level, path->btree_id),
+		.btree_id	= path->btree_id,
+		.level		= path->level,
+		.cached		= path->cached,
+		.path		= path_idx,
+		.k		= k,
+		.ip_allocated	= ip,
+	};
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	trans_for_each_update(trans, i)
+		BUG_ON(i != trans->updates &&
+		       btree_insert_entry_cmp(i - 1, i) >= 0);
+#endif
+
+	/*
+	 * Pending updates are kept sorted: first, find position of new update,
+	 * then delete/trim any updates the new update overwrites:
+	 */
+	for (i = trans->updates; i < trans->updates + trans->nr_updates; i++) {
+		cmp = btree_insert_entry_cmp(&n, i);
+		if (cmp <= 0)
+			break;
+	}
+
+	bool overwrite = !cmp && i < trans->updates + trans->nr_updates;
+
+	if (overwrite) {
+		EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
+
+		bch2_path_put(trans, i->path, true);
+		i->flags	= n.flags;
+		i->cached	= n.cached;
+		i->k		= n.k;
+		i->path		= n.path;
+		i->ip_allocated	= n.ip_allocated;
+	} else {
+		array_insert_item(trans->updates, trans->nr_updates,
+				  i - trans->updates, n);
+
+		i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
+		i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
+
+		if (unlikely(trans->journal_replay_not_finished)) {
+			struct bkey_i *j_k =
+				bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
+
+			if (j_k) {
+				i->old_k = j_k->k;
+				i->old_v = &j_k->v;
+			}
+		}
+	}
+
+	__btree_path_get(trans, trans->paths + i->path, true);
+
+	trace_update_by_path(trans, path, i, overwrite);
+
+	/*
+	 * If a key is present in the key cache, it must also exist in the
+	 * btree - this is necessary for cache coherency. When iterating over
+	 * a btree that's cached in the key cache, the btree iter code checks
+	 * the key cache - but the key has to exist in the btree for that to
+	 * work:
+	 */
+	if (path->cached && !i->old_btree_u64s)
+		return flush_new_cached_update(trans, i, flags, ip);
+
+	return 0;
+}
+
+static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
+						    struct btree_iter *iter,
+						    struct btree_path *path)
+{
+	struct btree_path *key_cache_path = btree_iter_key_cache_path(trans, iter);
+
+	if (!key_cache_path ||
+	    !key_cache_path->should_be_locked ||
+	    !bpos_eq(key_cache_path->pos, iter->pos)) {
+		struct bkey_cached *ck;
+		int ret;
+
+		if (!iter->key_cache_path)
+			iter->key_cache_path =
+				bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+					      BTREE_ITER_intent|
+					      BTREE_ITER_cached, _THIS_IP_);
+
+		iter->key_cache_path =
+			bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
+						iter->flags & BTREE_ITER_intent,
+						_THIS_IP_);
+
+		ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_cached);
+		if (unlikely(ret))
+			return ret;
+
+		ck = (void *) trans->paths[iter->key_cache_path].l[0].b;
+
+		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+			trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
+			return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
+		}
+
+		btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
+	}
+
+	return 0;
+}
+
+int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+				   struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
+{
+	btree_path_idx_t path_idx = iter->update_path ?: iter->path;
+	int ret;
+
+	if (iter->flags & BTREE_ITER_is_extents)
+		return bch2_trans_update_extent(trans, iter, k, flags);
+
+	if (bkey_deleted(&k->k) &&
+	    !(flags & BTREE_UPDATE_key_cache_reclaim) &&
+	    (iter->flags & BTREE_ITER_filter_snapshots)) {
+		ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
+		if (unlikely(ret < 0))
+			return ret;
+
+		if (ret)
+			k->k.type = KEY_TYPE_whiteout;
+	}
+
+	/*
+	 * Ensure that updates to cached btrees go to the key cache:
+	 */
+	struct btree_path *path = trans->paths + path_idx;
+	if (!(flags & BTREE_UPDATE_key_cache_reclaim) &&
+	    !path->cached &&
+	    !path->level &&
+	    btree_id_cached(trans->c, path->btree_id)) {
+		ret = bch2_trans_update_get_key_cache(trans, iter, path);
+		if (ret)
+			return ret;
+
+		path_idx = iter->key_cache_path;
+	}
+
+	return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_);
+}
+
+int bch2_btree_insert_clone_trans(struct btree_trans *trans,
+				  enum btree_id btree,
+				  struct bkey_i *k)
+{
+	struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k));
+	int ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	bkey_copy(n, k);
+	return bch2_btree_insert_trans(trans, btree, n, 0);
+}
+
+struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
+{
+	unsigned new_top = trans->journal_entries_u64s + u64s;
+	unsigned old_size = trans->journal_entries_size;
+
+	if (new_top > trans->journal_entries_size) {
+		trans->journal_entries_size = roundup_pow_of_two(new_top);
+
+		btree_trans_stats(trans)->journal_entries_size = trans->journal_entries_size;
+	}
+
+	struct jset_entry *n =
+		bch2_trans_kmalloc_nomemzero(trans,
+				trans->journal_entries_size * sizeof(u64));
+	if (IS_ERR(n))
+		return ERR_CAST(n);
+
+	if (trans->journal_entries)
+		memcpy(n, trans->journal_entries, old_size * sizeof(u64));
+	trans->journal_entries = n;
+
+	struct jset_entry *e = btree_trans_journal_entries_top(trans);
+	trans->journal_entries_u64s = new_top;
+	return e;
+}
+
+int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
+			     enum btree_id btree, struct bpos end)
+{
+	bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent);
+	struct bkey_s_c k = bch2_btree_iter_peek_prev(iter);
+	int ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	bch2_btree_iter_advance(iter);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	BUG_ON(k.k->type != KEY_TYPE_deleted);
+
+	if (bkey_gt(k.k->p, end)) {
+		ret = -BCH_ERR_ENOSPC_btree_slot;
+		goto err;
+	}
+
+	return 0;
+err:
+	bch2_trans_iter_exit(trans, iter);
+	return ret;
+}
+
+void bch2_trans_commit_hook(struct btree_trans *trans,
+			    struct btree_trans_commit_hook *h)
+{
+	h->next = trans->hooks;
+	trans->hooks = h;
+}
+
+int bch2_btree_insert_nonextent(struct btree_trans *trans,
+				enum btree_id btree, struct bkey_i *k,
+				enum btree_iter_update_trigger_flags flags)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, btree, k->k.p,
+			     BTREE_ITER_cached|
+			     BTREE_ITER_not_extents|
+			     BTREE_ITER_intent);
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, k, flags);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
+			    struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
+{
+	struct btree_iter iter;
+	bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
+			     BTREE_ITER_intent|flags);
+	int ret = bch2_btree_iter_traverse(&iter) ?:
+		  bch2_trans_update(trans, &iter, k, flags);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/**
+ * bch2_btree_insert - insert keys into the extent btree
+ * @c:			pointer to struct bch_fs
+ * @id:			btree to insert into
+ * @k:			key to insert
+ * @disk_res:		must be non-NULL whenever inserting or potentially
+ *			splitting data extents
+ * @flags:		transaction commit flags
+ * @iter_flags:		btree iter update trigger flags
+ *
+ * Returns:		0 on success, error code on failure
+ */
+int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
+		      struct disk_reservation *disk_res, int flags,
+		      enum btree_iter_update_trigger_flags iter_flags)
+{
+	return bch2_trans_commit_do(c, disk_res, NULL, flags,
+			     bch2_btree_insert_trans(trans, id, k, iter_flags));
+}
+
+int bch2_btree_delete_at(struct btree_trans *trans,
+			 struct btree_iter *iter, unsigned update_flags)
+{
+	struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
+	int ret = PTR_ERR_OR_ZERO(k);
+	if (ret)
+		return ret;
+
+	bkey_init(&k->k);
+	k->k.p = iter->pos;
+	return bch2_trans_update(trans, iter, k, update_flags);
+}
+
+int bch2_btree_delete(struct btree_trans *trans,
+		      enum btree_id btree, struct bpos pos,
+		      unsigned update_flags)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, btree, pos,
+			     BTREE_ITER_cached|
+			     BTREE_ITER_intent);
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_btree_delete_at(trans, &iter, update_flags);
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
+				  struct bpos start, struct bpos end,
+				  unsigned update_flags,
+				  u64 *journal_seq)
+{
+	u32 restart_count = trans->restart_count;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent);
+	while ((k = bch2_btree_iter_peek_max(&iter, end)).k) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(trans->c, 0);
+		struct bkey_i delete;
+
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		bkey_init(&delete.k);
+
+		/*
+		 * This could probably be more efficient for extents:
+		 */
+
+		/*
+		 * For extents, iter.pos won't necessarily be the same as
+		 * bkey_start_pos(k.k) (for non extents they always will be the
+		 * same). It's important that we delete starting from iter.pos
+		 * because the range we want to delete could start in the middle
+		 * of k.
+		 *
+		 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
+		 * bkey_start_pos(k.k)).
+		 */
+		delete.k.p = iter.pos;
+
+		if (iter.flags & BTREE_ITER_is_extents)
+			bch2_key_resize(&delete.k,
+					bpos_min(end, k.k->p).offset -
+					iter.pos.offset);
+
+		ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
+			bch2_trans_commit(trans, &disk_res, journal_seq,
+					  BCH_TRANS_COMMIT_no_enospc);
+		bch2_disk_reservation_put(trans->c, &disk_res);
+err:
+		/*
+		 * the bch2_trans_begin() call is in a weird place because we
+		 * need to call it after every transaction commit, to avoid path
+		 * overflow, but don't want to call it if the delete operation
+		 * is a no-op and we have no work to do:
+		 */
+		bch2_trans_begin(trans);
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			ret = 0;
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret ?: trans_was_restarted(trans, restart_count);
+}
+
+/*
+ * bch_btree_delete_range - delete everything within a given range
+ *
+ * Range is a half open interval - [start, end)
+ */
+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
+			    struct bpos start, struct bpos end,
+			    unsigned update_flags,
+			    u64 *journal_seq)
+{
+	int ret = bch2_trans_run(c,
+			bch2_btree_delete_range_trans(trans, id, start, end,
+						      update_flags, journal_seq));
+	if (ret == -BCH_ERR_transaction_restart_nested)
+		ret = 0;
+	return ret;
+}
+
+int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, bool set)
+{
+	struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
+	int ret = PTR_ERR_OR_ZERO(k);
+	if (ret)
+		return ret;
+
+	bkey_init(&k->k);
+	k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+	k->k.p = iter->pos;
+	if (iter->flags & BTREE_ITER_is_extents)
+		bch2_key_resize(&k->k, 1);
+
+	return bch2_trans_update(trans, iter, k, 0);
+}
+
+int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
+		       struct bpos pos, bool set)
+{
+	struct btree_iter iter;
+	bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
+
+	int ret = bch2_btree_iter_traverse(&iter) ?:
+		  bch2_btree_bit_mod_iter(trans, &iter, set);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
+				struct bpos pos, bool set)
+{
+	struct bkey_i k;
+
+	bkey_init(&k.k);
+	k.k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+	k.k.p = pos;
+
+	return bch2_trans_update_buffered(trans, btree, &k);
+}
+
+static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s)
+{
+	struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s));
+	int ret = PTR_ERR_OR_ZERO(e);
+	if (ret)
+		return ret;
+
+	struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry);
+	journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s);
+	memcpy(l->d, buf->buf, buf->pos);
+	return 0;
+}
+
+__printf(3, 0)
+static int
+__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
+		  va_list args)
+{
+	struct printbuf buf = PRINTBUF;
+	prt_vprintf(&buf, fmt, args);
+
+	unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
+	prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos);
+
+	int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
+	if (ret)
+		goto err;
+
+	if (!test_bit(JOURNAL_running, &c->journal.flags)) {
+		ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s));
+		if (ret)
+			goto err;
+
+		struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries);
+		journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s);
+		memcpy(l->d, buf.buf, buf.pos);
+		c->journal.early_journal_entries.nr += jset_u64s(u64s);
+	} else {
+		ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags,
+			__bch2_trans_log_msg(trans, &buf, u64s));
+	}
+err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+__printf(2, 3)
+int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	ret = __bch2_fs_log_msg(c, 0, fmt, args);
+	va_end(args);
+	return ret;
+}
+
+/*
+ * Use for logging messages during recovery to enable reserved space and avoid
+ * blocking.
+ */
+__printf(2, 3)
+int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
+	va_end(args);
+	return ret;
+}
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index 5e47d4cd..58df2019 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_BTREE_UPDATE_H
 #define _BCACHEFS_BTREE_UPDATE_H
 
@@ -6,165 +7,361 @@
 
 struct bch_fs;
 struct btree;
-struct btree_insert;
-
-void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
-				     struct btree_iter *);
-bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
-				struct btree_node_iter *, struct bkey_i *);
-void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *,
-			    struct bkey_i *);
-
-/* Normal update interface: */
-
-struct btree_insert {
-	struct bch_fs		*c;
-	struct disk_reservation *disk_res;
-	struct journal_res	journal_res;
-	u64			*journal_seq;
-	struct extent_insert_hook *hook;
-	unsigned		flags;
-	bool			did_work;
-
-	unsigned short		nr;
-	struct btree_insert_entry  *entries;
+
+void bch2_btree_node_prep_for_write(struct btree_trans *,
+				    struct btree_path *, struct btree *);
+bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
+				struct btree *, struct btree_node_iter *,
+				struct bkey_i *);
+
+int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64);
+int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64);
+void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
+
+void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
+				struct bkey_i *, u64);
+
+#define BCH_TRANS_COMMIT_FLAGS()							\
+	x(no_enospc,	"don't check for enospc")					\
+	x(no_check_rw,	"don't attempt to take a ref on c->writes")			\
+	x(no_journal_res, "don't take a journal reservation, instead "			\
+			"pin journal entry referred to by trans->journal_res.seq")	\
+	x(journal_reclaim, "operation required for journal reclaim; may return error"	\
+			"instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
+	x(skip_accounting_apply, "we're in journal replay - accounting updates have already been applied")
+
+enum __bch_trans_commit_flags {
+	/* First bits for bch_watermark: */
+	__BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS,
+#define x(n, ...)	__BCH_TRANS_COMMIT_##n,
+	BCH_TRANS_COMMIT_FLAGS()
+#undef x
+};
+
+enum bch_trans_commit_flags {
+#define x(n, ...)	BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n),
+	BCH_TRANS_COMMIT_FLAGS()
+#undef x
 };
 
-int __bch2_btree_insert_at(struct btree_insert *);
+void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags);
+
+int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
+int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
 
-#define _TENTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...)   N
-#define COUNT_ARGS(...)  _TENTH_ARG(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1)
+int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
+				struct bkey_i *, enum btree_iter_update_trigger_flags);
 
-#define BTREE_INSERT_ENTRY(_iter, _k)					\
-	((struct btree_insert_entry) {					\
-		.iter		= (_iter),				\
-		.k		= (_k),					\
-		.done		= false,				\
-	})
+int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
+			enum btree_iter_update_trigger_flags);
+int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct
+		disk_reservation *, int flags, enum
+		btree_iter_update_trigger_flags iter_flags);
 
-#define BTREE_INSERT_ENTRY_EXTRA_RES(_iter, _k, _extra)			\
-	((struct btree_insert_entry) {					\
-		.iter		= (_iter),				\
-		.k		= (_k),					\
-		.extra_res = (_extra),					\
-		.done		= false,				\
-	})
+int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
+				  struct bpos, struct bpos, unsigned, u64 *);
+int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
+			    struct bpos, struct bpos, unsigned, u64 *);
+
+int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool);
+int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
+int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool);
+
+static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
+						enum btree_id btree, struct bpos pos)
+{
+	return bch2_btree_bit_mod_buffered(trans, btree, pos, false);
+}
+
+int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
+				     struct bpos, struct bpos);
+
+/*
+ * For use when splitting extents in existing snapshots:
+ *
+ * If @old_pos is an interior snapshot node, iterate over descendent snapshot
+ * nodes: for every descendent snapshot in whiche @old_pos is overwritten and
+ * not visible, emit a whiteout at @new_pos.
+ */
+static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
+						 enum btree_id btree,
+						 struct bpos old_pos,
+						 struct bpos new_pos)
+{
+	if (!btree_type_has_snapshots(btree) ||
+	    bkey_eq(old_pos, new_pos))
+		return 0;
+
+	return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos);
+}
+
+int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
+				       enum btree_iter_update_trigger_flags,
+				       struct bkey_s_c, struct bkey_s_c);
+
+int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
+			     enum btree_id, struct bpos);
+
+int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
+				   struct bkey_i *, enum btree_iter_update_trigger_flags);
+
+struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned);
+
+static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans)
+{
+	return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+}
+
+static inline struct jset_entry *
+bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
+{
+	if (!trans->journal_entries ||
+	    trans->journal_entries_u64s + u64s > trans->journal_entries_size)
+		return __bch2_trans_jset_entry_alloc(trans, u64s);
+
+	struct jset_entry *e = btree_trans_journal_entries_top(trans);
+	trans->journal_entries_u64s += u64s;
+	return e;
+}
+
+int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
+
+static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
+					    enum btree_id btree,
+					    struct bkey_i *k)
+{
+	/*
+	 * Most updates skip the btree write buffer until journal replay is
+	 * finished because synchronization with journal replay relies on having
+	 * a btree node locked - if we're overwriting a key in the journal that
+	 * journal replay hasn't yet replayed, we have to mark it as
+	 * overwritten.
+	 *
+	 * But accounting updates don't overwrite, they're deltas, and they have
+	 * to be flushed to the btree strictly in order for journal replay to be
+	 * able to tell which updates need to be applied:
+	 */
+	if (k->k.type != KEY_TYPE_accounting &&
+	    unlikely(trans->journal_replay_not_finished))
+		return bch2_btree_insert_clone_trans(trans, btree, k);
+
+	struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
+	int ret = PTR_ERR_OR_ZERO(e);
+	if (ret)
+		return ret;
+
+	journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, btree, 0, k->k.u64s);
+	bkey_copy(e->start, k);
+	return 0;
+}
+
+void bch2_trans_commit_hook(struct btree_trans *,
+			    struct btree_trans_commit_hook *);
+int __bch2_trans_commit(struct btree_trans *, unsigned);
+
+__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
+__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
 
 /**
- * bch_btree_insert_at - insert one or more keys at iterator positions
- * @iter:		btree iterator
- * @insert_key:		key to insert
- * @disk_res:		disk reservation
- * @hook:		extent insert callback
+ * bch2_trans_commit - insert keys at given iterator positions
+ *
+ * This is main entry point for btree updates.
  *
  * Return values:
- * -EINTR: locking changed, this function should be called again. Only returned
- *  if passed BTREE_INSERT_ATOMIC.
  * -EROFS: filesystem read only
  * -EIO: journal or btree node IO error
  */
-#define bch2_btree_insert_at(_c, _disk_res, _hook,			\
-			    _journal_seq, _flags, ...)			\
-	__bch2_btree_insert_at(&(struct btree_insert) {			\
-		.c		= (_c),					\
-		.disk_res	= (_disk_res),				\
-		.journal_seq	= (_journal_seq),			\
-		.hook		= (_hook),				\
-		.flags		= (_flags),				\
-		.nr		= COUNT_ARGS(__VA_ARGS__),		\
-		.entries	= (struct btree_insert_entry[]) {	\
-			__VA_ARGS__					\
-		}})
-
-enum {
-	__BTREE_INSERT_ATOMIC,
-	__BTREE_INSERT_NOUNLOCK,
-	__BTREE_INSERT_NOFAIL,
-	__BTREE_INSERT_USE_RESERVE,
-	__BTREE_INSERT_USE_ALLOC_RESERVE,
-	__BTREE_INSERT_JOURNAL_REPLAY,
-	__BTREE_INSERT_NOWAIT,
-	__BTREE_INSERT_GC_LOCK_HELD,
-	__BCH_HASH_SET_MUST_CREATE,
-	__BCH_HASH_SET_MUST_REPLACE,
-};
+static inline int bch2_trans_commit(struct btree_trans *trans,
+				    struct disk_reservation *disk_res,
+				    u64 *journal_seq,
+				    unsigned flags)
+{
+	trans->disk_res		= disk_res;
+	trans->journal_seq	= journal_seq;
 
-/*
- * Don't drop/retake locks before doing btree update, instead return -EINTR if
- * we had to drop locks for any reason
- */
-#define BTREE_INSERT_ATOMIC		(1 << __BTREE_INSERT_ATOMIC)
+	return __bch2_trans_commit(trans, flags);
+}
 
-/*
- * Don't drop locks _after_ successfully updating btree:
- */
-#define BTREE_INSERT_NOUNLOCK		(1 << __BTREE_INSERT_NOUNLOCK)
+#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
+	lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_flags)))
 
-/* Don't check for -ENOSPC: */
-#define BTREE_INSERT_NOFAIL		(1 << __BTREE_INSERT_NOFAIL)
+#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
+	nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_flags)))
 
-/* for copygc, or when merging btree nodes */
-#define BTREE_INSERT_USE_RESERVE	(1 << __BTREE_INSERT_USE_RESERVE)
-#define BTREE_INSERT_USE_ALLOC_RESERVE	(1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
+#define bch2_trans_commit_do(_c, _disk_res, _journal_seq, _flags, _do)		\
+	bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
 
-/*
- * Insert is for journal replay: don't get journal reservations, or mark extents
- * (bch_mark_key)
- */
-#define BTREE_INSERT_JOURNAL_REPLAY	(1 << __BTREE_INSERT_JOURNAL_REPLAY)
+#define trans_for_each_update(_trans, _i)				\
+	for (struct btree_insert_entry *_i = (_trans)->updates;		\
+	     (_i) < (_trans)->updates + (_trans)->nr_updates;		\
+	     (_i)++)
 
-/* Don't block on allocation failure (for new btree nodes: */
-#define BTREE_INSERT_NOWAIT		(1 << __BTREE_INSERT_NOWAIT)
-#define BTREE_INSERT_GC_LOCK_HELD	(1 << __BTREE_INSERT_GC_LOCK_HELD)
+static inline void bch2_trans_reset_updates(struct btree_trans *trans)
+{
+	trans_for_each_update(trans, i)
+		bch2_path_put(trans, i->path, true);
 
-#define BCH_HASH_SET_MUST_CREATE	(1 << __BCH_HASH_SET_MUST_CREATE)
-#define BCH_HASH_SET_MUST_REPLACE	(1 << __BCH_HASH_SET_MUST_REPLACE)
+	trans->nr_updates		= 0;
+	trans->journal_entries_u64s	= 0;
+	trans->hooks			= NULL;
+	trans->extra_disk_res		= 0;
+}
 
-int bch2_btree_delete_at(struct btree_iter *, unsigned);
+static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
+						  unsigned type, unsigned min_bytes)
+{
+	unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k));
+	struct bkey_i *mut;
 
-int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *,
-			     struct disk_reservation *,
-			     struct extent_insert_hook *, u64 *, unsigned);
+	if (type && k.k->type != type)
+		return ERR_PTR(-ENOENT);
 
-int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
-		     struct disk_reservation *,
-		     struct extent_insert_hook *, u64 *, int flags);
+	/* extra padding for varint_decode_fast... */
+	mut = bch2_trans_kmalloc_nomemzero(trans, bytes + 8);
+	if (!IS_ERR(mut)) {
+		bkey_reassemble(mut, k);
 
-int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
-			   struct bpos, struct bpos, struct bversion,
-			   struct disk_reservation *,
-			   struct extent_insert_hook *, u64 *);
-
-int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
-			    __le64, unsigned);
-int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
-			       struct btree *, struct bkey_i_extent *);
-
-/* new transactional interface: */
-
-void bch2_trans_update(struct btree_trans *, struct btree_iter *,
-			     struct bkey_i *, unsigned);
-int bch2_trans_commit(struct btree_trans *,
-		      struct disk_reservation *,
-		      struct extent_insert_hook *,
-		      u64 *, unsigned);
-
-#define bch2_trans_do(_c, _journal_seq, _flags, _do)			\
-({									\
-	struct btree_trans trans;					\
-	int _ret;							\
-									\
-	bch2_trans_init(&trans, (_c));					\
-									\
-	do {								\
-		bch2_trans_begin(&trans);				\
-									\
-		_ret = (_do) ?:	bch2_trans_commit(&trans, NULL, NULL,	\
-					(_journal_seq), (_flags));	\
-	} while (_ret == -EINTR);					\
-									\
-	bch2_trans_exit(&trans);					\
-	_ret;								\
-})
+		if (unlikely(bytes > bkey_bytes(k.k))) {
+			memset((void *) mut + bkey_bytes(k.k), 0,
+			       bytes - bkey_bytes(k.k));
+			mut->k.u64s = DIV_ROUND_UP(bytes, sizeof(u64));
+		}
+	}
+	return mut;
+}
+
+static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k)
+{
+	return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0);
+}
+
+#define bch2_bkey_make_mut_noupdate_typed(_trans, _k, _type)		\
+	bkey_i_to_##_type(__bch2_bkey_make_mut_noupdate(_trans, _k,	\
+				KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
+
+static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
+					struct bkey_s_c *k,
+					enum btree_iter_update_trigger_flags flags,
+					unsigned type, unsigned min_bytes)
+{
+	struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes);
+	int ret;
+
+	if (IS_ERR(mut))
+		return mut;
+
+	ret = bch2_trans_update(trans, iter, mut, flags);
+	if (ret)
+		return ERR_PTR(ret);
+
+	*k = bkey_i_to_s_c(mut);
+	return mut;
+}
+
+static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans,
+						struct btree_iter *iter, struct bkey_s_c *k,
+						enum btree_iter_update_trigger_flags flags)
+{
+	return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0);
+}
+
+#define bch2_bkey_make_mut_typed(_trans, _iter, _k, _flags, _type)	\
+	bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\
+				KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
+
+static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 unsigned btree_id, struct bpos pos,
+					 enum btree_iter_update_trigger_flags flags,
+					 unsigned type, unsigned min_bytes)
+{
+	struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
+				btree_id, pos, flags|BTREE_ITER_intent, type);
+	struct bkey_i *ret = IS_ERR(k.k)
+		? ERR_CAST(k.k)
+		: __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes);
+	if (IS_ERR(ret))
+		bch2_trans_iter_exit(trans, iter);
+	return ret;
+}
+
+static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
+					       struct btree_iter *iter,
+					       unsigned btree_id, struct bpos pos,
+					       enum btree_iter_update_trigger_flags flags)
+{
+	return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0);
+}
+
+static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 unsigned btree_id, struct bpos pos,
+					 enum btree_iter_update_trigger_flags flags,
+					 unsigned type, unsigned min_bytes)
+{
+	struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter,
+				btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes);
+	int ret;
+
+	if (IS_ERR(mut))
+		return mut;
+
+	ret = bch2_trans_update(trans, iter, mut, flags);
+	if (ret) {
+		bch2_trans_iter_exit(trans, iter);
+		return ERR_PTR(ret);
+	}
+
+	return mut;
+}
+
+static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans,
+						       struct btree_iter *iter,
+						       unsigned btree_id, struct bpos pos,
+						       enum btree_iter_update_trigger_flags flags,
+						       unsigned min_bytes)
+{
+	return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes);
+}
+
+static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
+					       struct btree_iter *iter,
+					       unsigned btree_id, struct bpos pos,
+					       enum btree_iter_update_trigger_flags flags)
+{
+	return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0);
+}
+
+#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
+	bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter,		\
+			_btree_id, _pos, _flags,			\
+			KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
+
+static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter,
+					       enum btree_iter_update_trigger_flags flags,
+					       unsigned type, unsigned val_size)
+{
+	struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size);
+	int ret;
+
+	if (IS_ERR(k))
+		return k;
+
+	bkey_init(&k->k);
+	k->k.p = iter->pos;
+	k->k.type = type;
+	set_bkey_val_bytes(&k->k, val_size);
+
+	ret = bch2_trans_update(trans, iter, k, flags);
+	if (unlikely(ret))
+		return ERR_PTR(ret);
+	return k;
+}
+
+#define bch2_bkey_alloc(_trans, _iter, _flags, _type)			\
+	bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter, _flags,	\
+				KEY_TYPE_##_type, sizeof(struct bch_##_type)))
 
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 392ee0a0..f2a1d5d3 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -1,87 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
-#include "alloc.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
 #include "bkey_methods.h"
 #include "btree_cache.h"
 #include "btree_gc.h"
+#include "btree_journal_iter.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
 #include "btree_iter.h"
 #include "btree_locking.h"
 #include "buckets.h"
+#include "clock.h"
+#include "error.h"
 #include "extents.h"
+#include "io_write.h"
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
+#include "recovery_passes.h"
 #include "replicas.h"
+#include "sb-members.h"
 #include "super-io.h"
+#include "trace.h"
 
 #include <linux/random.h>
-#include <trace/events/bcachefs.h>
 
-static void btree_node_will_make_reachable(struct btree_update *,
-					   struct btree *);
-static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
-static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
+static const char * const bch2_btree_update_modes[] = {
+#define x(t) #t,
+	BTREE_UPDATE_MODES()
+#undef x
+	NULL
+};
 
-/* Debug code: */
+static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
+				  btree_path_idx_t, struct btree *, struct keylist *);
+static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
 
-static void btree_node_interior_verify(struct btree *b)
+/*
+ * Verify that child nodes correctly span parent node's range:
+ */
+int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 {
-	struct btree_node_iter iter;
-	struct bkey_packed *k;
+	struct bch_fs *c = trans->c;
+	struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2
+		? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
+		: b->data->min_key;
+	struct btree_and_journal_iter iter;
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+	struct bkey_buf prev;
+	int ret = 0;
 
-	BUG_ON(!b->level);
+	BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+	       !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
+			b->data->min_key));
+
+	bch2_bkey_buf_init(&prev);
+	bkey_init(&prev.k->k);
+	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+
+	if (b == btree_node_root(c, b)) {
+		if (!bpos_eq(b->data->min_key, POS_MIN)) {
+			printbuf_reset(&buf);
+			bch2_bpos_to_text(&buf, b->data->min_key);
+			log_fsck_err(trans, btree_root_bad_min_key,
+				      "btree root with incorrect min_key: %s", buf.buf);
+			goto topology_repair;
+		}
 
-	bch2_btree_node_iter_init(&iter, b, b->key.k.p, false, false);
-#if 1
-	BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) ||
-	       bkey_cmp_left_packed(b, k, &b->key.k.p));
+		if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
+			printbuf_reset(&buf);
+			bch2_bpos_to_text(&buf, b->data->max_key);
+			log_fsck_err(trans, btree_root_bad_max_key,
+				      "btree root with incorrect max_key: %s", buf.buf);
+			goto topology_repair;
+		}
+	}
 
-	BUG_ON((bch2_btree_node_iter_advance(&iter, b),
-		!bch2_btree_node_iter_end(&iter)));
-#else
-	const char *msg;
+	if (!b->c.level)
+		goto out;
 
-	msg = "not found";
-	k = bch2_btree_node_iter_peek(&iter, b);
-	if (!k)
-		goto err;
+	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+		if (k.k->type != KEY_TYPE_btree_ptr_v2)
+			goto out;
 
-	msg = "isn't what it should be";
-	if (bkey_cmp_left_packed(b, k, &b->key.k.p))
-		goto err;
+		struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
-	bch2_btree_node_iter_advance(&iter, b);
+		struct bpos expected_min = bkey_deleted(&prev.k->k)
+			? node_min
+			: bpos_successor(prev.k->k.p);
 
-	msg = "isn't last key";
-	if (!bch2_btree_node_iter_end(&iter))
-		goto err;
-	return;
-err:
-	bch2_dump_btree_node(b);
-	printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode,
-	       b->key.k.p.offset, msg);
-	BUG();
-#endif
+		if (!bpos_eq(expected_min, bp.v->min_key)) {
+			bch2_topology_error(c);
+
+			printbuf_reset(&buf);
+			prt_str(&buf, "end of prev node doesn't match start of next node\n  in ");
+			bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+			prt_str(&buf, " node ");
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+			prt_str(&buf, "\n  prev ");
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
+			prt_str(&buf, "\n  next ");
+			bch2_bkey_val_to_text(&buf, c, k);
+
+			log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf);
+			goto topology_repair;
+		}
+
+		bch2_bkey_buf_reassemble(&prev, c, k);
+		bch2_btree_and_journal_iter_advance(&iter);
+	}
+
+	if (bkey_deleted(&prev.k->k)) {
+		bch2_topology_error(c);
+
+		printbuf_reset(&buf);
+		prt_str(&buf, "empty interior node\n  in ");
+		bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+		prt_str(&buf, " node ");
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+		log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
+		goto topology_repair;
+	} else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
+		bch2_topology_error(c);
+
+		printbuf_reset(&buf);
+		prt_str(&buf, "last child node doesn't end at end of parent node\n  in ");
+		bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+		prt_str(&buf, " node ");
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+		prt_str(&buf, "\n  last key ");
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
+
+		log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf);
+		goto topology_repair;
+	}
+out:
+fsck_err:
+	bch2_btree_and_journal_iter_exit(&iter);
+	bch2_bkey_buf_exit(&prev, c);
+	printbuf_exit(&buf);
+	return ret;
+topology_repair:
+	ret = bch2_topology_error(c);
+	goto out;
 }
 
 /* Calculate ideal packed bkey format for new btree nodes: */
 
-void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
+static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
 {
 	struct bkey_packed *k;
-	struct bset_tree *t;
 	struct bkey uk;
 
-	bch2_bkey_format_add_pos(s, b->data->min_key);
-
 	for_each_bset(b, t)
-		for (k = btree_bkey_first(b, t);
-		     k != btree_bkey_last(b, t);
-		     k = bkey_next(k))
-			if (!bkey_whiteout(k)) {
+		bset_tree_for_each_key(b, t, k)
+			if (!bkey_deleted(k)) {
 				uk = bkey_unpack_key(b, k);
 				bch2_bkey_format_add_key(s, &uk);
 			}
@@ -92,141 +170,59 @@ static struct bkey_format bch2_btree_calc_format(struct btree *b)
 	struct bkey_format_state s;
 
 	bch2_bkey_format_init(&s);
+	bch2_bkey_format_add_pos(&s, b->data->min_key);
+	bch2_bkey_format_add_pos(&s, b->data->max_key);
 	__bch2_btree_calc_format(&s, b);
 
 	return bch2_bkey_format_done(&s);
 }
 
-static size_t btree_node_u64s_with_format(struct btree *b,
+static size_t btree_node_u64s_with_format(struct btree_nr_keys nr,
+					  struct bkey_format *old_f,
 					  struct bkey_format *new_f)
 {
-	struct bkey_format *old_f = &b->format;
-
 	/* stupid integer promotion rules */
 	ssize_t delta =
 	    (((int) new_f->key_u64s - old_f->key_u64s) *
-	     (int) b->nr.packed_keys) +
+	     (int) nr.packed_keys) +
 	    (((int) new_f->key_u64s - BKEY_U64s) *
-	     (int) b->nr.unpacked_keys);
+	     (int) nr.unpacked_keys);
 
-	BUG_ON(delta + b->nr.live_u64s < 0);
+	BUG_ON(delta + nr.live_u64s < 0);
 
-	return b->nr.live_u64s + delta;
+	return nr.live_u64s + delta;
 }
 
 /**
- * btree_node_format_fits - check if we could rewrite node with a new format
+ * bch2_btree_node_format_fits - check if we could rewrite node with a new format
  *
- * This assumes all keys can pack with the new format -- it just checks if
- * the re-packed keys would fit inside the node itself.
+ * @c:		filesystem handle
+ * @b:		btree node to rewrite
+ * @nr:		number of keys for new node (i.e. b->nr)
+ * @new_f:	bkey format to translate keys to
+ *
+ * Returns: true if all re-packed keys will be able to fit in a new node.
+ *
+ * Assumes all keys will successfully pack with the new format.
  */
-bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
+static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
+				 struct btree_nr_keys nr,
 				 struct bkey_format *new_f)
 {
-	size_t u64s = btree_node_u64s_with_format(b, new_f);
+	size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f);
 
-	return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
+	return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b);
 }
 
 /* Btree node freeing/allocation: */
 
-static bool btree_key_matches(struct bch_fs *c,
-			      struct bkey_s_c_extent l,
-			      struct bkey_s_c_extent r)
-{
-	const struct bch_extent_ptr *ptr1, *ptr2;
-
-	extent_for_each_ptr(l, ptr1)
-		extent_for_each_ptr(r, ptr2)
-			if (ptr1->dev == ptr2->dev &&
-			    ptr1->gen == ptr2->gen &&
-			    ptr1->offset == ptr2->offset)
-				return true;
-
-	return false;
-}
-
-/*
- * We're doing the index update that makes @b unreachable, update stuff to
- * reflect that:
- *
- * Must be called _before_ btree_update_updated_root() or
- * btree_update_updated_node:
- */
-static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
-				       struct bkey_s_c k,
-				       struct bch_fs_usage *stats)
+static void __btree_node_free(struct btree_trans *trans, struct btree *b)
 {
-	struct bch_fs *c = as->c;
-	struct pending_btree_node_free *d;
-	unsigned replicas;
-
-	/*
-	 * btree_update lock is only needed here to avoid racing with
-	 * gc:
-	 */
-	mutex_lock(&c->btree_interior_update_lock);
-
-	for (d = as->pending; d < as->pending + as->nr_pending; d++)
-		if (!bkey_cmp(k.k->p, d->key.k.p) &&
-		    btree_key_matches(c, bkey_s_c_to_extent(k),
-				      bkey_i_to_s_c_extent(&d->key)))
-			goto found;
-	BUG();
-found:
-	BUG_ON(d->index_update_done);
-	d->index_update_done = true;
-
-	/*
-	 * Btree nodes are accounted as freed in bch_alloc_stats when they're
-	 * freed from the index:
-	 */
-	replicas = bch2_extent_nr_dirty_ptrs(k);
-	if (replicas)
-		stats->s[replicas - 1].data[S_META] -= c->opts.btree_node_size;
-
-	/*
-	 * We're dropping @k from the btree, but it's still live until the
-	 * index update is persistent so we need to keep a reference around for
-	 * mark and sweep to find - that's primarily what the
-	 * btree_node_pending_free list is for.
-	 *
-	 * So here (when we set index_update_done = true), we're moving an
-	 * existing reference to a different part of the larger "gc keyspace" -
-	 * and the new position comes after the old position, since GC marks
-	 * the pending free list after it walks the btree.
-	 *
-	 * If we move the reference while mark and sweep is _between_ the old
-	 * and the new position, mark and sweep will see the reference twice
-	 * and it'll get double accounted - so check for that here and subtract
-	 * to cancel out one of mark and sweep's markings if necessary:
-	 */
-
-	/*
-	 * bch2_mark_key() compares the current gc pos to the pos we're
-	 * moving this reference from, hence one comparison here:
-	 */
-	if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
-		struct bch_fs_usage tmp = { 0 };
-
-		bch2_mark_key(c, bkey_i_to_s_c(&d->key),
-			     -c->opts.btree_node_size, true, b
-			     ? gc_pos_btree_node(b)
-			     : gc_pos_btree_root(as->btree_id),
-			     &tmp, 0, 0);
-		/*
-		 * Don't apply tmp - pending deletes aren't tracked in
-		 * bch_alloc_stats:
-		 */
-	}
+	struct bch_fs *c = trans->c;
 
-	mutex_unlock(&c->btree_interior_update_lock);
-}
-
-static void __btree_node_free(struct bch_fs *c, struct btree *b)
-{
-	trace_btree_node_free(c, b);
+	trace_and_count(c, btree_node_free, trans, b);
 
+	BUG_ON(btree_node_write_blocked(b));
 	BUG_ON(btree_node_dirty(b));
 	BUG_ON(btree_node_need_write(b));
 	BUG_ON(b == btree_node_root(c, b));
@@ -235,527 +231,641 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
 	BUG_ON(b->will_make_reachable);
 
 	clear_btree_node_noevict(b);
-
-	bch2_btree_node_hash_remove(&c->btree_cache, b);
-
-	mutex_lock(&c->btree_cache.lock);
-	list_move(&b->list, &c->btree_cache.freeable);
-	mutex_unlock(&c->btree_cache.lock);
 }
 
-void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
+static void bch2_btree_node_free_inmem(struct btree_trans *trans,
+				       struct btree_path *path,
+				       struct btree *b)
 {
-	struct btree_ob_ref ob = b->ob;
+	struct bch_fs *c = trans->c;
+	unsigned i, level = b->c.level;
 
-	btree_update_drop_new_node(c, b);
+	bch2_btree_node_lock_write_nofail(trans, path, &b->c);
 
-	b->ob.nr = 0;
+	__btree_node_free(trans, b);
 
-	clear_btree_node_dirty(b);
+	mutex_lock(&c->btree_cache.lock);
+	bch2_btree_node_hash_remove(&c->btree_cache, b);
+	mutex_unlock(&c->btree_cache.lock);
 
-	btree_node_lock_type(c, b, SIX_LOCK_write);
-	__btree_node_free(c, b);
-	six_unlock_write(&b->lock);
+	six_unlock_write(&b->c.lock);
+	mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
 
-	bch2_open_bucket_put_refs(c, &ob.nr, ob.refs);
+	trans_for_each_path(trans, path, i)
+		if (path->l[level].b == b) {
+			btree_node_unlock(trans, path, level);
+			path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+		}
 }
 
-void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
-				struct btree_iter *iter)
+static void bch2_btree_node_free_never_used(struct btree_update *as,
+					    struct btree_trans *trans,
+					    struct btree *b)
 {
-	/*
-	 * Is this a node that isn't reachable on disk yet?
-	 *
-	 * Nodes that aren't reachable yet have writes blocked until they're
-	 * reachable - now that we've cancelled any pending writes and moved
-	 * things waiting on that write to wait on this update, we can drop this
-	 * node from the list of nodes that the other update is making
-	 * reachable, prior to freeing it:
-	 */
-	btree_update_drop_new_node(c, b);
+	struct bch_fs *c = as->c;
+	struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
+	struct btree_path *path;
+	unsigned i, level = b->c.level;
 
-	__bch2_btree_node_lock_write(b, iter);
-	__btree_node_free(c, b);
-	six_unlock_write(&b->lock);
+	BUG_ON(!list_empty(&b->write_blocked));
+	BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
 
-	bch2_btree_iter_node_drop(iter, b);
-}
+	b->will_make_reachable = 0;
+	closure_put(&as->cl);
 
-static void bch2_btree_node_free_ondisk(struct bch_fs *c,
-					struct pending_btree_node_free *pending)
-{
-	struct bch_fs_usage stats = { 0 };
+	clear_btree_node_will_make_reachable(b);
+	clear_btree_node_accessed(b);
+	clear_btree_node_dirty_acct(c, b);
+	clear_btree_node_need_write(b);
 
-	BUG_ON(!pending->index_update_done);
+	mutex_lock(&c->btree_cache.lock);
+	__bch2_btree_node_hash_remove(&c->btree_cache, b);
+	mutex_unlock(&c->btree_cache.lock);
 
-	bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-		     -c->opts.btree_node_size, true,
-		     gc_phase(GC_PHASE_PENDING_DELETE),
-		     &stats, 0, 0);
-	/*
-	 * Don't apply stats - pending deletes aren't tracked in
-	 * bch_alloc_stats:
-	 */
-}
+	BUG_ON(p->nr >= ARRAY_SIZE(p->b));
+	p->b[p->nr++] = b;
 
-void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b)
-{
-	bch2_open_bucket_put_refs(c, &b->ob.nr, b->ob.refs);
+	six_unlock_intent(&b->c.lock);
+
+	trans_for_each_path(trans, path, i)
+		if (path->l[level].b == b) {
+			btree_node_unlock(trans, path, level);
+			path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+		}
 }
 
-static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
+static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 					     struct disk_reservation *res,
 					     struct closure *cl,
+					     bool interior_node,
 					     unsigned flags)
 {
+	struct bch_fs *c = trans->c;
 	struct write_point *wp;
 	struct btree *b;
-	BKEY_PADDED(k) tmp;
-	struct bkey_i_extent *e;
-	struct btree_ob_ref ob;
+	BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+	struct open_buckets obs = { .nr = 0 };
 	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
-	unsigned nr_reserve;
-	enum alloc_reserve alloc_reserve;
-
-	if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
-		nr_reserve	= 0;
-		alloc_reserve	= RESERVE_ALLOC;
-	} else if (flags & BTREE_INSERT_USE_RESERVE) {
-		nr_reserve	= BTREE_NODE_RESERVE / 2;
-		alloc_reserve	= RESERVE_BTREE;
-	} else {
-		nr_reserve	= BTREE_NODE_RESERVE;
-		alloc_reserve	= RESERVE_NONE;
-	}
+	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
+	unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim
+		? BTREE_NODE_RESERVE
+		: 0;
+	int ret;
+
+	b = bch2_btree_node_mem_alloc(trans, interior_node);
+	if (IS_ERR(b))
+		return b;
+
+	BUG_ON(b->ob.nr);
 
 	mutex_lock(&c->btree_reserve_cache_lock);
 	if (c->btree_reserve_cache_nr > nr_reserve) {
 		struct btree_alloc *a =
 			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
 
-		ob = a->ob;
+		obs = a->ob;
 		bkey_copy(&tmp.k, &a->k);
 		mutex_unlock(&c->btree_reserve_cache_lock);
-		goto mem_alloc;
+		goto out;
 	}
 	mutex_unlock(&c->btree_reserve_cache_lock);
-
 retry:
-	wp = bch2_alloc_sectors_start(c, c->opts.foreground_target,
+	ret = bch2_alloc_sectors_start_trans(trans,
+				      c->opts.metadata_target ?:
+				      c->opts.foreground_target,
+				      0,
 				      writepoint_ptr(&c->btree_write_point),
 				      &devs_have,
 				      res->nr_replicas,
-				      c->opts.metadata_replicas_required,
-				      alloc_reserve, 0, cl);
-	if (IS_ERR(wp))
-		return ERR_CAST(wp);
+				      min(res->nr_replicas,
+					  c->opts.metadata_replicas_required),
+				      watermark, 0, cl, &wp);
+	if (unlikely(ret))
+		goto err;
 
-	if (wp->sectors_free < c->opts.btree_node_size) {
+	if (wp->sectors_free < btree_sectors(c)) {
 		struct open_bucket *ob;
 		unsigned i;
 
-		writepoint_for_each_ptr(wp, ob, i)
-			if (ob->sectors_free < c->opts.btree_node_size)
+		open_bucket_for_each(c, &wp->ptrs, ob, i)
+			if (ob->sectors_free < btree_sectors(c))
 				ob->sectors_free = 0;
 
 		bch2_alloc_sectors_done(c, wp);
 		goto retry;
 	}
 
-	e = bkey_extent_init(&tmp.k);
-	bch2_alloc_sectors_append_ptrs(c, wp, e, c->opts.btree_node_size);
+	bkey_btree_ptr_v2_init(&tmp.k);
+	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
 
-	ob.nr = 0;
-	bch2_open_bucket_get(c, wp, &ob.nr, ob.refs);
+	bch2_open_bucket_get(c, wp, &obs);
 	bch2_alloc_sectors_done(c, wp);
-mem_alloc:
-	b = bch2_btree_node_mem_alloc(c);
-
-	/* we hold cannibalize_lock: */
-	BUG_ON(IS_ERR(b));
-	BUG_ON(b->ob.nr);
-
+out:
 	bkey_copy(&b->key, &tmp.k);
-	b->ob = ob;
+	b->ob = obs;
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
 
 	return b;
+err:
+	bch2_btree_node_to_freelist(c, b);
+	return ERR_PTR(ret);
 }
 
-static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
+static struct btree *bch2_btree_node_alloc(struct btree_update *as,
+					   struct btree_trans *trans,
+					   unsigned level)
 {
 	struct bch_fs *c = as->c;
 	struct btree *b;
+	struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
+	int ret;
 
 	BUG_ON(level >= BTREE_MAX_DEPTH);
-	BUG_ON(!as->reserve->nr);
+	BUG_ON(!p->nr);
 
-	b = as->reserve->b[--as->reserve->nr];
+	b = p->b[--p->nr];
 
-	BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id));
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
 
 	set_btree_node_accessed(b);
-	set_btree_node_dirty(b);
+	set_btree_node_dirty_acct(c, b);
+	set_btree_node_need_write(b);
 
 	bch2_bset_init_first(b, &b->data->keys);
+	b->c.level	= level;
+	b->c.btree_id	= as->btree_id;
+	b->version_ondisk = c->sb.version;
+
 	memset(&b->nr, 0, sizeof(b->nr));
 	b->data->magic = cpu_to_le64(bset_magic(c));
+	memset(&b->data->_ptr, 0, sizeof(b->data->_ptr));
 	b->data->flags = 0;
 	SET_BTREE_NODE_ID(b->data, as->btree_id);
 	SET_BTREE_NODE_LEVEL(b->data, level);
-	b->data->ptr = bkey_i_to_extent(&b->key)->v.start->ptr;
+
+	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
+
+		bp->v.mem_ptr		= 0;
+		bp->v.seq		= b->data->keys.seq;
+		bp->v.sectors_written	= 0;
+	}
+
+	SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
 
 	bch2_btree_build_aux_trees(b);
 
-	btree_node_will_make_reachable(as, b);
+	ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
+	BUG_ON(ret);
 
-	trace_btree_node_alloc(c, b);
+	trace_and_count(c, btree_node_alloc, trans, b);
+	bch2_increment_clock(c, btree_sectors(c), WRITE);
 	return b;
 }
 
-struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
-						  struct btree *b,
-						  struct bkey_format format)
+static void btree_set_min(struct btree *b, struct bpos pos)
 {
-	struct btree *n;
-
-	n = bch2_btree_node_alloc(as, b->level);
-
-	n->data->min_key	= b->data->min_key;
-	n->data->max_key	= b->data->max_key;
-	n->data->format		= format;
-	SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
-
-	btree_node_set_format(n, format);
-
-	bch2_btree_sort_into(as->c, n, b);
-
-	btree_node_reset_sib_u64s(n);
+	if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
+		bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
+	b->data->min_key = pos;
+}
 
-	n->key.k.p = b->key.k.p;
-	return n;
+static void btree_set_max(struct btree *b, struct bpos pos)
+{
+	b->key.k.p = pos;
+	b->data->max_key = pos;
 }
 
 static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
+						       struct btree_trans *trans,
 						       struct btree *b)
 {
-	struct bkey_format new_f = bch2_btree_calc_format(b);
+	struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level);
+	struct bkey_format format = bch2_btree_calc_format(b);
 
 	/*
 	 * The keys might expand with the new format - if they wouldn't fit in
 	 * the btree node anymore, use the old format for now:
 	 */
-	if (!bch2_btree_node_format_fits(as->c, b, &new_f))
-		new_f = b->format;
+	if (!bch2_btree_node_format_fits(as->c, b, b->nr, &format))
+		format = b->format;
+
+	SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
+
+	btree_set_min(n, b->data->min_key);
+	btree_set_max(n, b->data->max_key);
+
+	n->data->format		= format;
+	btree_node_set_format(n, format);
+
+	bch2_btree_sort_into(as->c, n, b);
 
-	return __bch2_btree_node_alloc_replacement(as, b, new_f);
+	btree_node_reset_sib_u64s(n);
+	return n;
 }
 
-static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
+static struct btree *__btree_root_alloc(struct btree_update *as,
+				struct btree_trans *trans, unsigned level)
 {
-	struct btree *b = bch2_btree_node_alloc(as, level);
+	struct btree *b = bch2_btree_node_alloc(as, trans, level);
 
-	b->data->min_key = POS_MIN;
-	b->data->max_key = POS_MAX;
+	btree_set_min(b, POS_MIN);
+	btree_set_max(b, SPOS_MAX);
 	b->data->format = bch2_btree_calc_format(b);
-	b->key.k.p = POS_MAX;
 
 	btree_node_set_format(b, b->data->format);
 	bch2_btree_build_aux_trees(b);
 
-	six_unlock_write(&b->lock);
-
 	return b;
 }
 
-static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve)
+static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans)
 {
-	bch2_disk_reservation_put(c, &reserve->disk_res);
-
-	mutex_lock(&c->btree_reserve_cache_lock);
-
-	while (reserve->nr) {
-		struct btree *b = reserve->b[--reserve->nr];
-
-		six_unlock_write(&b->lock);
+	struct bch_fs *c = as->c;
+	struct prealloc_nodes *p;
+
+	for (p = as->prealloc_nodes;
+	     p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
+	     p++) {
+		while (p->nr) {
+			struct btree *b = p->b[--p->nr];
+
+			mutex_lock(&c->btree_reserve_cache_lock);
+
+			if (c->btree_reserve_cache_nr <
+			    ARRAY_SIZE(c->btree_reserve_cache)) {
+				struct btree_alloc *a =
+					&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
+
+				a->ob = b->ob;
+				b->ob.nr = 0;
+				bkey_copy(&a->k, &b->key);
+			} else {
+				bch2_open_buckets_put(c, &b->ob);
+			}
 
-		if (c->btree_reserve_cache_nr <
-		    ARRAY_SIZE(c->btree_reserve_cache)) {
-			struct btree_alloc *a =
-				&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
+			mutex_unlock(&c->btree_reserve_cache_lock);
 
-			a->ob = b->ob;
-			b->ob.nr = 0;
-			bkey_copy(&a->k, &b->key);
-		} else {
-			bch2_btree_open_bucket_put(c, b);
+			btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+			btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+			__btree_node_free(trans, b);
+			bch2_btree_node_to_freelist(c, b);
 		}
-
-		btree_node_lock_type(c, b, SIX_LOCK_write);
-		__btree_node_free(c, b);
-		six_unlock_write(&b->lock);
-
-		six_unlock_intent(&b->lock);
 	}
-
-	mutex_unlock(&c->btree_reserve_cache_lock);
-
-	mempool_free(reserve, &c->btree_reserve_pool);
 }
 
-static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
-						    unsigned nr_nodes,
-						    unsigned flags,
-						    struct closure *cl)
+static int bch2_btree_reserve_get(struct btree_trans *trans,
+				  struct btree_update *as,
+				  unsigned nr_nodes[2],
+				  unsigned flags,
+				  struct closure *cl)
 {
-	struct btree_reserve *reserve;
 	struct btree *b;
-	struct disk_reservation disk_res = { 0, 0 };
-	unsigned sectors = nr_nodes * c->opts.btree_node_size;
-	int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD;
-
-	if (flags & BTREE_INSERT_NOFAIL)
-		disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
-
-	/*
-	 * This check isn't necessary for correctness - it's just to potentially
-	 * prevent us from doing a lot of work that'll end up being wasted:
-	 */
-	ret = bch2_journal_error(&c->journal);
-	if (ret)
-		return ERR_PTR(ret);
-
-	if (bch2_disk_reservation_get(c, &disk_res, sectors,
-				      c->opts.metadata_replicas,
-				      disk_res_flags))
-		return ERR_PTR(-ENOSPC);
+	unsigned interior;
+	int ret = 0;
 
-	BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
+	BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
 
 	/*
 	 * Protects reaping from the btree node cache and using the btree node
 	 * open bucket reserve:
 	 */
-	ret = bch2_btree_cache_cannibalize_lock(c, cl);
-	if (ret) {
-		bch2_disk_reservation_put(c, &disk_res);
-		return ERR_PTR(ret);
-	}
+	ret = bch2_btree_cache_cannibalize_lock(trans, cl);
+	if (ret)
+		return ret;
 
-	reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO);
+	for (interior = 0; interior < 2; interior++) {
+		struct prealloc_nodes *p = as->prealloc_nodes + interior;
 
-	reserve->disk_res = disk_res;
-	reserve->nr = 0;
+		while (p->nr < nr_nodes[interior]) {
+			b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
+						    interior, flags);
+			if (IS_ERR(b)) {
+				ret = PTR_ERR(b);
+				goto err;
+			}
 
-	while (reserve->nr < nr_nodes) {
-		b = __bch2_btree_node_alloc(c, &disk_res,
-					    flags & BTREE_INSERT_NOWAIT
-					    ? NULL : cl, flags);
-		if (IS_ERR(b)) {
-			ret = PTR_ERR(b);
-			goto err_free;
+			p->b[p->nr++] = b;
 		}
-
-		ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
-					      bkey_i_to_s_c(&b->key));
-		if (ret)
-			goto err_free;
-
-		reserve->b[reserve->nr++] = b;
 	}
-
-	bch2_btree_cache_cannibalize_unlock(c);
-	return reserve;
-err_free:
-	bch2_btree_reserve_put(c, reserve);
-	bch2_btree_cache_cannibalize_unlock(c);
-	trace_btree_reserve_get_fail(c, nr_nodes, cl);
-	return ERR_PTR(ret);
+err:
+	bch2_btree_cache_cannibalize_unlock(trans);
+	return ret;
 }
 
 /* Asynchronous interior node update machinery */
 
-static void bch2_btree_update_free(struct btree_update *as)
+static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans)
 {
 	struct bch_fs *c = as->c;
 
-	BUG_ON(as->nr_new_nodes);
-	BUG_ON(as->nr_pending);
+	if (as->took_gc_lock)
+		up_read(&c->gc_lock);
+	as->took_gc_lock = false;
+
+	bch2_journal_pin_drop(&c->journal, &as->journal);
+	bch2_journal_pin_flush(&c->journal, &as->journal);
+	bch2_disk_reservation_put(c, &as->disk_res);
+	bch2_btree_reserve_put(as, trans);
 
-	if (as->reserve)
-		bch2_btree_reserve_put(c, as->reserve);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
+			       as->start_time);
 
 	mutex_lock(&c->btree_interior_update_lock);
+	list_del(&as->unwritten_list);
 	list_del(&as->list);
 
 	closure_debug_destroy(&as->cl);
 	mempool_free(as, &c->btree_interior_update_pool);
-	percpu_ref_put(&c->writes);
 
+	/*
+	 * Have to do the wakeup with btree_interior_update_lock still held,
+	 * since being on btree_interior_update_list is our ref on @c:
+	 */
 	closure_wake_up(&c->btree_interior_update_wait);
+
 	mutex_unlock(&c->btree_interior_update_lock);
 }
 
-static void btree_update_nodes_reachable(struct closure *cl)
+static void btree_update_add_key(struct btree_update *as,
+				 struct keylist *keys, struct btree *b)
+{
+	struct bkey_i *k = &b->key;
+
+	BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s >
+	       ARRAY_SIZE(as->_old_keys));
+
+	bkey_copy(keys->top, k);
+	bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1;
+
+	bch2_keylist_push(keys);
+}
+
+static bool btree_update_new_nodes_marked_sb(struct btree_update *as)
+{
+	for_each_keylist_key(&as->new_keys, k)
+		if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k)))
+			return false;
+	return true;
+}
+
+static void btree_update_new_nodes_mark_sb(struct btree_update *as)
 {
-	struct btree_update *as = container_of(cl, struct btree_update, cl);
 	struct bch_fs *c = as->c;
 
-	bch2_journal_pin_drop(&c->journal, &as->journal);
+	mutex_lock(&c->sb_lock);
+	for_each_keylist_key(&as->new_keys, k)
+		bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k));
 
-	mutex_lock(&c->btree_interior_update_lock);
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+}
 
-	while (as->nr_new_nodes) {
-		struct btree *b = as->new_nodes[--as->nr_new_nodes];
+/*
+ * The transactional part of an interior btree node update, where we journal the
+ * update we did to the interior node and update alloc info:
+ */
+static int btree_update_nodes_written_trans(struct btree_trans *trans,
+					    struct btree_update *as)
+{
+	struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s);
+	int ret = PTR_ERR_OR_ZERO(e);
+	if (ret)
+		return ret;
 
-		BUG_ON(b->will_make_reachable != (unsigned long) as);
-		b->will_make_reachable = 0;
-		mutex_unlock(&c->btree_interior_update_lock);
+	memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64));
 
-		/*
-		 * b->will_make_reachable prevented it from being written, so
-		 * write it now if it needs to be written:
-		 */
-		btree_node_lock_type(c, b, SIX_LOCK_read);
-		bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
-		six_unlock_read(&b->lock);
-		mutex_lock(&c->btree_interior_update_lock);
-	}
+	trans->journal_pin = &as->journal;
 
-	while (as->nr_pending)
-		bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]);
+	for_each_keylist_key(&as->old_keys, k) {
+		unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
 
-	mutex_unlock(&c->btree_interior_update_lock);
+		ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k),
+					   BTREE_TRIGGER_transactional);
+		if (ret)
+			return ret;
+	}
+
+	for_each_keylist_key(&as->new_keys, k) {
+		unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
 
-	closure_wake_up(&as->wait);
+		ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k),
+					   BTREE_TRIGGER_transactional);
+		if (ret)
+			return ret;
+	}
 
-	bch2_btree_update_free(as);
+	return 0;
 }
 
-static void btree_update_wait_on_journal(struct closure *cl)
+static void btree_update_nodes_written(struct btree_update *as)
 {
-	struct btree_update *as = container_of(cl, struct btree_update, cl);
 	struct bch_fs *c = as->c;
+	struct btree *b;
+	struct btree_trans *trans = bch2_trans_get(c);
+	u64 journal_seq = 0;
+	unsigned i;
 	int ret;
 
-	ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
-	if (ret < 0)
+	/*
+	 * If we're already in an error state, it might be because a btree node
+	 * was never written, and we might be trying to free that same btree
+	 * node here, but it won't have been marked as allocated and we'll see
+	 * spurious disk usage inconsistencies in the transactional part below
+	 * if we don't skip it:
+	 */
+	ret = bch2_journal_error(&c->journal);
+	if (ret)
 		goto err;
-	if (!ret) {
-		continue_at(cl, btree_update_wait_on_journal, system_wq);
-		return;
-	}
 
-	bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
-err:
-	continue_at(cl, btree_update_nodes_reachable, system_wq);
-}
+	if (!btree_update_new_nodes_marked_sb(as))
+		btree_update_new_nodes_mark_sb(as);
 
-static void btree_update_nodes_written(struct closure *cl)
-{
-	struct btree_update *as = container_of(cl, struct btree_update, cl);
-	struct bch_fs *c = as->c;
-	struct btree *b;
+	/*
+	 * Wait for any in flight writes to finish before we free the old nodes
+	 * on disk:
+	 */
+	for (i = 0; i < as->nr_old_nodes; i++) {
+		__le64 seq;
+
+		b = as->old_nodes[i];
+
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+		seq = b->data ? b->data->keys.seq : 0;
+		six_unlock_read(&b->c.lock);
+
+		if (seq == as->old_nodes_seq[i])
+			wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
+				       TASK_UNINTERRUPTIBLE);
+	}
 
 	/*
 	 * We did an update to a parent node where the pointers we added pointed
 	 * to child nodes that weren't written yet: now, the child nodes have
 	 * been written so we can write out the update to the interior node.
 	 */
-retry:
-	mutex_lock(&c->btree_interior_update_lock);
-	as->nodes_written = true;
-
-	switch (as->mode) {
-	case BTREE_INTERIOR_NO_UPDATE:
-		BUG();
-	case BTREE_INTERIOR_UPDATING_NODE:
-		/* The usual case: */
-		b = READ_ONCE(as->b);
-
-		if (!six_trylock_read(&b->lock)) {
-			mutex_unlock(&c->btree_interior_update_lock);
-			btree_node_lock_type(c, b, SIX_LOCK_read);
-			six_unlock_read(&b->lock);
-			goto retry;
-		}
-
-		BUG_ON(!btree_node_dirty(b));
-		closure_wait(&btree_current_write(b)->wait, cl);
 
-		list_del(&as->write_blocked_list);
-		mutex_unlock(&c->btree_interior_update_lock);
+	/*
+	 * We can't call into journal reclaim here: we'd block on the journal
+	 * reclaim lock, but we may need to release the open buckets we have
+	 * pinned in order for other btree updates to make forward progress, and
+	 * journal reclaim does btree updates when flushing bkey_cached entries,
+	 * which may require allocations as well.
+	 */
+	ret = commit_do(trans, &as->disk_res, &journal_seq,
+			BCH_WATERMARK_interior_updates|
+			BCH_TRANS_COMMIT_no_enospc|
+			BCH_TRANS_COMMIT_no_check_rw|
+			BCH_TRANS_COMMIT_journal_reclaim,
+			btree_update_nodes_written_trans(trans, as));
+	bch2_trans_unlock(trans);
+
+	bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
+			     "%s", bch2_err_str(ret));
+err:
+	/*
+	 * Ensure transaction is unlocked before using btree_node_lock_nopath()
+	 * (the use of which is always suspect, we need to work on removing this
+	 * in the future)
+	 *
+	 * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
+	 * calls bch2_path_upgrade(), before we call path_make_mut(), so we may
+	 * rarely end up with a locked path besides the one we have here:
+	 */
+	bch2_trans_unlock(trans);
+	bch2_trans_begin(trans);
 
+	/*
+	 * We have to be careful because another thread might be getting ready
+	 * to free as->b and calling btree_update_reparent() on us - we'll
+	 * recheck under btree_update_lock below:
+	 */
+	b = READ_ONCE(as->b);
+	if (b) {
 		/*
-		 * b->write_blocked prevented it from being written, so
-		 * write it now if it needs to be written:
+		 * @b is the node we did the final insert into:
+		 *
+		 * On failure to get a journal reservation, we still have to
+		 * unblock the write and allow most of the write path to happen
+		 * so that shutdown works, but the i->journal_seq mechanism
+		 * won't work to prevent the btree write from being visible (we
+		 * didn't get a journal sequence number) - instead
+		 * __bch2_btree_node_write() doesn't do the actual write if
+		 * we're in journal error state:
 		 */
-		bch2_btree_node_write_cond(c, b, true);
-		six_unlock_read(&b->lock);
-		break;
 
-	case BTREE_INTERIOR_UPDATING_AS:
-		/*
-		 * The btree node we originally updated has been freed and is
-		 * being rewritten - so we need to write anything here, we just
-		 * need to signal to that btree_update that it's ok to make the
-		 * new replacement node visible:
-		 */
-		closure_put(&as->parent_as->cl);
+		btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans,
+						as->btree_id, b->c.level, b->key.k.p);
+		struct btree_path *path = trans->paths + path_idx;
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
+		path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
+		path->l[b->c.level].b = b;
+
+		bch2_btree_node_lock_write_nofail(trans, path, &b->c);
+
+		mutex_lock(&c->btree_interior_update_lock);
+
+		list_del(&as->write_blocked_list);
+		if (list_empty(&b->write_blocked))
+			clear_btree_node_write_blocked(b);
 
 		/*
-		 * and then we have to wait on that btree_update to finish:
+		 * Node might have been freed, recheck under
+		 * btree_interior_update_lock:
 		 */
-		closure_wait(&as->parent_as->wait, cl);
+		if (as->b == b) {
+			BUG_ON(!b->c.level);
+			BUG_ON(!btree_node_dirty(b));
+
+			if (!ret) {
+				struct bset *last = btree_bset_last(b);
+
+				last->journal_seq = cpu_to_le64(
+							     max(journal_seq,
+								 le64_to_cpu(last->journal_seq)));
+
+				bch2_btree_add_journal_pin(c, b, journal_seq);
+			} else {
+				/*
+				 * If we didn't get a journal sequence number we
+				 * can't write this btree node, because recovery
+				 * won't know to ignore this write:
+				 */
+				set_btree_node_never_write(b);
+			}
+		}
+
 		mutex_unlock(&c->btree_interior_update_lock);
-		break;
 
-	case BTREE_INTERIOR_UPDATING_ROOT:
-		/* b is the new btree root: */
-		b = READ_ONCE(as->b);
+		mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
+		six_unlock_write(&b->c.lock);
 
-		if (!six_trylock_read(&b->lock)) {
-			mutex_unlock(&c->btree_interior_update_lock);
-			btree_node_lock_type(c, b, SIX_LOCK_read);
-			six_unlock_read(&b->lock);
-			goto retry;
-		}
+		btree_node_write_if_need(c, b, SIX_LOCK_intent);
+		btree_node_unlock(trans, path, b->c.level);
+		bch2_path_put(trans, path_idx, true);
+	}
 
-		BUG_ON(c->btree_roots[b->btree_id].as != as);
-		c->btree_roots[b->btree_id].as = NULL;
+	bch2_journal_pin_drop(&c->journal, &as->journal);
 
-		bch2_btree_set_root_ondisk(c, b, WRITE);
+	mutex_lock(&c->btree_interior_update_lock);
+	for (i = 0; i < as->nr_new_nodes; i++) {
+		b = as->new_nodes[i];
 
-		/*
-		 * We don't have to wait anything anything here (before
-		 * btree_update_nodes_reachable frees the old nodes
-		 * ondisk) - we've ensured that the very next journal write will
-		 * have the pointer to the new root, and before the allocator
-		 * can reuse the old nodes it'll have to do a journal commit:
-		 */
-		six_unlock_read(&b->lock);
-		mutex_unlock(&c->btree_interior_update_lock);
+		BUG_ON(b->will_make_reachable != (unsigned long) as);
+		b->will_make_reachable = 0;
+		clear_btree_node_will_make_reachable(b);
+	}
+	mutex_unlock(&c->btree_interior_update_lock);
 
-		/*
-		 * Bit of funny circularity going on here we have to break:
-		 *
-		 * We have to drop our journal pin before writing the journal
-		 * entry that points to the new btree root: else, we could
-		 * deadlock if the journal currently happens to be full.
-		 *
-		 * This mean we're dropping the journal pin _before_ the new
-		 * nodes are technically reachable - but this is safe, because
-		 * after the bch2_btree_set_root_ondisk() call above they will
-		 * be reachable as of the very next journal write:
-		 */
-		bch2_journal_pin_drop(&c->journal, &as->journal);
+	for (i = 0; i < as->nr_new_nodes; i++) {
+		b = as->new_nodes[i];
 
-		as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal);
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+		btree_node_write_if_need(c, b, SIX_LOCK_read);
+		six_unlock_read(&b->c.lock);
+	}
 
-		btree_update_wait_on_journal(cl);
-		return;
+	for (i = 0; i < as->nr_open_buckets; i++)
+		bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
+
+	bch2_btree_update_free(as, trans);
+	bch2_trans_put(trans);
+}
+
+static void btree_interior_update_work(struct work_struct *work)
+{
+	struct bch_fs *c =
+		container_of(work, struct bch_fs, btree_interior_update_work);
+	struct btree_update *as;
+
+	while (1) {
+		mutex_lock(&c->btree_interior_update_lock);
+		as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
+					      struct btree_update, unwritten_list);
+		if (as && !as->nodes_written)
+			as = NULL;
+		mutex_unlock(&c->btree_interior_update_lock);
+
+		if (!as)
+			break;
+
+		btree_update_nodes_written(as);
 	}
+}
+
+static CLOSURE_CALLBACK(btree_update_set_nodes_written)
+{
+	closure_type(as, struct btree_update, cl);
+	struct bch_fs *c = as->c;
 
-	continue_at(cl, btree_update_nodes_reachable, system_wq);
+	mutex_lock(&c->btree_interior_update_lock);
+	as->nodes_written = true;
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
 }
 
 /*
@@ -766,53 +876,28 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
 {
 	struct bch_fs *c = as->c;
 
+	BUG_ON(as->mode != BTREE_UPDATE_none);
+	BUG_ON(as->update_level_end < b->c.level);
+	BUG_ON(!btree_node_dirty(b));
+	BUG_ON(!b->c.level);
+
 	mutex_lock(&c->btree_interior_update_lock);
+	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
 
-	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
-	BUG_ON(!btree_node_dirty(b));
+	as->mode	= BTREE_UPDATE_node;
+	as->b		= b;
+	as->update_level_end = b->c.level;
 
-	as->mode = BTREE_INTERIOR_UPDATING_NODE;
-	as->b = b;
+	set_btree_node_write_blocked(b);
 	list_add(&as->write_blocked_list, &b->write_blocked);
 
 	mutex_unlock(&c->btree_interior_update_lock);
-
-	/*
-	 * In general, when you're staging things in a journal that will later
-	 * be written elsewhere, and you also want to guarantee ordering: that
-	 * is, if you have updates a, b, c, after a crash you should never see c
-	 * and not a or b - there's a problem:
-	 *
-	 * If the final destination of the update(s) (i.e. btree node) can be
-	 * written/flushed _before_ the relevant journal entry - oops, that
-	 * breaks ordering, since the various leaf nodes can be written in any
-	 * order.
-	 *
-	 * Normally we use bset->journal_seq to deal with this - if during
-	 * recovery we find a btree node write that's newer than the newest
-	 * journal entry, we just ignore it - we don't need it, anything we're
-	 * supposed to have (that we reported as completed via fsync()) will
-	 * still be in the journal, and as far as the state of the journal is
-	 * concerned that btree node write never happened.
-	 *
-	 * That breaks when we're rewriting/splitting/merging nodes, since we're
-	 * mixing btree node writes that haven't happened yet with previously
-	 * written data that has been reported as completed to the journal.
-	 *
-	 * Thus, before making the new nodes reachable, we have to wait the
-	 * newest journal sequence number we have data for to be written (if it
-	 * hasn't been yet).
-	 */
-	bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
 }
 
-static void interior_update_flush(struct journal *j,
-			struct journal_entry_pin *pin, u64 seq)
+static int bch2_update_reparent_journal_pin_flush(struct journal *j,
+				struct journal_entry_pin *_pin, u64 seq)
 {
-	struct btree_update *as =
-		container_of(pin, struct btree_update, journal);
-
-	bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
+	return 0;
 }
 
 static void btree_update_reparent(struct btree_update *as,
@@ -820,77 +905,80 @@ static void btree_update_reparent(struct btree_update *as,
 {
 	struct bch_fs *c = as->c;
 
-	child->b = NULL;
-	child->mode = BTREE_INTERIOR_UPDATING_AS;
-	child->parent_as = as;
-	closure_get(&as->cl);
+	lockdep_assert_held(&c->btree_interior_update_lock);
 
-	/*
-	 * When we write a new btree root, we have to drop our journal pin
-	 * _before_ the new nodes are technically reachable; see
-	 * btree_update_nodes_written().
-	 *
-	 * This goes for journal pins that are recursively blocked on us - so,
-	 * just transfer the journal pin to the new interior update so
-	 * btree_update_nodes_written() can drop it.
-	 */
-	bch2_journal_pin_add_if_older(&c->journal, &child->journal,
-				      &as->journal, interior_update_flush);
-	bch2_journal_pin_drop(&c->journal, &child->journal);
+	child->b = NULL;
+	child->mode = BTREE_UPDATE_update;
 
-	as->journal_seq = max(as->journal_seq, child->journal_seq);
+	bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
+			      bch2_update_reparent_journal_pin_flush);
 }
 
-static void btree_update_updated_root(struct btree_update *as)
+static void btree_update_updated_root(struct btree_update *as, struct btree *b)
 {
+	struct bkey_i *insert = &b->key;
 	struct bch_fs *c = as->c;
-	struct btree_root *r = &c->btree_roots[as->btree_id];
 
-	mutex_lock(&c->btree_interior_update_lock);
+	BUG_ON(as->mode != BTREE_UPDATE_none);
 
-	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
+	       ARRAY_SIZE(as->journal_entries));
 
-	/*
-	 * Old root might not be persistent yet - if so, redirect its
-	 * btree_update operation to point to us:
-	 */
-	if (r->as)
-		btree_update_reparent(as, r->as);
+	as->journal_u64s +=
+		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
+				  BCH_JSET_ENTRY_btree_root,
+				  b->c.btree_id, b->c.level,
+				  insert, insert->k.u64s);
 
-	as->mode = BTREE_INTERIOR_UPDATING_ROOT;
-	as->b = r->b;
-	r->as = as;
+	mutex_lock(&c->btree_interior_update_lock);
+	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
 
+	as->mode	= BTREE_UPDATE_root;
 	mutex_unlock(&c->btree_interior_update_lock);
-
-	/*
-	 * When we're rewriting nodes and updating interior nodes, there's an
-	 * issue with updates that haven't been written in the journal getting
-	 * mixed together with older data - see btree_update_updated_node()
-	 * for the explanation.
-	 *
-	 * However, this doesn't affect us when we're writing a new btree root -
-	 * because to make that new root reachable we have to write out a new
-	 * journal entry, which must necessarily be newer than as->journal_seq.
-	 */
 }
 
-static void btree_node_will_make_reachable(struct btree_update *as,
-					   struct btree *b)
+/*
+ * bch2_btree_update_add_new_node:
+ *
+ * This causes @as to wait on @b to be written, before it gets to
+ * bch2_btree_update_nodes_written
+ *
+ * Additionally, it sets b->will_make_reachable to prevent any additional writes
+ * to @b from happening besides the first until @b is reachable on disk
+ *
+ * And it adds @b to the list of @as's new nodes, so that we can update sector
+ * counts in bch2_btree_update_nodes_written:
+ */
+static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
 {
 	struct bch_fs *c = as->c;
 
+	closure_get(&as->cl);
+
 	mutex_lock(&c->btree_interior_update_lock);
 	BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
 	BUG_ON(b->will_make_reachable);
 
 	as->new_nodes[as->nr_new_nodes++] = b;
 	b->will_make_reachable = 1UL|(unsigned long) as;
+	set_btree_node_will_make_reachable(b);
 
-	closure_get(&as->cl);
 	mutex_unlock(&c->btree_interior_update_lock);
+
+	btree_update_add_key(as, &as->new_keys, b);
+
+	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+		unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data;
+		unsigned sectors = round_up(bytes, block_bytes(c)) >> 9;
+
+		bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
+			cpu_to_le16(sectors);
+	}
 }
 
+/*
+ * returns true if @b was a new node
+ */
 static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
 {
 	struct btree_update *as;
@@ -898,7 +986,13 @@ static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
 	unsigned i;
 
 	mutex_lock(&c->btree_interior_update_lock);
+	/*
+	 * When b->will_make_reachable != 0, it owns a ref on as->cl that's
+	 * dropped when it gets written by bch2_btree_complete_write - the
+	 * xchg() is for synchronization with bch2_btree_complete_write:
+	 */
 	v = xchg(&b->will_make_reachable, 0);
+	clear_btree_node_will_make_reachable(b);
 	as = (struct btree_update *) (v & ~1UL);
 
 	if (!as) {
@@ -919,25 +1013,17 @@ found:
 		closure_put(&as->cl);
 }
 
-static void btree_interior_update_add_node_reference(struct btree_update *as,
-						     struct btree *b)
+static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
 {
-	struct bch_fs *c = as->c;
-	struct pending_btree_node_free *d;
-
-	mutex_lock(&c->btree_interior_update_lock);
-
-	/* Add this node to the list of nodes being freed: */
-	BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
-
-	d = &as->pending[as->nr_pending++];
-	d->index_update_done	= false;
-	d->seq			= b->data->keys.seq;
-	d->btree_id		= b->btree_id;
-	d->level		= b->level;
-	bkey_copy(&d->key, &b->key);
+	while (b->ob.nr)
+		as->open_buckets[as->nr_open_buckets++] =
+			b->ob.v[--b->ob.nr];
+}
 
-	mutex_unlock(&c->btree_interior_update_lock);
+static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j,
+				struct journal_entry_pin *_pin, u64 seq)
+{
+	return 0;
 }
 
 /*
@@ -945,34 +1031,18 @@ static void btree_interior_update_add_node_reference(struct btree_update *as,
  * nodes and thus outstanding btree_updates - redirect @b's
  * btree_updates to point to this btree_update:
  */
-void bch2_btree_interior_update_will_free_node(struct btree_update *as,
-					       struct btree *b)
+static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
+						      struct btree *b)
 {
 	struct bch_fs *c = as->c;
-	struct closure *cl, *cl_n;
 	struct btree_update *p, *n;
 	struct btree_write *w;
-	struct bset_tree *t;
 
 	set_btree_node_dying(b);
 
 	if (btree_node_fake(b))
 		return;
 
-	btree_interior_update_add_node_reference(as, b);
-
-	/*
-	 * Does this node have data that hasn't been written in the journal?
-	 *
-	 * If so, we have to wait for the corresponding journal entry to be
-	 * written before making the new nodes reachable - we can't just carry
-	 * over the bset->journal_seq tracking, since we'll be mixing those keys
-	 * in with keys that aren't in the journal anymore:
-	 */
-	for_each_bset(b, t)
-		as->journal_seq = max(as->journal_seq,
-				      le64_to_cpu(bset(b, t)->journal_seq));
-
 	mutex_lock(&c->btree_interior_update_lock);
 
 	/*
@@ -984,22 +1054,19 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	 * operations complete
 	 */
 	list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
-		list_del(&p->write_blocked_list);
+		list_del_init(&p->write_blocked_list);
 		btree_update_reparent(as, p);
+
+		/*
+		 * for flush_held_btree_writes() waiting on updates to flush or
+		 * nodes to be writeable:
+		 */
+		closure_wake_up(&c->btree_interior_update_wait);
 	}
 
-	clear_btree_node_dirty(b);
+	clear_btree_node_dirty_acct(c, b);
 	clear_btree_node_need_write(b);
-	w = btree_current_write(b);
-
-	/*
-	 * Does this node have any btree_update operations waiting on this node
-	 * to be written?
-	 *
-	 * If so, wake them up when this btree_update operation is reachable:
-	 */
-	llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
-		llist_add(&cl->list, &as->wait.list);
+	clear_btree_node_write_blocked(b);
 
 	/*
 	 * Does this node have unwritten data that has a pin on the journal?
@@ -1009,152 +1076,258 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	 * oldest pin of any of the nodes we're freeing. We'll release the pin
 	 * when the new nodes are persistent and reachable on disk:
 	 */
-	bch2_journal_pin_add_if_older(&c->journal, &w->journal,
-				      &as->journal, interior_update_flush);
+	w = btree_current_write(b);
+	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
+			      bch2_btree_update_will_free_node_journal_pin_flush);
 	bch2_journal_pin_drop(&c->journal, &w->journal);
 
 	w = btree_prev_write(b);
-	bch2_journal_pin_add_if_older(&c->journal, &w->journal,
-				      &as->journal, interior_update_flush);
+	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
+			      bch2_btree_update_will_free_node_journal_pin_flush);
 	bch2_journal_pin_drop(&c->journal, &w->journal);
 
 	mutex_unlock(&c->btree_interior_update_lock);
+
+	/*
+	 * Is this a node that isn't reachable on disk yet?
+	 *
+	 * Nodes that aren't reachable yet have writes blocked until they're
+	 * reachable - now that we've cancelled any pending writes and moved
+	 * things waiting on that write to wait on this update, we can drop this
+	 * node from the list of nodes that the other update is making
+	 * reachable, prior to freeing it:
+	 */
+	btree_update_drop_new_node(c, b);
+
+	btree_update_add_key(as, &as->old_keys, b);
+
+	as->old_nodes[as->nr_old_nodes] = b;
+	as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
+	as->nr_old_nodes++;
 }
 
-void bch2_btree_update_done(struct btree_update *as)
+static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans)
 {
-	BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
+	struct bch_fs *c = as->c;
+	u64 start_time = as->start_time;
+
+	BUG_ON(as->mode == BTREE_UPDATE_none);
+
+	if (as->took_gc_lock)
+		up_read(&as->c->gc_lock);
+	as->took_gc_lock = false;
+
+	bch2_btree_reserve_put(as, trans);
 
-	bch2_btree_reserve_put(as->c, as->reserve);
-	as->reserve = NULL;
+	continue_at(&as->cl, btree_update_set_nodes_written,
+		    as->c->btree_interior_update_worker);
 
-	continue_at(&as->cl, btree_update_nodes_written, system_freezable_wq);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
+			       start_time);
 }
 
-struct btree_update *
-bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
-			unsigned nr_nodes, unsigned flags,
-			struct closure *cl)
+static struct btree_update *
+bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
+			unsigned level_start, bool split, unsigned flags)
 {
-	struct btree_reserve *reserve;
+	struct bch_fs *c = trans->c;
 	struct btree_update *as;
+	u64 start_time = local_clock();
+	int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
+		? BCH_DISK_RESERVATION_NOFAIL : 0;
+	unsigned nr_nodes[2] = { 0, 0 };
+	unsigned level_end = level_start;
+	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
+	int ret = 0;
+	u32 restart_count = trans->restart_count;
+
+	BUG_ON(!path->should_be_locked);
 
-	if (unlikely(!percpu_ref_tryget(&c->writes)))
-		return ERR_PTR(-EROFS);
+	if (watermark == BCH_WATERMARK_copygc)
+		watermark = BCH_WATERMARK_btree_copygc;
+	if (watermark < BCH_WATERMARK_btree)
+		watermark = BCH_WATERMARK_btree;
 
-	reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
-	if (IS_ERR(reserve)) {
-		percpu_ref_put(&c->writes);
-		return ERR_CAST(reserve);
+	flags &= ~BCH_WATERMARK_MASK;
+	flags |= watermark;
+
+	if (watermark < BCH_WATERMARK_reclaim &&
+	    test_bit(JOURNAL_space_low, &c->journal.flags)) {
+		if (flags & BCH_TRANS_COMMIT_journal_reclaim)
+			return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock);
+
+		ret = drop_locks_do(trans,
+			({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; }));
+		if (ret)
+			return ERR_PTR(ret);
 	}
 
-	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
+	while (1) {
+		nr_nodes[!!level_end] += 1 + split;
+		level_end++;
+
+		ret = bch2_btree_path_upgrade(trans, path, level_end + 1);
+		if (ret)
+			return ERR_PTR(ret);
+
+		if (!btree_path_node(path, level_end)) {
+			/* Allocating new root? */
+			nr_nodes[1] += split;
+			level_end = BTREE_MAX_DEPTH;
+			break;
+		}
+
+		/*
+		 * Always check for space for two keys, even if we won't have to
+		 * split at prior level - it might have been a merge instead:
+		 */
+		if (bch2_btree_node_insert_fits(path->l[level_end].b,
+						BKEY_BTREE_PTR_U64s_MAX * 2))
+			break;
+
+		split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
+	}
+
+	if (!down_read_trylock(&c->gc_lock)) {
+		ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
+		if (ret) {
+			up_read(&c->gc_lock);
+			return ERR_PTR(ret);
+		}
+	}
+
+	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
 	memset(as, 0, sizeof(*as));
 	closure_init(&as->cl, NULL);
-	as->c		= c;
-	as->mode	= BTREE_INTERIOR_NO_UPDATE;
-	as->btree_id	= id;
-	as->reserve	= reserve;
+	as->c			= c;
+	as->start_time		= start_time;
+	as->ip_started		= _RET_IP_;
+	as->mode		= BTREE_UPDATE_none;
+	as->flags		= flags;
+	as->took_gc_lock	= true;
+	as->btree_id		= path->btree_id;
+	as->update_level_start	= level_start;
+	as->update_level_end	= level_end;
+	INIT_LIST_HEAD(&as->list);
+	INIT_LIST_HEAD(&as->unwritten_list);
 	INIT_LIST_HEAD(&as->write_blocked_list);
-
+	bch2_keylist_init(&as->old_keys, as->_old_keys);
+	bch2_keylist_init(&as->new_keys, as->_new_keys);
 	bch2_keylist_init(&as->parent_keys, as->inline_keys);
 
 	mutex_lock(&c->btree_interior_update_lock);
 	list_add_tail(&as->list, &c->btree_interior_update_list);
 	mutex_unlock(&c->btree_interior_update_lock);
 
-	return as;
-}
+	/*
+	 * We don't want to allocate if we're in an error state, that can cause
+	 * deadlock on emergency shutdown due to open buckets getting stuck in
+	 * the btree_reserve_cache after allocator shutdown has cleared it out.
+	 * This check needs to come after adding us to the btree_interior_update
+	 * list but before calling bch2_btree_reserve_get, to synchronize with
+	 * __bch2_fs_read_only().
+	 */
+	ret = bch2_journal_error(&c->journal);
+	if (ret)
+		goto err;
 
-/* Btree root updates: */
+	ret = bch2_disk_reservation_get(c, &as->disk_res,
+			(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
+			c->opts.metadata_replicas,
+			disk_res_flags);
+	if (ret)
+		goto err;
 
-static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
-{
-	/* Root nodes cannot be reaped */
-	mutex_lock(&c->btree_cache.lock);
-	list_del_init(&b->list);
-	mutex_unlock(&c->btree_cache.lock);
+	ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
+	if (bch2_err_matches(ret, ENOSPC) ||
+	    bch2_err_matches(ret, ENOMEM)) {
+		struct closure cl;
 
-	mutex_lock(&c->btree_root_lock);
-	BUG_ON(btree_node_root(c, b) &&
-	       (b->level < btree_node_root(c, b)->level ||
-		!btree_node_dying(btree_node_root(c, b))));
+		/*
+		 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
+		 * flag
+		 */
+		if (bch2_err_matches(ret, ENOSPC) &&
+		    (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
+		    watermark < BCH_WATERMARK_reclaim) {
+			ret = -BCH_ERR_journal_reclaim_would_deadlock;
+			goto err;
+		}
 
-	btree_node_root(c, b) = b;
-	mutex_unlock(&c->btree_root_lock);
+		closure_init_stack(&cl);
 
-	bch2_recalc_btree_reserve(c);
-}
+		do {
+			ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
 
-static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
-{
-	struct bch_fs *c = as->c;
-	struct btree *old = btree_node_root(c, b);
-	struct bch_fs_usage stats = { 0 };
+			bch2_trans_unlock(trans);
+			bch2_wait_on_allocator(c, &cl);
+		} while (bch2_err_matches(ret, BCH_ERR_operation_blocked));
+	}
 
-	__bch2_btree_set_root_inmem(c, b);
+	if (ret) {
+		trace_and_count(c, btree_reserve_get_fail, trans->fn,
+				_RET_IP_, nr_nodes[0] + nr_nodes[1], ret);
+		goto err;
+	}
 
-	bch2_mark_key(c, bkey_i_to_s_c(&b->key),
-		      c->opts.btree_node_size, true,
-		      gc_pos_btree_root(b->btree_id),
-		      &stats, 0, 0);
+	ret = bch2_trans_relock(trans);
+	if (ret)
+		goto err;
 
-	if (old && !btree_node_fake(old))
-		bch2_btree_node_free_index(as, NULL,
-					   bkey_i_to_s_c(&old->key),
-					   &stats);
-	bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
-			    gc_pos_btree_root(b->btree_id));
+	bch2_trans_verify_not_restarted(trans, restart_count);
+	return as;
+err:
+	bch2_btree_update_free(as, trans);
+	if (!bch2_err_matches(ret, ENOSPC) &&
+	    !bch2_err_matches(ret, EROFS) &&
+	    ret != -BCH_ERR_journal_reclaim_would_deadlock)
+		bch_err_fn_ratelimited(c, ret);
+	return ERR_PTR(ret);
 }
 
-static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
+/* Btree root updates: */
+
+static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
 {
-	struct btree_root *r = &c->btree_roots[b->btree_id];
+	/* Root nodes cannot be reaped */
+	mutex_lock(&c->btree_cache.lock);
+	list_del_init(&b->list);
+	mutex_unlock(&c->btree_cache.lock);
 
 	mutex_lock(&c->btree_root_lock);
-
-	BUG_ON(b != r->b);
-	bkey_copy(&r->key, &b->key);
-	r->level = b->level;
-	r->alive = true;
-	if (rw == WRITE)
-		c->btree_roots_dirty = true;
-
+	bch2_btree_id_root(c, b->c.btree_id)->b = b;
 	mutex_unlock(&c->btree_root_lock);
+
+	bch2_recalc_btree_reserve(c);
 }
 
-/**
- * bch_btree_set_root - update the root in memory and on disk
- *
- * To ensure forward progress, the current task must not be holding any
- * btree node write locks. However, you must hold an intent lock on the
- * old root.
- *
- * Note: This allocates a journal entry but doesn't add any keys to
- * it.  All the btree roots are part of every journal write, so there
- * is nothing new to be done.  This just guarantees that there is a
- * journal write.
- */
-static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
-				struct btree_iter *iter)
+static int bch2_btree_set_root(struct btree_update *as,
+			       struct btree_trans *trans,
+			       struct btree_path *path,
+			       struct btree *b,
+			       bool nofail)
 {
 	struct bch_fs *c = as->c;
-	struct btree *old;
 
-	trace_btree_set_root(c, b);
-	BUG_ON(!b->written);
+	trace_and_count(c, btree_node_set_root, trans, b);
 
-	old = btree_node_root(c, b);
+	struct btree *old = btree_node_root(c, b);
 
 	/*
 	 * Ensure no one is using the old root while we switch to the
 	 * new root:
 	 */
-	bch2_btree_node_lock_write(old, iter);
+	if (nofail) {
+		bch2_btree_node_lock_write_nofail(trans, path, &old->c);
+	} else {
+		int ret = bch2_btree_node_lock_write(trans, path, &old->c);
+		if (ret)
+			return ret;
+	}
 
-	bch2_btree_set_root_inmem(as, b);
+	bch2_btree_set_root_inmem(c, b);
 
-	btree_update_updated_root(as);
+	btree_update_updated_root(as, b);
 
 	/*
 	 * Unlock old root after new root is visible:
@@ -1163,136 +1336,227 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
 	 * an intent lock on the new root, and any updates that would
 	 * depend on the new root would have to update the new root.
 	 */
-	bch2_btree_node_unlock_write(old, iter);
+	bch2_btree_node_unlock_write(trans, path, old);
+	return 0;
 }
 
 /* Interior node updates: */
 
-static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b,
-					struct btree_iter *iter,
-					struct bkey_i *insert,
-					struct btree_node_iter *node_iter)
+static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
+					struct btree_trans *trans,
+					struct btree_path *path,
+					struct btree *b,
+					struct btree_node_iter *node_iter,
+					struct bkey_i *insert)
 {
 	struct bch_fs *c = as->c;
-	struct bch_fs_usage stats = { 0 };
 	struct bkey_packed *k;
-	struct bkey tmp;
+	struct printbuf buf = PRINTBUF;
+	unsigned long old, new;
+
+	BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
+	       !btree_ptr_sectors_written(bkey_i_to_s_c(insert)));
+
+	if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)))
+		bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
+
+	struct bkey_validate_context from = (struct bkey_validate_context) {
+		.from	= BKEY_VALIDATE_btree_node,
+		.level	= b->c.level,
+		.btree	= b->c.btree_id,
+		.flags	= BCH_VALIDATE_commit,
+	};
+	if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?:
+	    bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) {
+		bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__);
+		dump_stack();
+	}
 
-	BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b));
+	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
+	       ARRAY_SIZE(as->journal_entries));
 
-	if (bkey_extent_is_data(&insert->k))
-		bch2_mark_key(c, bkey_i_to_s_c(insert),
-			     c->opts.btree_node_size, true,
-			     gc_pos_btree_node(b), &stats, 0, 0);
+	as->journal_u64s +=
+		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
+				  BCH_JSET_ENTRY_btree_keys,
+				  b->c.btree_id, b->c.level,
+				  insert, insert->k.u64s);
 
 	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
-	       !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false))
+	       bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
 		bch2_btree_node_iter_advance(node_iter, b);
 
-	/*
-	 * If we're overwriting, look up pending delete and mark so that gc
-	 * marks it on the pending delete list:
-	 */
-	if (k && !bkey_cmp_packed(b, k, &insert->k))
-		bch2_btree_node_free_index(as, b,
-					   bkey_disassemble(b, k, &tmp),
-					   &stats);
+	bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
+	set_btree_node_dirty_acct(c, b);
+
+	old = READ_ONCE(b->flags);
+	do {
+		new = old;
 
-	bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
-			    gc_pos_btree_node(b));
+		new &= ~BTREE_WRITE_TYPE_MASK;
+		new |= BTREE_WRITE_interior;
+		new |= 1 << BTREE_NODE_need_write;
+	} while (!try_cmpxchg(&b->flags, &old, new));
 
-	bch2_btree_bset_insert_key(iter, b, node_iter, insert);
-	set_btree_node_dirty(b);
-	set_btree_node_need_write(b);
+	printbuf_exit(&buf);
+}
+
+static void
+bch2_btree_insert_keys_interior(struct btree_update *as,
+				struct btree_trans *trans,
+				struct btree_path *path,
+				struct btree *b,
+				struct btree_node_iter node_iter,
+				struct keylist *keys)
+{
+	struct bkey_i *insert = bch2_keylist_front(keys);
+	struct bkey_packed *k;
+
+	BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
+
+	while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
+	       (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
+		;
+
+	for (;
+	     insert != keys->top && bpos_le(insert->k.p, b->key.k.p);
+	     insert = bkey_next(insert))
+		bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
+
+	if (bch2_btree_node_check_topology(trans, b)) {
+		struct printbuf buf = PRINTBUF;
+
+		for (struct bkey_i *k = keys->keys;
+		     k != insert;
+		     k = bkey_next(k)) {
+			bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k));
+			prt_newline(&buf);
+		}
+
+		panic("%s(): check_topology error: inserted keys\n%s", __func__, buf.buf);
+	}
+
+	memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data);
+	keys->top_p -= insert->_data - keys->keys_p;
+}
+
+static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos)
+{
+	if (insert_keys)
+		for_each_keylist_key(insert_keys, k)
+			if (bkey_deleted(&k->k) && bpos_eq(k->k.p, pos))
+				return true;
+	return false;
 }
 
 /*
  * Move keys from n1 (original replacement node, now lower node) to n2 (higher
  * node)
  */
-static struct btree *__btree_split_node(struct btree_update *as,
-					struct btree *n1,
-					struct btree_iter *iter)
+static void __btree_split_node(struct btree_update *as,
+			       struct btree_trans *trans,
+			       struct btree *b,
+			       struct btree *n[2],
+			       struct keylist *insert_keys)
 {
-	size_t nr_packed = 0, nr_unpacked = 0;
-	struct btree *n2;
-	struct bset *set1, *set2;
-	struct bkey_packed *k, *prev = NULL;
+	struct bkey_packed *k;
+	struct bpos n1_pos = POS_MIN;
+	struct btree_node_iter iter;
+	struct bset *bsets[2];
+	struct bkey_format_state format[2];
+	struct bkey_packed *out[2];
+	struct bkey uk;
+	unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5;
+	struct { unsigned nr_keys, val_u64s; } nr_keys[2];
+	int i;
 
-	n2 = bch2_btree_node_alloc(as, n1->level);
+	memset(&nr_keys, 0, sizeof(nr_keys));
 
-	n2->data->max_key	= n1->data->max_key;
-	n2->data->format	= n1->format;
-	SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
-	n2->key.k.p = n1->key.k.p;
+	for (i = 0; i < 2; i++) {
+		BUG_ON(n[i]->nsets != 1);
 
-	btree_node_set_format(n2, n2->data->format);
+		bsets[i] = btree_bset_first(n[i]);
+		out[i] = bsets[i]->start;
 
-	set1 = btree_bset_first(n1);
-	set2 = btree_bset_first(n2);
+		SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1);
+		bch2_bkey_format_init(&format[i]);
+	}
 
-	/*
-	 * Has to be a linear search because we don't have an auxiliary
-	 * search tree yet
-	 */
-	k = set1->start;
-	while (1) {
-		if (bkey_next(k) == vstruct_last(set1))
-			break;
-		if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
-			break;
+	u64s = 0;
+	for_each_btree_node_key(b, k, &iter) {
+		if (bkey_deleted(k))
+			continue;
 
-		if (bkey_packed(k))
-			nr_packed++;
-		else
-			nr_unpacked++;
+		uk = bkey_unpack_key(b, k);
 
-		prev = k;
-		k = bkey_next(k);
-	}
+		if (b->c.level &&
+		    u64s < n1_u64s &&
+		    u64s + k->u64s >= n1_u64s &&
+		    (bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p) ||
+		     key_deleted_in_insert(insert_keys, uk.p)))
+			n1_u64s += k->u64s;
 
-	BUG_ON(!prev);
+		i = u64s >= n1_u64s;
+		u64s += k->u64s;
+		if (!i)
+			n1_pos = uk.p;
+		bch2_bkey_format_add_key(&format[i], &uk);
 
-	n1->key.k.p = bkey_unpack_pos(n1, prev);
-	n1->data->max_key = n1->key.k.p;
-	n2->data->min_key =
-		btree_type_successor(n1->btree_id, n1->key.k.p);
+		nr_keys[i].nr_keys++;
+		nr_keys[i].val_u64s += bkeyp_val_u64s(&b->format, k);
+	}
+
+	btree_set_min(n[0], b->data->min_key);
+	btree_set_max(n[0], n1_pos);
+	btree_set_min(n[1], bpos_successor(n1_pos));
+	btree_set_max(n[1], b->data->max_key);
 
-	set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
-	set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
+	for (i = 0; i < 2; i++) {
+		bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key);
+		bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key);
 
-	set_btree_bset_end(n1, n1->set);
-	set_btree_bset_end(n2, n2->set);
+		n[i]->data->format = bch2_bkey_format_done(&format[i]);
 
-	n2->nr.live_u64s	= le16_to_cpu(set2->u64s);
-	n2->nr.bset_u64s[0]	= le16_to_cpu(set2->u64s);
-	n2->nr.packed_keys	= n1->nr.packed_keys - nr_packed;
-	n2->nr.unpacked_keys	= n1->nr.unpacked_keys - nr_unpacked;
+		unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s +
+			nr_keys[i].val_u64s;
+		if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b))
+			n[i]->data->format = b->format;
 
-	n1->nr.live_u64s	= le16_to_cpu(set1->u64s);
-	n1->nr.bset_u64s[0]	= le16_to_cpu(set1->u64s);
-	n1->nr.packed_keys	= nr_packed;
-	n1->nr.unpacked_keys	= nr_unpacked;
+		btree_node_set_format(n[i], n[i]->data->format);
+	}
 
-	BUG_ON(!set1->u64s);
-	BUG_ON(!set2->u64s);
+	u64s = 0;
+	for_each_btree_node_key(b, k, &iter) {
+		if (bkey_deleted(k))
+			continue;
 
-	memcpy_u64s(set2->start,
-		    vstruct_end(set1),
-		    le16_to_cpu(set2->u64s));
+		i = u64s >= n1_u64s;
+		u64s += k->u64s;
 
-	btree_node_reset_sib_u64s(n1);
-	btree_node_reset_sib_u64s(n2);
+		if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k)
+					? &b->format: &bch2_bkey_format_current, k))
+			out[i]->format = KEY_FORMAT_LOCAL_BTREE;
+		else
+			bch2_bkey_unpack(b, (void *) out[i], k);
 
-	bch2_verify_btree_nr_keys(n1);
-	bch2_verify_btree_nr_keys(n2);
+		out[i]->needs_whiteout = false;
 
-	if (n1->level) {
-		btree_node_interior_verify(n1);
-		btree_node_interior_verify(n2);
+		btree_keys_account_key_add(&n[i]->nr, 0, out[i]);
+		out[i] = bkey_p_next(out[i]);
 	}
 
-	return n2;
+	for (i = 0; i < 2; i++) {
+		bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data);
+
+		BUG_ON(!bsets[i]->u64s);
+
+		set_btree_bset_end(n[i], n[i]->set);
+
+		btree_node_reset_sib_u64s(n[i]);
+
+		bch2_verify_btree_nr_keys(n[i]);
+
+		BUG_ON(bch2_btree_node_check_topology(trans, n[i]));
+	}
 }
 
 /*
@@ -1306,84 +1570,78 @@ static struct btree *__btree_split_node(struct btree_update *as,
  * nodes that were coalesced, and thus in the middle of a child node post
  * coalescing:
  */
-static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
-				    struct btree_iter *iter,
+static void btree_split_insert_keys(struct btree_update *as,
+				    struct btree_trans *trans,
+				    btree_path_idx_t path_idx,
+				    struct btree *b,
 				    struct keylist *keys)
 {
-	struct btree_node_iter node_iter;
-	struct bkey_i *k = bch2_keylist_front(keys);
-	struct bkey_packed *p;
-	struct bset *i;
-
-	BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
+	struct btree_path *path = trans->paths + path_idx;
 
-	bch2_btree_node_iter_init(&node_iter, b, k->k.p, false, false);
+	if (!bch2_keylist_empty(keys) &&
+	    bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) {
+		struct btree_node_iter node_iter;
 
-	while (!bch2_keylist_empty(keys)) {
-		k = bch2_keylist_front(keys);
+		bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
 
-		BUG_ON(bch_keylist_u64s(keys) >
-		       bch_btree_keys_u64s_remaining(as->c, b));
-		BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0);
-		BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0);
-
-		bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter);
-		bch2_keylist_pop_front(keys);
+		bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
 	}
-
-	/*
-	 * We can't tolerate whiteouts here - with whiteouts there can be
-	 * duplicate keys, and it would be rather bad if we picked a duplicate
-	 * for the pivot:
-	 */
-	i = btree_bset_first(b);
-	p = i->start;
-	while (p != vstruct_last(i))
-		if (bkey_deleted(p)) {
-			le16_add_cpu(&i->u64s, -p->u64s);
-			set_btree_bset_end(b, b->set);
-			memmove_u64s_down(p, bkey_next(p),
-					  (u64 *) vstruct_last(i) -
-					  (u64 *) p);
-		} else
-			p = bkey_next(p);
-
-	BUG_ON(b->nsets != 1 ||
-	       b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
-
-	btree_node_interior_verify(b);
 }
 
-static void btree_split(struct btree_update *as, struct btree *b,
-			struct btree_iter *iter, struct keylist *keys,
-			unsigned flags)
+static int btree_split(struct btree_update *as, struct btree_trans *trans,
+		       btree_path_idx_t path, struct btree *b,
+		       struct keylist *keys)
 {
 	struct bch_fs *c = as->c;
-	struct btree *parent = btree_node_parent(iter, b);
+	struct btree *parent = btree_node_parent(trans->paths + path, b);
 	struct btree *n1, *n2 = NULL, *n3 = NULL;
+	btree_path_idx_t path1 = 0, path2 = 0;
 	u64 start_time = local_clock();
+	int ret = 0;
 
+	bch2_verify_btree_nr_keys(b);
 	BUG_ON(!parent && (b != btree_node_root(c, b)));
-	BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
+	BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1));
+
+	ret = bch2_btree_node_check_topology(trans, b);
+	if (ret)
+		return ret;
 
 	bch2_btree_interior_update_will_free_node(as, b);
 
-	n1 = bch2_btree_node_alloc_replacement(as, b);
+	if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
+		struct btree *n[2];
+
+		trace_and_count(c, btree_node_split, trans, b);
 
-	if (keys)
-		btree_split_insert_keys(as, n1, iter, keys);
+		n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
+		n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
 
-	if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) {
-		trace_btree_split(c, b);
+		__btree_split_node(as, trans, b, n, keys);
 
-		n2 = __btree_split_node(as, n1, iter);
+		if (keys) {
+			btree_split_insert_keys(as, trans, path, n1, keys);
+			btree_split_insert_keys(as, trans, path, n2, keys);
+			BUG_ON(!bch2_keylist_empty(keys));
+		}
 
 		bch2_btree_build_aux_trees(n2);
 		bch2_btree_build_aux_trees(n1);
-		six_unlock_write(&n2->lock);
-		six_unlock_write(&n1->lock);
 
-		bch2_btree_node_write(c, n2, SIX_LOCK_intent);
+		bch2_btree_update_add_new_node(as, n1);
+		bch2_btree_update_add_new_node(as, n2);
+		six_unlock_write(&n2->c.lock);
+		six_unlock_write(&n1->c.lock);
+
+		path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
+		six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
+		bch2_btree_path_level_init(trans, trans->paths + path1, n1);
+
+		path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p);
+		six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
+		bch2_btree_path_level_init(trans, trans->paths + path2, n2);
 
 		/*
 		 * Note that on recursive parent_keys == keys, so we
@@ -1395,139 +1653,173 @@ static void btree_split(struct btree_update *as, struct btree *b,
 
 		if (!parent) {
 			/* Depth increases, make a new root */
-			n3 = __btree_root_alloc(as, b->level + 1);
+			n3 = __btree_root_alloc(as, trans, b->c.level + 1);
+
+			bch2_btree_update_add_new_node(as, n3);
+			six_unlock_write(&n3->c.lock);
+
+			trans->paths[path2].locks_want++;
+			BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level));
+			six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
+			mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
+			bch2_btree_path_level_init(trans, trans->paths + path2, n3);
 
 			n3->sib_u64s[0] = U16_MAX;
 			n3->sib_u64s[1] = U16_MAX;
 
-			btree_split_insert_keys(as, n3, iter, &as->parent_keys);
-
-			bch2_btree_node_write(c, n3, SIX_LOCK_intent);
+			btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
 		}
 	} else {
-		trace_btree_compact(c, b);
+		trace_and_count(c, btree_node_compact, trans, b);
+
+		n1 = bch2_btree_node_alloc_replacement(as, trans, b);
+
+		if (keys) {
+			btree_split_insert_keys(as, trans, path, n1, keys);
+			BUG_ON(!bch2_keylist_empty(keys));
+		}
 
 		bch2_btree_build_aux_trees(n1);
-		six_unlock_write(&n1->lock);
+		bch2_btree_update_add_new_node(as, n1);
+		six_unlock_write(&n1->c.lock);
 
-		bch2_keylist_add(&as->parent_keys, &n1->key);
-	}
+		path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
+		six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
+		bch2_btree_path_level_init(trans, trans->paths + path1, n1);
 
-	bch2_btree_node_write(c, n1, SIX_LOCK_intent);
+		if (parent)
+			bch2_keylist_add(&as->parent_keys, &n1->key);
+	}
 
 	/* New nodes all written, now make them visible: */
 
 	if (parent) {
 		/* Split a non root node */
-		bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
+		ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
 	} else if (n3) {
-		bch2_btree_set_root(as, n3, iter);
+		ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false);
 	} else {
 		/* Root filled up but didn't need to be split */
-		bch2_btree_set_root(as, n1, iter);
+		ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false);
 	}
 
-	bch2_btree_open_bucket_put(c, n1);
-	if (n2)
-		bch2_btree_open_bucket_put(c, n2);
-	if (n3)
-		bch2_btree_open_bucket_put(c, n3);
+	if (ret)
+		goto err;
+
+	if (n3) {
+		bch2_btree_update_get_open_buckets(as, n3);
+		bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
+	}
+	if (n2) {
+		bch2_btree_update_get_open_buckets(as, n2);
+		bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
+	}
+	bch2_btree_update_get_open_buckets(as, n1);
+	bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
 
 	/*
-	 * Note - at this point other linked iterators could still have @b read
-	 * locked; we're depending on the bch2_btree_iter_node_replace() calls
-	 * below removing all references to @b so we don't return with other
-	 * iterators pointing to a node they have locked that's been freed.
-	 *
-	 * We have to free the node first because the bch2_iter_node_replace()
-	 * calls will drop _our_ iterator's reference - and intent lock - to @b.
+	 * The old node must be freed (in memory) _before_ unlocking the new
+	 * nodes - else another thread could re-acquire a read lock on the old
+	 * node after another thread has locked and updated the new node, thus
+	 * seeing stale data:
 	 */
-	bch2_btree_node_free_inmem(c, b, iter);
-
-	/* Successful split, update the iterator to point to the new nodes: */
+	bch2_btree_node_free_inmem(trans, trans->paths + path, b);
 
 	if (n3)
-		bch2_btree_iter_node_replace(iter, n3);
+		bch2_trans_node_add(trans, trans->paths + path, n3);
 	if (n2)
-		bch2_btree_iter_node_replace(iter, n2);
-	bch2_btree_iter_node_replace(iter, n1);
-
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_split], start_time);
-}
-
-static void
-bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
-				struct btree_iter *iter, struct keylist *keys)
-{
-	struct btree_iter *linked;
-	struct btree_node_iter node_iter;
-	struct bkey_i *insert = bch2_keylist_front(keys);
-	struct bkey_packed *k;
-
-	/* Don't screw up @iter's position: */
-	node_iter = iter->l[b->level].iter;
-
-	/*
-	 * btree_split(), btree_gc_coalesce() will insert keys before
-	 * the iterator's current position - they know the keys go in
-	 * the node the iterator points to:
-	 */
-	while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
-	       (bkey_cmp_packed(b, k, &insert->k) >= 0))
-		;
-
-	while (!bch2_keylist_empty(keys)) {
-		insert = bch2_keylist_front(keys);
+		bch2_trans_node_add(trans, trans->paths + path2, n2);
+	bch2_trans_node_add(trans, trans->paths + path1, n1);
 
-		bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
-		bch2_keylist_pop_front(keys);
+	if (n3)
+		six_unlock_intent(&n3->c.lock);
+	if (n2)
+		six_unlock_intent(&n2->c.lock);
+	six_unlock_intent(&n1->c.lock);
+out:
+	if (path2) {
+		__bch2_btree_path_unlock(trans, trans->paths + path2);
+		bch2_path_put(trans, path2, true);
+	}
+	if (path1) {
+		__bch2_btree_path_unlock(trans, trans->paths + path1);
+		bch2_path_put(trans, path1, true);
 	}
 
-	btree_update_updated_node(as, b);
-
-	for_each_btree_iter_with_node(iter, b, linked)
-		bch2_btree_node_iter_peek(&linked->l[b->level].iter, b);
+	bch2_trans_verify_locks(trans);
 
-	bch2_btree_iter_verify(iter, b);
+	bch2_time_stats_update(&c->times[n2
+			       ? BCH_TIME_btree_node_split
+			       : BCH_TIME_btree_node_compact],
+			       start_time);
+	return ret;
+err:
+	if (n3)
+		bch2_btree_node_free_never_used(as, trans, n3);
+	if (n2)
+		bch2_btree_node_free_never_used(as, trans, n2);
+	bch2_btree_node_free_never_used(as, trans, n1);
+	goto out;
 }
 
 /**
- * bch_btree_insert_node - insert bkeys into a given btree node
+ * bch2_btree_insert_node - insert bkeys into a given btree node
  *
- * @iter:		btree iterator
+ * @as:			btree_update object
+ * @trans:		btree_trans object
+ * @path_idx:		path that points to current node
+ * @b:			node to insert keys into
  * @keys:		list of keys to insert
- * @hook:		insert callback
- * @persistent:		if not null, @persistent will wait on journal write
+ *
+ * Returns: 0 on success, typically transaction restart error on failure
  *
  * Inserts as many keys as it can into a given btree node, splitting it if full.
  * If a split occurred, this function will return early. This can only happen
  * for leaf nodes -- inserts into interior nodes have to be atomic.
  */
-void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
-			    struct btree_iter *iter, struct keylist *keys,
-			    unsigned flags)
+static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
+				  btree_path_idx_t path_idx, struct btree *b,
+				  struct keylist *keys)
 {
 	struct bch_fs *c = as->c;
+	struct btree_path *path = trans->paths + path_idx, *linked;
+	unsigned i;
 	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
 	int old_live_u64s = b->nr.live_u64s;
 	int live_u64s_added, u64s_added;
+	int ret;
 
-	BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
-	BUG_ON(!b->level);
+	lockdep_assert_held(&c->gc_lock);
+	BUG_ON(!btree_node_intent_locked(path, b->c.level));
+	BUG_ON(!b->c.level);
 	BUG_ON(!as || as->b);
 	bch2_verify_keylist_sorted(keys);
 
-	if (as->must_rewrite)
-		goto split;
+	ret = bch2_btree_node_lock_write(trans, path, &b->c);
+	if (ret)
+		return ret;
 
-	bch2_btree_node_lock_for_insert(c, b, iter);
+	bch2_btree_node_prep_for_write(trans, path, b);
 
-	if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) {
-		bch2_btree_node_unlock_write(b, iter);
+	if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) {
+		bch2_btree_node_unlock_write(trans, path, b);
 		goto split;
 	}
 
-	bch2_btree_insert_keys_interior(as, b, iter, keys);
+	ret = bch2_btree_node_check_topology(trans, b);
+	if (ret) {
+		bch2_btree_node_unlock_write(trans, path, b);
+		return ret;
+	}
+
+	bch2_btree_insert_keys_interior(as, trans, path, b,
+					path->l[b->c.level].iter, keys);
+
+	trans_for_each_path_with_node(trans, b, linked, i)
+		bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
+
+	bch2_trans_verify_paths(trans);
 
 	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
 	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
@@ -1539,124 +1831,178 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
 
 	if (u64s_added > live_u64s_added &&
 	    bch2_maybe_compact_whiteouts(c, b))
-		bch2_btree_iter_reinit_node(iter, b);
-
-	bch2_btree_node_unlock_write(b, iter);
+		bch2_trans_node_reinit_iter(trans, b);
 
-	btree_node_interior_verify(b);
-
-	bch2_foreground_maybe_merge(c, iter, b->level, flags);
-	return;
+	btree_update_updated_node(as, b);
+	bch2_btree_node_unlock_write(trans, path, b);
+	return 0;
 split:
-	btree_split(as, b, iter, keys, flags);
+	/*
+	 * We could attempt to avoid the transaction restart, by calling
+	 * bch2_btree_path_upgrade() and allocating more nodes:
+	 */
+	if (b->c.level >= as->update_level_end) {
+		trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
+	}
+
+	return btree_split(as, trans, path_idx, b, keys);
 }
 
-int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
+int bch2_btree_split_leaf(struct btree_trans *trans,
+			  btree_path_idx_t path,
 			  unsigned flags)
 {
-	struct btree *b = iter->l[0].b;
+	/* btree_split & merge may both cause paths array to be reallocated */
+	struct btree *b = path_l(trans->paths + path)->b;
 	struct btree_update *as;
-	struct closure cl;
+	unsigned l;
 	int ret = 0;
-	struct btree_iter *linked;
 
-	/*
-	 * We already have a disk reservation and open buckets pinned; this
-	 * allocation must not block:
-	 */
-	for_each_btree_iter(iter, linked)
-		if (linked->btree_id == BTREE_ID_EXTENTS)
-			flags |= BTREE_INSERT_USE_RESERVE;
+	as = bch2_btree_update_start(trans, trans->paths + path,
+				     trans->paths[path].level,
+				     true, flags);
+	if (IS_ERR(as))
+		return PTR_ERR(as);
 
-	closure_init_stack(&cl);
+	ret = btree_split(as, trans, path, b, NULL);
+	if (ret) {
+		bch2_btree_update_free(as, trans);
+		return ret;
+	}
 
-	/* Hack, because gc and splitting nodes doesn't mix yet: */
-	if (!down_read_trylock(&c->gc_lock)) {
-		if (flags & BTREE_INSERT_NOUNLOCK)
-			return -EINTR;
+	bch2_btree_update_done(as, trans);
 
-		bch2_btree_iter_unlock(iter);
-		down_read(&c->gc_lock);
+	for (l = trans->paths[path].level + 1;
+	     btree_node_intent_locked(&trans->paths[path], l) && !ret;
+	     l++)
+		ret = bch2_foreground_maybe_merge(trans, path, l, flags);
 
-		if (btree_iter_linked(iter))
-			ret = -EINTR;
-	}
+	return ret;
+}
 
-	/*
-	 * XXX: figure out how far we might need to split,
-	 * instead of locking/reserving all the way to the root:
-	 */
-	if (!bch2_btree_iter_upgrade(iter, U8_MAX,
-			!(flags & BTREE_INSERT_NOUNLOCK))) {
-		ret = -EINTR;
-		goto out;
-	}
+static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans,
+				   btree_path_idx_t path_idx)
+{
+	struct bch_fs *c = as->c;
+	struct btree_path *path = trans->paths + path_idx;
+	struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b;
 
-	as = bch2_btree_update_start(c, iter->btree_id,
-		btree_update_reserve_required(c, b), flags,
-		!(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
-	if (IS_ERR(as)) {
-		ret = PTR_ERR(as);
-		if (ret == -EAGAIN) {
-			BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
-			bch2_btree_iter_unlock(iter);
-			ret = -EINTR;
-		}
-		goto out;
-	}
+	BUG_ON(!btree_node_locked(path, b->c.level));
 
-	btree_split(as, b, iter, NULL, flags);
-	bch2_btree_update_done(as);
+	n = __btree_root_alloc(as, trans, b->c.level + 1);
 
-	/*
-	 * We haven't successfully inserted yet, so don't downgrade all the way
-	 * back to read locks;
-	 */
-	__bch2_btree_iter_downgrade(iter, 1);
-out:
-	up_read(&c->gc_lock);
-	closure_sync(&cl);
-	return ret;
+	bch2_btree_update_add_new_node(as, n);
+	six_unlock_write(&n->c.lock);
+
+	path->locks_want++;
+	BUG_ON(btree_node_locked(path, n->c.level));
+	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+	mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+	bch2_btree_path_level_init(trans, path, n);
+
+	n->sib_u64s[0] = U16_MAX;
+	n->sib_u64s[1] = U16_MAX;
+
+	bch2_keylist_add(&as->parent_keys, &b->key);
+	btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
+
+	int ret = bch2_btree_set_root(as, trans, path, n, true);
+	BUG_ON(ret);
+
+	bch2_btree_update_get_open_buckets(as, n);
+	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+	bch2_trans_node_add(trans, path, n);
+	six_unlock_intent(&n->c.lock);
+
+	mutex_lock(&c->btree_cache.lock);
+	list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list);
+	mutex_unlock(&c->btree_cache.lock);
+
+	bch2_trans_verify_locks(trans);
+}
+
+int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
+
+	if (btree_node_fake(b))
+		return bch2_btree_split_leaf(trans, path, flags);
+
+	struct btree_update *as =
+		bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags);
+	if (IS_ERR(as))
+		return PTR_ERR(as);
+
+	__btree_increase_depth(as, trans, path);
+	bch2_btree_update_done(as, trans);
+	return 0;
 }
 
-void __bch2_foreground_maybe_merge(struct bch_fs *c,
-				   struct btree_iter *iter,
-				   unsigned level,
-				   unsigned flags,
-				   enum btree_node_sibling sib)
+int __bch2_foreground_maybe_merge(struct btree_trans *trans,
+				  btree_path_idx_t path,
+				  unsigned level,
+				  unsigned flags,
+				  enum btree_node_sibling sib)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_update *as;
 	struct bkey_format_state new_s;
 	struct bkey_format new_f;
 	struct bkey_i delete;
 	struct btree *b, *m, *n, *prev, *next, *parent;
-	struct closure cl;
+	struct bpos sib_pos;
 	size_t sib_u64s;
+	enum btree_id btree = trans->paths[path].btree_id;
+	btree_path_idx_t sib_path = 0, new_path = 0;
+	u64 start_time = local_clock();
 	int ret = 0;
 
-	closure_init_stack(&cl);
-retry:
-	BUG_ON(!btree_node_locked(iter, level));
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
+	BUG_ON(!trans->paths[path].should_be_locked);
+	BUG_ON(!btree_node_locked(&trans->paths[path], level));
 
-	b = iter->l[level].b;
+	/*
+	 * Work around a deadlock caused by the btree write buffer not doing
+	 * merges and leaving tons of merges for us to do - we really don't need
+	 * to be doing merges at all from the interior update path, and if the
+	 * interior update path is generating too many new interior updates we
+	 * deadlock:
+	 */
+	if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates)
+		return 0;
 
-	parent = btree_node_parent(iter, b);
-	if (!parent)
-		goto out;
+	if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) {
+		flags &= ~BCH_WATERMARK_MASK;
+		flags |= BCH_WATERMARK_btree;
+		flags |= BCH_TRANS_COMMIT_journal_reclaim;
+	}
 
-	if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
-		goto out;
+	b = trans->paths[path].l[level].b;
 
-	/* XXX: can't be holding read locks */
-	m = bch2_btree_node_get_sibling(c, iter, b,
-			!(flags & BTREE_INSERT_NOUNLOCK), sib);
-	if (IS_ERR(m)) {
-		ret = PTR_ERR(m);
-		goto err;
+	if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
+	    (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) {
+		b->sib_u64s[sib] = U16_MAX;
+		return 0;
 	}
 
-	/* NULL means no sibling: */
-	if (!m) {
+	sib_pos = sib == btree_prev_sib
+		? bpos_predecessor(b->data->min_key)
+		: bpos_successor(b->data->max_key);
+
+	sib_path = bch2_path_get(trans, btree, sib_pos,
+				 U8_MAX, level, BTREE_ITER_intent, _THIS_IP_);
+	ret = bch2_btree_path_traverse(trans, sib_path, false);
+	if (ret)
+		goto err;
+
+	btree_path_set_should_be_locked(trans, trans->paths + sib_path);
+
+	m = trans->paths[sib_path].l[level].b;
+
+	if (btree_node_parent(trans->paths + path, b) !=
+	    btree_node_parent(trans->paths + sib_path, m)) {
 		b->sib_u64s[sib] = U16_MAX;
 		goto out;
 	}
@@ -1669,13 +2015,31 @@ retry:
 		next = m;
 	}
 
+	if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) {
+		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+
+		bch2_bpos_to_text(&buf1, prev->data->max_key);
+		bch2_bpos_to_text(&buf2, next->data->min_key);
+		bch_err(c,
+			"%s(): btree topology error:\n"
+			"  prev ends at   %s\n"
+			"  next starts at %s",
+			__func__, buf1.buf, buf2.buf);
+		printbuf_exit(&buf1);
+		printbuf_exit(&buf2);
+		ret = bch2_topology_error(c);
+		goto err;
+	}
+
 	bch2_bkey_format_init(&new_s);
-	__bch2_btree_calc_format(&new_s, b);
-	__bch2_btree_calc_format(&new_s, m);
+	bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
+	__bch2_btree_calc_format(&new_s, prev);
+	__bch2_btree_calc_format(&new_s, next);
+	bch2_bkey_format_add_pos(&new_s, next->data->max_key);
 	new_f = bch2_bkey_format_done(&new_s);
 
-	sib_u64s = btree_node_u64s_with_format(b, &new_f) +
-		btree_node_u64s_with_format(m, &new_f);
+	sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) +
+		btree_node_u64s_with_format(m->nr, &m->format, &new_f);
 
 	if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
 		sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
@@ -1684,403 +2048,486 @@ retry:
 	}
 
 	sib_u64s = min(sib_u64s, btree_max_u64s(c));
+	sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
 	b->sib_u64s[sib] = sib_u64s;
 
-	if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
-		six_unlock_intent(&m->lock);
+	if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
 		goto out;
-	}
-
-	/* We're changing btree topology, doesn't mix with gc: */
-	if (!down_read_trylock(&c->gc_lock))
-		goto err_cycle_gc_lock;
 
-	if (!bch2_btree_iter_upgrade(iter, U8_MAX,
-			!(flags & BTREE_INSERT_NOUNLOCK))) {
-		ret = -EINTR;
-		goto err_unlock;
-	}
-
-	as = bch2_btree_update_start(c, iter->btree_id,
-			 btree_update_reserve_required(c, parent) + 1,
-			 BTREE_INSERT_NOFAIL|
-			 BTREE_INSERT_USE_RESERVE,
-			 !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
-	if (IS_ERR(as)) {
-		ret = PTR_ERR(as);
-		goto err_unlock;
-	}
+	parent = btree_node_parent(trans->paths + path, b);
+	as = bch2_btree_update_start(trans, trans->paths + path, level, false,
+				     BCH_TRANS_COMMIT_no_enospc|flags);
+	ret = PTR_ERR_OR_ZERO(as);
+	if (ret)
+		goto err;
 
-	trace_btree_merge(c, b);
+	trace_and_count(c, btree_node_merge, trans, b);
 
 	bch2_btree_interior_update_will_free_node(as, b);
 	bch2_btree_interior_update_will_free_node(as, m);
 
-	n = bch2_btree_node_alloc(as, b->level);
+	n = bch2_btree_node_alloc(as, trans, b->c.level);
 
-	n->data->min_key	= prev->data->min_key;
-	n->data->max_key	= next->data->max_key;
-	n->data->format		= new_f;
-	n->key.k.p		= next->key.k.p;
+	SET_BTREE_NODE_SEQ(n->data,
+			   max(BTREE_NODE_SEQ(b->data),
+			       BTREE_NODE_SEQ(m->data)) + 1);
 
+	btree_set_min(n, prev->data->min_key);
+	btree_set_max(n, next->data->max_key);
+
+	n->data->format	 = new_f;
 	btree_node_set_format(n, new_f);
 
 	bch2_btree_sort_into(c, n, prev);
 	bch2_btree_sort_into(c, n, next);
 
 	bch2_btree_build_aux_trees(n);
-	six_unlock_write(&n->lock);
+	bch2_btree_update_add_new_node(as, n);
+	six_unlock_write(&n->c.lock);
+
+	new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p);
+	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+	mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+	bch2_btree_path_level_init(trans, trans->paths + new_path, n);
 
 	bkey_init(&delete.k);
 	delete.k.p = prev->key.k.p;
 	bch2_keylist_add(&as->parent_keys, &delete);
 	bch2_keylist_add(&as->parent_keys, &n->key);
 
-	bch2_btree_node_write(c, n, SIX_LOCK_intent);
-
-	bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
+	bch2_trans_verify_paths(trans);
 
-	bch2_btree_open_bucket_put(c, n);
-	bch2_btree_node_free_inmem(c, b, iter);
-	bch2_btree_node_free_inmem(c, m, iter);
-	bch2_btree_iter_node_replace(iter, n);
+	ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
+	if (ret)
+		goto err_free_update;
 
-	bch2_btree_iter_verify(iter, n);
+	bch2_trans_verify_paths(trans);
 
-	bch2_btree_update_done(as);
+	bch2_btree_update_get_open_buckets(as, n);
+	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 
-	six_unlock_intent(&m->lock);
-	up_read(&c->gc_lock);
-out:
-	/*
-	 * Don't downgrade locks here: we're called after successful insert,
-	 * and the caller will downgrade locks after a successful insert
-	 * anyways (in case e.g. a split was required first)
-	 *
-	 * And we're also called when inserting into interior nodes in the
-	 * split path, and downgrading to read locks in there is potentially
-	 * confusing:
-	 */
-	closure_sync(&cl);
-	return;
+	bch2_btree_node_free_inmem(trans, trans->paths + path, b);
+	bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m);
 
-err_cycle_gc_lock:
-	six_unlock_intent(&m->lock);
+	bch2_trans_node_add(trans, trans->paths + path, n);
 
-	if (flags & BTREE_INSERT_NOUNLOCK)
-		goto out;
+	bch2_trans_verify_paths(trans);
 
-	bch2_btree_iter_unlock(iter);
+	six_unlock_intent(&n->c.lock);
 
-	down_read(&c->gc_lock);
-	up_read(&c->gc_lock);
-	ret = -EINTR;
-	goto err;
+	bch2_btree_update_done(as, trans);
 
-err_unlock:
-	six_unlock_intent(&m->lock);
-	up_read(&c->gc_lock);
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
+out:
 err:
-	BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
-
-	if ((ret == -EAGAIN || ret == -EINTR) &&
-	    !(flags & BTREE_INSERT_NOUNLOCK)) {
-		bch2_btree_iter_unlock(iter);
-		closure_sync(&cl);
-		ret = bch2_btree_iter_traverse(iter);
-		if (ret)
-			goto out;
-
-		goto retry;
-	}
-
+	if (new_path)
+		bch2_path_put(trans, new_path, true);
+	bch2_path_put(trans, sib_path, true);
+	bch2_trans_verify_locks(trans);
+	if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
+		ret = 0;
+	if (!ret)
+		ret = bch2_trans_relock(trans);
+	return ret;
+err_free_update:
+	bch2_btree_node_free_never_used(as, trans, n);
+	bch2_btree_update_free(as, trans);
 	goto out;
 }
 
-static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
-				struct btree *b, unsigned flags,
-				struct closure *cl)
+int bch2_btree_node_rewrite(struct btree_trans *trans,
+			    struct btree_iter *iter,
+			    struct btree *b,
+			    unsigned flags)
 {
-	struct btree *n, *parent = btree_node_parent(iter, b);
+	struct bch_fs *c = trans->c;
+	struct btree *n, *parent;
 	struct btree_update *as;
+	btree_path_idx_t new_path = 0;
+	int ret;
 
-	as = bch2_btree_update_start(c, iter->btree_id,
-		(parent
-		 ? btree_update_reserve_required(c, parent)
-		 : 0) + 1,
-		flags, cl);
-	if (IS_ERR(as)) {
-		trace_btree_gc_rewrite_node_fail(c, b);
-		return PTR_ERR(as);
-	}
+	flags |= BCH_TRANS_COMMIT_no_enospc;
+
+	struct btree_path *path = btree_iter_path(trans, iter);
+	parent = btree_node_parent(path, b);
+	as = bch2_btree_update_start(trans, path, b->c.level, false, flags);
+	ret = PTR_ERR_OR_ZERO(as);
+	if (ret)
+		goto out;
 
 	bch2_btree_interior_update_will_free_node(as, b);
 
-	n = bch2_btree_node_alloc_replacement(as, b);
+	n = bch2_btree_node_alloc_replacement(as, trans, b);
 
 	bch2_btree_build_aux_trees(n);
-	six_unlock_write(&n->lock);
+	bch2_btree_update_add_new_node(as, n);
+	six_unlock_write(&n->c.lock);
 
-	trace_btree_gc_rewrite_node(c, b);
+	new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p);
+	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+	mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+	bch2_btree_path_level_init(trans, trans->paths + new_path, n);
 
-	bch2_btree_node_write(c, n, SIX_LOCK_intent);
+	trace_and_count(c, btree_node_rewrite, trans, b);
 
 	if (parent) {
 		bch2_keylist_add(&as->parent_keys, &n->key);
-		bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
+		ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
 	} else {
-		bch2_btree_set_root(as, n, iter);
+		ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false);
 	}
 
-	bch2_btree_open_bucket_put(c, n);
+	if (ret)
+		goto err;
+
+	bch2_btree_update_get_open_buckets(as, n);
+	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 
-	bch2_btree_node_free_inmem(c, b, iter);
+	bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b);
 
-	bch2_btree_iter_node_replace(iter, n);
+	bch2_trans_node_add(trans, trans->paths + iter->path, n);
+	six_unlock_intent(&n->c.lock);
 
-	bch2_btree_update_done(as);
-	return 0;
+	bch2_btree_update_done(as, trans);
+out:
+	if (new_path)
+		bch2_path_put(trans, new_path, true);
+	bch2_trans_downgrade(trans);
+	return ret;
+err:
+	bch2_btree_node_free_never_used(as, trans, n);
+	bch2_btree_update_free(as, trans);
+	goto out;
 }
 
-/**
- * bch_btree_node_rewrite - Rewrite/move a btree node
- *
- * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
- * btree_check_reserve() has to wait)
- */
-int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
-			    __le64 seq, unsigned flags)
+struct async_btree_rewrite {
+	struct bch_fs		*c;
+	struct work_struct	work;
+	struct list_head	list;
+	enum btree_id		btree_id;
+	unsigned		level;
+	struct bkey_buf		key;
+};
+
+static int async_btree_node_rewrite_trans(struct btree_trans *trans,
+					  struct async_btree_rewrite *a)
 {
-	struct closure cl;
-	struct btree *b;
-	int ret;
-
-	flags |= BTREE_INSERT_NOFAIL;
+	struct btree_iter iter;
+	bch2_trans_node_iter_init(trans, &iter,
+				  a->btree_id, a->key.k->k.p,
+				  BTREE_MAX_DEPTH, a->level, 0);
+	struct btree *b = bch2_btree_iter_peek_node(&iter);
+	int ret = PTR_ERR_OR_ZERO(b);
+	if (ret)
+		goto out;
 
-	closure_init_stack(&cl);
+	bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k);
+	ret = found
+		? bch2_btree_node_rewrite(trans, &iter, b, 0)
+		: -ENOENT;
 
-	bch2_btree_iter_upgrade(iter, U8_MAX, true);
+#if 0
+	/* Tracepoint... */
+	if (!ret || ret == -ENOENT) {
+		struct bch_fs *c = trans->c;
+		struct printbuf buf = PRINTBUF;
 
-	if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
-		if (!down_read_trylock(&c->gc_lock)) {
-			bch2_btree_iter_unlock(iter);
-			down_read(&c->gc_lock);
+		if (!ret) {
+			prt_printf(&buf, "rewrite node:\n  ");
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
+		} else {
+			prt_printf(&buf, "node to rewrite not found:\n  want: ");
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
+			prt_printf(&buf, "\n  got:  ");
+			if (b)
+				bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+			else
+				prt_str(&buf, "(null)");
 		}
+		bch_info(c, "%s", buf.buf);
+		printbuf_exit(&buf);
 	}
+#endif
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
 
-	while (1) {
-		ret = bch2_btree_iter_traverse(iter);
-		if (ret)
-			break;
+static void async_btree_node_rewrite_work(struct work_struct *work)
+{
+	struct async_btree_rewrite *a =
+		container_of(work, struct async_btree_rewrite, work);
+	struct bch_fs *c = a->c;
 
-		b = bch2_btree_iter_peek_node(iter);
-		if (!b || b->data->keys.seq != seq)
-			break;
+	int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a));
+	if (ret != -ENOENT)
+		bch_err_fn_ratelimited(c, ret);
+
+	spin_lock(&c->btree_node_rewrites_lock);
+	list_del(&a->list);
+	spin_unlock(&c->btree_node_rewrites_lock);
+
+	closure_wake_up(&c->btree_node_rewrites_wait);
+
+	bch2_bkey_buf_exit(&a->key, c);
+	bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
+	kfree(a);
+}
+
+void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
+{
+	struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS);
+	if (!a)
+		return;
+
+	a->c		= c;
+	a->btree_id	= b->c.btree_id;
+	a->level	= b->c.level;
+	INIT_WORK(&a->work, async_btree_node_rewrite_work);
 
-		ret = __btree_node_rewrite(c, iter, b, flags, &cl);
-		if (ret != -EAGAIN &&
-		    ret != -EINTR)
+	bch2_bkey_buf_init(&a->key);
+	bch2_bkey_buf_copy(&a->key, c, &b->key);
+
+	bool now = false, pending = false;
+
+	spin_lock(&c->btree_node_rewrites_lock);
+	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
+		list_add(&a->list, &c->btree_node_rewrites);
+		now = true;
+	} else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
+		list_add(&a->list, &c->btree_node_rewrites_pending);
+		pending = true;
+	}
+	spin_unlock(&c->btree_node_rewrites_lock);
+
+	if (now) {
+		queue_work(c->btree_node_rewrite_worker, &a->work);
+	} else if (pending) {
+		/* bch2_do_pending_node_rewrites will execute */
+	} else {
+		bch2_bkey_buf_exit(&a->key, c);
+		kfree(a);
+	}
+}
+
+void bch2_async_btree_node_rewrites_flush(struct bch_fs *c)
+{
+	closure_wait_event(&c->btree_node_rewrites_wait,
+			   list_empty(&c->btree_node_rewrites));
+}
+
+void bch2_do_pending_node_rewrites(struct bch_fs *c)
+{
+	while (1) {
+		spin_lock(&c->btree_node_rewrites_lock);
+		struct async_btree_rewrite *a =
+			list_pop_entry(&c->btree_node_rewrites_pending,
+				       struct async_btree_rewrite, list);
+		if (a)
+			list_add(&a->list, &c->btree_node_rewrites);
+		spin_unlock(&c->btree_node_rewrites_lock);
+
+		if (!a)
 			break;
 
-		bch2_btree_iter_unlock(iter);
-		closure_sync(&cl);
+		bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
+		queue_work(c->btree_node_rewrite_worker, &a->work);
 	}
+}
 
-	bch2_btree_iter_downgrade(iter);
+void bch2_free_pending_node_rewrites(struct bch_fs *c)
+{
+	while (1) {
+		spin_lock(&c->btree_node_rewrites_lock);
+		struct async_btree_rewrite *a =
+			list_pop_entry(&c->btree_node_rewrites_pending,
+				       struct async_btree_rewrite, list);
+		spin_unlock(&c->btree_node_rewrites_lock);
 
-	if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
-		up_read(&c->gc_lock);
+		if (!a)
+			break;
 
-	closure_sync(&cl);
-	return ret;
+		bch2_bkey_buf_exit(&a->key, c);
+		kfree(a);
+	}
 }
 
-static void __bch2_btree_node_update_key(struct bch_fs *c,
-					 struct btree_update *as,
-					 struct btree_iter *iter,
-					 struct btree *b, struct btree *new_hash,
-					 struct bkey_i_extent *new_key)
+static int __bch2_btree_node_update_key(struct btree_trans *trans,
+					struct btree_iter *iter,
+					struct btree *b, struct btree *new_hash,
+					struct bkey_i *new_key,
+					unsigned commit_flags,
+					bool skip_triggers)
 {
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter2 = { NULL };
 	struct btree *parent;
 	int ret;
 
-	/*
-	 * Two corner cases that need to be thought about here:
-	 *
-	 * @b may not be reachable yet - there might be another interior update
-	 * operation waiting on @b to be written, and we're gonna deliver the
-	 * write completion to that interior update operation _before_
-	 * persisting the new_key update
-	 *
-	 * That ends up working without us having to do anything special here:
-	 * the reason is, we do kick off (and do the in memory updates) for the
-	 * update for @new_key before we return, creating a new interior_update
-	 * operation here.
-	 *
-	 * The new interior update operation here will in effect override the
-	 * previous one. The previous one was going to terminate - make @b
-	 * reachable - in one of two ways:
-	 * - updating the btree root pointer
-	 *   In that case,
-	 *   no, this doesn't work. argh.
-	 */
-
-	if (b->will_make_reachable)
-		as->must_rewrite = true;
+	if (!skip_triggers) {
+		ret   = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1,
+					     bkey_i_to_s_c(&b->key),
+					     BTREE_TRIGGER_transactional) ?:
+			bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1,
+					     bkey_i_to_s(new_key),
+					     BTREE_TRIGGER_transactional);
+		if (ret)
+			return ret;
+	}
 
-	btree_interior_update_add_node_reference(as, b);
+	if (new_hash) {
+		bkey_copy(&new_hash->key, new_key);
+		ret = bch2_btree_node_hash_insert(&c->btree_cache,
+				new_hash, b->c.level, b->c.btree_id);
+		BUG_ON(ret);
+	}
 
-	parent = btree_node_parent(iter, b);
+	parent = btree_node_parent(btree_iter_path(trans, iter), b);
 	if (parent) {
-		if (new_hash) {
-			bkey_copy(&new_hash->key, &new_key->k_i);
-			ret = bch2_btree_node_hash_insert(&c->btree_cache,
-					new_hash, b->level, b->btree_id);
-			BUG_ON(ret);
-		}
+		bch2_trans_copy_iter(&iter2, iter);
 
-		bch2_keylist_add(&as->parent_keys, &new_key->k_i);
-		bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
+		iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
+				iter2.flags & BTREE_ITER_intent,
+				_THIS_IP_);
 
-		if (new_hash) {
-			mutex_lock(&c->btree_cache.lock);
-			bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
+		struct btree_path *path2 = btree_iter_path(trans, &iter2);
+		BUG_ON(path2->level != b->c.level);
+		BUG_ON(!bpos_eq(path2->pos, new_key->k.p));
 
-			bch2_btree_node_hash_remove(&c->btree_cache, b);
+		btree_path_set_level_up(trans, path2);
 
-			bkey_copy(&b->key, &new_key->k_i);
-			ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
-			BUG_ON(ret);
-			mutex_unlock(&c->btree_cache.lock);
-		} else {
-			bkey_copy(&b->key, &new_key->k_i);
-		}
-	} else {
-		struct bch_fs_usage stats = { 0 };
+		trans->paths_sorted = false;
 
+		ret   = bch2_btree_iter_traverse(&iter2) ?:
+			bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun);
+		if (ret)
+			goto err;
+	} else {
 		BUG_ON(btree_node_root(c, b) != b);
 
-		bch2_btree_node_lock_write(b, iter);
-
-		bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
-			      c->opts.btree_node_size, true,
-			      gc_pos_btree_root(b->btree_id),
-			      &stats, 0, 0);
-		bch2_btree_node_free_index(as, NULL,
-					   bkey_i_to_s_c(&b->key),
-					   &stats);
-		bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
-				    gc_pos_btree_root(b->btree_id));
-
-		if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
-			mutex_lock(&c->btree_cache.lock);
-			bch2_btree_node_hash_remove(&c->btree_cache, b);
-
-			bkey_copy(&b->key, &new_key->k_i);
-			ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
-			BUG_ON(ret);
-			mutex_unlock(&c->btree_cache.lock);
-		} else {
-			bkey_copy(&b->key, &new_key->k_i);
-		}
+		struct jset_entry *e = bch2_trans_jset_entry_alloc(trans,
+				       jset_u64s(new_key->k.u64s));
+		ret = PTR_ERR_OR_ZERO(e);
+		if (ret)
+			return ret;
 
-		btree_update_updated_root(as);
-		bch2_btree_node_unlock_write(b, iter);
+		journal_entry_set(e,
+				  BCH_JSET_ENTRY_btree_root,
+				  b->c.btree_id, b->c.level,
+				  new_key, new_key->k.u64s);
 	}
 
-	bch2_btree_update_done(as);
-}
+	ret = bch2_trans_commit(trans, NULL, NULL, commit_flags);
+	if (ret)
+		goto err;
 
-int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
-			       struct btree *b, struct bkey_i_extent *new_key)
-{
-	struct btree *parent = btree_node_parent(iter, b);
-	struct btree_update *as = NULL;
-	struct btree *new_hash = NULL;
-	struct closure cl;
-	int ret;
+	bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c);
 
-	closure_init_stack(&cl);
+	if (new_hash) {
+		mutex_lock(&c->btree_cache.lock);
+		bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
 
-	if (!bch2_btree_iter_upgrade(iter, U8_MAX, true))
-		return -EINTR;
+		__bch2_btree_node_hash_remove(&c->btree_cache, b);
 
-	if (!down_read_trylock(&c->gc_lock)) {
-		bch2_btree_iter_unlock(iter);
-		down_read(&c->gc_lock);
+		bkey_copy(&b->key, new_key);
+		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+		BUG_ON(ret);
+		mutex_unlock(&c->btree_cache.lock);
+	} else {
+		bkey_copy(&b->key, new_key);
+	}
 
-		if (!bch2_btree_iter_relock(iter)) {
-			ret = -EINTR;
-			goto err;
-		}
+	bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b);
+out:
+	bch2_trans_iter_exit(trans, &iter2);
+	return ret;
+err:
+	if (new_hash) {
+		mutex_lock(&c->btree_cache.lock);
+		bch2_btree_node_hash_remove(&c->btree_cache, b);
+		mutex_unlock(&c->btree_cache.lock);
 	}
+	goto out;
+}
 
-	/* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
-	if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
-		/* bch2_btree_reserve_get will unlock */
-		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
-		if (ret) {
-			ret = -EINTR;
+int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
+			       struct btree *b, struct bkey_i *new_key,
+			       unsigned commit_flags, bool skip_triggers)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *new_hash = NULL;
+	struct btree_path *path = btree_iter_path(trans, iter);
+	struct closure cl;
+	int ret = 0;
 
-			bch2_btree_iter_unlock(iter);
-			up_read(&c->gc_lock);
-			closure_sync(&cl);
-			down_read(&c->gc_lock);
+	ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1);
+	if (ret)
+		return ret;
 
-			if (!bch2_btree_iter_relock(iter))
-				goto err;
+	closure_init_stack(&cl);
+
+	/*
+	 * check btree_ptr_hash_val() after @b is locked by
+	 * btree_iter_traverse():
+	 */
+	if (btree_ptr_hash_val(new_key) != b->hash_val) {
+		ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
+		if (ret) {
+			ret = drop_locks_do(trans, (closure_sync(&cl), 0));
+			if (ret)
+				return ret;
 		}
 
-		new_hash = bch2_btree_node_mem_alloc(c);
+		new_hash = bch2_btree_node_mem_alloc(trans, false);
+		ret = PTR_ERR_OR_ZERO(new_hash);
+		if (ret)
+			goto err;
 	}
 
-	as = bch2_btree_update_start(c, iter->btree_id,
-		parent ? btree_update_reserve_required(c, parent) : 0,
-		BTREE_INSERT_NOFAIL|
-		BTREE_INSERT_USE_RESERVE|
-		BTREE_INSERT_USE_ALLOC_RESERVE,
-		&cl);
+	path->intent_ref++;
+	ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key,
+					   commit_flags, skip_triggers);
+	--path->intent_ref;
 
-	if (IS_ERR(as)) {
-		ret = PTR_ERR(as);
-		if (ret == -EAGAIN)
-			ret = -EINTR;
+	if (new_hash)
+		bch2_btree_node_to_freelist(c, new_hash);
+err:
+	closure_sync(&cl);
+	bch2_btree_cache_cannibalize_unlock(trans);
+	return ret;
+}
 
-		if (ret != -EINTR)
-			goto err;
+int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
+					struct btree *b, struct bkey_i *new_key,
+					unsigned commit_flags, bool skip_triggers)
+{
+	struct btree_iter iter;
+	int ret;
 
-		bch2_btree_iter_unlock(iter);
-		up_read(&c->gc_lock);
-		closure_sync(&cl);
-		down_read(&c->gc_lock);
+	bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
+				  BTREE_MAX_DEPTH, b->c.level,
+				  BTREE_ITER_intent);
+	ret = bch2_btree_iter_traverse(&iter);
+	if (ret)
+		goto out;
 
-		if (!bch2_btree_iter_relock(iter))
-			goto err;
+	/* has node been freed? */
+	if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) {
+		/* node has been freed: */
+		BUG_ON(!btree_node_dying(b));
+		goto out;
 	}
 
-	ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
-				      extent_i_to_s_c(new_key).s_c);
-	if (ret)
-		goto err_free_update;
-
-	__bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
+	BUG_ON(!btree_node_hashed(b));
 
-	bch2_btree_iter_downgrade(iter);
-err:
-	if (new_hash) {
-		mutex_lock(&c->btree_cache.lock);
-		list_move(&new_hash->list, &c->btree_cache.freeable);
-		mutex_unlock(&c->btree_cache.lock);
+	bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
+			    !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
 
-		six_unlock_write(&new_hash->lock);
-		six_unlock_intent(&new_hash->lock);
-	}
-	up_read(&c->gc_lock);
-	closure_sync(&cl);
+	ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
+					 commit_flags, skip_triggers);
+out:
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
-err_free_update:
-	bch2_btree_update_free(as);
-	goto err;
 }
 
 /* Init code: */
@@ -2093,12 +2540,12 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
 {
 	BUG_ON(btree_node_root(c, b));
 
-	__bch2_btree_set_root_inmem(c, b);
-	bch2_btree_set_root_ondisk(c, b, READ);
+	bch2_btree_set_root_inmem(c, b);
 }
 
-void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
+int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level)
 {
+	struct bch_fs *c = trans->c;
 	struct closure cl;
 	struct btree *b;
 	int ret;
@@ -2106,65 +2553,197 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 	closure_init_stack(&cl);
 
 	do {
-		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+		ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
 		closure_sync(&cl);
 	} while (ret);
 
-	b = bch2_btree_node_mem_alloc(c);
-	bch2_btree_cache_cannibalize_unlock(c);
+	b = bch2_btree_node_mem_alloc(trans, false);
+	bch2_btree_cache_cannibalize_unlock(trans);
+
+	ret = PTR_ERR_OR_ZERO(b);
+	if (ret)
+		return ret;
 
 	set_btree_node_fake(b);
-	b->level	= 0;
-	b->btree_id	= id;
+	set_btree_node_need_rewrite(b);
+	b->c.level	= level;
+	b->c.btree_id	= id;
 
-	bkey_extent_init(&b->key);
-	b->key.k.p = POS_MAX;
-	bkey_i_to_extent(&b->key)->v._data[0] = U64_MAX - id;
+	bkey_btree_ptr_init(&b->key);
+	b->key.k.p = SPOS_MAX;
+	*((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
 
 	bch2_bset_init_first(b, &b->data->keys);
 	bch2_btree_build_aux_trees(b);
 
-	b->data->min_key = POS_MIN;
-	b->data->max_key = POS_MAX;
+	b->data->flags = 0;
+	btree_set_min(b, POS_MIN);
+	btree_set_max(b, SPOS_MAX);
 	b->data->format = bch2_btree_calc_format(b);
 	btree_node_set_format(b, b->data->format);
 
-	ret = bch2_btree_node_hash_insert(&c->btree_cache, b, b->level, b->btree_id);
+	ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
+					  b->c.level, b->c.btree_id);
 	BUG_ON(ret);
 
-	__bch2_btree_set_root_inmem(c, b);
+	bch2_btree_set_root_inmem(c, b);
 
-	six_unlock_write(&b->lock);
-	six_unlock_intent(&b->lock);
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
+	return 0;
 }
 
-ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
+void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
+{
+	bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level)));
+}
+
+static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
+{
+	prt_printf(out, "%ps: ", (void *) as->ip_started);
+	bch2_trans_commit_flags_to_text(out, as->flags);
+
+	prt_str(out, " ");
+	bch2_btree_id_to_text(out, as->btree_id);
+	prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
+		   as->update_level_start,
+		   as->update_level_end,
+		   bch2_btree_update_modes[as->mode],
+		   as->nodes_written,
+		   closure_nr_remaining(&as->cl),
+		   as->journal.seq);
+}
+
+void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
 {
-	char *out = buf, *end = buf + PAGE_SIZE;
 	struct btree_update *as;
 
 	mutex_lock(&c->btree_interior_update_lock);
 	list_for_each_entry(as, &c->btree_interior_update_list, list)
-		out += scnprintf(out, end - out, "%p m %u w %u r %u j %llu\n",
-				 as,
-				 as->mode,
-				 as->nodes_written,
-				 atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
-				 bch2_journal_pin_seq(&c->journal, &as->journal));
+		bch2_btree_update_to_text(out, as);
 	mutex_unlock(&c->btree_interior_update_lock);
-
-	return out - buf;
 }
 
-size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
+static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
 {
-	size_t ret = 0;
-	struct list_head *i;
+	bool ret;
 
 	mutex_lock(&c->btree_interior_update_lock);
-	list_for_each(i, &c->btree_interior_update_list)
-		ret++;
+	ret = !list_empty(&c->btree_interior_update_list);
 	mutex_unlock(&c->btree_interior_update_lock);
 
 	return ret;
 }
+
+bool bch2_btree_interior_updates_flush(struct bch_fs *c)
+{
+	bool ret = bch2_btree_interior_updates_pending(c);
+
+	if (ret)
+		closure_wait_event(&c->btree_interior_update_wait,
+				   !bch2_btree_interior_updates_pending(c));
+	return ret;
+}
+
+void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry)
+{
+	struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
+
+	mutex_lock(&c->btree_root_lock);
+
+	r->level = entry->level;
+	r->alive = true;
+	bkey_copy(&r->key, (struct bkey_i *) entry->start);
+
+	mutex_unlock(&c->btree_root_lock);
+}
+
+struct jset_entry *
+bch2_btree_roots_to_journal_entries(struct bch_fs *c,
+				    struct jset_entry *end,
+				    unsigned long skip)
+{
+	unsigned i;
+
+	mutex_lock(&c->btree_root_lock);
+
+	for (i = 0; i < btree_id_nr_alive(c); i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (r->alive && !test_bit(i, &skip)) {
+			journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
+					  i, r->level, &r->key, r->key.k.u64s);
+			end = vstruct_next(end);
+		}
+	}
+
+	mutex_unlock(&c->btree_root_lock);
+
+	return end;
+}
+
+static void bch2_btree_alloc_to_text(struct printbuf *out,
+				     struct bch_fs *c,
+				     struct btree_alloc *a)
+{
+	printbuf_indent_add(out, 2);
+	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&a->k));
+	prt_newline(out);
+
+	struct open_bucket *ob;
+	unsigned i;
+	open_bucket_for_each(c, &a->ob, ob, i)
+		bch2_open_bucket_to_text(out, c, ob);
+
+	printbuf_indent_sub(out, 2);
+}
+
+void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	for (unsigned i = 0; i < c->btree_reserve_cache_nr; i++)
+		bch2_btree_alloc_to_text(out, c, &c->btree_reserve_cache[i]);
+}
+
+void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
+{
+	WARN_ON(!list_empty(&c->btree_node_rewrites));
+	WARN_ON(!list_empty(&c->btree_node_rewrites_pending));
+
+	if (c->btree_node_rewrite_worker)
+		destroy_workqueue(c->btree_node_rewrite_worker);
+	if (c->btree_interior_update_worker)
+		destroy_workqueue(c->btree_interior_update_worker);
+	mempool_exit(&c->btree_interior_update_pool);
+}
+
+void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
+{
+	mutex_init(&c->btree_reserve_cache_lock);
+	INIT_LIST_HEAD(&c->btree_interior_update_list);
+	INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
+	mutex_init(&c->btree_interior_update_lock);
+	INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
+
+	INIT_LIST_HEAD(&c->btree_node_rewrites);
+	INIT_LIST_HEAD(&c->btree_node_rewrites_pending);
+	spin_lock_init(&c->btree_node_rewrites_lock);
+}
+
+int bch2_fs_btree_interior_update_init(struct bch_fs *c)
+{
+	c->btree_interior_update_worker =
+		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
+	if (!c->btree_interior_update_worker)
+		return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+
+	c->btree_node_rewrite_worker =
+		alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
+	if (!c->btree_node_rewrite_worker)
+		return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+
+	if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
+				      sizeof(struct btree_update)))
+		return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
+
+	return 0;
+}
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index e6f05071..7930ffea 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
 #define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
 
@@ -5,30 +6,22 @@
 #include "btree_locking.h"
 #include "btree_update.h"
 
-struct btree_reserve {
-	struct disk_reservation	disk_res;
-	unsigned		nr;
-	struct btree		*b[BTREE_RESERVE_MAX];
-};
+#define BTREE_UPDATE_NODES_MAX		((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES)
 
-void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *);
-bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
-				struct bkey_format *);
+#define BTREE_UPDATE_JOURNAL_RES	(BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
 
-/* Btree node freeing/allocation: */
+int bch2_btree_node_check_topology(struct btree_trans *, struct btree *);
 
-/*
- * Tracks a btree node that has been (or is about to be) freed in memory, but
- * has _not_ yet been freed on disk (because the write that makes the new
- * node(s) visible and frees the old hasn't completed yet)
- */
-struct pending_btree_node_free {
-	bool			index_update_done;
+#define BTREE_UPDATE_MODES()	\
+	x(none)			\
+	x(node)			\
+	x(root)			\
+	x(update)
 
-	__le64			seq;
-	enum btree_id		btree_id;
-	unsigned		level;
-	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+enum btree_update_mode {
+#define x(n)	BTREE_UPDATE_##n,
+	BTREE_UPDATE_MODES()
+#undef x
 };
 
 /*
@@ -52,26 +45,25 @@ struct pending_btree_node_free {
 struct btree_update {
 	struct closure			cl;
 	struct bch_fs			*c;
+	u64				start_time;
+	unsigned long			ip_started;
 
 	struct list_head		list;
+	struct list_head		unwritten_list;
 
-	/* What kind of update are we doing? */
-	enum {
-		BTREE_INTERIOR_NO_UPDATE,
-		BTREE_INTERIOR_UPDATING_NODE,
-		BTREE_INTERIOR_UPDATING_ROOT,
-		BTREE_INTERIOR_UPDATING_AS,
-	} mode;
-
-	unsigned			must_rewrite:1;
+	enum btree_update_mode		mode;
+	enum bch_trans_commit_flags	flags;
 	unsigned			nodes_written:1;
+	unsigned			took_gc_lock:1;
 
 	enum btree_id			btree_id;
+	unsigned			update_level_start;
+	unsigned			update_level_end;
 
-	struct btree_reserve		*reserve;
+	struct disk_reservation		disk_res;
 
 	/*
-	 * BTREE_INTERIOR_UPDATING_NODE:
+	 * BTREE_UPDATE_node:
 	 * The update that made the new nodes visible was a regular update to an
 	 * existing interior node - @b. We can't write out the update to @b
 	 * until the new nodes we created are finished writing, so we block @b
@@ -82,18 +74,6 @@ struct btree_update {
 	struct list_head		write_blocked_list;
 
 	/*
-	 * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
-	 * we're now blocking another btree_update
-	 * @parent_as - btree_update that's waiting on our nodes to finish
-	 * writing, before it can make new nodes visible on disk
-	 * @wait - list of child btree_updates that are waiting on this
-	 * btree_update to make all the new nodes visible before they can free
-	 * their old btree nodes
-	 */
-	struct btree_update		*parent_as;
-	struct closure_waitlist		wait;
-
-	/*
 	 * We may be freeing nodes that were dirty, and thus had journal entries
 	 * pinned: we need to transfer the oldest of those pins to the
 	 * btree_update operation, and release it when the new node(s)
@@ -101,19 +81,37 @@ struct btree_update {
 	 */
 	struct journal_entry_pin	journal;
 
-	u64				journal_seq;
+	/* Preallocated nodes we reserve when we start the update: */
+	struct prealloc_nodes {
+		struct btree		*b[BTREE_UPDATE_NODES_MAX];
+		unsigned		nr;
+	}				prealloc_nodes[2];
 
-	/*
-	 * Nodes being freed:
-	 * Protected by c->btree_node_pending_free_lock
-	 */
-	struct pending_btree_node_free	pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
-	unsigned			nr_pending;
+	/* Nodes being freed: */
+	struct keylist			old_keys;
+	u64				_old_keys[BTREE_UPDATE_NODES_MAX *
+						  BKEY_BTREE_PTR_U64s_MAX];
+
+	/* Nodes being added: */
+	struct keylist			new_keys;
+	u64				_new_keys[BTREE_UPDATE_NODES_MAX *
+						  BKEY_BTREE_PTR_U64s_MAX];
 
 	/* New nodes, that will be made reachable by this update: */
-	struct btree			*new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
+	struct btree			*new_nodes[BTREE_UPDATE_NODES_MAX];
 	unsigned			nr_new_nodes;
 
+	struct btree			*old_nodes[BTREE_UPDATE_NODES_MAX];
+	__le64				old_nodes_seq[BTREE_UPDATE_NODES_MAX];
+	unsigned			nr_old_nodes;
+
+	open_bucket_idx_t		open_buckets[BTREE_UPDATE_NODES_MAX *
+						     BCH_REPLICAS_MAX];
+	open_bucket_idx_t		nr_open_buckets;
+
+	unsigned			journal_u64s;
+	u64				journal_entries[BTREE_UPDATE_JOURNAL_RES];
+
 	/* Only here to reduce stack usage on recursive splits: */
 	struct keylist			parent_keys;
 	/*
@@ -124,82 +122,69 @@ struct btree_update {
 	u64				inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
 };
 
-#define for_each_pending_btree_node_free(c, as, p)			\
-	list_for_each_entry(as, &c->btree_interior_update_list, list)	\
-		for (p = as->pending; p < as->pending + as->nr_pending; p++)
-
-void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
-				struct btree_iter *);
-void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
-void bch2_btree_open_bucket_put(struct bch_fs *, struct btree *);
-
 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
+						  struct btree_trans *,
 						  struct btree *,
 						  struct bkey_format);
 
-void bch2_btree_update_done(struct btree_update *);
-struct btree_update *
-bch2_btree_update_start(struct bch_fs *, enum btree_id, unsigned,
-			unsigned, struct closure *);
-
-void bch2_btree_interior_update_will_free_node(struct btree_update *,
-					       struct btree *);
+int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
 
-void bch2_btree_insert_node(struct btree_update *, struct btree *,
-			    struct btree_iter *, struct keylist *,
-			    unsigned);
-int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
+int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned);
 
-void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
-				   unsigned, unsigned, enum btree_node_sibling);
+int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
+				  unsigned, unsigned, enum btree_node_sibling);
 
-static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
-					struct btree_iter *iter,
+static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
+					btree_path_idx_t path_idx,
 					unsigned level, unsigned flags,
 					enum btree_node_sibling sib)
 {
+	struct btree_path *path = trans->paths + path_idx;
 	struct btree *b;
 
-	/*
-	 * iterators are inconsistent when they hit end of leaf, until
-	 * traversed again
-	 *
-	 * XXX inconsistent how?
-	 */
-	if (iter->flags & BTREE_ITER_AT_END_OF_LEAF)
-		return;
-
-	if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
-		return;
+	EBUG_ON(!btree_node_locked(path, level));
 
-	if (!bch2_btree_node_relock(iter, level))
-		return;
+	if (bch2_btree_node_merging_disabled)
+		return 0;
 
-	b = iter->l[level].b;
-	if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
-		return;
+	b = path->l[level].b;
+	if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
+		return 0;
 
-	__bch2_foreground_maybe_merge(c, iter, level, flags, sib);
+	return __bch2_foreground_maybe_merge(trans, path_idx, level, flags, sib);
 }
 
-static inline void bch2_foreground_maybe_merge(struct bch_fs *c,
-					       struct btree_iter *iter,
-					       unsigned level,
-					       unsigned flags)
+static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
+					      btree_path_idx_t path,
+					      unsigned level,
+					      unsigned flags)
 {
-	bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
-					    btree_prev_sib);
-	bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
-					    btree_next_sib);
+	bch2_trans_verify_not_unlocked_or_in_restart(trans);
+
+	return  bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
+						    btree_prev_sib) ?:
+		bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
+						    btree_next_sib);
 }
 
+int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
+			    struct btree *, unsigned);
+void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
+int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
+			       struct btree *, struct bkey_i *,
+			       unsigned, bool);
+int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
+					struct bkey_i *, unsigned, bool);
+
 void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
-void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
+
+int bch2_btree_root_alloc_fake_trans(struct btree_trans *, enum btree_id, unsigned);
+void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned);
 
 static inline unsigned btree_update_reserve_required(struct bch_fs *c,
 						     struct btree *b)
 {
-	unsigned depth = btree_node_root(c, b)->level + 1;
+	unsigned depth = btree_node_root(c, b)->c.level + 1;
 
 	/*
 	 * Number of nodes we might have to allocate in a worst case btree
@@ -207,9 +192,9 @@ static inline unsigned btree_update_reserve_required(struct bch_fs *c,
 	 * a new root, unless we're already at max depth:
 	 */
 	if (depth < BTREE_MAX_DEPTH)
-		return (depth - b->level) * 2 + 1;
+		return (depth - b->c.level) * 2 + 1;
 	else
-		return (depth - b->level) * 2 - 1;
+		return (depth - b->c.level) * 2 - 1;
 }
 
 static inline void btree_node_reset_sib_u64s(struct btree *b)
@@ -218,21 +203,19 @@ static inline void btree_node_reset_sib_u64s(struct btree *b)
 	b->sib_u64s[1] = b->nr.live_u64s;
 }
 
-static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
+static inline void *btree_data_end(struct btree *b)
 {
-	return (void *) b->data + btree_bytes(c);
+	return (void *) b->data + btree_buf_bytes(b);
 }
 
-static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
-							    struct btree *b)
+static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b)
 {
-	return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
+	return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s);
 }
 
-static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
-							  struct btree *b)
+static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b)
 {
-	return btree_data_end(c, b);
+	return btree_data_end(b);
 }
 
 static inline void *write_block(struct btree *b)
@@ -240,32 +223,36 @@ static inline void *write_block(struct btree *b)
 	return (void *) b->data + (b->written << 9);
 }
 
+static inline bool __btree_addr_written(struct btree *b, void *p)
+{
+	return p < write_block(b);
+}
+
 static inline bool bset_written(struct btree *b, struct bset *i)
 {
-	return (void *) i < write_block(b);
+	return __btree_addr_written(b, i);
 }
 
-static inline bool bset_unwritten(struct btree *b, struct bset *i)
+static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
 {
-	return (void *) i > write_block(b);
+	return __btree_addr_written(b, k);
 }
 
-static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
-						 struct btree *b,
-						 void *end)
+static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end)
 {
 	ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
-		b->whiteout_u64s +
-		b->uncompacted_whiteout_u64s;
-	ssize_t total = c->opts.btree_node_size << 6;
+		b->whiteout_u64s;
+	ssize_t total = btree_buf_bytes(b) >> 3;
+
+	/* Always leave one extra u64 for bch2_varint_decode: */
+	used++;
 
 	return total - used;
 }
 
-static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
-						   struct btree *b)
+static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b)
 {
-	ssize_t remaining = __bch_btree_u64s_remaining(c, b,
+	ssize_t remaining = __bch2_btree_u64s_remaining(b,
 				btree_bkey_last(b, bset_tree_last(b)));
 
 	BUG_ON(remaining < 0);
@@ -276,29 +263,30 @@ static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
 	return remaining;
 }
 
+#define BTREE_WRITE_SET_U64s_BITS	9
+
 static inline unsigned btree_write_set_buffer(struct btree *b)
 {
 	/*
 	 * Could buffer up larger amounts of keys for btrees with larger keys,
 	 * pending benchmarking:
 	 */
-	return 4 << 10;
+	return 8 << BTREE_WRITE_SET_U64s_BITS;
 }
 
-static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
-						     struct btree *b)
+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b)
 {
-	struct bset *i = btree_bset_last(b);
+	struct bset_tree *t = bset_tree_last(b);
 	struct btree_node_entry *bne = max(write_block(b),
 			(void *) btree_bkey_last(b, bset_tree_last(b)));
 	ssize_t remaining_space =
-		__bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
+		__bch2_btree_u64s_remaining(b, bne->keys.start);
 
-	if (unlikely(bset_written(b, i))) {
+	if (unlikely(bset_written(b, bset(b, t)))) {
 		if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
 			return bne;
 	} else {
-		if (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
+		if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
 		    remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
 			return bne;
 	}
@@ -306,68 +294,54 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
 	return NULL;
 }
 
-static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
-				      struct bkey_packed *k)
+static inline void push_whiteout(struct btree *b, struct bpos pos)
 {
-	if (bset_written(b, bset(b, t))) {
-		EBUG_ON(b->uncompacted_whiteout_u64s <
-			bkeyp_key_u64s(&b->format, k));
-		b->uncompacted_whiteout_u64s -=
-			bkeyp_key_u64s(&b->format, k);
-	}
-}
+	struct bkey_packed k;
 
-static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
-				    struct bkey_packed *k)
-{
-	if (bset_written(b, bset(b, t))) {
-		BUG_ON(!k->needs_whiteout);
-		b->uncompacted_whiteout_u64s +=
-			bkeyp_key_u64s(&b->format, k);
+	BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s);
+	EBUG_ON(btree_node_just_written(b));
+
+	if (!bkey_pack_pos(&k, pos, b)) {
+		struct bkey *u = (void *) &k;
+
+		bkey_init(u);
+		u->p = pos;
 	}
+
+	k.needs_whiteout = true;
+
+	b->whiteout_u64s += k.u64s;
+	bkey_p_copy(unwritten_whiteouts_start(b), &k);
 }
 
 /*
  * write lock must be held on @b (else the dirty bset that we were going to
  * insert into could be written out from under us)
  */
-static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
-					      struct btree *b, unsigned u64s)
+static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s)
 {
-	if (unlikely(btree_node_fake(b)))
+	if (unlikely(btree_node_need_rewrite(b)))
 		return false;
 
-	if (btree_node_is_extents(b)) {
-		/* The insert key might split an existing key
-		 * (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case:
-		 */
-		u64s += BKEY_EXTENT_U64s_MAX;
-	}
-
-	return u64s <= bch_btree_keys_u64s_remaining(c, b);
+	return u64s <= bch2_btree_keys_u64s_remaining(b);
 }
 
-static inline bool journal_res_insert_fits(struct btree_insert *trans,
-					   struct btree_insert_entry *insert)
-{
-	unsigned u64s = 0;
-	struct btree_insert_entry *i;
+void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
 
-	/*
-	 * If we didn't get a journal reservation, we're in journal replay and
-	 * we're not journalling updates:
-	 */
-	if (!trans->journal_res.ref)
-		return true;
+bool bch2_btree_interior_updates_flush(struct bch_fs *);
 
-	for (i = insert; i < trans->entries + trans->nr; i++)
-		u64s += jset_u64s(i->k->k.u64s + i->extra_res);
+void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
+struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
+					struct jset_entry *, unsigned long);
 
-	return u64s <= trans->journal_res.u64s;
-}
+void bch2_async_btree_node_rewrites_flush(struct bch_fs *);
+void bch2_do_pending_node_rewrites(struct bch_fs *);
+void bch2_free_pending_node_rewrites(struct bch_fs *);
 
-ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
+void bch2_btree_reserve_cache_to_text(struct printbuf *, struct bch_fs *);
 
-size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
+void bch2_fs_btree_interior_update_exit(struct bch_fs *);
+void bch2_fs_btree_interior_update_init_early(struct bch_fs *);
+int bch2_fs_btree_interior_update_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
deleted file mode 100644
index a481b0d6..00000000
--- a/libbcachefs/btree_update_leaf.c
+++ /dev/null
@@ -1,749 +0,0 @@
-
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "debug.h"
-#include "extents.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "keylist.h"
-
-#include <linux/sort.h>
-#include <trace/events/bcachefs.h>
-
-/* Inserting into a given leaf node (last stage of insert): */
-
-/* Handle overwrites and do insert, for non extents: */
-bool bch2_btree_bset_insert_key(struct btree_iter *iter,
-				struct btree *b,
-				struct btree_node_iter *node_iter,
-				struct bkey_i *insert)
-{
-	const struct bkey_format *f = &b->format;
-	struct bkey_packed *k;
-	struct bset_tree *t;
-	unsigned clobber_u64s;
-
-	EBUG_ON(btree_node_just_written(b));
-	EBUG_ON(bset_written(b, btree_bset_last(b)));
-	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
-	EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
-		bkey_cmp(insert->k.p, b->data->max_key) > 0);
-
-	k = bch2_btree_node_iter_peek_all(node_iter, b);
-	if (k && !bkey_cmp_packed(b, k, &insert->k)) {
-		BUG_ON(bkey_whiteout(k));
-
-		t = bch2_bkey_to_bset(b, k);
-
-		if (bset_unwritten(b, bset(b, t)) &&
-		    bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) &&
-		    !bkey_whiteout(&insert->k)) {
-			k->type = insert->k.type;
-			memcpy_u64s(bkeyp_val(f, k), &insert->v,
-				    bkey_val_u64s(&insert->k));
-			return true;
-		}
-
-		insert->k.needs_whiteout = k->needs_whiteout;
-
-		btree_keys_account_key_drop(&b->nr, t - b->set, k);
-
-		if (t == bset_tree_last(b)) {
-			clobber_u64s = k->u64s;
-
-			/*
-			 * If we're deleting, and the key we're deleting doesn't
-			 * need a whiteout (it wasn't overwriting a key that had
-			 * been written to disk) - just delete it:
-			 */
-			if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
-				bch2_bset_delete(b, k, clobber_u64s);
-				bch2_btree_node_iter_fix(iter, b, node_iter, t,
-							k, clobber_u64s, 0);
-				return true;
-			}
-
-			goto overwrite;
-		}
-
-		k->type = KEY_TYPE_DELETED;
-		bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
-					k->u64s, k->u64s);
-
-		if (bkey_whiteout(&insert->k)) {
-			reserve_whiteout(b, t, k);
-			return true;
-		} else {
-			k->needs_whiteout = false;
-		}
-	} else {
-		/*
-		 * Deleting, but the key to delete wasn't found - nothing to do:
-		 */
-		if (bkey_whiteout(&insert->k))
-			return false;
-
-		insert->k.needs_whiteout = false;
-	}
-
-	t = bset_tree_last(b);
-	k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
-	clobber_u64s = 0;
-overwrite:
-	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
-	if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
-		bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
-					clobber_u64s, k->u64s);
-	return true;
-}
-
-static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
-			       unsigned i, u64 seq)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct btree_write *w = container_of(pin, struct btree_write, journal);
-	struct btree *b = container_of(w, struct btree, writes[i]);
-
-	btree_node_lock_type(c, b, SIX_LOCK_read);
-	bch2_btree_node_write_cond(c, b,
-			(btree_current_write(b) == w &&
-			 w->journal.pin_list == journal_seq_pin(j, seq)));
-	six_unlock_read(&b->lock);
-}
-
-static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-{
-	return __btree_node_flush(j, pin, 0, seq);
-}
-
-static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-{
-	return __btree_node_flush(j, pin, 1, seq);
-}
-
-void bch2_btree_journal_key(struct btree_insert *trans,
-			   struct btree_iter *iter,
-			   struct bkey_i *insert)
-{
-	struct bch_fs *c = trans->c;
-	struct journal *j = &c->journal;
-	struct btree *b = iter->l[0].b;
-	struct btree_write *w = btree_current_write(b);
-
-	EBUG_ON(iter->level || b->level);
-	EBUG_ON(trans->journal_res.ref !=
-		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
-
-	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-		u64 seq = trans->journal_res.seq;
-		bool needs_whiteout = insert->k.needs_whiteout;
-
-		/* ick */
-		insert->k.needs_whiteout = false;
-		bch2_journal_add_keys(j, &trans->journal_res,
-				      iter->btree_id, insert);
-		insert->k.needs_whiteout = needs_whiteout;
-
-		bch2_journal_set_has_inode(j, &trans->journal_res,
-					   insert->k.p.inode);
-
-		if (trans->journal_seq)
-			*trans->journal_seq = seq;
-		btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
-	}
-
-	if (unlikely(!journal_pin_active(&w->journal))) {
-		u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
-			? trans->journal_res.seq
-			: j->replay_journal_seq;
-
-		bch2_journal_pin_add(j, seq, &w->journal,
-				     btree_node_write_idx(b) == 0
-				     ? btree_node_flush0
-				     : btree_node_flush1);
-	}
-
-	if (unlikely(!btree_node_dirty(b)))
-		set_btree_node_dirty(b);
-}
-
-static enum btree_insert_ret
-bch2_insert_fixup_key(struct btree_insert *trans,
-		     struct btree_insert_entry *insert)
-{
-	struct btree_iter *iter = insert->iter;
-	struct btree_iter_level *l = &iter->l[0];
-
-	EBUG_ON(iter->level);
-	EBUG_ON(insert->k->k.u64s >
-		bch_btree_keys_u64s_remaining(trans->c, l->b));
-
-	if (bch2_btree_bset_insert_key(iter, l->b, &l->iter,
-				       insert->k))
-		bch2_btree_journal_key(trans, iter, insert->k);
-
-	trans->did_work = true;
-	return BTREE_INSERT_OK;
-}
-
-/**
- * btree_insert_key - insert a key one key into a leaf node
- */
-static enum btree_insert_ret
-btree_insert_key_leaf(struct btree_insert *trans,
-		      struct btree_insert_entry *insert)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter *iter = insert->iter;
-	struct btree *b = iter->l[0].b;
-	enum btree_insert_ret ret;
-	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
-	int old_live_u64s = b->nr.live_u64s;
-	int live_u64s_added, u64s_added;
-
-	ret = !btree_node_is_extents(b)
-		? bch2_insert_fixup_key(trans, insert)
-		: bch2_insert_fixup_extent(trans, insert);
-
-	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
-	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
-
-	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
-		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
-	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
-		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
-
-	if (u64s_added > live_u64s_added &&
-	    bch2_maybe_compact_whiteouts(c, b))
-		bch2_btree_iter_reinit_node(iter, b);
-
-	trace_btree_insert_key(c, b, insert->k);
-	return ret;
-}
-
-#define trans_for_each_entry(trans, i)					\
-	for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
-
-/*
- * We sort transaction entries so that if multiple iterators point to the same
- * leaf node they'll be adjacent:
- */
-static bool same_leaf_as_prev(struct btree_insert *trans,
-			      struct btree_insert_entry *i)
-{
-	return i != trans->entries &&
-		i[0].iter->l[0].b == i[-1].iter->l[0].b;
-}
-
-static inline struct btree_insert_entry *trans_next_leaf(struct btree_insert *trans,
-							 struct btree_insert_entry *i)
-{
-	struct btree *b = i->iter->l[0].b;
-
-	do {
-		i++;
-	} while (i < trans->entries + trans->nr && b == i->iter->l[0].b);
-
-	return i;
-}
-
-#define trans_for_each_leaf(trans, i)					\
-	for ((i) = (trans)->entries;					\
-	     (i) < (trans)->entries + (trans)->nr;			\
-	     (i) = trans_next_leaf(trans, i))
-
-inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
-					    struct btree_iter *iter)
-{
-	bch2_btree_node_lock_write(b, iter);
-
-	if (btree_node_just_written(b) &&
-	    bch2_btree_post_write_cleanup(c, b))
-		bch2_btree_iter_reinit_node(iter, b);
-
-	/*
-	 * If the last bset has been written, or if it's gotten too big - start
-	 * a new bset to insert into:
-	 */
-	if (want_new_bset(c, b))
-		bch2_btree_init_next(c, b, iter);
-}
-
-static void multi_lock_write(struct bch_fs *c, struct btree_insert *trans)
-{
-	struct btree_insert_entry *i;
-
-	trans_for_each_leaf(trans, i)
-		bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
-}
-
-static void multi_unlock_write(struct btree_insert *trans)
-{
-	struct btree_insert_entry *i;
-
-	trans_for_each_leaf(trans, i)
-		bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
-}
-
-static inline int btree_trans_cmp(struct btree_insert_entry l,
-				  struct btree_insert_entry r)
-{
-	return btree_iter_cmp(l.iter, r.iter);
-}
-
-/* Normal update interface: */
-
-/*
- * Get journal reservation, take write locks, and attempt to do btree update(s):
- */
-static inline int do_btree_insert_at(struct btree_insert *trans,
-				     struct btree_iter **split,
-				     bool *cycle_gc_lock)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_insert_entry *i;
-	unsigned u64s;
-	int ret;
-
-	trans_for_each_entry(trans, i) {
-		BUG_ON(i->done);
-		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
-	}
-
-	u64s = 0;
-	trans_for_each_entry(trans, i)
-		u64s += jset_u64s(i->k->k.u64s + i->extra_res);
-
-	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
-
-	ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
-		? bch2_journal_res_get(&c->journal,
-				      &trans->journal_res,
-				      u64s, u64s)
-		: 0;
-	if (ret)
-		return ret;
-
-	multi_lock_write(c, trans);
-
-	if (race_fault()) {
-		ret = -EINTR;
-		trans_restart(" (race)");
-		goto out;
-	}
-
-	u64s = 0;
-	trans_for_each_entry(trans, i) {
-		/* Multiple inserts might go to same leaf: */
-		if (!same_leaf_as_prev(trans, i))
-			u64s = 0;
-
-		/*
-		 * bch2_btree_node_insert_fits() must be called under write lock:
-		 * with only an intent lock, another thread can still call
-		 * bch2_btree_node_write(), converting an unwritten bset to a
-		 * written one
-		 */
-		u64s += i->k->k.u64s + i->extra_res;
-		if (!bch2_btree_node_insert_fits(c,
-				i->iter->l[0].b, u64s)) {
-			ret = -EINTR;
-			*split = i->iter;
-			goto out;
-		}
-	}
-
-	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
-		if (journal_seq_verify(c))
-			trans_for_each_entry(trans, i)
-				i->k->k.version.lo = trans->journal_res.seq;
-		else if (inject_invalid_keys(c))
-			trans_for_each_entry(trans, i)
-				i->k->k.version = MAX_VERSION;
-	}
-
-	trans_for_each_entry(trans, i) {
-		switch (btree_insert_key_leaf(trans, i)) {
-		case BTREE_INSERT_OK:
-			i->done = true;
-			break;
-		case BTREE_INSERT_JOURNAL_RES_FULL:
-		case BTREE_INSERT_NEED_TRAVERSE:
-		case BTREE_INSERT_NEED_RESCHED:
-			ret = -EINTR;
-			break;
-		case BTREE_INSERT_BTREE_NODE_FULL:
-			ret = -EINTR;
-			*split = i->iter;
-			break;
-		case BTREE_INSERT_ENOSPC:
-			ret = -ENOSPC;
-			break;
-		case BTREE_INSERT_NEED_GC_LOCK:
-			ret = -EINTR;
-			*cycle_gc_lock = true;
-			break;
-		default:
-			BUG();
-		}
-
-		/*
-		 * If we did some work (i.e. inserted part of an extent),
-		 * we have to do all the other updates as well:
-		 */
-		if (!trans->did_work && (ret || *split))
-			break;
-	}
-out:
-	multi_unlock_write(trans);
-	bch2_journal_res_put(&c->journal, &trans->journal_res);
-
-	return ret;
-}
-
-static inline void btree_insert_entry_checks(struct bch_fs *c,
-					     struct btree_insert_entry *i)
-{
-	BUG_ON(i->iter->level);
-	BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
-	BUG_ON(debug_check_bkeys(c) &&
-	       !bkey_deleted(&i->k->k) &&
-	       bch2_bkey_invalid(c, i->iter->btree_id,
-				 bkey_i_to_s_c(i->k)));
-}
-
-/**
- * __bch_btree_insert_at - insert keys at given iterator positions
- *
- * This is main entry point for btree updates.
- *
- * Return values:
- * -EINTR: locking changed, this function should be called again. Only returned
- *  if passed BTREE_INSERT_ATOMIC.
- * -EROFS: filesystem read only
- * -EIO: journal or btree node IO error
- */
-int __bch2_btree_insert_at(struct btree_insert *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_insert_entry *i;
-	struct btree_iter *linked, *split = NULL;
-	bool cycle_gc_lock = false;
-	unsigned flags;
-	int ret;
-
-	BUG_ON(!trans->nr);
-
-	for_each_btree_iter(trans->entries[0].iter, linked)
-		bch2_btree_iter_verify_locks(linked);
-
-	/* for the sake of sanity: */
-	BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
-
-	trans_for_each_entry(trans, i)
-		btree_insert_entry_checks(c, i);
-
-	bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
-
-	if (unlikely(!percpu_ref_tryget(&c->writes)))
-		return -EROFS;
-retry:
-	split = NULL;
-	cycle_gc_lock = false;
-
-	trans_for_each_entry(trans, i) {
-		unsigned old_locks_want = i->iter->locks_want;
-		unsigned old_uptodate = i->iter->uptodate;
-
-		if (!bch2_btree_iter_upgrade(i->iter, 1, true)) {
-			trans_restart(" (failed upgrade, locks_want %u uptodate %u)",
-				      old_locks_want, old_uptodate);
-			ret = -EINTR;
-			goto err;
-		}
-
-		if (i->iter->flags & BTREE_ITER_ERROR) {
-			ret = -EIO;
-			goto err;
-		}
-	}
-
-	ret = do_btree_insert_at(trans, &split, &cycle_gc_lock);
-	if (unlikely(ret))
-		goto err;
-
-	trans_for_each_leaf(trans, i)
-		bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
-
-	trans_for_each_entry(trans, i)
-		bch2_btree_iter_downgrade(i->iter);
-out:
-	percpu_ref_put(&c->writes);
-
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-		/* make sure we didn't drop or screw up locks: */
-		for_each_btree_iter(trans->entries[0].iter, linked) {
-			bch2_btree_iter_verify_locks(linked);
-			BUG_ON((trans->flags & BTREE_INSERT_NOUNLOCK) &&
-			       trans->did_work &&
-			       linked->uptodate >= BTREE_ITER_NEED_RELOCK);
-		}
-
-		/* make sure we didn't lose an error: */
-		if (!ret)
-			trans_for_each_entry(trans, i)
-				BUG_ON(!i->done);
-	}
-
-	BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
-
-	return ret;
-err:
-	flags = trans->flags;
-
-	/*
-	 * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
-	 * update; if we haven't done anything yet it doesn't apply
-	 */
-	if (!trans->did_work)
-		flags &= ~BTREE_INSERT_NOUNLOCK;
-
-	if (split) {
-		ret = bch2_btree_split_leaf(c, split, flags);
-
-		/*
-		 * if the split succeeded without dropping locks the insert will
-		 * still be atomic (in the BTREE_INSERT_ATOMIC sense, what the
-		 * caller peeked() and is overwriting won't have changed)
-		 */
-#if 0
-		/*
-		 * XXX:
-		 * split -> btree node merging (of parent node) might still drop
-		 * locks when we're not passing it BTREE_INSERT_NOUNLOCK
-		 */
-		if (!ret && !trans->did_work)
-			goto retry;
-#endif
-
-		/*
-		 * don't care if we got ENOSPC because we told split it
-		 * couldn't block:
-		 */
-		if (!ret || (flags & BTREE_INSERT_NOUNLOCK)) {
-			trans_restart(" (split)");
-			ret = -EINTR;
-		}
-	}
-
-	if (cycle_gc_lock) {
-		if (!down_read_trylock(&c->gc_lock)) {
-			if (flags & BTREE_INSERT_NOUNLOCK)
-				goto out;
-
-			bch2_btree_iter_unlock(trans->entries[0].iter);
-			down_read(&c->gc_lock);
-		}
-		up_read(&c->gc_lock);
-	}
-
-	if (ret == -EINTR) {
-		if (flags & BTREE_INSERT_NOUNLOCK) {
-			trans_restart(" (can't unlock)");
-			goto out;
-		}
-
-		trans_for_each_entry(trans, i) {
-			int ret2 = bch2_btree_iter_traverse(i->iter);
-			if (ret2) {
-				ret = ret2;
-				trans_restart(" (traverse)");
-				goto out;
-			}
-
-			BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
-		}
-
-		/*
-		 * BTREE_ITER_ATOMIC means we have to return -EINTR if we
-		 * dropped locks:
-		 */
-		if (!(flags & BTREE_INSERT_ATOMIC))
-			goto retry;
-
-		trans_restart(" (atomic)");
-	}
-
-	goto out;
-}
-
-void bch2_trans_update(struct btree_trans *trans,
-		       struct btree_iter *iter,
-		       struct bkey_i *k,
-		       unsigned extra_journal_res)
-{
-	struct btree_insert_entry *i;
-
-	BUG_ON(trans->nr_updates >= ARRAY_SIZE(trans->updates));
-
-	i = &trans->updates[trans->nr_updates++];
-
-	*i = (struct btree_insert_entry) {
-		.iter	= iter,
-		.k		= k,
-		.extra_res	= extra_journal_res,
-	};
-
-	btree_insert_entry_checks(trans->c, i);
-}
-
-int bch2_trans_commit(struct btree_trans *trans,
-		      struct disk_reservation *disk_res,
-		      struct extent_insert_hook *hook,
-		      u64 *journal_seq,
-		      unsigned flags)
-{
-	struct btree_insert insert = {
-		.c		= trans->c,
-		.disk_res	= disk_res,
-		.journal_seq	= journal_seq,
-		.flags		= flags,
-		.nr		= trans->nr_updates,
-		.entries	= trans->updates,
-	};
-
-	if (!trans->nr_updates)
-		return 0;
-
-	trans->nr_updates = 0;
-
-	return __bch2_btree_insert_at(&insert);
-}
-
-int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
-{
-	struct bkey_i k;
-
-	bkey_init(&k.k);
-	k.k.p = iter->pos;
-
-	return bch2_btree_insert_at(iter->c, NULL, NULL, NULL,
-				    BTREE_INSERT_NOFAIL|
-				    BTREE_INSERT_USE_RESERVE|flags,
-				    BTREE_INSERT_ENTRY(iter, &k));
-}
-
-int bch2_btree_insert_list_at(struct btree_iter *iter,
-			     struct keylist *keys,
-			     struct disk_reservation *disk_res,
-			     struct extent_insert_hook *hook,
-			     u64 *journal_seq, unsigned flags)
-{
-	BUG_ON(flags & BTREE_INSERT_ATOMIC);
-	BUG_ON(bch2_keylist_empty(keys));
-	bch2_verify_keylist_sorted(keys);
-
-	while (!bch2_keylist_empty(keys)) {
-		int ret = bch2_btree_insert_at(iter->c, disk_res, hook,
-				journal_seq, flags,
-				BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys)));
-		if (ret)
-			return ret;
-
-		bch2_keylist_pop_front(keys);
-	}
-
-	return 0;
-}
-
-/**
- * bch_btree_insert - insert keys into the extent btree
- * @c:			pointer to struct bch_fs
- * @id:			btree to insert into
- * @insert_keys:	list of keys to insert
- * @hook:		insert callback
- */
-int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
-		     struct bkey_i *k,
-		     struct disk_reservation *disk_res,
-		     struct extent_insert_hook *hook,
-		     u64 *journal_seq, int flags)
-{
-	struct btree_iter iter;
-	int ret;
-
-	bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k),
-			     BTREE_ITER_INTENT);
-	ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags,
-				   BTREE_INSERT_ENTRY(&iter, k));
-	bch2_btree_iter_unlock(&iter);
-
-	return ret;
-}
-
-/*
- * bch_btree_delete_range - delete everything within a given range
- *
- * Range is a half open interval - [start, end)
- */
-int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
-			   struct bpos start,
-			   struct bpos end,
-			   struct bversion version,
-			   struct disk_reservation *disk_res,
-			   struct extent_insert_hook *hook,
-			   u64 *journal_seq)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	bch2_btree_iter_init(&iter, c, id, start,
-			     BTREE_ITER_INTENT);
-
-	while ((k = bch2_btree_iter_peek(&iter)).k &&
-	       !(ret = btree_iter_err(k))) {
-		unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
-		/* really shouldn't be using a bare, unpadded bkey_i */
-		struct bkey_i delete;
-
-		if (bkey_cmp(iter.pos, end) >= 0)
-			break;
-
-		bkey_init(&delete.k);
-
-		/*
-		 * For extents, iter.pos won't necessarily be the same as
-		 * bkey_start_pos(k.k) (for non extents they always will be the
-		 * same). It's important that we delete starting from iter.pos
-		 * because the range we want to delete could start in the middle
-		 * of k.
-		 *
-		 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
-		 * bkey_start_pos(k.k)).
-		 */
-		delete.k.p = iter.pos;
-		delete.k.version = version;
-
-		if (iter.flags & BTREE_ITER_IS_EXTENTS) {
-			/* create the biggest key we can */
-			bch2_key_resize(&delete.k, max_sectors);
-			bch2_cut_back(end, &delete.k);
-		}
-
-		ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq,
-					   BTREE_INSERT_NOFAIL,
-					   BTREE_INSERT_ENTRY(&iter, &delete));
-		if (ret)
-			break;
-
-		bch2_btree_iter_cond_resched(&iter);
-	}
-
-	bch2_btree_iter_unlock(&iter);
-	return ret;
-}
diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c
new file mode 100644
index 00000000..49ce2d1e
--- /dev/null
+++ b/libbcachefs/btree_write_buffer.c
@@ -0,0 +1,854 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "disk_accounting.h"
+#include "error.h"
+#include "extents.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+
+#include <linux/prefetch.h>
+#include <linux/sort.h>
+
+static int bch2_btree_write_buffer_journal_flush(struct journal *,
+				struct journal_entry_pin *, u64);
+
+static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
+{
+	return (cmp_int(l->hi, r->hi) ?:
+		cmp_int(l->mi, r->mi) ?:
+		cmp_int(l->lo, r->lo)) >= 0;
+}
+
+static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
+{
+#ifdef CONFIG_X86_64
+	int cmp;
+
+	asm("mov   (%[l]), %%rax;"
+	    "sub   (%[r]), %%rax;"
+	    "mov  8(%[l]), %%rax;"
+	    "sbb  8(%[r]), %%rax;"
+	    "mov 16(%[l]), %%rax;"
+	    "sbb 16(%[r]), %%rax;"
+	    : "=@ccae" (cmp)
+	    : [l] "r" (l), [r] "r" (r)
+	    : "rax", "cc");
+
+	EBUG_ON(cmp != __wb_key_ref_cmp(l, r));
+	return cmp;
+#else
+	return __wb_key_ref_cmp(l, r);
+#endif
+}
+
+static int wb_key_seq_cmp(const void *_l, const void *_r)
+{
+	const struct btree_write_buffered_key *l = _l;
+	const struct btree_write_buffered_key *r = _r;
+
+	return cmp_int(l->journal_seq, r->journal_seq);
+}
+
+/* Compare excluding idx, the low 24 bits: */
+static inline bool wb_key_eq(const void *_l, const void *_r)
+{
+	const struct wb_key_ref *l = _l;
+	const struct wb_key_ref *r = _r;
+
+	return !((l->hi ^ r->hi)|
+		 (l->mi ^ r->mi)|
+		 ((l->lo >> 24) ^ (r->lo >> 24)));
+}
+
+static noinline void wb_sort(struct wb_key_ref *base, size_t num)
+{
+	size_t n = num, a = num / 2;
+
+	if (!a)		/* num < 2 || size == 0 */
+		return;
+
+	for (;;) {
+		size_t b, c, d;
+
+		if (a)			/* Building heap: sift down --a */
+			--a;
+		else if (--n)		/* Sorting: Extract root to --n */
+			swap(base[0], base[n]);
+		else			/* Sort complete */
+			break;
+
+		/*
+		 * Sift element at "a" down into heap.  This is the
+		 * "bottom-up" variant, which significantly reduces
+		 * calls to cmp_func(): we find the sift-down path all
+		 * the way to the leaves (one compare per level), then
+		 * backtrack to find where to insert the target element.
+		 *
+		 * Because elements tend to sift down close to the leaves,
+		 * this uses fewer compares than doing two per level
+		 * on the way down.  (A bit more than half as many on
+		 * average, 3/4 worst-case.)
+		 */
+		for (b = a; c = 2*b + 1, (d = c + 1) < n;)
+			b = wb_key_ref_cmp(base + c, base + d) ? c : d;
+		if (d == n)		/* Special case last leaf with no sibling */
+			b = c;
+
+		/* Now backtrack from "b" to the correct location for "a" */
+		while (b != a && wb_key_ref_cmp(base + a, base + b))
+			b = (b - 1) / 2;
+		c = b;			/* Where "a" belongs */
+		while (b != a) {	/* Shift it into place */
+			b = (b - 1) / 2;
+			swap(base[b], base[c]);
+		}
+	}
+}
+
+static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
+					  struct btree_iter *iter,
+					  struct btree_write_buffered_key *wb)
+{
+	struct btree_path *path = btree_iter_path(trans, iter);
+
+	bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+
+	trans->journal_res.seq = wb->journal_seq;
+
+	return bch2_trans_update(trans, iter, &wb->k,
+				 BTREE_UPDATE_internal_snapshot_node) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BCH_TRANS_COMMIT_no_enospc|
+				  BCH_TRANS_COMMIT_no_check_rw|
+				  BCH_TRANS_COMMIT_no_journal_res|
+				  BCH_TRANS_COMMIT_journal_reclaim);
+}
+
+static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
+			       struct btree_write_buffered_key *wb,
+			       bool *write_locked,
+			       bool *accounting_accumulated,
+			       size_t *fast)
+{
+	struct btree_path *path;
+	int ret;
+
+	EBUG_ON(!wb->journal_seq);
+	EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq);
+	EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		return ret;
+
+	if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
+		struct bkey u;
+		struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
+
+		if (k.k->type == KEY_TYPE_accounting)
+			bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
+						   bkey_s_c_to_accounting(k));
+	}
+	*accounting_accumulated = true;
+
+	/*
+	 * We can't clone a path that has write locks: unshare it now, before
+	 * set_pos and traverse():
+	 */
+	if (btree_iter_path(trans, iter)->ref > 1)
+		iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_);
+
+	path = btree_iter_path(trans, iter);
+
+	if (!*write_locked) {
+		ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c);
+		if (ret)
+			return ret;
+
+		bch2_btree_node_prep_for_write(trans, path, path->l[0].b);
+		*write_locked = true;
+	}
+
+	if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) {
+		*write_locked = false;
+		return wb_flush_one_slowpath(trans, iter, wb);
+	}
+
+	bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
+	(*fast)++;
+	return 0;
+}
+
+/*
+ * Update a btree with a write buffered key using the journal seq of the
+ * original write buffer insert.
+ *
+ * It is not safe to rejournal the key once it has been inserted into the write
+ * buffer because that may break recovery ordering. For example, the key may
+ * have already been modified in the active write buffer in a seq that comes
+ * before the current transaction. If we were to journal this key again and
+ * crash, recovery would process updates in the wrong order.
+ */
+static int
+btree_write_buffered_insert(struct btree_trans *trans,
+			  struct btree_write_buffered_key *wb)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
+			     BTREE_ITER_cached|BTREE_ITER_intent);
+
+	trans->journal_res.seq = wb->journal_seq;
+
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, &wb->k,
+				  BTREE_UPDATE_internal_snapshot_node);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb)
+{
+	struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer);
+	struct journal *j = &c->journal;
+
+	if (!wb->inc.keys.nr)
+		return;
+
+	bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin,
+			     bch2_btree_write_buffer_journal_flush);
+
+	darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr));
+	darray_resize(&wb->sorted, wb->flushing.keys.size);
+
+	if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) {
+		swap(wb->flushing.keys, wb->inc.keys);
+		goto out;
+	}
+
+	size_t nr = min(darray_room(wb->flushing.keys),
+			wb->sorted.size - wb->flushing.keys.nr);
+	nr = min(nr, wb->inc.keys.nr);
+
+	memcpy(&darray_top(wb->flushing.keys),
+	       wb->inc.keys.data,
+	       sizeof(wb->inc.keys.data[0]) * nr);
+
+	memmove(wb->inc.keys.data,
+		wb->inc.keys.data + nr,
+	       sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr));
+
+	wb->flushing.keys.nr	+= nr;
+	wb->inc.keys.nr		-= nr;
+out:
+	if (!wb->inc.keys.nr)
+		bch2_journal_pin_drop(j, &wb->inc.pin);
+	else
+		bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin,
+					bch2_btree_write_buffer_journal_flush);
+
+	if (j->watermark) {
+		spin_lock(&j->lock);
+		bch2_journal_set_watermark(j);
+		spin_unlock(&j->lock);
+	}
+
+	BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
+}
+
+static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct journal *j = &c->journal;
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	struct btree_iter iter = { NULL };
+	size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
+	bool write_locked = false;
+	bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
+	int ret = 0;
+
+	ret = bch2_journal_error(&c->journal);
+	if (ret)
+		return ret;
+
+	bch2_trans_unlock(trans);
+	bch2_trans_begin(trans);
+
+	mutex_lock(&wb->inc.lock);
+	move_keys_from_inc_to_flushing(wb);
+	mutex_unlock(&wb->inc.lock);
+
+	for (size_t i = 0; i < wb->flushing.keys.nr; i++) {
+		wb->sorted.data[i].idx = i;
+		wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree;
+		memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos));
+	}
+	wb->sorted.nr = wb->flushing.keys.nr;
+
+	/*
+	 * We first sort so that we can detect and skip redundant updates, and
+	 * then we attempt to flush in sorted btree order, as this is most
+	 * efficient.
+	 *
+	 * However, since we're not flushing in the order they appear in the
+	 * journal we won't be able to drop our journal pin until everything is
+	 * flushed - which means this could deadlock the journal if we weren't
+	 * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail
+	 * if it would block taking a journal reservation.
+	 *
+	 * If that happens, simply skip the key so we can optimistically insert
+	 * as many keys as possible in the fast path.
+	 */
+	wb_sort(wb->sorted.data, wb->sorted.nr);
+
+	darray_for_each(wb->sorted, i) {
+		struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
+
+		for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
+			prefetch(&wb->flushing.keys.data[n->idx]);
+
+		BUG_ON(!k->journal_seq);
+
+		if (!accounting_replay_done &&
+		    k->k.k.type == KEY_TYPE_accounting) {
+			slowpath++;
+			continue;
+		}
+
+		if (i + 1 < &darray_top(wb->sorted) &&
+		    wb_key_eq(i, i + 1)) {
+			struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
+
+			if (k->k.k.type == KEY_TYPE_accounting &&
+			    n->k.k.type == KEY_TYPE_accounting)
+				bch2_accounting_accumulate(bkey_i_to_accounting(&n->k),
+							   bkey_i_to_s_c_accounting(&k->k));
+
+			overwritten++;
+			n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
+			k->journal_seq = 0;
+			continue;
+		}
+
+		if (write_locked) {
+			struct btree_path *path = btree_iter_path(trans, &iter);
+
+			if (path->btree_id != i->btree ||
+			    bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) {
+				bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+				write_locked = false;
+
+				ret = lockrestart_do(trans,
+					bch2_btree_iter_traverse(&iter) ?:
+					bch2_foreground_maybe_merge(trans, iter.path, 0,
+							BCH_WATERMARK_reclaim|
+							BCH_TRANS_COMMIT_journal_reclaim|
+							BCH_TRANS_COMMIT_no_check_rw|
+							BCH_TRANS_COMMIT_no_enospc));
+				if (ret)
+					goto err;
+			}
+		}
+
+		if (!iter.path || iter.btree_id != k->btree) {
+			bch2_trans_iter_exit(trans, &iter);
+			bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p,
+					     BTREE_ITER_intent|BTREE_ITER_all_snapshots);
+		}
+
+		bch2_btree_iter_set_pos(&iter, k->k.k.p);
+		btree_iter_path(trans, &iter)->preserve = false;
+
+		bool accounting_accumulated = false;
+		do {
+			if (race_fault()) {
+				ret = -BCH_ERR_journal_reclaim_would_deadlock;
+				break;
+			}
+
+			ret = wb_flush_one(trans, &iter, k, &write_locked,
+					   &accounting_accumulated, &fast);
+			if (!write_locked)
+				bch2_trans_begin(trans);
+		} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
+
+		if (!ret) {
+			k->journal_seq = 0;
+		} else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
+			slowpath++;
+			ret = 0;
+		} else
+			break;
+	}
+
+	if (write_locked) {
+		struct btree_path *path = btree_iter_path(trans, &iter);
+		bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret)
+		goto err;
+
+	if (slowpath) {
+		/*
+		 * Flush in the order they were present in the journal, so that
+		 * we can release journal pins:
+		 * The fastpath zapped the seq of keys that were successfully flushed so
+		 * we can skip those here.
+		 */
+		trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
+
+		sort(wb->flushing.keys.data,
+		     wb->flushing.keys.nr,
+		     sizeof(wb->flushing.keys.data[0]),
+		     wb_key_seq_cmp, NULL);
+
+		darray_for_each(wb->flushing.keys, i) {
+			if (!i->journal_seq)
+				continue;
+
+			if (!accounting_replay_done &&
+			    i->k.k.type == KEY_TYPE_accounting) {
+				could_not_insert++;
+				continue;
+			}
+
+			if (!could_not_insert)
+				bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
+							bch2_btree_write_buffer_journal_flush);
+
+			bch2_trans_begin(trans);
+
+			ret = commit_do(trans, NULL, NULL,
+					BCH_WATERMARK_reclaim|
+					BCH_TRANS_COMMIT_journal_reclaim|
+					BCH_TRANS_COMMIT_no_check_rw|
+					BCH_TRANS_COMMIT_no_enospc|
+					BCH_TRANS_COMMIT_no_journal_res ,
+					btree_write_buffered_insert(trans, i));
+			if (ret)
+				goto err;
+
+			i->journal_seq = 0;
+		}
+
+		/*
+		 * If journal replay hasn't finished with accounting keys we
+		 * can't flush accounting keys at all - condense them and leave
+		 * them for next time.
+		 *
+		 * Q: Can the write buffer overflow?
+		 * A Shouldn't be any actual risk. It's just new accounting
+		 * updates that the write buffer can't flush, and those are only
+		 * going to be generated by interior btree node updates as
+		 * journal replay has to split/rewrite nodes to make room for
+		 * its updates.
+		 *
+		 * And for those new acounting updates, updates to the same
+		 * counters get accumulated as they're flushed from the journal
+		 * to the write buffer - see the patch for eytzingcer tree
+		 * accumulated. So we could only overflow if the number of
+		 * distinct counters touched somehow was very large.
+		 */
+		if (could_not_insert) {
+			struct btree_write_buffered_key *dst = wb->flushing.keys.data;
+
+			darray_for_each(wb->flushing.keys, i)
+				if (i->journal_seq)
+					*dst++ = *i;
+			wb->flushing.keys.nr = dst - wb->flushing.keys.data;
+		}
+	}
+err:
+	if (ret || !could_not_insert) {
+		bch2_journal_pin_drop(j, &wb->flushing.pin);
+		wb->flushing.keys.nr = 0;
+	}
+
+	bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
+	trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);
+	return ret;
+}
+
+static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
+{
+	struct journal_keys_to_wb dst;
+	int ret = 0;
+
+	bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
+
+	for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
+		jset_entry_for_each_key(entry, k) {
+			ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
+			if (ret)
+				goto out;
+		}
+
+		entry->type = BCH_JSET_ENTRY_btree_keys;
+	}
+out:
+	ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret;
+	return ret;
+}
+
+static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq)
+{
+	struct journal *j = &c->journal;
+	struct journal_buf *buf;
+	bool blocked;
+	int ret = 0;
+
+	while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) {
+		ret = bch2_journal_keys_to_write_buffer(c, buf);
+
+		if (!blocked && !ret) {
+			spin_lock(&j->lock);
+			buf->need_flush_to_write_buffer = false;
+			spin_unlock(&j->lock);
+		}
+
+		mutex_unlock(&j->buf_lock);
+
+		if (blocked) {
+			bch2_journal_unblock(j);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq,
+					bool *did_work)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	int ret = 0, fetch_from_journal_err;
+
+	do {
+		bch2_trans_unlock(trans);
+
+		fetch_from_journal_err = fetch_wb_keys_from_journal(c, max_seq);
+
+		*did_work |= wb->inc.keys.nr || wb->flushing.keys.nr;
+
+		/*
+		 * On memory allocation failure, bch2_btree_write_buffer_flush_locked()
+		 * is not guaranteed to empty wb->inc:
+		 */
+		mutex_lock(&wb->flushing.lock);
+		ret = bch2_btree_write_buffer_flush_locked(trans);
+		mutex_unlock(&wb->flushing.lock);
+	} while (!ret &&
+		 (fetch_from_journal_err ||
+		  (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) ||
+		  (wb->flushing.pin.seq && wb->flushing.pin.seq <= max_seq)));
+
+	return ret;
+}
+
+static int bch2_btree_write_buffer_journal_flush(struct journal *j,
+				struct journal_entry_pin *_pin, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	bool did_work = false;
+
+	return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq, &did_work));
+}
+
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	bool did_work = false;
+
+	trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_);
+
+	return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal), &did_work);
+}
+
+/*
+ * The write buffer requires flushing when going RO: keys in the journal for the
+ * write buffer don't have a journal pin yet
+ */
+bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *c)
+{
+	if (bch2_journal_error(&c->journal))
+		return false;
+
+	bool did_work = false;
+	bch2_trans_run(c, btree_write_buffer_flush_seq(trans,
+				journal_cur_seq(&c->journal), &did_work));
+	return did_work;
+}
+
+int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	int ret = 0;
+
+	if (mutex_trylock(&wb->flushing.lock)) {
+		ret = bch2_btree_write_buffer_flush_locked(trans);
+		mutex_unlock(&wb->flushing.lock);
+	}
+
+	return ret;
+}
+
+int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer))
+		return -BCH_ERR_erofs_no_writes;
+
+	int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
+	bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+	return ret;
+}
+
+/*
+ * In check and repair code, when checking references to write buffer btrees we
+ * need to issue a flush before we have a definitive error: this issues a flush
+ * if this is a key we haven't yet checked.
+ */
+int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans,
+					struct bkey_s_c referring_k,
+					struct bkey_buf *last_flushed)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_buf tmp;
+	int ret = 0;
+
+	bch2_bkey_buf_init(&tmp);
+
+	if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) {
+		bch2_bkey_buf_reassemble(&tmp, c, referring_k);
+
+		if (bkey_is_btree_ptr(referring_k.k)) {
+			bch2_trans_unlock(trans);
+			bch2_btree_interior_updates_flush(c);
+		}
+
+		ret = bch2_btree_write_buffer_flush_sync(trans);
+		if (ret)
+			goto err;
+
+		bch2_bkey_buf_copy(last_flushed, c, tmp.k);
+		ret = -BCH_ERR_transaction_restart_write_buffer_flush;
+	}
+err:
+	bch2_bkey_buf_exit(&tmp, c);
+	return ret;
+}
+
+static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work);
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	int ret;
+
+	mutex_lock(&wb->flushing.lock);
+	do {
+		ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans));
+	} while (!ret && bch2_btree_write_buffer_should_flush(c));
+	mutex_unlock(&wb->flushing.lock);
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+}
+
+static void wb_accounting_sort(struct btree_write_buffer *wb)
+{
+	eytzinger0_sort(wb->accounting.data, wb->accounting.nr,
+			sizeof(wb->accounting.data[0]),
+			wb_key_cmp, NULL);
+}
+
+int bch2_accounting_key_to_wb_slowpath(struct bch_fs *c, enum btree_id btree,
+				       struct bkey_i_accounting *k)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	struct btree_write_buffered_key new = { .btree = btree };
+
+	bkey_copy(&new.k, &k->k_i);
+
+	int ret = darray_push(&wb->accounting, new);
+	if (ret)
+		return ret;
+
+	wb_accounting_sort(wb);
+	return 0;
+}
+
+int bch2_journal_key_to_wb_slowpath(struct bch_fs *c,
+			     struct journal_keys_to_wb *dst,
+			     enum btree_id btree, struct bkey_i *k)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	int ret;
+retry:
+	ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL);
+	if (!ret && dst->wb == &wb->flushing)
+		ret = darray_resize(&wb->sorted, wb->flushing.keys.size);
+
+	if (unlikely(ret)) {
+		if (dst->wb == &c->btree_write_buffer.flushing) {
+			mutex_unlock(&dst->wb->lock);
+			dst->wb = &c->btree_write_buffer.inc;
+			bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin,
+					     bch2_btree_write_buffer_journal_flush);
+			goto retry;
+		}
+
+		return ret;
+	}
+
+	dst->room = darray_room(dst->wb->keys);
+	if (dst->wb == &wb->flushing)
+		dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
+	BUG_ON(!dst->room);
+	BUG_ON(!dst->seq);
+
+	struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
+	wb_k->journal_seq	= dst->seq;
+	wb_k->btree		= btree;
+	bkey_copy(&wb_k->k, k);
+	dst->wb->keys.nr++;
+	dst->room--;
+	return 0;
+}
+
+void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	if (mutex_trylock(&wb->flushing.lock)) {
+		mutex_lock(&wb->inc.lock);
+		move_keys_from_inc_to_flushing(wb);
+
+		/*
+		 * Attempt to skip wb->inc, and add keys directly to
+		 * wb->flushing, saving us a copy later:
+		 */
+
+		if (!wb->inc.keys.nr) {
+			dst->wb = &wb->flushing;
+		} else {
+			mutex_unlock(&wb->flushing.lock);
+			dst->wb = &wb->inc;
+		}
+	} else {
+		mutex_lock(&wb->inc.lock);
+		dst->wb = &wb->inc;
+	}
+
+	dst->room = darray_room(dst->wb->keys);
+	if (dst->wb == &wb->flushing)
+		dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
+	dst->seq = seq;
+
+	bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin,
+			     bch2_btree_write_buffer_journal_flush);
+
+	darray_for_each(wb->accounting, i)
+		memset(&i->k.v, 0, bkey_val_bytes(&i->k.k));
+}
+
+int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	unsigned live_accounting_keys = 0;
+	int ret = 0;
+
+	darray_for_each(wb->accounting, i)
+		if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&i->k))) {
+			i->journal_seq = dst->seq;
+			live_accounting_keys++;
+			ret = __bch2_journal_key_to_wb(c, dst, i->btree, &i->k);
+			if (ret)
+				break;
+		}
+
+	if (live_accounting_keys * 2 < wb->accounting.nr) {
+		struct btree_write_buffered_key *dst = wb->accounting.data;
+
+		darray_for_each(wb->accounting, src)
+			if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&src->k)))
+				*dst++ = *src;
+		wb->accounting.nr = dst - wb->accounting.data;
+		wb_accounting_sort(wb);
+	}
+
+	if (!dst->wb->keys.nr)
+		bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
+
+	if (bch2_btree_write_buffer_should_flush(c) &&
+	    __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) &&
+	    !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+
+	if (dst->wb == &wb->flushing)
+		mutex_unlock(&wb->flushing.lock);
+	mutex_unlock(&wb->inc.lock);
+
+	return ret;
+}
+
+static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size)
+{
+	if (wb->keys.size >= new_size)
+		return 0;
+
+	if (!mutex_trylock(&wb->lock))
+		return -EINTR;
+
+	int ret = darray_resize(&wb->keys, new_size);
+	mutex_unlock(&wb->lock);
+	return ret;
+}
+
+int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	return wb_keys_resize(&wb->flushing, new_size) ?:
+		wb_keys_resize(&wb->inc, new_size);
+}
+
+void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) &&
+	       !bch2_journal_error(&c->journal));
+
+	darray_exit(&wb->accounting);
+	darray_exit(&wb->sorted);
+	darray_exit(&wb->flushing.keys);
+	darray_exit(&wb->inc.keys);
+}
+
+int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	mutex_init(&wb->inc.lock);
+	mutex_init(&wb->flushing.lock);
+	INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work);
+
+	/* Will be resized by journal as needed: */
+	unsigned initial_size = 1 << 16;
+
+	return  darray_make_room(&wb->inc.keys, initial_size) ?:
+		darray_make_room(&wb->flushing.keys, initial_size) ?:
+		darray_make_room(&wb->sorted, initial_size);
+}
diff --git a/libbcachefs/btree_write_buffer.h b/libbcachefs/btree_write_buffer.h
new file mode 100644
index 00000000..d535cea2
--- /dev/null
+++ b/libbcachefs/btree_write_buffer.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
+#define _BCACHEFS_BTREE_WRITE_BUFFER_H
+
+#include "bkey.h"
+#include "disk_accounting.h"
+
+static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4;
+}
+
+static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4;
+}
+
+struct btree_trans;
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
+bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *);
+int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *);
+int bch2_btree_write_buffer_tryflush(struct btree_trans *);
+
+struct bkey_buf;
+int bch2_btree_write_buffer_maybe_flush(struct btree_trans *, struct bkey_s_c, struct bkey_buf *);
+
+struct journal_keys_to_wb {
+	struct btree_write_buffer_keys	*wb;
+	size_t				room;
+	u64				seq;
+};
+
+static inline int wb_key_cmp(const void *_l, const void *_r)
+{
+	const struct btree_write_buffered_key *l = _l;
+	const struct btree_write_buffered_key *r = _r;
+
+	return cmp_int(l->btree, r->btree) ?: bpos_cmp(l->k.k.p, r->k.k.p);
+}
+
+int bch2_accounting_key_to_wb_slowpath(struct bch_fs *,
+			      enum btree_id, struct bkey_i_accounting *);
+
+static inline int bch2_accounting_key_to_wb(struct bch_fs *c,
+			     enum btree_id btree, struct bkey_i_accounting *k)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	struct btree_write_buffered_key search;
+	search.btree = btree;
+	search.k.k.p = k->k.p;
+
+	unsigned idx = eytzinger0_find(wb->accounting.data, wb->accounting.nr,
+			sizeof(wb->accounting.data[0]),
+			wb_key_cmp, &search);
+
+	if (idx >= wb->accounting.nr)
+		return bch2_accounting_key_to_wb_slowpath(c, btree, k);
+
+	struct bkey_i_accounting *dst = bkey_i_to_accounting(&wb->accounting.data[idx].k);
+	bch2_accounting_accumulate(dst, accounting_i_to_s_c(k));
+	return 0;
+}
+
+int bch2_journal_key_to_wb_slowpath(struct bch_fs *,
+			     struct journal_keys_to_wb *,
+			     enum btree_id, struct bkey_i *);
+
+static inline int __bch2_journal_key_to_wb(struct bch_fs *c,
+			     struct journal_keys_to_wb *dst,
+			     enum btree_id btree, struct bkey_i *k)
+{
+	if (unlikely(!dst->room))
+		return bch2_journal_key_to_wb_slowpath(c, dst, btree, k);
+
+	struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
+	wb_k->journal_seq	= dst->seq;
+	wb_k->btree		= btree;
+	bkey_copy(&wb_k->k, k);
+	dst->wb->keys.nr++;
+	dst->room--;
+	return 0;
+}
+
+static inline int bch2_journal_key_to_wb(struct bch_fs *c,
+			     struct journal_keys_to_wb *dst,
+			     enum btree_id btree, struct bkey_i *k)
+{
+	EBUG_ON(!dst->seq);
+
+	return k->k.type == KEY_TYPE_accounting
+		? bch2_accounting_key_to_wb(c, btree, bkey_i_to_accounting(k))
+		: __bch2_journal_key_to_wb(c, dst, btree, k);
+}
+
+void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64);
+int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
+
+int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
+void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
+int bch2_fs_btree_write_buffer_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */
diff --git a/libbcachefs/btree_write_buffer_types.h b/libbcachefs/btree_write_buffer_types.h
new file mode 100644
index 00000000..e9e76e20
--- /dev/null
+++ b/libbcachefs/btree_write_buffer_types.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
+#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
+
+#include "darray.h"
+#include "journal_types.h"
+
+#define BTREE_WRITE_BUFERED_VAL_U64s_MAX	4
+#define BTREE_WRITE_BUFERED_U64s_MAX	(BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
+
+struct wb_key_ref {
+union {
+	struct {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+		unsigned			idx:24;
+		u8				pos[sizeof(struct bpos)];
+		enum btree_id			btree:8;
+#else
+		enum btree_id			btree:8;
+		u8				pos[sizeof(struct bpos)];
+		unsigned			idx:24;
+#endif
+	} __packed;
+	struct {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+		u64 lo;
+		u64 mi;
+		u64 hi;
+#else
+		u64 hi;
+		u64 mi;
+		u64 lo;
+#endif
+	};
+};
+};
+
+struct btree_write_buffered_key {
+	enum btree_id			btree:8;
+	u64				journal_seq:56;
+	__BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
+};
+
+struct btree_write_buffer_keys {
+	DARRAY(struct btree_write_buffered_key) keys;
+	struct journal_entry_pin	pin;
+	struct mutex			lock;
+};
+
+struct btree_write_buffer {
+	DARRAY(struct wb_key_ref)	sorted;
+	struct btree_write_buffer_keys	inc;
+	struct btree_write_buffer_keys	flushing;
+	struct work_struct		flush_work;
+
+	DARRAY(struct btree_write_buffered_key) accounting;
+};
+
+#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 43112445..afd35c93 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -1,970 +1,1361 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Code for manipulating bucket marks for garbage collection.
  *
  * Copyright 2014 Datera, Inc.
- *
- * Bucket states:
- * - free bucket: mark == 0
- *   The bucket contains no data and will not be read
- *
- * - allocator bucket: owned_by_allocator == 1
- *   The bucket is on a free list, or it is an open bucket
- *
- * - cached bucket: owned_by_allocator == 0 &&
- *                  dirty_sectors == 0 &&
- *                  cached_sectors > 0
- *   The bucket contains data but may be safely discarded as there are
- *   enough replicas of the data on other cache devices, or it has been
- *   written back to the backing device
- *
- * - dirty bucket: owned_by_allocator == 0 &&
- *                 dirty_sectors > 0
- *   The bucket contains data that we must not discard (either only copy,
- *   or one of the 'main copies' for data requiring multiple replicas)
- *
- * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
- *   This is a btree node, journal or gen/prio bucket
- *
- * Lifecycle:
- *
- * bucket invalidated => bucket on freelist => open bucket =>
- *     [dirty bucket =>] cached bucket => bucket invalidated => ...
- *
- * Note that cache promotion can skip the dirty bucket step, as data
- * is copied from a deeper tier to a shallower tier, onto a cached
- * bucket.
- * Note also that a cached bucket can spontaneously become dirty --
- * see below.
- *
- * Only a traversal of the key space can determine whether a bucket is
- * truly dirty or cached.
- *
- * Transitions:
- *
- * - free => allocator: bucket was invalidated
- * - cached => allocator: bucket was invalidated
- *
- * - allocator => dirty: open bucket was filled up
- * - allocator => cached: open bucket was filled up
- * - allocator => metadata: metadata was allocated
- *
- * - dirty => cached: dirty sectors were copied to a deeper tier
- * - dirty => free: dirty sectors were overwritten or moved (copy gc)
- * - cached => free: cached sectors were overwritten
- *
- * - metadata => free: metadata was freed
- *
- * Oddities:
- * - cached => dirty: a device was removed so formerly replicated data
- *                    is no longer sufficiently replicated
- * - free => cached: cannot happen
- * - free => dirty: cannot happen
- * - free => metadata: cannot happen
  */
 
 #include "bcachefs.h"
-#include "alloc.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "bset.h"
 #include "btree_gc.h"
+#include "btree_update.h"
 #include "buckets.h"
+#include "buckets_waiting_for_journal.h"
+#include "disk_accounting.h"
+#include "ec.h"
 #include "error.h"
+#include "inode.h"
 #include "movinggc.h"
+#include "rebalance.h"
+#include "recovery.h"
+#include "recovery_passes.h"
+#include "reflink.h"
+#include "replicas.h"
+#include "subvolume.h"
+#include "trace.h"
 
 #include <linux/preempt.h>
-#include <trace/events/bcachefs.h>
 
-#ifdef DEBUG_BUCKETS
+void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
+{
+	memset(usage, 0, sizeof(*usage));
+	acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s());
+}
 
-#define lg_local_lock	lg_global_lock
-#define lg_local_unlock	lg_global_unlock
+static u64 reserve_factor(u64 r)
+{
+	return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
+}
 
-static void bch2_fs_stats_verify(struct bch_fs *c)
+static struct bch_fs_usage_short
+__bch2_fs_usage_read_short(struct bch_fs *c)
 {
-	struct bch_fs_usage stats =
-		__bch2_fs_usage_read(c);
-	unsigned i;
+	struct bch_fs_usage_short ret;
+	u64 data, reserved;
 
-	for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
-		if ((s64) stats.s[i].data[S_META] < 0)
-			panic("replicas %u meta underflow: %lli\n",
-			      i + 1, stats.s[i].data[S_META]);
+	ret.capacity = c->capacity -
+		percpu_u64_get(&c->usage->hidden);
 
-		if ((s64) stats.s[i].data[S_DIRTY] < 0)
-			panic("replicas %u dirty underflow: %lli\n",
-			      i + 1, stats.s[i].data[S_DIRTY]);
+	data		= percpu_u64_get(&c->usage->data) +
+		percpu_u64_get(&c->usage->btree);
+	reserved	= percpu_u64_get(&c->usage->reserved) +
+		percpu_u64_get(c->online_reserved);
 
-		if ((s64) stats.s[i].persistent_reserved < 0)
-			panic("replicas %u reserved underflow: %lli\n",
-			      i + 1, stats.s[i].persistent_reserved);
-	}
+	ret.used	= min(ret.capacity, data + reserve_factor(reserved));
+	ret.free	= ret.capacity - ret.used;
 
-	if ((s64) stats.online_reserved < 0)
-		panic("sectors_online_reserved underflow: %lli\n",
-		      stats.online_reserved);
+	ret.nr_inodes	= percpu_u64_get(&c->usage->nr_inodes);
+
+	return ret;
 }
 
-static void bch2_dev_stats_verify(struct bch_dev *ca)
+struct bch_fs_usage_short
+bch2_fs_usage_read_short(struct bch_fs *c)
 {
-	struct bch_dev_usage stats =
-		__bch2_dev_usage_read(ca);
-	u64 n = ca->mi.nbuckets - ca->mi.first_bucket;
-	unsigned i;
+	struct bch_fs_usage_short ret;
+
+	percpu_down_read(&c->mark_lock);
+	ret = __bch2_fs_usage_read_short(c);
+	percpu_up_read(&c->mark_lock);
 
-	for (i = 0; i < ARRAY_SIZE(stats.buckets); i++)
-		BUG_ON(stats.buckets[i]		> n);
-	BUG_ON(stats.buckets_alloc		> n);
-	BUG_ON(stats.buckets_unavailable	> n);
+	return ret;
 }
 
-static void bch2_disk_reservations_verify(struct bch_fs *c, int flags)
+void bch2_dev_usage_to_text(struct printbuf *out,
+			    struct bch_dev *ca,
+			    struct bch_dev_usage *usage)
 {
-	if (!(flags & BCH_DISK_RESERVATION_NOFAIL)) {
-		u64 used = __bch2_fs_sectors_used(c);
-		u64 cached = 0;
-		u64 avail = atomic64_read(&c->sectors_available);
-		int cpu;
+	if (out->nr_tabstops < 5) {
+		printbuf_tabstops_reset(out);
+		printbuf_tabstop_push(out, 12);
+		printbuf_tabstop_push(out, 16);
+		printbuf_tabstop_push(out, 16);
+		printbuf_tabstop_push(out, 16);
+		printbuf_tabstop_push(out, 16);
+	}
 
-		for_each_possible_cpu(cpu)
-			cached += per_cpu_ptr(c->usage_percpu, cpu)->available_cache;
+	prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
 
-		if (used + avail + cached > c->capacity)
-			panic("used %llu avail %llu cached %llu capacity %llu\n",
-			      used, avail, cached, c->capacity);
+	for (unsigned i = 0; i < BCH_DATA_NR; i++) {
+		bch2_prt_data_type(out, i);
+		prt_printf(out, "\t%llu\r%llu\r%llu\r\n",
+			   usage->d[i].buckets,
+			   usage->d[i].sectors,
+			   usage->d[i].fragmented);
 	}
-}
 
-#else
+	prt_printf(out, "capacity\t%llu\r\n", ca->mi.nbuckets);
+}
 
-static void bch2_fs_stats_verify(struct bch_fs *c) {}
-static void bch2_dev_stats_verify(struct bch_dev *ca) {}
-static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {}
+static int bch2_check_fix_ptr(struct btree_trans *trans,
+			      struct bkey_s_c k,
+			      struct extent_ptr_decoded p,
+			      const union bch_extent_entry *entry,
+			      bool *do_update)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
+	if (!ca) {
+		if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID,
+				trans, ptr_to_invalid_device,
+				"pointer to missing device %u\n"
+				"while marking %s",
+				p.ptr.dev,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			*do_update = true;
+		return 0;
+	}
 
-#endif
+	struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+	if (!g) {
+		if (fsck_err(trans, ptr_to_invalid_device,
+			     "pointer to invalid bucket on device %u\n"
+			     "while marking %s",
+			     p.ptr.dev,
+			     (printbuf_reset(&buf),
+			      bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			*do_update = true;
+		goto out;
+	}
 
-/*
- * Clear journal_seq_valid for buckets for which it's not needed, to prevent
- * wraparound:
- */
-void bch2_bucket_seq_cleanup(struct bch_fs *c)
-{
-	u16 last_seq_ondisk = c->journal.last_seq_ondisk;
-	struct bch_dev *ca;
-	struct bucket_array *buckets;
-	struct bucket *g;
-	struct bucket_mark m;
-	unsigned i;
+	enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
+
+	if (fsck_err_on(!g->gen_valid,
+			trans, ptr_to_missing_alloc_key,
+			"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+			"while marking %s",
+			p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+			bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+			p.ptr.gen,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		if (!p.ptr.cached) {
+			g->gen_valid		= true;
+			g->gen			= p.ptr.gen;
+		} else {
+			*do_update = true;
+		}
+	}
 
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		buckets = bucket_array(ca);
+	if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
+			trans, ptr_gen_newer_than_bucket_gen,
+			"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+			"while marking %s",
+			p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+			bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+			p.ptr.gen, g->gen,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		if (!p.ptr.cached &&
+		    (g->data_type != BCH_DATA_btree ||
+		     data_type == BCH_DATA_btree)) {
+			g->gen_valid		= true;
+			g->gen			= p.ptr.gen;
+			g->data_type		= 0;
+			g->stripe_sectors	= 0;
+			g->dirty_sectors	= 0;
+			g->cached_sectors	= 0;
+		} else {
+			*do_update = true;
+		}
+	}
 
-		for_each_bucket(g, buckets) {
-			bucket_cmpxchg(g, m, ({
-				if (!m.journal_seq_valid ||
-				    bucket_needs_journal_commit(m, last_seq_ondisk))
-					break;
+	if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
+			trans, ptr_gen_newer_than_bucket_gen,
+			"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+			"while marking %s",
+			p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+			bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+			p.ptr.gen,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+		*do_update = true;
+
+	if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
+			trans, stale_dirty_ptr,
+			"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+			"while marking %s",
+			p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+			bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+			p.ptr.gen, g->gen,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+		*do_update = true;
+
+	if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
+		goto out;
 
-				m.journal_seq_valid = 0;
-			}));
+	if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
+			trans, ptr_bucket_data_type_mismatch,
+			"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
+			"while marking %s",
+			p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+			bch2_data_type_str(g->data_type),
+			bch2_data_type_str(data_type),
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		if (data_type == BCH_DATA_btree) {
+			g->gen_valid		= true;
+			g->gen			= p.ptr.gen;
+			g->data_type		= data_type;
+			g->stripe_sectors	= 0;
+			g->dirty_sectors	= 0;
+			g->cached_sectors	= 0;
+		} else {
+			*do_update = true;
 		}
-		up_read(&ca->bucket_lock);
 	}
+
+	if (p.has_ec) {
+		struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
+
+		if (fsck_err_on(!m || !m->alive,
+				trans, ptr_to_missing_stripe,
+				"pointer to nonexistent stripe %llu\n"
+				"while marking %s",
+				(u64) p.ec.idx,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			*do_update = true;
+
+		if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p),
+				trans, ptr_to_incorrect_stripe,
+				"pointer does not match stripe %llu\n"
+				"while marking %s",
+				(u64) p.ec.idx,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			*do_update = true;
+	}
+out:
+fsck_err:
+	bch2_dev_put(ca);
+	printbuf_exit(&buf);
+	return ret;
 }
 
-#define bch2_usage_add(_acc, _stats)					\
-do {									\
-	typeof(_acc) _a = (_acc), _s = (_stats);			\
-	unsigned i;							\
-									\
-	for (i = 0; i < sizeof(*_a) / sizeof(u64); i++)			\
-		((u64 *) (_a))[i] += ((u64 *) (_s))[i];			\
-} while (0)
+int bch2_check_fix_ptrs(struct btree_trans *trans,
+			enum btree_id btree, unsigned level, struct bkey_s_c k,
+			enum btree_iter_update_trigger_flags flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry_c;
+	struct extent_ptr_decoded p = { 0 };
+	bool do_update = false;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	percpu_down_read(&c->mark_lock);
+
+	bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
+		ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
+		if (ret)
+			goto err;
+	}
 
-#define bch2_usage_read_raw(_stats)					\
-({									\
-	typeof(*this_cpu_ptr(_stats)) _acc;				\
-	int cpu;							\
-									\
-	memset(&_acc, 0, sizeof(_acc));					\
-									\
-	for_each_possible_cpu(cpu)					\
-		bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu));	\
-									\
-	_acc;								\
-})
+	if (do_update) {
+		if (flags & BTREE_TRIGGER_is_root) {
+			bch_err(c, "cannot update btree roots yet");
+			ret = -EINVAL;
+			goto err;
+		}
 
-#define bch2_usage_read_cached(_c, _cached, _uncached)			\
-({									\
-	typeof(_cached) _ret;						\
-	unsigned _seq;							\
-									\
-	do {								\
-		_seq = read_seqcount_begin(&(_c)->gc_pos_lock);		\
-		_ret = (_c)->gc_pos.phase == GC_PHASE_DONE		\
-			? bch2_usage_read_raw(_uncached)			\
-			: (_cached);					\
-	} while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq));	\
-									\
-	_ret;								\
-})
+		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			goto err;
+
+		rcu_read_lock();
+		bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev));
+		rcu_read_unlock();
+
+		if (level) {
+			/*
+			 * We don't want to drop btree node pointers - if the
+			 * btree node isn't there anymore, the read path will
+			 * sort it out:
+			 */
+			struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+			rcu_read_lock();
+			bkey_for_each_ptr(ptrs, ptr) {
+				struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+				struct bucket *g = PTR_GC_BUCKET(ca, ptr);
+
+				ptr->gen = g->gen;
+			}
+			rcu_read_unlock();
+		} else {
+			struct bkey_ptrs ptrs;
+			union bch_extent_entry *entry;
+
+			rcu_read_lock();
+restart_drop_ptrs:
+			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+			bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) {
+				struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
+				struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+				enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry);
+
+				if ((p.ptr.cached &&
+				     (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) ||
+				    (!p.ptr.cached &&
+				     gen_cmp(p.ptr.gen, g->gen) < 0) ||
+				    gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX ||
+				    (g->data_type &&
+				     g->data_type != data_type)) {
+					bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr);
+					goto restart_drop_ptrs;
+				}
+			}
+			rcu_read_unlock();
+again:
+			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+			bkey_extent_entry_for_each(ptrs, entry) {
+				if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
+					struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
+									entry->stripe_ptr.idx);
+					union bch_extent_entry *next_ptr;
+
+					bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
+						if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
+							goto found;
+					next_ptr = NULL;
+found:
+					if (!next_ptr) {
+						bch_err(c, "aieee, found stripe ptr with no data ptr");
+						continue;
+					}
+
+					if (!m || !m->alive ||
+					    !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
+								       &next_ptr->ptr,
+								       m->sectors)) {
+						bch2_bkey_extent_entry_drop(new, entry);
+						goto again;
+					}
+				}
+			}
+		}
 
-struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
-{
-	return bch2_usage_read_raw(ca->usage_percpu);
-}
+		if (0) {
+			printbuf_reset(&buf);
+			bch2_bkey_val_to_text(&buf, c, k);
+			bch_info(c, "updated %s", buf.buf);
 
-struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
-{
-	return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
-}
+			printbuf_reset(&buf);
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
+			bch_info(c, "new key %s", buf.buf);
+		}
 
-struct bch_fs_usage
-__bch2_fs_usage_read(struct bch_fs *c)
-{
-	return bch2_usage_read_raw(c->usage_percpu);
+		percpu_up_read(&c->mark_lock);
+		struct btree_iter iter;
+		bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
+					  BTREE_ITER_intent|BTREE_ITER_all_snapshots);
+		ret =   bch2_btree_iter_traverse(&iter) ?:
+			bch2_trans_update(trans, &iter, new,
+					  BTREE_UPDATE_internal_snapshot_node|
+					  BTREE_TRIGGER_norun);
+		bch2_trans_iter_exit(trans, &iter);
+		percpu_down_read(&c->mark_lock);
+
+		if (ret)
+			goto err;
+
+		if (level)
+			bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
+	}
+err:
+	percpu_up_read(&c->mark_lock);
+	printbuf_exit(&buf);
+	return ret;
 }
 
-struct bch_fs_usage
-bch2_fs_usage_read(struct bch_fs *c)
+int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
+			   struct bkey_s_c k,
+			   const struct bch_extent_ptr *ptr,
+			   s64 sectors, enum bch_data_type ptr_data_type,
+			   u8 b_gen, u8 bucket_data_type,
+			   u32 *bucket_sectors)
 {
-	return bch2_usage_read_cached(c,
-				     c->usage_cached,
-				     c->usage_percpu);
-}
+	struct bch_fs *c = trans->c;
+	size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
+	struct printbuf buf = PRINTBUF;
+	bool inserting = sectors > 0;
+	int ret = 0;
+
+	BUG_ON(!sectors);
+
+	if (gen_after(ptr->gen, b_gen)) {
+		bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+		log_fsck_err(trans, ptr_gen_newer_than_bucket_gen,
+			"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
+			"while marking %s",
+			ptr->dev, bucket_nr, b_gen,
+			bch2_data_type_str(bucket_data_type ?: ptr_data_type),
+			ptr->gen,
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		if (inserting)
+			goto err;
+		goto out;
+	}
 
-struct fs_usage_sum {
-	u64	data;
-	u64	reserved;
-};
+	if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
+		bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+		log_fsck_err(trans, ptr_too_stale,
+			"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+			"while marking %s",
+			ptr->dev, bucket_nr, b_gen,
+			bch2_data_type_str(bucket_data_type ?: ptr_data_type),
+			ptr->gen,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		if (inserting)
+			goto err;
+		goto out;
+	}
 
-static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
-{
-	struct fs_usage_sum sum = { 0 };
-	unsigned i;
+	if (b_gen != ptr->gen && ptr->cached) {
+		ret = 1;
+		goto out;
+	}
 
-	for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
-		sum.data += (stats.s[i].data[S_META] +
-			     stats.s[i].data[S_DIRTY]) * (i + 1);
-		sum.reserved += stats.s[i].persistent_reserved * (i + 1);
+	if (b_gen != ptr->gen) {
+		bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+		log_fsck_err(trans, stale_dirty_ptr,
+			"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
+			"while marking %s",
+			ptr->dev, bucket_nr, b_gen,
+			bucket_gen_get(ca, bucket_nr),
+			bch2_data_type_str(bucket_data_type ?: ptr_data_type),
+			ptr->gen,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		if (inserting)
+			goto err;
+		goto out;
 	}
 
-	sum.reserved += stats.online_reserved;
-	return sum;
-}
+	if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) {
+		bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+		log_fsck_err(trans, ptr_bucket_data_type_mismatch,
+			"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
+			"while marking %s",
+			ptr->dev, bucket_nr, b_gen,
+			bch2_data_type_str(bucket_data_type),
+			bch2_data_type_str(ptr_data_type),
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		if (inserting)
+			goto err;
+		goto out;
+	}
 
-#define RESERVE_FACTOR	6
+	if ((u64) *bucket_sectors + sectors > U32_MAX) {
+		bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+		log_fsck_err(trans, bucket_sector_count_overflow,
+			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
+			"while marking %s",
+			ptr->dev, bucket_nr, b_gen,
+			bch2_data_type_str(bucket_data_type ?: ptr_data_type),
+			*bucket_sectors, sectors,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		if (inserting)
+			goto err;
+		sectors = -*bucket_sectors;
+	}
 
-static u64 reserve_factor(u64 r)
-{
-	return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
+	*bucket_sectors += sectors;
+out:
+	printbuf_exit(&buf);
+	return ret;
+err:
+fsck_err:
+	bch2_dump_trans_updates(trans);
+	bch2_inconsistent_error(c);
+	ret = -BCH_ERR_bucket_ref_update;
+	goto out;
 }
 
-static u64 avail_factor(u64 r)
+void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
 {
-	return (r << RESERVE_FACTOR) / (1 << RESERVE_FACTOR) + 1;
-}
+	struct bch_fs *c = trans->c;
+	u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+	static int warned_disk_usage = 0;
+	bool warn = false;
 
-u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
-{
-	struct fs_usage_sum sum = __fs_usage_sum(stats);
+	percpu_down_read(&c->mark_lock);
+	struct bch_fs_usage_base *src = &trans->fs_usage_delta;
 
-	return sum.data + reserve_factor(sum.reserved);
-}
+	s64 added = src->btree + src->data + src->reserved;
 
-u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
-{
-	return min(c->capacity, __bch2_fs_sectors_used(c, stats));
-}
+	/*
+	 * Not allowed to reduce sectors_available except by getting a
+	 * reservation:
+	 */
+	s64 should_not_have_added = added - (s64) disk_res_sectors;
+	if (unlikely(should_not_have_added > 0)) {
+		u64 old, new;
+
+		old = atomic64_read(&c->sectors_available);
+		do {
+			new = max_t(s64, 0, old - should_not_have_added);
+		} while (!atomic64_try_cmpxchg(&c->sectors_available,
+					       &old, new));
+
+		added -= should_not_have_added;
+		warn = true;
+	}
 
-u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats)
-{
-	return avail_factor(c->capacity - bch2_fs_sectors_used(c, stats));
-}
+	if (added > 0) {
+		trans->disk_res->sectors -= added;
+		this_cpu_sub(*c->online_reserved, added);
+	}
 
-static inline int is_unavailable_bucket(struct bucket_mark m)
-{
-	return !is_available_bucket(m);
+	preempt_disable();
+	struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
+	acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64));
+	preempt_enable();
+	percpu_up_read(&c->mark_lock);
+
+	if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
+		bch2_trans_inconsistent(trans,
+					"disk usage increased %lli more than %llu sectors reserved)",
+					should_not_have_added, disk_res_sectors);
 }
 
-static inline int is_fragmented_bucket(struct bucket_mark m,
-				       struct bch_dev *ca)
+/* KEY_TYPE_extent: */
+
+static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
+			  struct bkey_s_c k,
+			  const struct extent_ptr_decoded *p,
+			  s64 sectors, enum bch_data_type ptr_data_type,
+			  struct bch_alloc_v4 *a)
 {
-	if (!m.owned_by_allocator &&
-	    m.data_type == BCH_DATA_USER &&
-	    bucket_sectors_used(m))
-		return max_t(int, 0, (int) ca->mi.bucket_size -
-			     bucket_sectors_used(m));
+	u32 *dst_sectors = p->has_ec	? &a->stripe_sectors :
+		!p->ptr.cached		? &a->dirty_sectors :
+					  &a->cached_sectors;
+	int ret = bch2_bucket_ref_update(trans, ca, k, &p->ptr, sectors, ptr_data_type,
+					 a->gen, a->data_type, dst_sectors);
+
+	if (ret)
+		return ret;
+
+	alloc_data_type_set(a, ptr_data_type);
 	return 0;
 }
 
-static inline enum bch_data_type bucket_type(struct bucket_mark m)
+static int bch2_trigger_pointer(struct btree_trans *trans,
+			enum btree_id btree_id, unsigned level,
+			struct bkey_s_c k, struct extent_ptr_decoded p,
+			const union bch_extent_entry *entry,
+			s64 *sectors,
+			enum btree_iter_update_trigger_flags flags)
 {
-	return m.cached_sectors && !m.dirty_sectors
-		?  BCH_DATA_CACHED
-		: m.data_type;
-}
+	struct bch_fs *c = trans->c;
+	bool insert = !(flags & BTREE_TRIGGER_overwrite);
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	u64 abs_sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p);
+	*sectors = insert ? abs_sectors : -abs_sectors;
+
+	struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
+	if (unlikely(!ca)) {
+		if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID)
+			ret = -BCH_ERR_trigger_pointer;
+		goto err;
+	}
 
-static bool bucket_became_unavailable(struct bch_fs *c,
-				      struct bucket_mark old,
-				      struct bucket_mark new)
-{
-	return is_available_bucket(old) &&
-	       !is_available_bucket(new) &&
-	       (!c || c->gc_pos.phase == GC_PHASE_DONE);
+	struct bpos bucket;
+	struct bkey_i_backpointer bp;
+	__bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp, abs_sectors);
+
+	if (flags & BTREE_TRIGGER_transactional) {
+		struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
+		ret = PTR_ERR_OR_ZERO(a) ?:
+			__mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v);
+		if (ret)
+			goto err;
+
+		if (!p.ptr.cached) {
+			ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
+			if (ret)
+				goto err;
+		}
+	}
+
+	if (flags & BTREE_TRIGGER_gc) {
+		percpu_down_read(&c->mark_lock);
+		struct bucket *g = gc_bucket(ca, bucket.offset);
+		if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n  %s",
+					    p.ptr.dev,
+					    (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+			ret = -BCH_ERR_trigger_pointer;
+			goto err_unlock;
+		}
+
+		bucket_lock(g);
+		struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
+		ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new);
+		alloc_to_bucket(g, new);
+		bucket_unlock(g);
+err_unlock:
+		percpu_up_read(&c->mark_lock);
+
+		if (!ret)
+			ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
+	}
+err:
+	bch2_dev_put(ca);
+	printbuf_exit(&buf);
+	return ret;
 }
 
-void bch2_fs_usage_apply(struct bch_fs *c,
-			struct bch_fs_usage *stats,
-			struct disk_reservation *disk_res,
-			struct gc_pos gc_pos)
+static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
+				struct bkey_s_c k,
+				struct extent_ptr_decoded p,
+				enum bch_data_type data_type,
+				s64 sectors,
+				enum btree_iter_update_trigger_flags flags)
 {
-	struct fs_usage_sum sum = __fs_usage_sum(*stats);
-	s64 added = sum.data + sum.reserved;
+	if (flags & BTREE_TRIGGER_transactional) {
+		struct btree_iter iter;
+		struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter,
+				BTREE_ID_stripes, POS(0, p.ec.idx),
+				BTREE_ITER_with_updates, stripe);
+		int ret = PTR_ERR_OR_ZERO(s);
+		if (unlikely(ret)) {
+			bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
+				"pointer to nonexistent stripe %llu",
+				(u64) p.ec.idx);
+			goto err;
+		}
 
-	/*
-	 * Not allowed to reduce sectors_available except by getting a
-	 * reservation:
-	 */
-	BUG_ON(added > (s64) (disk_res ? disk_res->sectors : 0));
+		if (!bch2_ptr_matches_stripe(&s->v, p)) {
+			bch2_trans_inconsistent(trans,
+				"stripe pointer doesn't match stripe %llu",
+				(u64) p.ec.idx);
+			ret = -BCH_ERR_trigger_stripe_pointer;
+			goto err;
+		}
 
-	if (added > 0) {
-		disk_res->sectors	-= added;
-		stats->online_reserved	-= added;
+		stripe_blockcount_set(&s->v, p.ec.block,
+			stripe_blockcount_get(&s->v, p.ec.block) +
+			sectors);
+
+		struct disk_accounting_pos acc = {
+			.type = BCH_DISK_ACCOUNTING_replicas,
+		};
+		bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
+		acc.replicas.data_type = data_type;
+		ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
+err:
+		bch2_trans_iter_exit(trans, &iter);
+		return ret;
 	}
 
-	percpu_down_read_preempt_disable(&c->usage_lock);
-	/* online_reserved not subject to gc: */
-	this_cpu_ptr(c->usage_percpu)->online_reserved +=
-		stats->online_reserved;
-	stats->online_reserved = 0;
+	if (flags & BTREE_TRIGGER_gc) {
+		struct bch_fs *c = trans->c;
+
+		struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
+		if (!m) {
+			bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+				(u64) p.ec.idx);
+			return -BCH_ERR_ENOMEM_mark_stripe_ptr;
+		}
+
+		mutex_lock(&c->ec_stripes_heap_lock);
+
+		if (!m || !m->alive) {
+			mutex_unlock(&c->ec_stripes_heap_lock);
+			struct printbuf buf = PRINTBUF;
+			bch2_bkey_val_to_text(&buf, c, k);
+			bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n  while marking %s",
+					    (u64) p.ec.idx, buf.buf);
+			printbuf_exit(&buf);
+			bch2_inconsistent_error(c);
+			return -BCH_ERR_trigger_stripe_pointer;
+		}
 
-	if (!gc_will_visit(c, gc_pos))
-		bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats);
+		m->block_sectors[p.ec.block] += sectors;
 
-	bch2_fs_stats_verify(c);
-	percpu_up_read_preempt_enable(&c->usage_lock);
+		struct disk_accounting_pos acc = {
+			.type = BCH_DISK_ACCOUNTING_replicas,
+		};
+		memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e));
+		mutex_unlock(&c->ec_stripes_heap_lock);
 
-	memset(stats, 0, sizeof(*stats));
+		acc.replicas.data_type = data_type;
+		int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, true);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
 }
 
-static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
-				  struct bucket_mark old, struct bucket_mark new)
+static int __trigger_extent(struct btree_trans *trans,
+			    enum btree_id btree_id, unsigned level,
+			    struct bkey_s_c k,
+			    enum btree_iter_update_trigger_flags flags,
+			    s64 *replicas_sectors)
 {
-	struct bch_dev_usage *dev_usage;
+	bool gc = flags & BTREE_TRIGGER_gc;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
+		? BCH_DATA_btree
+		: BCH_DATA_user;
+	int ret = 0;
+
+	struct disk_accounting_pos acc_replicas_key = {
+		.type			= BCH_DISK_ACCOUNTING_replicas,
+		.replicas.data_type	= data_type,
+		.replicas.nr_devs	= 0,
+		.replicas.nr_required	= 1,
+	};
+
+	struct disk_accounting_pos acct_compression_key = {
+		.type			= BCH_DISK_ACCOUNTING_compression,
+	};
+	u64 compression_acct[3] = { 1, 0, 0 };
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		s64 disk_sectors = 0;
+		ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags);
+		if (ret < 0)
+			return ret;
+
+		bool stale = ret > 0;
+
+		if (p.ptr.cached && stale)
+			continue;
+
+		if (p.ptr.cached) {
+			ret = bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors, gc);
+			if (ret)
+				return ret;
+		} else if (!p.has_ec) {
+			*replicas_sectors       += disk_sectors;
+			replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev);
+		} else {
+			ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
+			if (ret)
+				return ret;
+
+			/*
+			 * There may be other dirty pointers in this extent, but
+			 * if so they're not required for mounting if we have an
+			 * erasure coded pointer in this extent:
+			 */
+			acc_replicas_key.replicas.nr_required = 0;
+		}
 
-	if (c)
-		percpu_rwsem_assert_held(&c->usage_lock);
+		if (acct_compression_key.compression.type &&
+		    acct_compression_key.compression.type != p.crc.compression_type) {
+			if (flags & BTREE_TRIGGER_overwrite)
+				bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
 
-	if (old.data_type && new.data_type &&
-	    old.data_type != new.data_type) {
-		BUG_ON(!c);
-		bch2_fs_inconsistent(c,
-			"different types of data in same bucket: %s, %s",
-			bch2_data_types[old.data_type],
-			bch2_data_types[new.data_type]);
+			ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
+						       ARRAY_SIZE(compression_acct), gc);
+			if (ret)
+				return ret;
+
+			compression_acct[0] = 1;
+			compression_acct[1] = 0;
+			compression_acct[2] = 0;
+		}
+
+		acct_compression_key.compression.type = p.crc.compression_type;
+		if (p.crc.compression_type) {
+			compression_acct[1] += p.crc.uncompressed_size;
+			compression_acct[2] += p.crc.compressed_size;
+		}
 	}
 
-	dev_usage = this_cpu_ptr(ca->usage_percpu);
+	if (acc_replicas_key.replicas.nr_devs) {
+		ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc);
+		if (ret)
+			return ret;
+	}
 
-	dev_usage->buckets[bucket_type(old)]--;
-	dev_usage->buckets[bucket_type(new)]++;
+	if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) {
+		struct disk_accounting_pos acc_snapshot_key = {
+			.type			= BCH_DISK_ACCOUNTING_snapshot,
+			.snapshot.id		= k.k->p.snapshot,
+		};
+		ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc);
+		if (ret)
+			return ret;
+	}
 
-	dev_usage->buckets_alloc +=
-		(int) new.owned_by_allocator - (int) old.owned_by_allocator;
-	dev_usage->buckets_unavailable +=
-		is_unavailable_bucket(new) - is_unavailable_bucket(old);
+	if (acct_compression_key.compression.type) {
+		if (flags & BTREE_TRIGGER_overwrite)
+			bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
 
-	dev_usage->sectors[old.data_type] -= old.dirty_sectors;
-	dev_usage->sectors[new.data_type] += new.dirty_sectors;
-	dev_usage->sectors[BCH_DATA_CACHED] +=
-		(int) new.cached_sectors - (int) old.cached_sectors;
-	dev_usage->sectors_fragmented +=
-		is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
+		ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
+					       ARRAY_SIZE(compression_acct), gc);
+		if (ret)
+			return ret;
+	}
 
-	if (!is_available_bucket(old) && is_available_bucket(new))
-		bch2_wake_allocator(ca);
+	if (level) {
+		struct disk_accounting_pos acc_btree_key = {
+			.type		= BCH_DISK_ACCOUNTING_btree,
+			.btree.id	= btree_id,
+		};
+		ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc);
+		if (ret)
+			return ret;
+	} else {
+		bool insert = !(flags & BTREE_TRIGGER_overwrite);
+		struct disk_accounting_pos acc_inum_key = {
+			.type		= BCH_DISK_ACCOUNTING_inum,
+			.inum.inum	= k.k->p.inode,
+		};
+		s64 v[3] = {
+			insert ? 1 : -1,
+			insert ? k.k->size : -((s64) k.k->size),
+			*replicas_sectors,
+		};
+		ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc);
+		if (ret)
+			return ret;
+	}
 
-	bch2_dev_stats_verify(ca);
+	return 0;
 }
 
-#define bucket_data_cmpxchg(c, ca, g, new, expr)		\
-({								\
-	struct bucket_mark _old = bucket_cmpxchg(g, new, expr);	\
-								\
-	bch2_dev_usage_update(c, ca, _old, new);		\
-	_old;							\
-})
-
-bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
-			    size_t b, struct bucket_mark *old)
+int bch2_trigger_extent(struct btree_trans *trans,
+			enum btree_id btree, unsigned level,
+			struct bkey_s_c old, struct bkey_s new,
+			enum btree_iter_update_trigger_flags flags)
 {
-	struct bucket *g;
-	struct bucket_mark new;
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
+	struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
+	unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
+	unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start;
+
+	if (unlikely(flags & BTREE_TRIGGER_check_repair))
+		return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags);
+
+	/* if pointers aren't changing - nothing to do: */
+	if (new_ptrs_bytes == old_ptrs_bytes &&
+	    !memcmp(new_ptrs.start,
+		    old_ptrs.start,
+		    new_ptrs_bytes))
+		return 0;
+
+	if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
+		s64 old_replicas_sectors = 0, new_replicas_sectors = 0;
+
+		if (old.k->type) {
+			int ret = __trigger_extent(trans, btree, level, old,
+						   flags & ~BTREE_TRIGGER_insert,
+						   &old_replicas_sectors);
+			if (ret)
+				return ret;
+		}
+
+		if (new.k->type) {
+			int ret = __trigger_extent(trans, btree, level, new.s_c,
+						   flags & ~BTREE_TRIGGER_overwrite,
+						   &new_replicas_sectors);
+			if (ret)
+				return ret;
+		}
 
-	percpu_rwsem_assert_held(&c->usage_lock);
+		int need_rebalance_delta = 0;
+		s64 need_rebalance_sectors_delta = 0;
 
-	g = bucket(ca, b);
+		s64 s = bch2_bkey_sectors_need_rebalance(c, old);
+		need_rebalance_delta -= s != 0;
+		need_rebalance_sectors_delta -= s;
 
-	*old = bucket_data_cmpxchg(c, ca, g, new, ({
-		if (!is_available_bucket(new)) {
-			percpu_up_read_preempt_enable(&c->usage_lock);
-			return false;
+		s = bch2_bkey_sectors_need_rebalance(c, new.s_c);
+		need_rebalance_delta += s != 0;
+		need_rebalance_sectors_delta += s;
+
+		if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
+			int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
+							  new.k->p, need_rebalance_delta > 0);
+			if (ret)
+				return ret;
 		}
 
-		new.owned_by_allocator	= 1;
-		new.data_type		= 0;
-		new.cached_sectors	= 0;
-		new.dirty_sectors	= 0;
-		new.gen++;
-	}));
+		if (need_rebalance_sectors_delta) {
+			struct disk_accounting_pos acc = {
+				.type		= BCH_DISK_ACCOUNTING_rebalance_work,
+			};
+			int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1,
+							   flags & BTREE_TRIGGER_gc);
+			if (ret)
+				return ret;
+		}
+	}
 
-	if (!old->owned_by_allocator && old->cached_sectors)
-		trace_invalidate(ca, bucket_to_sector(ca, b),
-				 old->cached_sectors);
-	return true;
+	return 0;
 }
 
-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-			    size_t b, bool owned_by_allocator,
-			    struct gc_pos pos, unsigned flags)
+/* KEY_TYPE_reservation */
+
+static int __trigger_reservation(struct btree_trans *trans,
+			enum btree_id btree_id, unsigned level, struct bkey_s_c k,
+			enum btree_iter_update_trigger_flags flags)
 {
-	struct bucket *g;
-	struct bucket_mark old, new;
+	if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
+		s64 sectors = k.k->size;
 
-	percpu_rwsem_assert_held(&c->usage_lock);
-	g = bucket(ca, b);
+		if (flags & BTREE_TRIGGER_overwrite)
+			sectors = -sectors;
 
-	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-	    gc_will_visit(c, pos))
-		return;
+		struct disk_accounting_pos acc = {
+			.type = BCH_DISK_ACCOUNTING_persistent_reserved,
+			.persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas,
+		};
 
-	old = bucket_data_cmpxchg(c, ca, g, new, ({
-		new.owned_by_allocator	= owned_by_allocator;
-	}));
+		return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, flags & BTREE_TRIGGER_gc);
+	}
 
-	BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
-	       c->gc_pos.phase == GC_PHASE_DONE);
+	return 0;
 }
 
-#define saturated_add(ca, dst, src, max)			\
-do {								\
-	BUG_ON((int) (dst) + (src) < 0);			\
-	if ((dst) == (max))					\
-		;						\
-	else if ((dst) + (src) <= (max))			\
-		dst += (src);					\
-	else {							\
-		dst = (max);					\
-		trace_sectors_saturated(ca);		\
-	}							\
-} while (0)
-
-void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
-			       size_t b, enum bch_data_type type,
-			       unsigned sectors, struct gc_pos pos,
-			       unsigned flags)
+int bch2_trigger_reservation(struct btree_trans *trans,
+			  enum btree_id btree_id, unsigned level,
+			  struct bkey_s_c old, struct bkey_s new,
+			  enum btree_iter_update_trigger_flags flags)
 {
-	struct bucket *g;
-	struct bucket_mark old, new;
+	return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags);
+}
 
-	BUG_ON(!type);
+/* Mark superblocks: */
 
-	if (likely(c)) {
-		percpu_rwsem_assert_held(&c->usage_lock);
+static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+				    struct bch_dev *ca, u64 b,
+				    enum bch_data_type type,
+				    unsigned sectors)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	int ret = 0;
+
+	struct bkey_i_alloc_v4 *a =
+		bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b));
+	if (IS_ERR(a))
+		return PTR_ERR(a);
+
+	if (a->v.data_type && type && a->v.data_type != type) {
+		bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+		log_fsck_err(trans, bucket_metadata_type_mismatch,
+			"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
+			"while marking %s",
+			iter.pos.inode, iter.pos.offset, a->v.gen,
+			bch2_data_type_str(a->v.data_type),
+			bch2_data_type_str(type),
+			bch2_data_type_str(type));
+		ret = -BCH_ERR_metadata_bucket_inconsistency;
+		goto err;
+	}
 
-		if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-		    gc_will_visit(c, pos))
-			return;
+	if (a->v.data_type	!= type ||
+	    a->v.dirty_sectors	!= sectors) {
+		a->v.data_type		= type;
+		a->v.dirty_sectors	= sectors;
+		ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
 	}
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
 
-	rcu_read_lock();
+static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca,
+			u64 b, enum bch_data_type data_type, unsigned sectors,
+			enum btree_iter_update_trigger_flags flags)
+{
+	struct bch_fs *c = trans->c;
+	int ret = 0;
 
-	g = bucket(ca, b);
-	old = bucket_data_cmpxchg(c, ca, g, new, ({
-		saturated_add(ca, new.dirty_sectors, sectors,
-			      GC_MAX_SECTORS_USED);
-		new.data_type		= type;
-	}));
+	percpu_down_read(&c->mark_lock);
+	struct bucket *g = gc_bucket(ca, b);
+	if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s",
+				    ca->dev_idx, bch2_data_type_str(data_type)))
+		goto err_unlock;
 
-	rcu_read_unlock();
+	bucket_lock(g);
+	struct bch_alloc_v4 old = bucket_m_to_alloc(*g);
 
-	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
-	       bucket_became_unavailable(c, old, new));
-}
+	if (bch2_fs_inconsistent_on(g->data_type &&
+			g->data_type != data_type, c,
+			"different types of data in same bucket: %s, %s",
+			bch2_data_type_str(g->data_type),
+			bch2_data_type_str(data_type)))
+		goto err;
+
+	if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
+			"bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size",
+			ca->dev_idx, b, g->gen,
+			bch2_data_type_str(g->data_type ?: data_type),
+			g->dirty_sectors, sectors))
+		goto err;
 
-/* Reverting this until the copygc + compression issue is fixed: */
+	g->data_type = data_type;
+	g->dirty_sectors += sectors;
+	struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
+	bucket_unlock(g);
+	percpu_up_read(&c->mark_lock);
+	ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
+	return ret;
+err:
+	bucket_unlock(g);
+err_unlock:
+	percpu_up_read(&c->mark_lock);
+	return -BCH_ERR_metadata_bucket_inconsistency;
+}
 
-static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
+int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+			struct bch_dev *ca, u64 b,
+			enum bch_data_type type, unsigned sectors,
+			enum btree_iter_update_trigger_flags flags)
 {
-	if (!sectors)
+	BUG_ON(type != BCH_DATA_free &&
+	       type != BCH_DATA_sb &&
+	       type != BCH_DATA_journal);
+
+	/*
+	 * Backup superblock might be past the end of our normal usable space:
+	 */
+	if (b >= ca->mi.nbuckets)
 		return 0;
 
-	return max(1U, DIV_ROUND_UP(sectors * crc.compressed_size,
-				    crc.uncompressed_size));
+	if (flags & BTREE_TRIGGER_gc)
+		return bch2_mark_metadata_bucket(trans, ca, b, type, sectors, flags);
+	else if (flags & BTREE_TRIGGER_transactional)
+		return commit_do(trans, NULL, NULL, 0,
+				 __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
+	else
+		BUG();
 }
 
-/*
- * Checking against gc's position has to be done here, inside the cmpxchg()
- * loop, to avoid racing with the start of gc clearing all the marks - GC does
- * that with the gc pos seqlock held.
- */
-static void bch2_mark_pointer(struct bch_fs *c,
-			      struct bkey_s_c_extent e,
-			      const struct bch_extent_ptr *ptr,
-			      struct bch_extent_crc_unpacked crc,
-			      s64 sectors, enum s_alloc type,
-			      struct bch_fs_usage *stats,
-			      u64 journal_seq, unsigned flags)
-{
-	struct bucket_mark old, new;
-	unsigned saturated;
-	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-	struct bucket *g = PTR_BUCKET(ca, ptr);
-	enum bch_data_type data_type = type == S_META
-		? BCH_DATA_BTREE : BCH_DATA_USER;
-	u64 v;
-
-	if (crc.compression_type) {
-		unsigned old_sectors, new_sectors;
-
-		if (sectors > 0) {
-			old_sectors = 0;
-			new_sectors = sectors;
-		} else {
-			old_sectors = e.k->size;
-			new_sectors = e.k->size + sectors;
-		}
-
-		sectors = -__disk_sectors(crc, old_sectors)
-			  +__disk_sectors(crc, new_sectors);
-	}
-
-	if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
-		if (journal_seq)
-			bucket_cmpxchg(g, new, ({
-				new.journal_seq_valid	= 1;
-				new.journal_seq		= journal_seq;
-			}));
+static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
+			struct bch_dev *ca, u64 start, u64 end,
+			enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors,
+			enum btree_iter_update_trigger_flags flags)
+{
+	do {
+		u64 b = sector_to_bucket(ca, start);
+		unsigned sectors =
+			min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
 
-		return;
-	}
+		if (b != *bucket && *bucket_sectors) {
+			int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
+							type, *bucket_sectors, flags);
+			if (ret)
+				return ret;
 
-	v = atomic64_read(&g->_mark.v);
-	do {
-		new.v.counter = old.v.counter = v;
-		saturated = 0;
-
-		/*
-		 * Check this after reading bucket mark to guard against
-		 * the allocator invalidating a bucket after we've already
-		 * checked the gen
-		 */
-		if (gen_after(new.gen, ptr->gen)) {
-			BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
-			EBUG_ON(!ptr->cached &&
-				test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
-			return;
+			*bucket_sectors = 0;
 		}
 
-		if (!ptr->cached &&
-		    new.dirty_sectors == GC_MAX_SECTORS_USED &&
-		    sectors < 0)
-			saturated = -sectors;
-
-		if (ptr->cached)
-			saturated_add(ca, new.cached_sectors, sectors,
-				      GC_MAX_SECTORS_USED);
-		else
-			saturated_add(ca, new.dirty_sectors, sectors,
-				      GC_MAX_SECTORS_USED);
-
-		if (!new.dirty_sectors &&
-		    !new.cached_sectors) {
-			new.data_type	= 0;
-
-			if (journal_seq) {
-				new.journal_seq_valid = 1;
-				new.journal_seq = journal_seq;
-			}
-		} else {
-			new.data_type = data_type;
-		}
+		*bucket		= b;
+		*bucket_sectors	+= sectors;
+		start += sectors;
+	} while (start < end);
 
-		if (flags & BCH_BUCKET_MARK_NOATOMIC) {
-			g->_mark = new;
-			break;
-		}
-	} while ((v = atomic64_cmpxchg(&g->_mark.v,
-			      old.v.counter,
-			      new.v.counter)) != old.v.counter);
-
-	bch2_dev_usage_update(c, ca, old, new);
-
-	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
-	       bucket_became_unavailable(c, old, new));
-
-	if (saturated &&
-	    atomic_long_add_return(saturated,
-				   &ca->saturated_count) >=
-	    bucket_to_sector(ca, ca->free_inc.size)) {
-		if (c->gc_thread) {
-			trace_gc_sectors_saturated(c);
-			wake_up_process(c->gc_thread);
-		}
-	}
+	return 0;
 }
 
-void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
-		   s64 sectors, bool metadata,
-		   struct gc_pos pos,
-		   struct bch_fs_usage *stats,
-		   u64 journal_seq, unsigned flags)
+static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca,
+			enum btree_iter_update_trigger_flags flags)
 {
-	/*
-	 * synchronization w.r.t. GC:
-	 *
-	 * Normally, bucket sector counts/marks are updated on the fly, as
-	 * references are added/removed from the btree, the lists of buckets the
-	 * allocator owns, other metadata buckets, etc.
-	 *
-	 * When GC is in progress and going to mark this reference, we do _not_
-	 * mark this reference here, to avoid double counting - GC will count it
-	 * when it gets to it.
-	 *
-	 * To know whether we should mark a given reference (GC either isn't
-	 * running, or has already marked references at this position) we
-	 * construct a total order for everything GC walks. Then, we can simply
-	 * compare the position of the reference we're marking - @pos - with
-	 * GC's current position. If GC is going to mark this reference, GC's
-	 * current position will be less than @pos; if GC's current position is
-	 * greater than @pos GC has either already walked this position, or
-	 * isn't running.
-	 *
-	 * To avoid racing with GC's position changing, we have to deal with
-	 *  - GC's position being set to GC_POS_MIN when GC starts:
-	 *    usage_lock guards against this
-	 *  - GC's position overtaking @pos: we guard against this with
-	 *    whatever lock protects the data structure the reference lives in
-	 *    (e.g. the btree node lock, or the relevant allocator lock).
-	 */
+	struct bch_fs *c = trans->c;
 
-	percpu_down_read_preempt_disable(&c->usage_lock);
-	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-	    gc_will_visit(c, pos))
-		flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
-
-	if (!stats)
-		stats = this_cpu_ptr(c->usage_percpu);
-
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const struct bch_extent_ptr *ptr;
-		struct bch_extent_crc_unpacked crc;
-		enum s_alloc type = metadata ? S_META : S_DIRTY;
-		unsigned replicas = 0;
-
-		BUG_ON(metadata && bkey_extent_is_cached(e.k));
-		BUG_ON(!sectors);
-
-		extent_for_each_ptr_crc(e, ptr, crc) {
-			bch2_mark_pointer(c, e, ptr, crc, sectors, type,
-					  stats, journal_seq, flags);
-			replicas += !ptr->cached;
-		}
+	mutex_lock(&c->sb_lock);
+	struct bch_sb_layout layout = ca->disk_sb.sb->layout;
+	mutex_unlock(&c->sb_lock);
+
+	u64 bucket = 0;
+	unsigned i, bucket_sectors = 0;
+	int ret;
 
-		if (replicas) {
-			BUG_ON(replicas - 1 > ARRAY_SIZE(stats->s));
-			stats->s[replicas - 1].data[type] += sectors;
+	for (i = 0; i < layout.nr_superblocks; i++) {
+		u64 offset = le64_to_cpu(layout.sb_offset[i]);
+
+		if (offset == BCH_SB_SECTOR) {
+			ret = bch2_trans_mark_metadata_sectors(trans, ca,
+						0, BCH_SB_SECTOR,
+						BCH_DATA_sb, &bucket, &bucket_sectors, flags);
+			if (ret)
+				return ret;
 		}
-		break;
+
+		ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
+				      offset + (1 << layout.sb_max_size_bits),
+				      BCH_DATA_sb, &bucket, &bucket_sectors, flags);
+		if (ret)
+			return ret;
 	}
-	case BCH_RESERVATION: {
-		struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
 
-		if (r.v->nr_replicas) {
-			BUG_ON(r.v->nr_replicas - 1 > ARRAY_SIZE(stats->s));
-			stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
-		}
-		break;
+	if (bucket_sectors) {
+		ret = bch2_trans_mark_metadata_bucket(trans, ca,
+				bucket, BCH_DATA_sb, bucket_sectors, flags);
+		if (ret)
+			return ret;
 	}
+
+	for (i = 0; i < ca->journal.nr; i++) {
+		ret = bch2_trans_mark_metadata_bucket(trans, ca,
+				ca->journal.buckets[i],
+				BCH_DATA_journal, ca->mi.bucket_size, flags);
+		if (ret)
+			return ret;
 	}
-	percpu_up_read_preempt_enable(&c->usage_lock);
-}
 
-/* Disk reservations: */
+	return 0;
+}
 
-static u64 __recalc_sectors_available(struct bch_fs *c)
+int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca,
+			enum btree_iter_update_trigger_flags flags)
 {
-	int cpu;
+	int ret = bch2_trans_run(c,
+		__bch2_trans_mark_dev_sb(trans, ca, flags));
+	bch_err_fn(c, ret);
+	return ret;
+}
 
-	for_each_possible_cpu(cpu)
-		per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
+int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c,
+			enum btree_iter_update_trigger_flags flags)
+{
+	for_each_online_member(c, ca) {
+		int ret = bch2_trans_mark_dev_sb(c, ca, flags);
+		if (ret) {
+			percpu_ref_put(&ca->io_ref);
+			return ret;
+		}
+	}
 
-	return bch2_fs_sectors_free(c, bch2_fs_usage_read(c));
+	return 0;
 }
 
-/* Used by gc when it's starting: */
-void bch2_recalc_sectors_available(struct bch_fs *c)
+int bch2_trans_mark_dev_sbs(struct bch_fs *c)
 {
-	percpu_down_write(&c->usage_lock);
-	atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
-	percpu_up_write(&c->usage_lock);
+	return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional);
 }
 
-void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
+bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b)
 {
-	percpu_down_read_preempt_disable(&c->usage_lock);
-	this_cpu_sub(c->usage_percpu->online_reserved,
-		     res->sectors);
+	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+	u64 b_offset	= bucket_to_sector(ca, b);
+	u64 b_end	= bucket_to_sector(ca, b + 1);
+	unsigned i;
+
+	if (!b)
+		return true;
 
-	bch2_fs_stats_verify(c);
-	percpu_up_read_preempt_enable(&c->usage_lock);
+	for (i = 0; i < layout->nr_superblocks; i++) {
+		u64 offset = le64_to_cpu(layout->sb_offset[i]);
+		u64 end = offset + (1 << layout->sb_max_size_bits);
 
-	res->sectors = 0;
+		if (!(offset >= b_end || end <= b_offset))
+			return true;
+	}
+
+	for (i = 0; i < ca->journal.nr; i++)
+		if (b == ca->journal.buckets[i])
+			return true;
+
+	return false;
 }
 
+/* Disk reservations: */
+
 #define SECTORS_CACHE	1024
 
-int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
-			      unsigned sectors, int flags)
+int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+				u64 sectors, enum bch_reservation_flags flags)
 {
-	struct bch_fs_usage *stats;
-	u64 old, v, get;
-	s64 sectors_available;
+	struct bch_fs_pcpu *pcpu;
+	u64 old, get;
+	u64 sectors_available;
 	int ret;
 
-	percpu_down_read_preempt_disable(&c->usage_lock);
-	stats = this_cpu_ptr(c->usage_percpu);
+	percpu_down_read(&c->mark_lock);
+	preempt_disable();
+	pcpu = this_cpu_ptr(c->pcpu);
 
-	if (sectors <= stats->available_cache)
+	if (sectors <= pcpu->sectors_available)
 		goto out;
 
-	v = atomic64_read(&c->sectors_available);
+	old = atomic64_read(&c->sectors_available);
 	do {
-		old = v;
 		get = min((u64) sectors + SECTORS_CACHE, old);
 
 		if (get < sectors) {
-			percpu_up_read_preempt_enable(&c->usage_lock);
+			preempt_enable();
 			goto recalculate;
 		}
-	} while ((v = atomic64_cmpxchg(&c->sectors_available,
-				       old, old - get)) != old);
+	} while (!atomic64_try_cmpxchg(&c->sectors_available,
+				       &old, old - get));
 
-	stats->available_cache	+= get;
+	pcpu->sectors_available		+= get;
 
 out:
-	stats->available_cache	-= sectors;
-	stats->online_reserved	+= sectors;
-	res->sectors		+= sectors;
+	pcpu->sectors_available		-= sectors;
+	this_cpu_add(*c->online_reserved, sectors);
+	res->sectors			+= sectors;
 
-	bch2_disk_reservations_verify(c, flags);
-	bch2_fs_stats_verify(c);
-	percpu_up_read_preempt_enable(&c->usage_lock);
+	preempt_enable();
+	percpu_up_read(&c->mark_lock);
 	return 0;
 
 recalculate:
-	/*
-	 * GC recalculates sectors_available when it starts, so that hopefully
-	 * we don't normally end up blocking here:
-	 */
-
-	/*
-	 * Piss fuck, we can be called from extent_insert_fixup() with btree
-	 * locks held:
-	 */
+	mutex_lock(&c->sectors_available_lock);
 
-	if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
-		if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
-			down_read(&c->gc_lock);
-		else if (!down_read_trylock(&c->gc_lock))
-			return -EINTR;
-	}
+	percpu_u64_set(&c->pcpu->sectors_available, 0);
+	sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
 
-	percpu_down_write(&c->usage_lock);
-	sectors_available = __recalc_sectors_available(c);
+	if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL))
+		sectors = min(sectors, sectors_available);
 
 	if (sectors <= sectors_available ||
 	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
 		atomic64_set(&c->sectors_available,
 			     max_t(s64, 0, sectors_available - sectors));
-		stats->online_reserved	+= sectors;
-		res->sectors		+= sectors;
+		this_cpu_add(*c->online_reserved, sectors);
+		res->sectors			+= sectors;
 		ret = 0;
-
-		bch2_disk_reservations_verify(c, flags);
 	} else {
 		atomic64_set(&c->sectors_available, sectors_available);
-		ret = -ENOSPC;
+		ret = -BCH_ERR_ENOSPC_disk_reservation;
 	}
 
-	bch2_fs_stats_verify(c);
-	percpu_up_write(&c->usage_lock);
-
-	if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
-		up_read(&c->gc_lock);
+	mutex_unlock(&c->sectors_available_lock);
+	percpu_up_read(&c->mark_lock);
 
 	return ret;
 }
 
 /* Startup/shutdown: */
 
-static void buckets_free_rcu(struct rcu_head *rcu)
+void bch2_buckets_nouse_free(struct bch_fs *c)
 {
-	struct bucket_array *buckets =
-		container_of(rcu, struct bucket_array, rcu);
-
-	kvpfree(buckets,
-		sizeof(struct bucket_array) +
-		buckets->nbuckets * sizeof(struct bucket));
+	for_each_member_device(c, ca) {
+		kvfree_rcu_mightsleep(ca->buckets_nouse);
+		ca->buckets_nouse = NULL;
+	}
 }
 
-int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
+int bch2_buckets_nouse_alloc(struct bch_fs *c)
 {
-	struct bucket_array *buckets = NULL, *old_buckets = NULL;
-	unsigned long *buckets_dirty = NULL;
-	u8 *oldest_gens = NULL;
-	alloc_fifo	free[RESERVE_NR];
-	alloc_fifo	free_inc;
-	alloc_heap	alloc_heap;
-	copygc_heap	copygc_heap;
-
-	size_t btree_reserve	= DIV_ROUND_UP(BTREE_NODE_RESERVE,
-			     ca->mi.bucket_size / c->opts.btree_node_size);
-	/* XXX: these should be tunable */
-	size_t reserve_none	= max_t(size_t, 4, ca->mi.nbuckets >> 9);
-	size_t copygc_reserve	= max_t(size_t, 16, ca->mi.nbuckets >> 7);
-	size_t free_inc_reserve = copygc_reserve / 2;
-	bool resize = ca->buckets != NULL,
-	     start_copygc = ca->copygc_thread != NULL;
-	int ret = -ENOMEM;
-	unsigned i;
+	for_each_member_device(c, ca) {
+		BUG_ON(ca->buckets_nouse);
 
-	memset(&free,		0, sizeof(free));
-	memset(&free_inc,	0, sizeof(free_inc));
-	memset(&alloc_heap,	0, sizeof(alloc_heap));
-	memset(&copygc_heap,	0, sizeof(copygc_heap));
-
-	if (!(buckets		= kvpmalloc(sizeof(struct bucket_array) +
-					    nbuckets * sizeof(struct bucket),
-					    GFP_KERNEL|__GFP_ZERO)) ||
-	    !(oldest_gens	= kvpmalloc(nbuckets * sizeof(u8),
-					    GFP_KERNEL|__GFP_ZERO)) ||
-	    !(buckets_dirty	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
+		ca->buckets_nouse = kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
 					    sizeof(unsigned long),
-					    GFP_KERNEL|__GFP_ZERO)) ||
-	    !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
-	    !init_fifo(&free[RESERVE_MOVINGGC],
-		       copygc_reserve, GFP_KERNEL) ||
-	    !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
-	    !init_fifo(&free_inc,	free_inc_reserve, GFP_KERNEL) ||
-	    !init_heap(&alloc_heap,	free_inc_reserve, GFP_KERNEL) ||
-	    !init_heap(&copygc_heap,	copygc_reserve, GFP_KERNEL))
-		goto err;
+					    GFP_KERNEL|__GFP_ZERO);
+		if (!ca->buckets_nouse) {
+			bch2_dev_put(ca);
+			return -BCH_ERR_ENOMEM_buckets_nouse;
+		}
+	}
 
-	buckets->first_bucket	= ca->mi.first_bucket;
-	buckets->nbuckets	= nbuckets;
+	return 0;
+}
 
-	bch2_copygc_stop(ca);
+static void bucket_gens_free_rcu(struct rcu_head *rcu)
+{
+	struct bucket_gens *buckets =
+		container_of(rcu, struct bucket_gens, rcu);
 
-	if (resize) {
-		down_write(&c->gc_lock);
-		down_write(&ca->bucket_lock);
-		percpu_down_write(&c->usage_lock);
-	}
+	kvfree(buckets);
+}
 
-	old_buckets = bucket_array(ca);
+int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
+{
+	struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
+	bool resize = ca->bucket_gens != NULL;
+	int ret;
 
-	if (resize) {
-		size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
+	BUG_ON(resize && ca->buckets_nouse);
 
-		memcpy(buckets->b,
-		       old_buckets->b,
-		       n * sizeof(struct bucket));
-		memcpy(oldest_gens,
-		       ca->oldest_gens,
-		       n * sizeof(u8));
-		memcpy(buckets_dirty,
-		       ca->buckets_dirty,
-		       BITS_TO_LONGS(n) * sizeof(unsigned long));
+	bucket_gens = kvmalloc(struct_size(bucket_gens, b, nbuckets),
+			       GFP_KERNEL|__GFP_ZERO);
+	if (!bucket_gens) {
+		ret = -BCH_ERR_ENOMEM_bucket_gens;
+		goto err;
 	}
 
-	rcu_assign_pointer(ca->buckets, buckets);
-	buckets = old_buckets;
+	bucket_gens->first_bucket = ca->mi.first_bucket;
+	bucket_gens->nbuckets	= nbuckets;
+	bucket_gens->nbuckets_minus_first =
+		bucket_gens->nbuckets - bucket_gens->first_bucket;
 
-	swap(ca->oldest_gens, oldest_gens);
-	swap(ca->buckets_dirty, buckets_dirty);
+	if (resize) {
+		down_write(&ca->bucket_lock);
+		percpu_down_write(&c->mark_lock);
+	}
 
-	if (resize)
-		percpu_up_write(&c->usage_lock);
+	old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
 
-	spin_lock(&c->freelist_lock);
-	for (i = 0; i < RESERVE_NR; i++) {
-		fifo_move(&free[i], &ca->free[i]);
-		swap(ca->free[i], free[i]);
+	if (resize) {
+		bucket_gens->nbuckets = min(bucket_gens->nbuckets,
+					    old_bucket_gens->nbuckets);
+		bucket_gens->nbuckets_minus_first =
+			bucket_gens->nbuckets - bucket_gens->first_bucket;
+		memcpy(bucket_gens->b,
+		       old_bucket_gens->b,
+		       bucket_gens->nbuckets);
 	}
-	fifo_move(&free_inc, &ca->free_inc);
-	swap(ca->free_inc, free_inc);
-	spin_unlock(&c->freelist_lock);
-
-	/* with gc lock held, alloc_heap can't be in use: */
-	swap(ca->alloc_heap, alloc_heap);
 
-	/* and we shut down copygc: */
-	swap(ca->copygc_heap, copygc_heap);
+	rcu_assign_pointer(ca->bucket_gens, bucket_gens);
+	bucket_gens	= old_bucket_gens;
 
 	nbuckets = ca->mi.nbuckets;
 
 	if (resize) {
+		percpu_up_write(&c->mark_lock);
 		up_write(&ca->bucket_lock);
-		up_write(&c->gc_lock);
 	}
 
-	if (start_copygc &&
-	    bch2_copygc_start(c, ca))
-		bch_err(ca, "error restarting copygc thread");
-
 	ret = 0;
 err:
-	free_heap(&copygc_heap);
-	free_heap(&alloc_heap);
-	free_fifo(&free_inc);
-	for (i = 0; i < RESERVE_NR; i++)
-		free_fifo(&free[i]);
-	kvpfree(buckets_dirty,
-		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
-	kvpfree(oldest_gens,
-		nbuckets * sizeof(u8));
-	if (buckets)
-		call_rcu(&old_buckets->rcu, buckets_free_rcu);
+	if (bucket_gens)
+		call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
 
 	return ret;
 }
 
 void bch2_dev_buckets_free(struct bch_dev *ca)
 {
-	unsigned i;
-
-	free_heap(&ca->copygc_heap);
-	free_heap(&ca->alloc_heap);
-	free_fifo(&ca->free_inc);
-	for (i = 0; i < RESERVE_NR; i++)
-		free_fifo(&ca->free[i]);
-	kvpfree(ca->buckets_dirty,
-		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
-	kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
-	kvpfree(rcu_dereference_protected(ca->buckets, 1),
-		sizeof(struct bucket_array) +
-		ca->mi.nbuckets * sizeof(struct bucket));
-
-	free_percpu(ca->usage_percpu);
+	kvfree(ca->buckets_nouse);
+	kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
+	free_percpu(ca->usage);
 }
 
 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-	if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)))
-		return -ENOMEM;
+	ca->usage = alloc_percpu(struct bch_dev_usage);
+	if (!ca->usage)
+		return -BCH_ERR_ENOMEM_usage_init;
 
-	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
+	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
 }
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 4deb6c37..3bebc4c3 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Code for manipulating bucket marks for garbage collection.
  *
@@ -8,62 +9,112 @@
 #define _BUCKETS_H
 
 #include "buckets_types.h"
-#include "super.h"
+#include "extents.h"
+#include "sb-members.h"
+
+static inline u64 sector_to_bucket(const struct bch_dev *ca, sector_t s)
+{
+	return div_u64(s, ca->mi.bucket_size);
+}
+
+static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
+{
+	return ((sector_t) b) * ca->mi.bucket_size;
+}
+
+static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
+{
+	u32 remainder;
+
+	div_u64_rem(s, ca->mi.bucket_size, &remainder);
+	return remainder;
+}
+
+static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, u32 *offset)
+{
+	return div_u64_rem(s, ca->mi.bucket_size, offset);
+}
 
 #define for_each_bucket(_b, _buckets)				\
 	for (_b = (_buckets)->b + (_buckets)->first_bucket;	\
 	     _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
 
-#define bucket_cmpxchg(g, new, expr)				\
-({								\
-	u64 _v = atomic64_read(&(g)->_mark.v);			\
-	struct bucket_mark _old;				\
-								\
-	do {							\
-		(new).v.counter = _old.v.counter = _v;		\
-		expr;						\
-	} while ((_v = atomic64_cmpxchg(&(g)->_mark.v,		\
-			       _old.v.counter,			\
-			       (new).v.counter)) != _old.v.counter);\
-	_old;							\
-})
+/*
+ * Ugly hack alert:
+ *
+ * We need to cram a spinlock in a single byte, because that's what we have left
+ * in struct bucket, and we care about the size of these - during fsck, we need
+ * in memory state for every single bucket on every device.
+ *
+ * We used to do
+ *   while (xchg(&b->lock, 1) cpu_relax();
+ * but, it turns out not all architectures support xchg on a single byte.
+ *
+ * So now we use bit_spin_lock(), with fun games since we can't burn a whole
+ * ulong for this - we just need to make sure the lock bit always ends up in the
+ * first byte.
+ */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define BUCKET_LOCK_BITNR	0
+#else
+#define BUCKET_LOCK_BITNR	(BITS_PER_LONG - 1)
+#endif
+
+union ulong_byte_assert {
+	ulong	ulong;
+	u8	byte;
+};
 
-static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+static inline void bucket_unlock(struct bucket *b)
 {
-	return rcu_dereference_check(ca->buckets,
-				     !ca->fs ||
-				     percpu_rwsem_is_held(&ca->fs->usage_lock) ||
-				     lockdep_is_held(&ca->fs->gc_lock) ||
-				     lockdep_is_held(&ca->bucket_lock));
+	BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
+
+	clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
+	wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR);
 }
 
-static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+static inline void bucket_lock(struct bucket *b)
 {
-	struct bucket_array *buckets = bucket_array(ca);
+	wait_on_bit_lock((void *) &b->lock, BUCKET_LOCK_BITNR,
+			 TASK_UNINTERRUPTIBLE);
+}
 
-	BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
-	return buckets->b + b;
+static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
+{
+	return genradix_ptr(&ca->buckets_gc, b);
 }
 
-static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
-					 size_t b, int rw)
+static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
 {
-	bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand;
+	return rcu_dereference_check(ca->bucket_gens,
+				     !ca->fs ||
+				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
+				     lockdep_is_held(&ca->fs->state_lock) ||
+				     lockdep_is_held(&ca->bucket_lock));
 }
 
-static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
+static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
 {
-	return c->bucket_clock[rw].hand - g->io_time[rw];
+	struct bucket_gens *gens = bucket_gens(ca);
+
+	if (b - gens->first_bucket >= gens->nbuckets_minus_first)
+		return NULL;
+	return gens->b + b;
 }
 
-/*
- * bucket_gc_gen() returns the difference between the bucket's current gen and
- * the oldest gen of any pointer into that bucket in the btree.
- */
+static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b)
+{
+	u8 *gen = bucket_gen(ca, b);
+	return gen ? *gen : -1;
+}
 
-static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
+static inline int bucket_gen_get(struct bch_dev *ca, size_t b)
 {
-	return bucket(ca, b)->mark.gen - ca->oldest_gens[b];
+	rcu_read_lock();
+	int ret = bucket_gen_get_rcu(ca, b);
+	rcu_read_unlock();
+	return ret;
 }
 
 static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
@@ -72,22 +123,42 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
 	return sector_to_bucket(ca, ptr->offset);
 }
 
-static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
-					const struct bch_extent_ptr *ptr)
+static inline struct bpos PTR_BUCKET_POS(const struct bch_dev *ca,
+					 const struct bch_extent_ptr *ptr)
 {
-	return bucket(ca, PTR_BUCKET_NR(ca, ptr));
+	return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
 }
 
-static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
-						 const struct bch_extent_ptr *ptr)
+static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_dev *ca,
+						const struct bch_extent_ptr *ptr,
+						u32 *bucket_offset)
 {
-	struct bucket_mark m;
+	return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
+}
 
-	rcu_read_lock();
-	m = READ_ONCE(bucket(ca, PTR_BUCKET_NR(ca, ptr))->mark);
-	rcu_read_unlock();
+static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
+					   const struct bch_extent_ptr *ptr)
+{
+	return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr));
+}
+
+static inline enum bch_data_type ptr_data_type(const struct bkey *k,
+					       const struct bch_extent_ptr *ptr)
+{
+	if (bkey_is_btree_ptr(k))
+		return BCH_DATA_btree;
 
-	return m;
+	return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
+}
+
+static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
+{
+	EBUG_ON(sectors < 0);
+
+	return crc_is_compressed(p.crc)
+		? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
+				   p.crc.uncompressed_size)
+		: sectors;
 }
 
 static inline int gen_cmp(u8 a, u8 b)
@@ -102,147 +173,190 @@ static inline int gen_after(u8 a, u8 b)
 	return r > 0 ? r : 0;
 }
 
+static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
+{
+	int gen = bucket_gen_get_rcu(ca, PTR_BUCKET_NR(ca, ptr));
+	return gen < 0 ? gen : gen_after(gen, ptr->gen);
+}
+
 /**
- * ptr_stale() - check if a pointer points into a bucket that has been
+ * dev_ptr_stale() - check if a pointer points into a bucket that has been
  * invalidated.
  */
-static inline u8 ptr_stale(struct bch_dev *ca,
-			   const struct bch_extent_ptr *ptr)
+static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
 {
-	return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen);
+	rcu_read_lock();
+	int ret = dev_ptr_stale_rcu(ca, ptr);
+	rcu_read_unlock();
+	return ret;
 }
 
-/* bucket gc marks */
-
-/* The dirty and cached sector counts saturate. If this occurs,
- * reference counting alone will not free the bucket, and a btree
- * GC must be performed. */
-#define GC_MAX_SECTORS_USED ((1U << 15) - 1)
+/* Device usage: */
 
-static inline unsigned bucket_sectors_used(struct bucket_mark mark)
+void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *);
+static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
 {
-	return mark.dirty_sectors + mark.cached_sectors;
-}
+	struct bch_dev_usage ret;
 
-static inline bool bucket_unused(struct bucket_mark mark)
-{
-	return !mark.owned_by_allocator &&
-		!mark.data_type &&
-		!bucket_sectors_used(mark);
+	bch2_dev_usage_read_fast(ca, &ret);
+	return ret;
 }
 
-/* Device usage: */
+void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage *);
 
-struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
-struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
-
-static inline u64 __dev_buckets_available(struct bch_dev *ca,
-					  struct bch_dev_usage stats)
+static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
 {
-	u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+	s64 reserved = 0;
 
-	if (WARN_ONCE(stats.buckets_unavailable > total,
-		      "buckets_unavailable overflow (%llu > %llu)\n",
-		      stats.buckets_unavailable, total))
-		return 0;
+	switch (watermark) {
+	case BCH_WATERMARK_NR:
+		BUG();
+	case BCH_WATERMARK_stripe:
+		reserved += ca->mi.nbuckets >> 6;
+		fallthrough;
+	case BCH_WATERMARK_normal:
+		reserved += ca->mi.nbuckets >> 6;
+		fallthrough;
+	case BCH_WATERMARK_copygc:
+		reserved += ca->nr_btree_reserve;
+		fallthrough;
+	case BCH_WATERMARK_btree:
+		reserved += ca->nr_btree_reserve;
+		fallthrough;
+	case BCH_WATERMARK_btree_copygc:
+	case BCH_WATERMARK_reclaim:
+	case BCH_WATERMARK_interior_updates:
+		break;
+	}
 
-	return total - stats.buckets_unavailable;
+	return reserved;
 }
 
-/*
- * Number of reclaimable buckets - only for use by the allocator thread:
- */
-static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca)
+static inline u64 dev_buckets_free(struct bch_dev *ca,
+				   struct bch_dev_usage usage,
+				   enum bch_watermark watermark)
 {
-	return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca));
+	return max_t(s64, 0,
+		     usage.d[BCH_DATA_free].buckets -
+		     ca->nr_open_buckets -
+		     bch2_dev_buckets_reserved(ca, watermark));
 }
 
-static inline u64 __dev_buckets_free(struct bch_dev *ca,
-				     struct bch_dev_usage stats)
+static inline u64 __dev_buckets_available(struct bch_dev *ca,
+					  struct bch_dev_usage usage,
+					  enum bch_watermark watermark)
 {
-	return __dev_buckets_available(ca, stats) +
-		fifo_used(&ca->free[RESERVE_NONE]) +
-		fifo_used(&ca->free_inc);
+	return max_t(s64, 0,
+		       usage.d[BCH_DATA_free].buckets
+		     + usage.d[BCH_DATA_cached].buckets
+		     + usage.d[BCH_DATA_need_gc_gens].buckets
+		     + usage.d[BCH_DATA_need_discard].buckets
+		     - ca->nr_open_buckets
+		     - bch2_dev_buckets_reserved(ca, watermark));
 }
 
-static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
+static inline u64 dev_buckets_available(struct bch_dev *ca,
+					enum bch_watermark watermark)
 {
-	return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca));
+	return __dev_buckets_available(ca, bch2_dev_usage_read(ca), watermark);
 }
 
 /* Filesystem usage: */
 
-static inline enum bch_data_type s_alloc_to_data_type(enum s_alloc s)
+static inline unsigned dev_usage_u64s(void)
 {
-	switch (s) {
-	case S_META:
-		return BCH_DATA_BTREE;
-	case S_DIRTY:
-		return BCH_DATA_USER;
-	default:
-		BUG();
-	}
+	return sizeof(struct bch_dev_usage) / sizeof(u64);
 }
 
-struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
-struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
-void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
-			 struct disk_reservation *, struct gc_pos);
+struct bch_fs_usage_short
+bch2_fs_usage_read_short(struct bch_fs *);
+
+int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *,
+			   struct bkey_s_c, const struct bch_extent_ptr *,
+			   s64, enum bch_data_type, u8, u8, u32 *);
+
+int bch2_check_fix_ptrs(struct btree_trans *,
+			enum btree_id, unsigned, struct bkey_s_c,
+			enum btree_iter_update_trigger_flags);
+
+int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned,
+			struct bkey_s_c, struct bkey_s,
+			enum btree_iter_update_trigger_flags);
+int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
+			  struct bkey_s_c, struct bkey_s,
+			  enum btree_iter_update_trigger_flags);
+
+#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
+({												\
+	int ret = 0;										\
+												\
+	if (_old.k->type)									\
+		ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_insert);	\
+	if (!ret && _new.k->type)								\
+		ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_overwrite);\
+	ret;											\
+})
+
+void bch2_trans_account_disk_usage_change(struct btree_trans *);
 
-u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
-u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
-u64 bch2_fs_sectors_free(struct bch_fs *, struct bch_fs_usage);
+int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64,
+				    enum bch_data_type, unsigned,
+				    enum btree_iter_update_trigger_flags);
+int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *,
+				    enum btree_iter_update_trigger_flags);
+int bch2_trans_mark_dev_sbs_flags(struct bch_fs *,
+				    enum btree_iter_update_trigger_flags);
+int bch2_trans_mark_dev_sbs(struct bch_fs *);
 
-static inline bool is_available_bucket(struct bucket_mark mark)
-{
-	return (!mark.owned_by_allocator &&
-		!mark.dirty_sectors &&
-		!mark.nouse);
-}
+bool bch2_is_superblock_bucket(struct bch_dev *, u64);
 
-static inline bool bucket_needs_journal_commit(struct bucket_mark m,
-					       u16 last_seq_ondisk)
+static inline const char *bch2_data_type_str(enum bch_data_type type)
 {
-	return m.journal_seq_valid &&
-		((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
+	return type < BCH_DATA_NR
+		? __bch2_data_types[type]
+		: "(invalid data type)";
 }
 
-void bch2_bucket_seq_cleanup(struct bch_fs *);
-
-bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
-			    size_t, struct bucket_mark *);
-void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
-			    size_t, bool, struct gc_pos, unsigned);
-void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
-			       size_t, enum bch_data_type, unsigned,
-			       struct gc_pos, unsigned);
-
-#define BCH_BUCKET_MARK_NOATOMIC		(1 << 0)
-#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE	(1 << 1)
-#define BCH_BUCKET_MARK_GC_WILL_VISIT		(1 << 2)
-#define BCH_BUCKET_MARK_GC_LOCK_HELD		(1 << 3)
-
-void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, struct gc_pos,
-		   struct bch_fs_usage *, u64, unsigned);
-
-void bch2_recalc_sectors_available(struct bch_fs *);
-
-void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
+/* disk reservations: */
 
 static inline void bch2_disk_reservation_put(struct bch_fs *c,
 					     struct disk_reservation *res)
 {
-	if (res->sectors)
-		__bch2_disk_reservation_put(c, res);
+	if (res->sectors) {
+		this_cpu_sub(*c->online_reserved, res->sectors);
+		res->sectors = 0;
+	}
 }
 
-#define BCH_DISK_RESERVATION_NOFAIL		(1 << 0)
-#define BCH_DISK_RESERVATION_GC_LOCK_HELD	(1 << 1)
-#define BCH_DISK_RESERVATION_BTREE_LOCKS_HELD	(1 << 2)
+enum bch_reservation_flags {
+	BCH_DISK_RESERVATION_NOFAIL	= 1 << 0,
+	BCH_DISK_RESERVATION_PARTIAL	= 1 << 1,
+};
+
+int __bch2_disk_reservation_add(struct bch_fs *, struct disk_reservation *,
+				u64, enum bch_reservation_flags);
 
-int bch2_disk_reservation_add(struct bch_fs *,
-			     struct disk_reservation *,
-			     unsigned, int);
+static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+					    u64 sectors, enum bch_reservation_flags flags)
+{
+#ifdef __KERNEL__
+	u64 old, new;
+
+	old = this_cpu_read(c->pcpu->sectors_available);
+	do {
+		if (sectors > old)
+			return __bch2_disk_reservation_add(c, res, sectors, flags);
+
+		new = old - sectors;
+	} while (!this_cpu_try_cmpxchg(c->pcpu->sectors_available, &old, new));
+
+	this_cpu_add(*c->online_reserved, sectors);
+	res->sectors			+= sectors;
+	return 0;
+#else
+	return __bch2_disk_reservation_add(c, res, sectors, flags);
+#endif
+}
 
 static inline struct disk_reservation
 bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
@@ -259,8 +373,7 @@ bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
 
 static inline int bch2_disk_reservation_get(struct bch_fs *c,
 					    struct disk_reservation *res,
-					    unsigned sectors,
-					    unsigned nr_replicas,
+					    u64 sectors, unsigned nr_replicas,
 					    int flags)
 {
 	*res = bch2_disk_reservation_init(c, nr_replicas);
@@ -268,6 +381,16 @@ static inline int bch2_disk_reservation_get(struct bch_fs *c,
 	return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags);
 }
 
+#define RESERVE_FACTOR	6
+
+static inline u64 avail_factor(u64 r)
+{
+	return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
+}
+
+void bch2_buckets_nouse_free(struct bch_fs *);
+int bch2_buckets_nouse_alloc(struct bch_fs *);
+
 int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
 void bch2_dev_buckets_free(struct bch_dev *);
 int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index 10f00861..7174047b 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -1,95 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BUCKETS_TYPES_H
 #define _BUCKETS_TYPES_H
 
+#include "bcachefs_format.h"
 #include "util.h"
 
-struct bucket_mark {
-	union {
-	struct {
-		atomic64_t	v;
-	};
-
-	struct {
-		u8		gen;
-		u8		data_type:3,
-				gen_valid:1,
-				owned_by_allocator:1,
-				nouse:1,
-				journal_seq_valid:1;
-		u16		dirty_sectors;
-		u16		cached_sectors;
-
-		/*
-		 * low bits of journal sequence number when this bucket was most
-		 * recently modified: if journal_seq_valid is set, this bucket
-		 * can't be reused until the journal sequence number written to
-		 * disk is >= the bucket's journal sequence number:
-		 */
-		u16		journal_seq;
-	};
-	};
-};
+#define BUCKET_JOURNAL_SEQ_BITS		16
 
 struct bucket {
-	union {
-		struct bucket_mark	_mark;
-		const struct bucket_mark mark;
-	};
-
-	u16				io_time[2];
-};
-
-struct bucket_array {
+	u8			lock;
+	u8			gen_valid:1;
+	u8			data_type:7;
+	u8			gen;
+	u8			stripe_redundancy;
+	u32			stripe;
+	u32			dirty_sectors;
+	u32			cached_sectors;
+	u32			stripe_sectors;
+} __aligned(sizeof(long));
+
+struct bucket_gens {
 	struct rcu_head		rcu;
 	u16			first_bucket;
 	size_t			nbuckets;
-	struct bucket		b[];
+	size_t			nbuckets_minus_first;
+	u8			b[] __counted_by(nbuckets);
 };
 
 struct bch_dev_usage {
-	u64			buckets[BCH_DATA_NR];
-	u64			buckets_alloc;
-	u64			buckets_unavailable;
-
-	/* _compressed_ sectors: */
-	u64			sectors[BCH_DATA_NR];
-	u64			sectors_fragmented;
+	struct bch_dev_usage_type {
+		u64		buckets;
+		u64		sectors; /* _compressed_ sectors: */
+		/*
+		 * XXX
+		 * Why do we have this? Isn't it just buckets * bucket_size -
+		 * sectors?
+		 */
+		u64		fragmented;
+	}			d[BCH_DATA_NR];
 };
 
-/* kill, switch to bch_data_type? */
-enum s_alloc {
-	S_META,
-	S_DIRTY,
-	S_ALLOC_NR,
+struct bch_fs_usage_base {
+	u64			hidden;
+	u64			btree;
+	u64			data;
+	u64			cached;
+	u64			reserved;
+	u64			nr_inodes;
 };
 
-struct bch_fs_usage {
-	/* all fields are in units of 512 byte sectors: */
-	/* _uncompressed_ sectors: */
-	u64			online_reserved;
-	u64			available_cache;
-
-	struct {
-		u64		data[S_ALLOC_NR];
-		u64		persistent_reserved;
-	}			s[BCH_REPLICAS_MAX];
+struct bch_fs_usage_short {
+	u64			capacity;
+	u64			used;
+	u64			free;
+	u64			nr_inodes;
 };
 
 /*
  * A reservation for space on disk:
  */
 struct disk_reservation {
-	u64		sectors;
-	u32		gen;
-	unsigned	nr_replicas;
+	u64			sectors;
+	u32			gen;
+	unsigned		nr_replicas;
 };
 
-struct copygc_heap_entry {
-	u8			gen;
-	u32			sectors;
-	u64			offset;
-};
-
-typedef HEAP(struct copygc_heap_entry) copygc_heap;
-
 #endif /* _BUCKETS_TYPES_H */
diff --git a/libbcachefs/buckets_waiting_for_journal.c b/libbcachefs/buckets_waiting_for_journal.c
new file mode 100644
index 00000000..f9fb150e
--- /dev/null
+++ b/libbcachefs/buckets_waiting_for_journal.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "buckets_waiting_for_journal.h"
+#include <linux/hash.h>
+#include <linux/random.h>
+
+static inline struct bucket_hashed *
+bucket_hash(struct buckets_waiting_for_journal_table *t,
+	    unsigned hash_seed_idx, u64 dev_bucket)
+{
+	return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits);
+}
+
+static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits)
+{
+	unsigned i;
+
+	t->bits = bits;
+	for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++)
+		get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i]));
+	memset(t->d, 0, sizeof(t->d[0]) << t->bits);
+}
+
+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
+				      u64 flushed_seq,
+				      unsigned dev, u64 bucket)
+{
+	struct buckets_waiting_for_journal_table *t;
+	u64 dev_bucket = (u64) dev << 56 | bucket;
+	bool ret = false;
+	unsigned i;
+
+	mutex_lock(&b->lock);
+	t = b->t;
+
+	for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+		struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
+
+		if (h->dev_bucket == dev_bucket) {
+			ret = h->journal_seq > flushed_seq;
+			break;
+		}
+	}
+
+	mutex_unlock(&b->lock);
+
+	return ret;
+}
+
+static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t,
+				struct bucket_hashed *new,
+				u64 flushed_seq)
+{
+	struct bucket_hashed *last_evicted = NULL;
+	unsigned tries, i;
+
+	for (tries = 0; tries < 10; tries++) {
+		struct bucket_hashed *old, *victim = NULL;
+
+		for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+			old = bucket_hash(t, i, new->dev_bucket);
+
+			if (old->dev_bucket == new->dev_bucket ||
+			    old->journal_seq <= flushed_seq) {
+				*old = *new;
+				return true;
+			}
+
+			if (last_evicted != old)
+				victim = old;
+		}
+
+		/* hashed to same slot 3 times: */
+		if (!victim)
+			break;
+
+		/* Failed to find an empty slot: */
+		swap(*new, *victim);
+		last_evicted = victim;
+	}
+
+	return false;
+}
+
+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
+					 u64 flushed_seq,
+					 unsigned dev, u64 bucket,
+					 u64 journal_seq)
+{
+	struct buckets_waiting_for_journal_table *t, *n;
+	struct bucket_hashed tmp, new = {
+		.dev_bucket	= (u64) dev << 56 | bucket,
+		.journal_seq	= journal_seq,
+	};
+	size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0, nr_rehashes_this_size = 0;
+	int ret = 0;
+
+	mutex_lock(&b->lock);
+
+	if (likely(bucket_table_insert(b->t, &new, flushed_seq)))
+		goto out;
+
+	t = b->t;
+	size = 1UL << t->bits;
+	for (i = 0; i < size; i++)
+		nr_elements += t->d[i].journal_seq > flushed_seq;
+
+	new_bits = ilog2(roundup_pow_of_two(nr_elements * 3));
+realloc:
+	n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
+	if (!n) {
+		ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set;
+		goto out;
+	}
+
+retry_rehash:
+	if (nr_rehashes_this_size == 3) {
+		new_bits++;
+		nr_rehashes_this_size = 0;
+		kvfree(n);
+		goto realloc;
+	}
+
+	nr_rehashes++;
+	nr_rehashes_this_size++;
+
+	bucket_table_init(n, new_bits);
+
+	tmp = new;
+	BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq));
+
+	for (i = 0; i < 1UL << t->bits; i++) {
+		if (t->d[i].journal_seq <= flushed_seq)
+			continue;
+
+		tmp = t->d[i];
+		if (!bucket_table_insert(n, &tmp, flushed_seq))
+			goto retry_rehash;
+	}
+
+	b->t = n;
+	kvfree(t);
+
+	pr_debug("took %zu rehashes, table at %zu/%lu elements",
+		 nr_rehashes, nr_elements, 1UL << b->t->bits);
+out:
+	mutex_unlock(&b->lock);
+
+	return ret;
+}
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
+{
+	struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+	kvfree(b->t);
+}
+
+#define INITIAL_TABLE_BITS		3
+
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
+{
+	struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+	mutex_init(&b->lock);
+
+	b->t = kvmalloc(sizeof(*b->t) +
+			(sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL);
+	if (!b->t)
+		return -BCH_ERR_ENOMEM_buckets_waiting_for_journal_init;
+
+	bucket_table_init(b->t, INITIAL_TABLE_BITS);
+	return 0;
+}
diff --git a/libbcachefs/buckets_waiting_for_journal.h b/libbcachefs/buckets_waiting_for_journal.h
new file mode 100644
index 00000000..d2ae19cb
--- /dev/null
+++ b/libbcachefs/buckets_waiting_for_journal.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_H
+
+#include "buckets_waiting_for_journal_types.h"
+
+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
+				      u64, unsigned, u64);
+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
+					 u64, unsigned, u64, u64);
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *);
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *);
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */
diff --git a/libbcachefs/buckets_waiting_for_journal_types.h b/libbcachefs/buckets_waiting_for_journal_types.h
new file mode 100644
index 00000000..e593db06
--- /dev/null
+++ b/libbcachefs/buckets_waiting_for_journal_types.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+
+#include <linux/siphash.h>
+
+struct bucket_hashed {
+	u64			dev_bucket;
+	u64			journal_seq;
+};
+
+struct buckets_waiting_for_journal_table {
+	unsigned		bits;
+	u64			hash_seeds[3];
+	struct bucket_hashed	d[];
+};
+
+struct buckets_waiting_for_journal {
+	struct mutex		lock;
+	struct buckets_waiting_for_journal_table *t;
+};
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index 5593b9a1..46e9e321 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -1,21 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
 #ifndef NO_BCACHEFS_CHARDEV
 
 #include "bcachefs.h"
-#include "alloc.h"
 #include "bcachefs_ioctl.h"
 #include "buckets.h"
 #include "chardev.h"
+#include "disk_accounting.h"
+#include "fsck.h"
+#include "journal.h"
 #include "move.h"
-#include "super.h"
+#include "recovery_passes.h"
+#include "replicas.h"
 #include "super-io.h"
+#include "thread_with_file.h"
 
-#include <linux/anon_inodes.h>
 #include <linux/cdev.h>
 #include <linux/device.h>
-#include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/ioctl.h>
-#include <linux/kthread.h>
 #include <linux/major.h>
 #include <linux/sched/task.h>
 #include <linux/slab.h>
@@ -31,12 +33,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
 		if (dev >= c->sb.nr_devices)
 			return ERR_PTR(-EINVAL);
 
-		rcu_read_lock();
-		ca = rcu_dereference(c->devs[dev]);
-		if (ca)
-			percpu_ref_get(&ca->ref);
-		rcu_read_unlock();
-
+		ca = bch2_dev_tryget_noerror(c, dev);
 		if (!ca)
 			return ERR_PTR(-EINVAL);
 	} else {
@@ -84,10 +81,9 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
 		devs[i] = strndup_user((const char __user *)(unsigned long)
 				       user_devs[i],
 				       PATH_MAX);
-		if (!devs[i]) {
-			ret = -ENOMEM;
+		ret= PTR_ERR_OR_ZERO(devs[i]);
+		if (ret)
 			goto err;
-		}
 	}
 
 	c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
@@ -115,8 +111,9 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
 		return -EINVAL;
 
 	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-	if (!path)
-		return -ENOMEM;
+	ret = PTR_ERR_OR_ZERO(path);
+	if (ret)
+		return ret;
 
 	err = bch2_fs_open_incremental(path);
 	kfree(path);
@@ -132,6 +129,8 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
 
 static long bch2_global_ioctl(unsigned cmd, void __user *arg)
 {
+	long ret;
+
 	switch (cmd) {
 #if 0
 	case BCH_IOCTL_ASSEMBLE:
@@ -139,30 +138,44 @@ static long bch2_global_ioctl(unsigned cmd, void __user *arg)
 	case BCH_IOCTL_INCREMENTAL:
 		return bch2_ioctl_incremental(arg);
 #endif
+	case BCH_IOCTL_FSCK_OFFLINE: {
+		ret = bch2_ioctl_fsck_offline(arg);
+		break;
+	}
 	default:
-		return -ENOTTY;
+		ret = -ENOTTY;
+		break;
 	}
+
+	if (ret < 0)
+		ret = bch2_err_class(ret);
+	return ret;
 }
 
 static long bch2_ioctl_query_uuid(struct bch_fs *c,
 			struct bch_ioctl_query_uuid __user *user_arg)
 {
-	return copy_to_user(&user_arg->uuid,
-			    &c->sb.user_uuid,
-			    sizeof(c->sb.user_uuid));
+	return copy_to_user_errcode(&user_arg->uuid, &c->sb.user_uuid,
+				    sizeof(c->sb.user_uuid));
 }
 
 #if 0
 static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
 {
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if (arg.flags || arg.pad)
 		return -EINVAL;
 
-	return bch2_fs_start(c) ? -EIO : 0;
+	return bch2_fs_start(c);
 }
 
 static long bch2_ioctl_stop(struct bch_fs *c)
 {
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	bch2_fs_stop(c);
 	return 0;
 }
@@ -173,15 +186,20 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
 	char *path;
 	int ret;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if (arg.flags || arg.pad)
 		return -EINVAL;
 
 	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-	if (!path)
-		return -ENOMEM;
+	ret = PTR_ERR_OR_ZERO(path);
+	if (ret)
+		return ret;
 
 	ret = bch2_dev_add(c, path);
-	kfree(path);
+	if (!IS_ERR(path))
+		kfree(path);
 
 	return ret;
 }
@@ -190,6 +208,9 @@ static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
 {
 	struct bch_dev *ca;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
 			   BCH_FORCE_IF_METADATA_LOST|
 			   BCH_FORCE_IF_DEGRADED|
@@ -209,12 +230,16 @@ static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
 	char *path;
 	int ret;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if (arg.flags || arg.pad)
 		return -EINVAL;
 
 	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-	if (!path)
-		return -ENOMEM;
+	ret = PTR_ERR_OR_ZERO(path);
+	if (ret)
+		return ret;
 
 	ret = bch2_dev_online(c, path);
 	kfree(path);
@@ -226,6 +251,9 @@ static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
 	struct bch_dev *ca;
 	int ret;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
 			   BCH_FORCE_IF_METADATA_LOST|
 			   BCH_FORCE_IF_DEGRADED|
@@ -238,7 +266,7 @@ static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
 		return PTR_ERR(ca);
 
 	ret = bch2_dev_offline(c, ca, arg.flags);
-	percpu_ref_put(&ca->ref);
+	bch2_dev_put(ca);
 	return ret;
 }
 
@@ -248,11 +276,15 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c,
 	struct bch_dev *ca;
 	int ret;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
 			   BCH_FORCE_IF_METADATA_LOST|
 			   BCH_FORCE_IF_DEGRADED|
 			   BCH_BY_INDEX)) ||
-	    arg.pad[0] || arg.pad[1] || arg.pad[2])
+	    arg.pad[0] || arg.pad[1] || arg.pad[2] ||
+	    arg.new_state >= BCH_MEMBER_STATE_NR)
 		return -EINVAL;
 
 	ca = bch2_device_lookup(c, arg.dev, arg.flags);
@@ -260,37 +292,35 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c,
 		return PTR_ERR(ca);
 
 	ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
+	if (ret)
+		bch_err(c, "Error setting device state: %s", bch2_err_str(ret));
 
-	percpu_ref_put(&ca->ref);
+	bch2_dev_put(ca);
 	return ret;
 }
 
 struct bch_data_ctx {
+	struct thread_with_file		thr;
+
 	struct bch_fs			*c;
 	struct bch_ioctl_data		arg;
 	struct bch_move_stats		stats;
-
-	int				ret;
-
-	struct task_struct		*thread;
 };
 
 static int bch2_data_thread(void *arg)
 {
-	struct bch_data_ctx *ctx = arg;
-
-	ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
+	struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr);
 
+	ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
 	ctx->stats.data_type = U8_MAX;
 	return 0;
 }
 
 static int bch2_data_job_release(struct inode *inode, struct file *file)
 {
-	struct bch_data_ctx *ctx = file->private_data;
+	struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
 
-	kthread_stop(ctx->thread);
-	put_task_struct(ctx->thread);
+	bch2_thread_with_file_exit(&ctx->thr);
 	kfree(ctx);
 	return 0;
 }
@@ -298,36 +328,36 @@ static int bch2_data_job_release(struct inode *inode, struct file *file)
 static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
 				  size_t len, loff_t *ppos)
 {
-	struct bch_data_ctx *ctx = file->private_data;
+	struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
 	struct bch_fs *c = ctx->c;
 	struct bch_ioctl_data_event e = {
 		.type			= BCH_DATA_EVENT_PROGRESS,
 		.p.data_type		= ctx->stats.data_type,
-		.p.btree_id		= ctx->stats.iter.btree_id,
-		.p.pos			= ctx->stats.iter.pos,
+		.p.btree_id		= ctx->stats.pos.btree,
+		.p.pos			= ctx->stats.pos.pos,
 		.p.sectors_done		= atomic64_read(&ctx->stats.sectors_seen),
-		.p.sectors_total	= bch2_fs_sectors_used(c, bch2_fs_usage_read(c)),
+		.p.sectors_total	= bch2_fs_usage_read_short(c).used,
 	};
 
 	if (len < sizeof(e))
 		return -EINVAL;
 
-	return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
+	return copy_to_user_errcode(buf, &e, sizeof(e)) ?: sizeof(e);
 }
 
 static const struct file_operations bcachefs_data_ops = {
 	.release	= bch2_data_job_release,
 	.read		= bch2_data_job_read,
-	.llseek		= no_llseek,
 };
 
 static long bch2_ioctl_data(struct bch_fs *c,
 			    struct bch_ioctl_data arg)
 {
-	struct bch_data_ctx *ctx = NULL;
-	struct file *file = NULL;
-	unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
-	int ret, fd = -1;
+	struct bch_data_ctx *ctx;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
 
 	if (arg.op >= BCH_DATA_OP_NR || arg.flags)
 		return -EINVAL;
@@ -339,112 +369,176 @@ static long bch2_ioctl_data(struct bch_fs *c,
 	ctx->c = c;
 	ctx->arg = arg;
 
-	ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]");
-	if (IS_ERR(ctx->thread)) {
-		ret = PTR_ERR(ctx->thread);
-		goto err;
-	}
-
-	ret = get_unused_fd_flags(flags);
+	ret = bch2_run_thread_with_file(&ctx->thr,
+			&bcachefs_data_ops,
+			bch2_data_thread);
 	if (ret < 0)
-		goto err;
-	fd = ret;
+		kfree(ctx);
+	return ret;
+}
 
-	file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags);
-	if (IS_ERR(file)) {
-		ret = PTR_ERR(file);
+static long bch2_ioctl_fs_usage(struct bch_fs *c,
+				struct bch_ioctl_fs_usage __user *user_arg)
+{
+	struct bch_ioctl_fs_usage arg = {};
+	darray_char replicas = {};
+	u32 replica_entries_bytes;
+	int ret = 0;
+
+	if (!test_bit(BCH_FS_started, &c->flags))
+		return -EINVAL;
+
+	if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
+		return -EFAULT;
+
+	ret   = bch2_fs_replicas_usage_read(c, &replicas) ?:
+		(replica_entries_bytes < replicas.nr ? -ERANGE : 0) ?:
+		copy_to_user_errcode(&user_arg->replicas, replicas.data, replicas.nr);
+	if (ret)
 		goto err;
+
+	struct bch_fs_usage_short u = bch2_fs_usage_read_short(c);
+	arg.capacity		= c->capacity;
+	arg.used		= u.used;
+	arg.online_reserved	= percpu_u64_get(c->online_reserved);
+	arg.replica_entries_bytes = replicas.nr;
+
+	for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) {
+		struct disk_accounting_pos k = {
+			.type = BCH_DISK_ACCOUNTING_persistent_reserved,
+			.persistent_reserved.nr_replicas = i,
+		};
+
+		bch2_accounting_mem_read(c,
+					 disk_accounting_pos_to_bpos(&k),
+					 &arg.persistent_reserved[i], 1);
 	}
 
-	fd_install(fd, file);
+	ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
+err:
+	darray_exit(&replicas);
+	return ret;
+}
+
+static long bch2_ioctl_query_accounting(struct bch_fs *c,
+			struct bch_ioctl_query_accounting __user *user_arg)
+{
+	struct bch_ioctl_query_accounting arg;
+	darray_char accounting = {};
+	int ret = 0;
+
+	if (!test_bit(BCH_FS_started, &c->flags))
+		return -EINVAL;
 
-	get_task_struct(ctx->thread);
-	wake_up_process(ctx->thread);
+	ret   = copy_from_user_errcode(&arg, user_arg, sizeof(arg)) ?:
+		bch2_fs_accounting_read(c, &accounting, arg.accounting_types_mask) ?:
+		(arg.accounting_u64s * sizeof(u64) < accounting.nr ? -ERANGE : 0) ?:
+		copy_to_user_errcode(&user_arg->accounting, accounting.data, accounting.nr);
+	if (ret)
+		goto err;
 
-	return fd;
+	arg.capacity		= c->capacity;
+	arg.used		= bch2_fs_usage_read_short(c).used;
+	arg.online_reserved	= percpu_u64_get(c->online_reserved);
+	arg.accounting_u64s	= accounting.nr / sizeof(u64);
+
+	ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
 err:
-	if (fd >= 0)
-		put_unused_fd(fd);
-	if (!IS_ERR_OR_NULL(ctx->thread))
-		kthread_stop(ctx->thread);
-	kfree(ctx);
+	darray_exit(&accounting);
 	return ret;
 }
 
-static long bch2_ioctl_usage(struct bch_fs *c,
-			     struct bch_ioctl_usage __user *user_arg)
+/* obsolete, didn't allow for new data types: */
+static long bch2_ioctl_dev_usage(struct bch_fs *c,
+				 struct bch_ioctl_dev_usage __user *user_arg)
 {
-	struct bch_ioctl_usage arg;
+	struct bch_ioctl_dev_usage arg;
+	struct bch_dev_usage src;
 	struct bch_dev *ca;
-	unsigned i, j;
-	int ret;
+	unsigned i;
 
-	if (!test_bit(BCH_FS_STARTED, &c->flags))
+	if (!test_bit(BCH_FS_started, &c->flags))
 		return -EINVAL;
 
 	if (copy_from_user(&arg, user_arg, sizeof(arg)))
 		return -EFAULT;
 
-	for (i = 0; i < arg.nr_devices; i++) {
-		struct bch_ioctl_dev_usage dst = { .alive = 0 };
+	if ((arg.flags & ~BCH_BY_INDEX) ||
+	    arg.pad[0] ||
+	    arg.pad[1] ||
+	    arg.pad[2])
+		return -EINVAL;
 
-		ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst));
-		if (ret)
-			return ret;
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	src = bch2_dev_usage_read(ca);
+
+	arg.state		= ca->mi.state;
+	arg.bucket_size		= ca->mi.bucket_size;
+	arg.nr_buckets		= ca->mi.nbuckets - ca->mi.first_bucket;
+
+	for (i = 0; i < ARRAY_SIZE(arg.d); i++) {
+		arg.d[i].buckets	= src.d[i].buckets;
+		arg.d[i].sectors	= src.d[i].sectors;
+		arg.d[i].fragmented	= src.d[i].fragmented;
 	}
 
-	{
-		struct bch_fs_usage src = bch2_fs_usage_read(c);
-		struct bch_ioctl_fs_usage dst = {
-			.capacity		= c->capacity,
-			.used			= bch2_fs_sectors_used(c, src),
-			.online_reserved	= src.online_reserved,
-		};
+	bch2_dev_put(ca);
 
-		for (i = 0; i < BCH_REPLICAS_MAX; i++) {
-			dst.persistent_reserved[i] =
-				src.s[i].persistent_reserved;
+	return copy_to_user_errcode(user_arg, &arg, sizeof(arg));
+}
 
-			for (j = 0; j < S_ALLOC_NR; j++)
-				dst.sectors[s_alloc_to_data_type(j)][i] =
-					src.s[i].data[j];
-		}
+static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
+				 struct bch_ioctl_dev_usage_v2 __user *user_arg)
+{
+	struct bch_ioctl_dev_usage_v2 arg;
+	struct bch_dev_usage src;
+	struct bch_dev *ca;
+	int ret = 0;
 
-		ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst));
-		if (ret)
-			return ret;
-	}
+	if (!test_bit(BCH_FS_started, &c->flags))
+		return -EINVAL;
 
-	for_each_member_device(ca, c, i) {
-		struct bch_dev_usage src = bch2_dev_usage_read(c, ca);
-		struct bch_ioctl_dev_usage dst = {
-			.alive		= 1,
-			.state		= ca->mi.state,
-			.bucket_size	= ca->mi.bucket_size,
-			.nr_buckets	= ca->mi.nbuckets - ca->mi.first_bucket,
-		};
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
 
-		if (ca->dev_idx >= arg.nr_devices) {
-			percpu_ref_put(&ca->ref);
-			return -ERANGE;
-		}
+	if ((arg.flags & ~BCH_BY_INDEX) ||
+	    arg.pad[0] ||
+	    arg.pad[1] ||
+	    arg.pad[2])
+		return -EINVAL;
 
-		if (percpu_ref_tryget(&ca->io_ref)) {
-			dst.dev = huge_encode_dev(ca->disk_sb.bdev->bd_dev);
-			percpu_ref_put(&ca->io_ref);
-		}
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
 
-		for (j = 0; j < BCH_DATA_NR; j++) {
-			dst.buckets[j] = src.buckets[j];
-			dst.sectors[j] = src.sectors[j];
-		}
+	src = bch2_dev_usage_read(ca);
+
+	arg.state		= ca->mi.state;
+	arg.bucket_size		= ca->mi.bucket_size;
+	arg.nr_data_types	= min(arg.nr_data_types, BCH_DATA_NR);
+	arg.nr_buckets		= ca->mi.nbuckets - ca->mi.first_bucket;
+
+	ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
+	if (ret)
+		goto err;
+
+	for (unsigned i = 0; i < arg.nr_data_types; i++) {
+		struct bch_ioctl_dev_usage_type t = {
+			.buckets	= src.d[i].buckets,
+			.sectors	= src.d[i].sectors,
+			.fragmented	= src.d[i].fragmented,
+		};
 
-		ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst));
+		ret = copy_to_user_errcode(&user_arg->d[i], &t, sizeof(t));
 		if (ret)
-			return ret;
+			goto err;
 	}
-
-	return 0;
+err:
+	bch2_dev_put(ca);
+	return ret;
 }
 
 static long bch2_ioctl_read_super(struct bch_fs *c,
@@ -454,6 +548,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
 	struct bch_sb *sb;
 	int ret = 0;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
 	    arg.pad)
 		return -EINVAL;
@@ -462,11 +559,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
 
 	if (arg.flags & BCH_READ_DEV) {
 		ca = bch2_device_lookup(c, arg.dev, arg.flags);
-
-		if (IS_ERR(ca)) {
-			ret = PTR_ERR(ca);
-			goto err;
-		}
+		ret = PTR_ERR_OR_ZERO(ca);
+		if (ret)
+			goto err_unlock;
 
 		sb = ca->disk_sb.sb;
 	} else {
@@ -478,11 +573,11 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
 		goto err;
 	}
 
-	ret = copy_to_user((void __user *)(unsigned long)arg.sb,
-			   sb, vstruct_bytes(sb));
+	ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb,
+				   vstruct_bytes(sb));
 err:
-	if (ca)
-		percpu_ref_put(&ca->ref);
+	bch2_dev_put(ca);
+err_unlock:
 	mutex_unlock(&c->sb_lock);
 	return ret;
 }
@@ -491,16 +586,20 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
 				    struct bch_ioctl_disk_get_idx arg)
 {
 	dev_t dev = huge_decode_dev(arg.dev);
-	struct bch_dev *ca;
-	unsigned i;
 
-	for_each_online_member(ca, c, i)
-		if (ca->disk_sb.bdev->bd_dev == dev) {
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!dev)
+		return -EINVAL;
+
+	for_each_online_member(c, ca)
+		if (ca->dev == dev) {
 			percpu_ref_put(&ca->io_ref);
-			return i;
+			return ca->dev_idx;
 		}
 
-	return -ENOENT;
+	return -BCH_ERR_ENOENT_dev_idx_not_found;
 }
 
 static long bch2_ioctl_disk_resize(struct bch_fs *c,
@@ -509,6 +608,9 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c,
 	struct bch_dev *ca;
 	int ret;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if ((arg.flags & ~BCH_BY_INDEX) ||
 	    arg.pad)
 		return -EINVAL;
@@ -519,7 +621,33 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c,
 
 	ret = bch2_dev_resize(c, ca, arg.nbuckets);
 
-	percpu_ref_put(&ca->ref);
+	bch2_dev_put(ca);
+	return ret;
+}
+
+static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
+				   struct bch_ioctl_disk_resize_journal arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if ((arg.flags & ~BCH_BY_INDEX) ||
+	    arg.pad)
+		return -EINVAL;
+
+	if (arg.nbuckets > U32_MAX)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
+
+	bch2_dev_put(ca);
 	return ret;
 }
 
@@ -529,23 +657,23 @@ do {									\
 									\
 	if (copy_from_user(&i, arg, sizeof(i)))				\
 		return -EFAULT;						\
-	return bch2_ioctl_##_name(c, i);				\
+	ret = bch2_ioctl_##_name(c, i);					\
+	goto out;							\
 } while (0)
 
 long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
 {
-	/* ioctls that don't require admin cap: */
+	long ret;
+
 	switch (cmd) {
 	case BCH_IOCTL_QUERY_UUID:
 		return bch2_ioctl_query_uuid(c, arg);
-	case BCH_IOCTL_USAGE:
-		return bch2_ioctl_usage(c, arg);
-	}
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	switch (cmd) {
+	case BCH_IOCTL_FS_USAGE:
+		return bch2_ioctl_fs_usage(c, arg);
+	case BCH_IOCTL_DEV_USAGE:
+		return bch2_ioctl_dev_usage(c, arg);
+	case BCH_IOCTL_DEV_USAGE_V2:
+		return bch2_ioctl_dev_usage_v2(c, arg);
 #if 0
 	case BCH_IOCTL_START:
 		BCH_IOCTL(start, struct bch_ioctl_start);
@@ -558,10 +686,9 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
 		BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
 	}
 
-	if (!test_bit(BCH_FS_STARTED, &c->flags))
+	if (!test_bit(BCH_FS_started, &c->flags))
 		return -EINVAL;
 
-	/* ioctls that do require admin cap: */
 	switch (cmd) {
 	case BCH_IOCTL_DISK_ADD:
 		BCH_IOCTL(disk_add, struct bch_ioctl_disk);
@@ -577,10 +704,19 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
 		BCH_IOCTL(data, struct bch_ioctl_data);
 	case BCH_IOCTL_DISK_RESIZE:
 		BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
-
+	case BCH_IOCTL_DISK_RESIZE_JOURNAL:
+		BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
+	case BCH_IOCTL_FSCK_ONLINE:
+		BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
+	case BCH_IOCTL_QUERY_ACCOUNTING:
+		return bch2_ioctl_query_accounting(c, arg);
 	default:
 		return -ENOTTY;
 	}
+out:
+	if (ret < 0)
+		ret = bch2_err_class(ret);
+	return ret;
 }
 
 static DEFINE_IDR(bch_chardev_minor);
@@ -603,7 +739,9 @@ static const struct file_operations bch_chardev_fops = {
 };
 
 static int bch_chardev_major;
-static struct class *bch_chardev_class;
+static const struct class bch_chardev_class = {
+	.name = "bcachefs",
+};
 static struct device *bch_chardev;
 
 void bch2_fs_chardev_exit(struct bch_fs *c)
@@ -620,7 +758,7 @@ int bch2_fs_chardev_init(struct bch_fs *c)
 	if (c->minor < 0)
 		return c->minor;
 
-	c->chardev = device_create(bch_chardev_class, NULL,
+	c->chardev = device_create(&bch_chardev_class, NULL,
 				   MKDEV(bch_chardev_major, c->minor), c,
 				   "bcachefs%u-ctl", c->minor);
 	if (IS_ERR(c->chardev))
@@ -631,32 +769,39 @@ int bch2_fs_chardev_init(struct bch_fs *c)
 
 void bch2_chardev_exit(void)
 {
-	if (!IS_ERR_OR_NULL(bch_chardev_class))
-		device_destroy(bch_chardev_class,
-			       MKDEV(bch_chardev_major, U8_MAX));
-	if (!IS_ERR_OR_NULL(bch_chardev_class))
-		class_destroy(bch_chardev_class);
+	device_destroy(&bch_chardev_class, MKDEV(bch_chardev_major, U8_MAX));
+	class_unregister(&bch_chardev_class);
 	if (bch_chardev_major > 0)
 		unregister_chrdev(bch_chardev_major, "bcachefs");
 }
 
 int __init bch2_chardev_init(void)
 {
+	int ret;
+
 	bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
 	if (bch_chardev_major < 0)
 		return bch_chardev_major;
 
-	bch_chardev_class = class_create(THIS_MODULE, "bcachefs");
-	if (IS_ERR(bch_chardev_class))
-		return PTR_ERR(bch_chardev_class);
+	ret = class_register(&bch_chardev_class);
+	if (ret)
+		goto major_out;
 
-	bch_chardev = device_create(bch_chardev_class, NULL,
+	bch_chardev = device_create(&bch_chardev_class, NULL,
 				    MKDEV(bch_chardev_major, U8_MAX),
 				    NULL, "bcachefs-ctl");
-	if (IS_ERR(bch_chardev))
-		return PTR_ERR(bch_chardev);
+	if (IS_ERR(bch_chardev)) {
+		ret = PTR_ERR(bch_chardev);
+		goto class_out;
+	}
 
 	return 0;
+
+class_out:
+	class_unregister(&bch_chardev_class);
+major_out:
+	unregister_chrdev(bch_chardev_major, "bcachefs-ctl");
+	return ret;
 }
 
 #endif /* NO_BCACHEFS_CHARDEV */
diff --git a/libbcachefs/chardev.h b/libbcachefs/chardev.h
index c3057b07..0f563ca5 100644
--- a/libbcachefs/chardev.h
+++ b/libbcachefs/chardev.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_CHARDEV_H
 #define _BCACHEFS_CHARDEV_H
 
@@ -16,7 +17,7 @@ int __init bch2_chardev_init(void);
 static inline long bch2_fs_ioctl(struct bch_fs *c,
 				unsigned cmd, void __user * arg)
 {
-	return -ENOSYS;
+	return -ENOTTY;
 }
 
 static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c
index 28d086bc..23a38357 100644
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@@ -1,276 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "checksum.h"
+#include "errcode.h"
+#include "error.h"
 #include "super.h"
 #include "super-io.h"
 
 #include <linux/crc32c.h>
 #include <linux/crypto.h>
+#include <linux/xxhash.h>
 #include <linux/key.h>
 #include <linux/random.h>
+#include <linux/ratelimit.h>
 #include <linux/scatterlist.h>
 #include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
 #include <crypto/hash.h>
 #include <crypto/poly1305.h>
+#include <crypto/skcipher.h>
 #include <keys/user-type.h>
 
 /*
- * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
- * use permitted, subject to terms of PostgreSQL license; see.)
-
- * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
- * usual sort of implementation. (See Ross Williams' excellent introduction
- * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
- * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
- * If we have no working 64-bit type, then fake it with two 32-bit registers.
- *
- * The present implementation is a normal (not "reflected", in Williams'
- * terms) 64-bit CRC, using initial all-ones register contents and a final
- * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
- * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
- *
- * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
- * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
- * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
- * x^7 + x^4 + x + 1
-*/
-
-static const u64 crc_table[256] = {
-	0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL,
-	0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL,
-	0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL,
-	0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL,
-	0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL,
-	0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL,
-	0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL,
-	0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL,
-	0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL,
-	0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL,
-	0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL,
-	0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL,
-	0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL,
-	0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL,
-	0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL,
-	0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL,
-	0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL,
-	0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL,
-	0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL,
-	0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL,
-	0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL,
-	0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL,
-	0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL,
-	0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL,
-	0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL,
-	0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL,
-	0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL,
-	0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL,
-	0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL,
-	0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL,
-	0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL,
-	0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL,
-	0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL,
-	0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL,
-	0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL,
-	0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL,
-	0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL,
-	0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL,
-	0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL,
-	0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL,
-	0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL,
-	0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL,
-	0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL,
-	0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL,
-	0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL,
-	0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL,
-	0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL,
-	0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL,
-	0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL,
-	0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL,
-	0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL,
-	0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL,
-	0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL,
-	0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL,
-	0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL,
-	0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL,
-	0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL,
-	0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL,
-	0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL,
-	0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL,
-	0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL,
-	0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL,
-	0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL,
-	0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL,
-	0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL,
-	0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL,
-	0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL,
-	0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL,
-	0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL,
-	0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL,
-	0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL,
-	0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL,
-	0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL,
-	0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL,
-	0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL,
-	0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL,
-	0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL,
-	0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL,
-	0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL,
-	0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL,
-	0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL,
-	0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL,
-	0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL,
-	0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL,
-	0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL,
-	0x9AFCE626CE85B507ULL,
+ * bch2_checksum state is an abstraction of the checksum state calculated over different pages.
+ * it features page merging without having the checksum algorithm lose its state.
+ * for native checksum aglorithms (like crc), a default seed value will do.
+ * for hash-like algorithms, a state needs to be stored
+ */
+
+struct bch2_checksum_state {
+	union {
+		u64 seed;
+		struct xxh64_state h64state;
+	};
+	unsigned int type;
 };
 
-u64 bch2_crc64_update(u64 crc, const void *_data, size_t len)
+static void bch2_checksum_init(struct bch2_checksum_state *state)
 {
-	const unsigned char *data = _data;
-
-	while (len--) {
-		int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
-		crc = crc_table[i] ^ (crc << 8);
-	}
-
-	return crc;
-}
-
-static u64 bch2_checksum_init(unsigned type)
-{
-	switch (type) {
-	case BCH_CSUM_NONE:
-		return 0;
-	case BCH_CSUM_CRC32C_NONZERO:
-		return U32_MAX;
-	case BCH_CSUM_CRC64_NONZERO:
-		return U64_MAX;
-	case BCH_CSUM_CRC32C:
-		return 0;
-	case BCH_CSUM_CRC64:
-		return 0;
+	switch (state->type) {
+	case BCH_CSUM_none:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_crc64:
+		state->seed = 0;
+		break;
+	case BCH_CSUM_crc32c_nonzero:
+		state->seed = U32_MAX;
+		break;
+	case BCH_CSUM_crc64_nonzero:
+		state->seed = U64_MAX;
+		break;
+	case BCH_CSUM_xxhash:
+		xxh64_reset(&state->h64state, 0);
+		break;
 	default:
 		BUG();
 	}
 }
 
-static u64 bch2_checksum_final(unsigned type, u64 crc)
+static u64 bch2_checksum_final(const struct bch2_checksum_state *state)
 {
-	switch (type) {
-	case BCH_CSUM_NONE:
-		return 0;
-	case BCH_CSUM_CRC32C_NONZERO:
-		return crc ^ U32_MAX;
-	case BCH_CSUM_CRC64_NONZERO:
-		return crc ^ U64_MAX;
-	case BCH_CSUM_CRC32C:
-		return crc;
-	case BCH_CSUM_CRC64:
-		return crc;
+	switch (state->type) {
+	case BCH_CSUM_none:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_crc64:
+		return state->seed;
+	case BCH_CSUM_crc32c_nonzero:
+		return state->seed ^ U32_MAX;
+	case BCH_CSUM_crc64_nonzero:
+		return state->seed ^ U64_MAX;
+	case BCH_CSUM_xxhash:
+		return xxh64_digest(&state->h64state);
 	default:
 		BUG();
 	}
 }
 
-static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
+static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len)
 {
-	switch (type) {
-	case BCH_CSUM_NONE:
-		return 0;
-	case BCH_CSUM_CRC32C_NONZERO:
-	case BCH_CSUM_CRC32C:
-		return crc32c(crc, data, len);
-	case BCH_CSUM_CRC64_NONZERO:
-	case BCH_CSUM_CRC64:
-		return bch2_crc64_update(crc, data, len);
+	switch (state->type) {
+	case BCH_CSUM_none:
+		return;
+	case BCH_CSUM_crc32c_nonzero:
+	case BCH_CSUM_crc32c:
+		state->seed = crc32c(state->seed, data, len);
+		break;
+	case BCH_CSUM_crc64_nonzero:
+	case BCH_CSUM_crc64:
+		state->seed = crc64_be(state->seed, data, len);
+		break;
+	case BCH_CSUM_xxhash:
+		xxh64_update(&state->h64state, data, len);
+		break;
 	default:
 		BUG();
 	}
 }
 
-static inline void do_encrypt_sg(struct crypto_skcipher *tfm,
-				 struct nonce nonce,
-				 struct scatterlist *sg, size_t len)
+static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
+				struct nonce nonce,
+				struct scatterlist *sg, size_t len)
 {
-	SKCIPHER_REQUEST_ON_STACK(req, tfm);
-	int ret;
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 
-	skcipher_request_set_tfm(req, tfm);
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_callback(req, 0, NULL, NULL);
 	skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
 
-	ret = crypto_skcipher_encrypt(req);
-	BUG_ON(ret);
+	int ret = crypto_skcipher_encrypt(req);
+	if (ret)
+		pr_err("got error %i from crypto_skcipher_encrypt()", ret);
+
+	return ret;
 }
 
-static inline void do_encrypt(struct crypto_skcipher *tfm,
+static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
 			      struct nonce nonce,
 			      void *buf, size_t len)
 {
-	struct scatterlist sg;
+	if (!is_vmalloc_addr(buf)) {
+		struct scatterlist sg = {};
+
+		sg_mark_end(&sg);
+		sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf));
+		return do_encrypt_sg(tfm, nonce, &sg, len);
+	} else {
+		DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
+		size_t sgl_len = 0;
+		int ret;
+
+		darray_init(&sgl);
+
+		while (len) {
+			unsigned offset = offset_in_page(buf);
+			struct scatterlist sg = {
+				.page_link	= (unsigned long) vmalloc_to_page(buf),
+				.offset		= offset,
+				.length		= min(len, PAGE_SIZE - offset),
+			};
 
-	sg_init_one(&sg, buf, len);
-	do_encrypt_sg(tfm, nonce, &sg, len);
+			if (darray_push(&sgl, sg)) {
+				sg_mark_end(&darray_last(sgl));
+				ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
+				if (ret)
+					goto err;
+
+				nonce = nonce_add(nonce, sgl_len);
+				sgl_len = 0;
+				sgl.nr = 0;
+				BUG_ON(darray_push(&sgl, sg));
+			}
+
+			buf += sg.length;
+			len -= sg.length;
+			sgl_len += sg.length;
+		}
+
+		sg_mark_end(&darray_last(sgl));
+		ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
+err:
+		darray_exit(&sgl);
+		return ret;
+	}
 }
 
 int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
-			   void *buf, size_t len)
+			    void *buf, size_t len)
 {
-	struct crypto_skcipher *chacha20 =
-		crypto_alloc_skcipher("chacha20", 0, 0);
+	struct crypto_sync_skcipher *chacha20 =
+		crypto_alloc_sync_skcipher("chacha20", 0, 0);
 	int ret;
 
-	if (!chacha20) {
-		pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20));
-		return PTR_ERR(chacha20);
+	ret = PTR_ERR_OR_ZERO(chacha20);
+	if (ret) {
+		pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret));
+		return ret;
 	}
 
-	ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key));
+	ret = crypto_skcipher_setkey(&chacha20->base,
+				     (void *) key, sizeof(*key));
 	if (ret) {
-		pr_err("crypto_skcipher_setkey() error: %i", ret);
+		pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret));
 		goto err;
 	}
 
-	do_encrypt(chacha20, nonce, buf, len);
+	ret = do_encrypt(chacha20, nonce, buf, len);
 err:
-	crypto_free_skcipher(chacha20);
+	crypto_free_sync_skcipher(chacha20);
 	return ret;
 }
 
-static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
-			 struct nonce nonce)
+static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
+			struct nonce nonce)
 {
 	u8 key[POLY1305_KEY_SIZE];
+	int ret;
 
 	nonce.d[3] ^= BCH_NONCE_POLY;
 
 	memset(key, 0, sizeof(key));
-	do_encrypt(c->chacha20, nonce, key, sizeof(key));
+	ret = do_encrypt(c->chacha20, nonce, key, sizeof(key));
+	if (ret)
+		return ret;
 
 	desc->tfm = c->poly1305;
-	desc->flags = 0;
 	crypto_shash_init(desc);
 	crypto_shash_update(desc, key, sizeof(key));
+	return 0;
 }
 
 struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
 			      struct nonce nonce, const void *data, size_t len)
 {
 	switch (type) {
-	case BCH_CSUM_NONE:
-	case BCH_CSUM_CRC32C_NONZERO:
-	case BCH_CSUM_CRC64_NONZERO:
-	case BCH_CSUM_CRC32C:
-	case BCH_CSUM_CRC64: {
-		u64 crc = bch2_checksum_init(type);
+	case BCH_CSUM_none:
+	case BCH_CSUM_crc32c_nonzero:
+	case BCH_CSUM_crc64_nonzero:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_xxhash:
+	case BCH_CSUM_crc64: {
+		struct bch2_checksum_state state;
 
-		crc = bch2_checksum_update(type, crc, data, len);
-		crc = bch2_checksum_final(type, crc);
+		state.type = type;
 
-		return (struct bch_csum) { .lo = cpu_to_le64(crc) };
+		bch2_checksum_init(&state);
+		bch2_checksum_update(&state, data, len);
+
+		return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
 	}
 
-	case BCH_CSUM_CHACHA20_POLY1305_80:
-	case BCH_CSUM_CHACHA20_POLY1305_128: {
+	case BCH_CSUM_chacha20_poly1305_80:
+	case BCH_CSUM_chacha20_poly1305_128: {
 		SHASH_DESC_ON_STACK(desc, c->poly1305);
 		u8 digest[POLY1305_DIGEST_SIZE];
 		struct bch_csum ret = { 0 };
@@ -284,17 +243,21 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
 		return ret;
 	}
 	default:
-		BUG();
+		return (struct bch_csum) {};
 	}
 }
 
-void bch2_encrypt(struct bch_fs *c, unsigned type,
+int bch2_encrypt(struct bch_fs *c, unsigned type,
 		  struct nonce nonce, void *data, size_t len)
 {
 	if (!bch2_csum_type_is_encryption(type))
-		return;
+		return 0;
 
-	do_encrypt(c->chacha20, nonce, data, len);
+	if (bch2_fs_inconsistent_on(!c->chacha20,
+				    c, "attempting to encrypt without encryption key"))
+		return -BCH_ERR_no_encryption_key;
+
+	return do_encrypt(c->chacha20, nonce, data, len);
 }
 
 static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
@@ -304,33 +267,35 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
 	struct bio_vec bv;
 
 	switch (type) {
-	case BCH_CSUM_NONE:
+	case BCH_CSUM_none:
 		return (struct bch_csum) { 0 };
-	case BCH_CSUM_CRC32C_NONZERO:
-	case BCH_CSUM_CRC64_NONZERO:
-	case BCH_CSUM_CRC32C:
-	case BCH_CSUM_CRC64: {
-		u64 crc = bch2_checksum_init(type);
+	case BCH_CSUM_crc32c_nonzero:
+	case BCH_CSUM_crc64_nonzero:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_xxhash:
+	case BCH_CSUM_crc64: {
+		struct bch2_checksum_state state;
+
+		state.type = type;
+		bch2_checksum_init(&state);
 
 #ifdef CONFIG_HIGHMEM
 		__bio_for_each_segment(bv, bio, *iter, *iter) {
-			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
-			crc = bch2_checksum_update(type,
-				crc, p, bv.bv_len);
-			kunmap_atomic(p);
+			void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
+
+			bch2_checksum_update(&state, p, bv.bv_len);
+			kunmap_local(p);
 		}
 #else
-		__bio_for_each_contig_segment(bv, bio, *iter, *iter)
-			crc = bch2_checksum_update(type, crc,
-				page_address(bv.bv_page) + bv.bv_offset,
+		__bio_for_each_bvec(bv, bio, *iter, *iter)
+			bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset,
 				bv.bv_len);
 #endif
-		crc = bch2_checksum_final(type, crc);
-		return (struct bch_csum) { .lo = cpu_to_le64(crc) };
+		return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
 	}
 
-	case BCH_CSUM_CHACHA20_POLY1305_80:
-	case BCH_CSUM_CHACHA20_POLY1305_128: {
+	case BCH_CSUM_chacha20_poly1305_80:
+	case BCH_CSUM_chacha20_poly1305_128: {
 		SHASH_DESC_ON_STACK(desc, c->poly1305);
 		u8 digest[POLY1305_DIGEST_SIZE];
 		struct bch_csum ret = { 0 };
@@ -339,13 +304,13 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
 
 #ifdef CONFIG_HIGHMEM
 		__bio_for_each_segment(bv, bio, *iter, *iter) {
-			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+			void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
 
 			crypto_shash_update(desc, p, bv.bv_len);
-			kunmap_atomic(p);
+			kunmap_local(p);
 		}
 #else
-		__bio_for_each_contig_segment(bv, bio, *iter, *iter)
+		__bio_for_each_bvec(bv, bio, *iter, *iter)
 			crypto_shash_update(desc,
 				page_address(bv.bv_page) + bv.bv_offset,
 				bv.bv_len);
@@ -356,7 +321,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
 		return ret;
 	}
 	default:
-		BUG();
+		return (struct bch_csum) {};
 	}
 }
 
@@ -368,66 +333,70 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
 	return __bch2_checksum_bio(c, type, nonce, bio, &iter);
 }
 
-void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
-		      struct nonce nonce, struct bio *bio)
+int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+		     struct nonce nonce, struct bio *bio)
 {
 	struct bio_vec bv;
 	struct bvec_iter iter;
-	struct scatterlist sgl[16], *sg = sgl;
-	size_t bytes = 0;
+	DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
+	size_t sgl_len = 0;
+	int ret = 0;
 
-	if (!bch2_csum_type_is_encryption(type))
-		return;
+	if (bch2_fs_inconsistent_on(!c->chacha20,
+				    c, "attempting to encrypt without encryption key"))
+		return -BCH_ERR_no_encryption_key;
 
-	sg_init_table(sgl, ARRAY_SIZE(sgl));
+	darray_init(&sgl);
 
 	bio_for_each_segment(bv, bio, iter) {
-		if (sg == sgl + ARRAY_SIZE(sgl)) {
-			sg_mark_end(sg - 1);
-			do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
-
-			nonce = nonce_add(nonce, bytes);
-			bytes = 0;
-
-			sg_init_table(sgl, ARRAY_SIZE(sgl));
-			sg = sgl;
+		struct scatterlist sg = {
+			.page_link	= (unsigned long) bv.bv_page,
+			.offset		= bv.bv_offset,
+			.length		= bv.bv_len,
+		};
+
+		if (darray_push(&sgl, sg)) {
+			sg_mark_end(&darray_last(sgl));
+			ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
+			if (ret)
+				goto err;
+
+			nonce = nonce_add(nonce, sgl_len);
+			sgl_len = 0;
+			sgl.nr = 0;
+
+			BUG_ON(darray_push(&sgl, sg));
 		}
 
-		sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
-		bytes += bv.bv_len;
+		sgl_len += sg.length;
 	}
 
-	sg_mark_end(sg - 1);
-	do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+	sg_mark_end(&darray_last(sgl));
+	ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
+err:
+	darray_exit(&sgl);
+	return ret;
 }
 
-static inline bool bch2_checksum_mergeable(unsigned type)
+struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
+				    struct bch_csum b, size_t b_len)
 {
+	struct bch2_checksum_state state;
 
-	switch (type) {
-	case BCH_CSUM_NONE:
-	case BCH_CSUM_CRC32C:
-	case BCH_CSUM_CRC64:
-		return true;
-	default:
-		return false;
-	}
-}
+	state.type = type;
+	bch2_checksum_init(&state);
+	state.seed = le64_to_cpu(a.lo);
 
-static struct bch_csum bch2_checksum_merge(unsigned type,
-					   struct bch_csum a,
-					   struct bch_csum b, size_t b_len)
-{
 	BUG_ON(!bch2_checksum_mergeable(type));
 
 	while (b_len) {
-		unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
+		unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE);
 
-		a.lo = bch2_checksum_update(type, a.lo,
-				page_address(ZERO_PAGE(0)), b);
-		b_len -= b;
+		bch2_checksum_update(&state,
+				page_address(ZERO_PAGE(0)), page_len);
+		b_len -= page_len;
 	}
-
+	a.lo = cpu_to_le64(bch2_checksum_final(&state));
 	a.lo ^= b.lo;
 	a.hi ^= b.hi;
 	return a;
@@ -450,9 +419,9 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
 		unsigned			csum_type;
 		struct bch_csum			csum;
 	} splits[3] = {
-		{ crc_a, len_a, new_csum_type },
-		{ crc_b, len_b, new_csum_type },
-		{ NULL,	 bio_sectors(bio) - len_a - len_b, new_csum_type },
+		{ crc_a, len_a, new_csum_type, { 0 }},
+		{ crc_b, len_b, new_csum_type, { 0 } },
+		{ NULL,	 bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } },
 	}, *i;
 	bool mergeable = crc_old.csum_type == new_csum_type &&
 		bch2_checksum_mergeable(new_csum_type);
@@ -460,7 +429,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
 
 	BUG_ON(len_a + len_b > bio_sectors(bio));
 	BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
-	BUG_ON(crc_old.compression_type);
+	BUG_ON(crc_is_compressed(crc_old));
 	BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
 	       bch2_csum_type_is_encryption(new_csum_type));
 
@@ -482,13 +451,29 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
 		merged = bch2_checksum_bio(c, crc_old.csum_type,
 				extent_nonce(version, crc_old), bio);
 
-	if (bch2_crc_cmp(merged, crc_old.csum))
+	if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
+		struct printbuf buf = PRINTBUF;
+		prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n"
+			   "  expected %0llx:%0llx got %0llx:%0llx (old type ",
+			   __func__,
+			   crc_old.csum.hi,
+			   crc_old.csum.lo,
+			   merged.hi,
+			   merged.lo);
+		bch2_prt_csum_type(&buf, crc_old.csum_type);
+		prt_str(&buf, " new type ");
+		bch2_prt_csum_type(&buf, new_csum_type);
+		prt_str(&buf, ")");
+		WARN_RATELIMIT(1, "%s", buf.buf);
+		printbuf_exit(&buf);
 		return -EIO;
+	}
 
 	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
 		if (i->crc)
 			*i->crc = (struct bch_extent_crc_unpacked) {
 				.csum_type		= i->csum_type,
+				.compression_type	= crc_old.compression_type,
 				.compressed_size	= i->len,
 				.uncompressed_size	= i->len,
 				.offset			= 0,
@@ -504,18 +489,51 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
 	return 0;
 }
 
+/* BCH_SB_FIELD_crypt: */
+
+static int bch2_sb_crypt_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				  enum bch_validate_flags flags, struct printbuf *err)
+{
+	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+	if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
+		prt_printf(err, "wrong size (got %zu should be %zu)",
+		       vstruct_bytes(&crypt->field), sizeof(*crypt));
+		return -BCH_ERR_invalid_sb_crypt;
+	}
+
+	if (BCH_CRYPT_KDF_TYPE(crypt)) {
+		prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
+		return -BCH_ERR_invalid_sb_crypt;
+	}
+
+	return 0;
+}
+
+static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
+				  struct bch_sb_field *f)
+{
+	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+	prt_printf(out, "KFD:               %llu\n", BCH_CRYPT_KDF_TYPE(crypt));
+	prt_printf(out, "scrypt n:          %llu\n", BCH_KDF_SCRYPT_N(crypt));
+	prt_printf(out, "scrypt r:          %llu\n", BCH_KDF_SCRYPT_R(crypt));
+	prt_printf(out, "scrypt p:          %llu\n", BCH_KDF_SCRYPT_P(crypt));
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
+	.validate	= bch2_sb_crypt_validate,
+	.to_text	= bch2_sb_crypt_to_text,
+};
+
 #ifdef __KERNEL__
-int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+static int __bch2_request_key(char *key_description, struct bch_key *key)
 {
-	char key_description[60];
 	struct key *keyring_key;
 	const struct user_key_payload *ukp;
 	int ret;
 
-	snprintf(key_description, sizeof(key_description),
-		 "bcachefs:%pUb", &sb->user_uuid);
-
-	keyring_key = request_key(&key_type_logon, key_description, NULL);
+	keyring_key = request_key(&key_type_user, key_description, NULL);
 	if (IS_ERR(keyring_key))
 		return PTR_ERR(keyring_key);
 
@@ -534,27 +552,83 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
 }
 #else
 #include <keyutils.h>
-#include <uuid/uuid.h>
 
-int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+static int __bch2_request_key(char *key_description, struct bch_key *key)
 {
 	key_serial_t key_id;
-	char key_description[60];
-	char uuid[40];
 
-	uuid_unparse_lower(sb->user_uuid.b, uuid);
-	sprintf(key_description, "bcachefs:%s", uuid);
+	key_id = request_key("user", key_description, NULL,
+			     KEY_SPEC_SESSION_KEYRING);
+	if (key_id >= 0)
+		goto got_key;
 
 	key_id = request_key("user", key_description, NULL,
 			     KEY_SPEC_USER_KEYRING);
-	if (key_id < 0)
-		return -errno;
+	if (key_id >= 0)
+		goto got_key;
+
+	key_id = request_key("user", key_description, NULL,
+			     KEY_SPEC_USER_SESSION_KEYRING);
+	if (key_id >= 0)
+		goto got_key;
+
+	return -errno;
+got_key:
 
 	if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
 		return -1;
 
 	return 0;
 }
+
+#include "crypto.h"
+#endif
+
+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+	struct printbuf key_description = PRINTBUF;
+	int ret;
+
+	prt_printf(&key_description, "bcachefs:");
+	pr_uuid(&key_description, sb->user_uuid.b);
+
+	ret = __bch2_request_key(key_description.buf, key);
+	printbuf_exit(&key_description);
+
+#ifndef __KERNEL__
+	if (ret) {
+		char *passphrase = read_passphrase("Enter passphrase: ");
+		struct bch_encrypted_key sb_key;
+
+		bch2_passphrase_check(sb, passphrase,
+				      key, &sb_key);
+		ret = 0;
+	}
+#endif
+
+	/* stash with memfd, pass memfd fd to mount */
+
+	return ret;
+}
+
+#ifndef __KERNEL__
+int bch2_revoke_key(struct bch_sb *sb)
+{
+	key_serial_t key_id;
+	struct printbuf key_description = PRINTBUF;
+
+	prt_printf(&key_description, "bcachefs:");
+	pr_uuid(&key_description, sb->user_uuid.b);
+
+	key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING);
+	printbuf_exit(&key_description);
+	if (key_id < 0)
+		return errno;
+
+	keyctl_revoke(key_id);
+
+	return 0;
+}
 #endif
 
 int bch2_decrypt_sb_key(struct bch_fs *c,
@@ -571,13 +645,13 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
 
 	ret = bch2_request_key(c->disk_sb.sb, &user_key);
 	if (ret) {
-		bch_err(c, "error requesting encryption key: %i", ret);
+		bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
 		goto err;
 	}
 
 	/* decrypt real key: */
 	ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
-			     &sb_key, sizeof(sb_key));
+				      &sb_key, sizeof(sb_key));
 	if (ret)
 		goto err;
 
@@ -596,22 +670,26 @@ err:
 
 static int bch2_alloc_ciphers(struct bch_fs *c)
 {
-	if (!c->chacha20)
-		c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0);
-	if (IS_ERR(c->chacha20)) {
-		bch_err(c, "error requesting chacha20 module: %li",
-			PTR_ERR(c->chacha20));
-		return PTR_ERR(c->chacha20);
+	if (c->chacha20)
+		return 0;
+
+	struct crypto_sync_skcipher *chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
+	int ret = PTR_ERR_OR_ZERO(chacha20);
+	if (ret) {
+		bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret));
+		return ret;
 	}
 
-	if (!c->poly1305)
-		c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
-	if (IS_ERR(c->poly1305)) {
-		bch_err(c, "error requesting poly1305 module: %li",
-			PTR_ERR(c->poly1305));
-		return PTR_ERR(c->poly1305);
+	struct crypto_shash *poly1305 = crypto_alloc_shash("poly1305", 0, 0);
+	ret = PTR_ERR_OR_ZERO(poly1305);
+	if (ret) {
+		bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret));
+		crypto_free_sync_skcipher(chacha20);
+		return ret;
 	}
 
+	c->chacha20	= chacha20;
+	c->poly1305	= poly1305;
 	return 0;
 }
 
@@ -623,7 +701,7 @@ int bch2_disable_encryption(struct bch_fs *c)
 
 	mutex_lock(&c->sb_lock);
 
-	crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+	crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
 	if (!crypt)
 		goto out;
 
@@ -636,7 +714,7 @@ int bch2_disable_encryption(struct bch_fs *c)
 	if (ret)
 		goto out;
 
-	crypt->key.magic	= BCH_KEY_MAGIC;
+	crypt->key.magic	= cpu_to_le64(BCH_KEY_MAGIC);
 	crypt->key.key		= key;
 
 	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
@@ -657,20 +735,20 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
 	mutex_lock(&c->sb_lock);
 
 	/* Do we already have an encryption key? */
-	if (bch2_sb_get_crypt(c->disk_sb.sb))
+	if (bch2_sb_field_get(c->disk_sb.sb, crypt))
 		goto err;
 
 	ret = bch2_alloc_ciphers(c);
 	if (ret)
 		goto err;
 
-	key.magic = BCH_KEY_MAGIC;
+	key.magic = cpu_to_le64(BCH_KEY_MAGIC);
 	get_random_bytes(&key.key, sizeof(key.key));
 
 	if (keyed) {
 		ret = bch2_request_key(c->disk_sb.sb, &user_key);
 		if (ret) {
-			bch_err(c, "error requesting encryption key: %i", ret);
+			bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
 			goto err;
 		}
 
@@ -680,14 +758,15 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
 			goto err;
 	}
 
-	ret = crypto_skcipher_setkey(c->chacha20,
+	ret = crypto_skcipher_setkey(&c->chacha20->base,
 			(void *) &key.key, sizeof(key.key));
 	if (ret)
 		goto err;
 
-	crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64));
+	crypt = bch2_sb_field_resize(&c->disk_sb, crypt,
+				     sizeof(*crypt) / sizeof(u64));
 	if (!crypt) {
-		ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
+		ret = -BCH_ERR_ENOSPC_sb_crypt;
 		goto err;
 	}
 
@@ -705,11 +784,11 @@ err:
 
 void bch2_fs_encryption_exit(struct bch_fs *c)
 {
-	if (!IS_ERR_OR_NULL(c->poly1305))
+	if (c->poly1305)
 		crypto_free_shash(c->poly1305);
-	if (!IS_ERR_OR_NULL(c->chacha20))
-		crypto_free_skcipher(c->chacha20);
-	if (!IS_ERR_OR_NULL(c->sha256))
+	if (c->chacha20)
+		crypto_free_sync_skcipher(c->chacha20);
+	if (c->sha256)
 		crypto_free_shash(c->sha256);
 }
 
@@ -719,16 +798,15 @@ int bch2_fs_encryption_init(struct bch_fs *c)
 	struct bch_key key;
 	int ret = 0;
 
-	pr_verbose_init(c->opts, "");
-
 	c->sha256 = crypto_alloc_shash("sha256", 0, 0);
-	if (IS_ERR(c->sha256)) {
-		bch_err(c, "error requesting sha256 module");
-		ret = PTR_ERR(c->sha256);
+	ret = PTR_ERR_OR_ZERO(c->sha256);
+	if (ret) {
+		c->sha256 = NULL;
+		bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
 		goto out;
 	}
 
-	crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+	crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
 	if (!crypt)
 		goto out;
 
@@ -740,12 +818,11 @@ int bch2_fs_encryption_init(struct bch_fs *c)
 	if (ret)
 		goto out;
 
-	ret = crypto_skcipher_setkey(c->chacha20,
+	ret = crypto_skcipher_setkey(&c->chacha20->base,
 			(void *) &key.key, sizeof(key.key));
 	if (ret)
 		goto out;
 out:
 	memzero_explicit(&key, sizeof(key));
-	pr_verbose_init(c->opts, "ret %i", ret);
 	return ret;
 }
diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h
index 031b36f3..43b9d71f 100644
--- a/libbcachefs/checksum.h
+++ b/libbcachefs/checksum.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_CHECKSUM_H
 #define _BCACHEFS_CHECKSUM_H
 
@@ -5,9 +6,24 @@
 #include "extents_types.h"
 #include "super-io.h"
 
-#include <crypto/chacha20.h>
+#include <linux/crc64.h>
+#include <crypto/chacha.h>
 
-u64 bch2_crc64_update(u64, const void *, size_t);
+static inline bool bch2_checksum_mergeable(unsigned type)
+{
+
+	switch (type) {
+	case BCH_CSUM_none:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_crc64:
+		return true;
+	default:
+		return false;
+	}
+}
+
+struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum,
+				    struct bch_csum, size_t);
 
 #define BCH_NONCE_EXTENT	cpu_to_le32(1 << 28)
 #define BCH_NONCE_BTREE		cpu_to_le32(2 << 28)
@@ -24,16 +40,42 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
  */
 #define csum_vstruct(_c, _type, _nonce, _i)				\
 ({									\
-	const void *start = ((const void *) (_i)) + sizeof((_i)->csum);	\
-	const void *end = vstruct_end(_i);				\
+	const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\
 									\
-	bch2_checksum(_c, _type, _nonce, start, end - start);		\
+	bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\
 })
 
+static inline void bch2_csum_to_text(struct printbuf *out,
+				     enum bch_csum_type type,
+				     struct bch_csum csum)
+{
+	const u8 *p = (u8 *) &csum;
+	unsigned bytes = type < BCH_CSUM_NR ? bch_crc_bytes[type] : 16;
+
+	for (unsigned i = 0; i < bytes; i++)
+		prt_hex_byte(out, p[i]);
+}
+
+static inline void bch2_csum_err_msg(struct printbuf *out,
+				     enum bch_csum_type type,
+				     struct bch_csum expected,
+				     struct bch_csum got)
+{
+	prt_str(out, "checksum error, type ");
+	bch2_prt_csum_type(out, type);
+	prt_str(out, ": got ");
+	bch2_csum_to_text(out, type, got);
+	prt_str(out, " should be ");
+	bch2_csum_to_text(out, type, expected);
+}
+
 int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
 int bch2_request_key(struct bch_sb *, struct bch_key *);
+#ifndef __KERNEL__
+int bch2_revoke_key(struct bch_sb *);
+#endif
 
-void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
+int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
 		 void *data, size_t);
 
 struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
@@ -45,8 +87,18 @@ int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
 			struct bch_extent_crc_unpacked *,
 			unsigned, unsigned, unsigned);
 
-void bch2_encrypt_bio(struct bch_fs *, unsigned,
-		    struct nonce, struct bio *);
+int __bch2_encrypt_bio(struct bch_fs *, unsigned,
+		       struct nonce, struct bio *);
+
+static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+				   struct nonce nonce, struct bio *bio)
+{
+	return bch2_csum_type_is_encryption(type)
+		? __bch2_encrypt_bio(c, type, nonce, bio)
+		: 0;
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
 
 int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
 			struct bch_key *);
@@ -57,46 +109,45 @@ int bch2_enable_encryption(struct bch_fs *, bool);
 void bch2_fs_encryption_exit(struct bch_fs *);
 int bch2_fs_encryption_init(struct bch_fs *);
 
-static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type,
 						       bool data)
 {
 	switch (type) {
-	case BCH_CSUM_OPT_NONE:
-	     return BCH_CSUM_NONE;
-	case BCH_CSUM_OPT_CRC32C:
-	     return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
-	case BCH_CSUM_OPT_CRC64:
-	     return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
+	case BCH_CSUM_OPT_none:
+		return BCH_CSUM_none;
+	case BCH_CSUM_OPT_crc32c:
+		return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
+	case BCH_CSUM_OPT_crc64:
+		return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
+	case BCH_CSUM_OPT_xxhash:
+		return BCH_CSUM_xxhash;
 	default:
-	     BUG();
+		BUG();
 	}
 }
 
 static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
-							 unsigned opt)
+							 struct bch_io_opts opts)
 {
+	if (opts.nocow)
+		return 0;
+
 	if (c->sb.encryption_type)
 		return c->opts.wide_macs
-			? BCH_CSUM_CHACHA20_POLY1305_128
-			: BCH_CSUM_CHACHA20_POLY1305_80;
+			? BCH_CSUM_chacha20_poly1305_128
+			: BCH_CSUM_chacha20_poly1305_80;
 
-	return bch2_csum_opt_to_type(opt, true);
+	return bch2_csum_opt_to_type(opts.data_checksum, true);
 }
 
 static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
 {
 	if (c->sb.encryption_type)
-		return BCH_CSUM_CHACHA20_POLY1305_128;
+		return BCH_CSUM_chacha20_poly1305_128;
 
 	return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
 }
 
-static const unsigned bch2_compression_opt_to_type[] = {
-#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_##t,
-	BCH_COMPRESSION_TYPES()
-#undef x
-};
-
 static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
 					   unsigned type)
 {
@@ -122,9 +173,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
 /* for skipping ahead and encrypting/decrypting at an offset: */
 static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
 {
-	EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1));
+	EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
 
-	le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE);
+	le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
 	return nonce;
 }
 
@@ -139,13 +190,16 @@ static inline struct nonce null_nonce(void)
 static inline struct nonce extent_nonce(struct bversion version,
 					struct bch_extent_crc_unpacked crc)
 {
-	unsigned size = crc.compression_type ? crc.uncompressed_size : 0;
+	unsigned compression_type = crc_is_compressed(crc)
+		? crc.compression_type
+		: 0;
+	unsigned size = compression_type ? crc.uncompressed_size : 0;
 	struct nonce nonce = (struct nonce) {{
 		[0] = cpu_to_le32(size << 22),
 		[1] = cpu_to_le32(version.lo),
 		[2] = cpu_to_le32(version.lo >> 32),
 		[3] = cpu_to_le32(version.hi|
-				  (crc.compression_type << 24))^BCH_NONCE_EXTENT,
+				  (compression_type << 24))^BCH_NONCE_EXTENT,
 	}};
 
 	return nonce_add(nonce, crc.nonce << 9);
diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c
index c67376f9..1d6b691e 100644
--- a/libbcachefs/clock.c
+++ b/libbcachefs/clock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "clock.h"
 
@@ -5,36 +6,58 @@
 #include <linux/kthread.h>
 #include <linux/preempt.h>
 
-static inline long io_timer_cmp(io_timer_heap *h,
-				struct io_timer *l,
-				struct io_timer *r)
+static inline bool io_timer_cmp(const void *l, const void *r, void __always_unused *args)
 {
-	return l->expire - r->expire;
+	struct io_timer **_l = (struct io_timer **)l;
+	struct io_timer **_r = (struct io_timer **)r;
+
+	return (*_l)->expire < (*_r)->expire;
+}
+
+static inline void io_timer_swp(void *l, void *r, void __always_unused *args)
+{
+	struct io_timer **_l = (struct io_timer **)l;
+	struct io_timer **_r = (struct io_timer **)r;
+
+	swap(*_l, *_r);
 }
 
 void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
 {
-	size_t i;
+	const struct min_heap_callbacks callbacks = {
+		.less = io_timer_cmp,
+		.swp = io_timer_swp,
+	};
 
 	spin_lock(&clock->timer_lock);
-	for (i = 0; i < clock->timers.used; i++)
+
+	if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) {
+		spin_unlock(&clock->timer_lock);
+		timer->fn(timer);
+		return;
+	}
+
+	for (size_t i = 0; i < clock->timers.nr; i++)
 		if (clock->timers.data[i] == timer)
 			goto out;
 
-	BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp));
+	BUG_ON(!min_heap_push(&clock->timers, &timer, &callbacks, NULL));
 out:
 	spin_unlock(&clock->timer_lock);
 }
 
 void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
 {
-	size_t i;
+	const struct min_heap_callbacks callbacks = {
+		.less = io_timer_cmp,
+		.swp = io_timer_swp,
+	};
 
 	spin_lock(&clock->timer_lock);
 
-	for (i = 0; i < clock->timers.used; i++)
+	for (size_t i = 0; i < clock->timers.nr; i++)
 		if (clock->timers.data[i] == timer) {
-			heap_del(&clock->timers, i, io_timer_cmp);
+			min_heap_del(&clock->timers, i, &callbacks, NULL);
 			break;
 		}
 
@@ -66,33 +89,31 @@ static void io_clock_cpu_timeout(struct timer_list *timer)
 	wake_up_process(wait->task);
 }
 
-void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
+void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until)
 {
-	struct io_clock_wait wait;
+	struct io_clock_wait wait = {
+		.io_timer.expire	= until,
+		.io_timer.fn		= io_clock_wait_fn,
+		.io_timer.fn2		= (void *) _RET_IP_,
+		.task			= current,
+	};
 
-	/* XXX: calculate sleep time rigorously */
-	wait.io_timer.expire	= until;
-	wait.io_timer.fn	= io_clock_wait_fn;
-	wait.task		= current;
-	wait.expired		= 0;
 	bch2_io_timer_add(clock, &wait.io_timer);
-
 	schedule();
-
 	bch2_io_timer_del(clock, &wait.io_timer);
 }
 
 void bch2_kthread_io_clock_wait(struct io_clock *clock,
-				unsigned long io_until,
-				unsigned long cpu_timeout)
+				u64 io_until, unsigned long cpu_timeout)
 {
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
-	struct io_clock_wait wait;
+	struct io_clock_wait wait = {
+		.io_timer.expire	= io_until,
+		.io_timer.fn		= io_clock_wait_fn,
+		.io_timer.fn2		= (void *) _RET_IP_,
+		.task			= current,
+	};
 
-	wait.io_timer.expire	= io_until;
-	wait.io_timer.fn	= io_clock_wait_fn;
-	wait.task		= current;
-	wait.expired		= 0;
 	bch2_io_timer_add(clock, &wait.io_timer);
 
 	timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
@@ -100,7 +121,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
 	if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
 		mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
 
-	while (1) {
+	do {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (kthread && kthread_should_stop())
 			break;
@@ -110,51 +131,58 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
 
 		schedule();
 		try_to_freeze();
-	}
+	} while (0);
 
 	__set_current_state(TASK_RUNNING);
-	del_singleshot_timer_sync(&wait.cpu_timer);
+	del_timer_sync(&wait.cpu_timer);
 	destroy_timer_on_stack(&wait.cpu_timer);
 	bch2_io_timer_del(clock, &wait.io_timer);
 }
 
-static struct io_timer *get_expired_timer(struct io_clock *clock,
-					  unsigned long now)
+static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now)
 {
 	struct io_timer *ret = NULL;
-
-	spin_lock(&clock->timer_lock);
-
-	if (clock->timers.used &&
-	    time_after_eq(now, clock->timers.data[0]->expire))
-		heap_pop(&clock->timers, ret, io_timer_cmp);
-
-	spin_unlock(&clock->timer_lock);
+	const struct min_heap_callbacks callbacks = {
+		.less = io_timer_cmp,
+		.swp = io_timer_swp,
+	};
+
+	if (clock->timers.nr &&
+	    time_after_eq64(now, clock->timers.data[0]->expire)) {
+		ret = *min_heap_peek(&clock->timers);
+		min_heap_pop(&clock->timers, &callbacks, NULL);
+	}
 
 	return ret;
 }
 
-void bch2_increment_clock(struct bch_fs *c, unsigned sectors, int rw)
+void __bch2_increment_clock(struct io_clock *clock, u64 sectors)
 {
-	struct io_clock *clock = &c->io_clock[rw];
 	struct io_timer *timer;
-	unsigned long now;
+	u64 now = atomic64_add_return(sectors, &clock->now);
 
-	/* Buffer up one megabyte worth of IO in the percpu counter */
-	preempt_disable();
+	spin_lock(&clock->timer_lock);
+	while ((timer = get_expired_timer(clock, now)))
+		timer->fn(timer);
+	spin_unlock(&clock->timer_lock);
+}
 
-	if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) <
-		   IO_CLOCK_PCPU_SECTORS)) {
-		preempt_enable();
-		return;
-	}
+void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
+{
+	out->atomic++;
+	spin_lock(&clock->timer_lock);
+	u64 now = atomic64_read(&clock->now);
 
-	sectors = this_cpu_xchg(*clock->pcpu_buf, 0);
-	preempt_enable();
-	now = atomic_long_add_return(sectors, &clock->now);
+	printbuf_tabstop_push(out, 40);
+	prt_printf(out, "current time:\t%llu\n", now);
 
-	while ((timer = get_expired_timer(clock, now)))
-		timer->fn(timer);
+	for (unsigned i = 0; i < clock->timers.nr; i++)
+		prt_printf(out, "%ps %ps:\t%llu\n",
+		       clock->timers.data[i]->fn,
+		       clock->timers.data[i]->fn2,
+		       clock->timers.data[i]->expire);
+	spin_unlock(&clock->timer_lock);
+	--out->atomic;
 }
 
 void bch2_io_clock_exit(struct io_clock *clock)
@@ -165,15 +193,17 @@ void bch2_io_clock_exit(struct io_clock *clock)
 
 int bch2_io_clock_init(struct io_clock *clock)
 {
-	atomic_long_set(&clock->now, 0);
+	atomic64_set(&clock->now, 0);
 	spin_lock_init(&clock->timer_lock);
 
+	clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
+
 	clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
 	if (!clock->pcpu_buf)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_io_clock_init;
 
 	if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_io_clock_init;
 
 	return 0;
 }
diff --git a/libbcachefs/clock.h b/libbcachefs/clock.h
index 1e2a7dea..82c79c8b 100644
--- a/libbcachefs/clock.h
+++ b/libbcachefs/clock.h
@@ -1,22 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_CLOCK_H
 #define _BCACHEFS_CLOCK_H
 
 void bch2_io_timer_add(struct io_clock *, struct io_timer *);
 void bch2_io_timer_del(struct io_clock *, struct io_timer *);
-void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
-				unsigned long);
-void bch2_increment_clock(struct bch_fs *, unsigned, int);
-
-void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
-
-#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
-({									\
-	long __ret = timeout;						\
-	might_sleep();							\
-	if (!___wait_cond_timeout(condition))				\
-		__ret = __wait_event_timeout(wq, condition, timeout);	\
-	__ret;								\
-})
+void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long);
+
+void __bch2_increment_clock(struct io_clock *, u64);
+
+static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors,
+					int rw)
+{
+	struct io_clock *clock = &c->io_clock[rw];
+
+	if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >=
+		   IO_CLOCK_PCPU_SECTORS))
+		__bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0));
+}
+
+void bch2_io_clock_schedule_timeout(struct io_clock *, u64);
+
+void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
 
 void bch2_io_clock_exit(struct io_clock *);
 int bch2_io_clock_init(struct io_clock *);
diff --git a/libbcachefs/clock_types.h b/libbcachefs/clock_types.h
index df404b6d..37554e45 100644
--- a/libbcachefs/clock_types.h
+++ b/libbcachefs/clock_types.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_CLOCK_TYPES_H
 #define _BCACHEFS_CLOCK_TYPES_H
 
@@ -16,17 +17,19 @@ typedef void (*io_timer_fn)(struct io_timer *);
 
 struct io_timer {
 	io_timer_fn		fn;
-	unsigned long		expire;
+	void			*fn2;
+	u64			expire;
 };
 
 /* Amount to buffer up on a percpu counter */
 #define IO_CLOCK_PCPU_SECTORS	128
 
-typedef HEAP(struct io_timer *)	io_timer_heap;
+typedef DEFINE_MIN_HEAP(struct io_timer *, io_timer_heap)	io_timer_heap;
 
 struct io_clock {
-	atomic_long_t		now;
+	atomic64_t		now;
 	u16 __percpu		*pcpu_buf;
+	unsigned		max_slop;
 
 	spinlock_t		timer_lock;
 	io_timer_heap		timers;
diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c
index 6379905b..f99ff181 100644
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@@ -1,15 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "checksum.h"
 #include "compress.h"
+#include "error.h"
 #include "extents.h"
-#include "io.h"
+#include "opts.h"
 #include "super-io.h"
 
-#include "lz4.h"
 #include <linux/lz4.h>
 #include <linux/zlib.h>
 #include <linux/zstd.h>
 
+static inline enum bch_compression_opts bch2_compression_type_to_opt(enum bch_compression_type type)
+{
+	switch (type) {
+	case BCH_COMPRESSION_TYPE_none:
+	case BCH_COMPRESSION_TYPE_incompressible:
+		return BCH_COMPRESSION_OPT_none;
+	case BCH_COMPRESSION_TYPE_lz4_old:
+	case BCH_COMPRESSION_TYPE_lz4:
+		return BCH_COMPRESSION_OPT_lz4;
+	case BCH_COMPRESSION_TYPE_gzip:
+		return BCH_COMPRESSION_OPT_gzip;
+	case BCH_COMPRESSION_TYPE_zstd:
+		return BCH_COMPRESSION_OPT_zstd;
+	default:
+		BUG();
+	}
+}
+
 /* Bounce buffer: */
 struct bbuf {
 	void		*b;
@@ -17,7 +36,6 @@ struct bbuf {
 		BB_NONE,
 		BB_VMAP,
 		BB_KMALLOC,
-		BB_VMALLOC,
 		BB_MEMPOOL,
 	}		type;
 	int		rw;
@@ -27,27 +45,35 @@ static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
 {
 	void *b;
 
-	BUG_ON(size > c->sb.encoded_extent_max << 9);
+	BUG_ON(size > c->opts.encoded_extent_max);
 
-	b = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
+	b = kmalloc(size, GFP_NOFS|__GFP_NOWARN);
 	if (b)
 		return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
 
-	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOWAIT);
-	b = b ? page_address(b) : NULL;
+	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOFS);
 	if (b)
 		return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
 
-	b = vmalloc(size);
-	if (b)
-		return (struct bbuf) { .b = b, .type = BB_VMALLOC, .rw = rw };
+	BUG();
+}
 
-	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO);
-	b = b ? page_address(b) : NULL;
-	if (b)
-		return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
+static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	void *expected_start = NULL;
 
-	BUG();
+	__bio_for_each_bvec(bv, bio, iter, start) {
+		if (expected_start &&
+		    expected_start != page_address(bv.bv_page) + bv.bv_offset)
+			return false;
+
+		expected_start = page_address(bv.bv_page) +
+			bv.bv_offset + bv.bv_len;
+	}
+
+	return true;
 }
 
 static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
@@ -59,34 +85,35 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
 	unsigned nr_pages = 0;
 	struct page *stack_pages[16];
 	struct page **pages = NULL;
-	bool first = true;
-	unsigned prev_end = PAGE_SIZE;
 	void *data;
 
-	BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
+	BUG_ON(start.bi_size > c->opts.encoded_extent_max);
 
-#ifndef CONFIG_HIGHMEM
-	__bio_for_each_contig_segment(bv, bio, iter, start) {
-		if (bv.bv_len == start.bi_size)
-			return (struct bbuf) {
-				.b = page_address(bv.bv_page) + bv.bv_offset,
-				.type = BB_NONE, .rw = rw
-			};
-	}
-#endif
+	if (!PageHighMem(bio_iter_page(bio, start)) &&
+	    bio_phys_contig(bio, start))
+		return (struct bbuf) {
+			.b = page_address(bio_iter_page(bio, start)) +
+				bio_iter_offset(bio, start),
+			.type = BB_NONE, .rw = rw
+		};
+
+	/* check if we can map the pages contiguously: */
 	__bio_for_each_segment(bv, bio, iter, start) {
-		if ((!first && bv.bv_offset) ||
-		    prev_end != PAGE_SIZE)
+		if (iter.bi_size != start.bi_size &&
+		    bv.bv_offset)
+			goto bounce;
+
+		if (bv.bv_len < iter.bi_size &&
+		    bv.bv_offset + bv.bv_len < PAGE_SIZE)
 			goto bounce;
 
-		prev_end = bv.bv_offset + bv.bv_len;
 		nr_pages++;
 	}
 
 	BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
 
 	pages = nr_pages > ARRAY_SIZE(stack_pages)
-		? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO)
+		? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS)
 		: stack_pages;
 	if (!pages)
 		goto bounce;
@@ -129,12 +156,8 @@ static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
 	case BB_KMALLOC:
 		kfree(buf.b);
 		break;
-	case BB_VMALLOC:
-		vfree(buf.b);
-		break;
 	case BB_MEMPOOL:
-		mempool_free(virt_to_page(buf.b),
-			     &c->compression_bounce[buf.rw]);
+		mempool_free(buf.b, &c->compression_bounce[buf.rw]);
 		break;
 	}
 }
@@ -155,22 +178,30 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 	void *workspace;
 	int ret;
 
+	enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type);
+	mempool_t *workspace_pool = &c->compress_workspace[opt];
+	if (unlikely(!mempool_initialized(workspace_pool))) {
+		if (fsck_err(c, compression_type_not_marked_in_sb,
+			     "compression type %s set but not marked in superblock",
+			     __bch2_compression_types[crc.compression_type]))
+			ret = bch2_check_set_has_compressed_data(c, opt);
+		else
+			ret = -BCH_ERR_compression_workspace_not_initialized;
+		if (ret)
+			goto out;
+	}
+
 	src_data = bio_map_or_bounce(c, src, READ);
 
 	switch (crc.compression_type) {
-	case BCH_COMPRESSION_LZ4_OLD:
-		ret = bch2_lz4_decompress(src_data.b, &src_len,
-				     dst_data, dst_len);
-		if (ret)
-			goto err;
-		break;
-	case BCH_COMPRESSION_LZ4:
+	case BCH_COMPRESSION_TYPE_lz4_old:
+	case BCH_COMPRESSION_TYPE_lz4:
 		ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
 						  src_len, dst_len, dst_len);
 		if (ret != dst_len)
 			goto err;
 		break;
-	case BCH_COMPRESSION_GZIP: {
+	case BCH_COMPRESSION_TYPE_gzip: {
 		z_stream strm = {
 			.next_in	= src_data.b,
 			.avail_in	= src_len,
@@ -178,34 +209,35 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 			.avail_out	= dst_len,
 		};
 
-		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
+		workspace = mempool_alloc(workspace_pool, GFP_NOFS);
 
 		zlib_set_workspace(&strm, workspace);
 		zlib_inflateInit2(&strm, -MAX_WBITS);
 		ret = zlib_inflate(&strm, Z_FINISH);
 
-		mempool_free(workspace, &c->decompress_workspace);
+		mempool_free(workspace, workspace_pool);
 
 		if (ret != Z_STREAM_END)
 			goto err;
 		break;
 	}
-	case BCH_COMPRESSION_ZSTD: {
+	case BCH_COMPRESSION_TYPE_zstd: {
 		ZSTD_DCtx *ctx;
-		size_t len;
+		size_t real_src_len = le32_to_cpup(src_data.b);
 
-		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
-		ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound());
+		if (real_src_len > src_len - 4)
+			goto err;
 
-		src_len = le32_to_cpup(src_data.b);
+		workspace = mempool_alloc(workspace_pool, GFP_NOFS);
+		ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
 
-		len = ZSTD_decompressDCtx(ctx,
+		ret = zstd_decompress_dctx(ctx,
 				dst_data,	dst_len,
-				src_data.b + 4, src_len);
+				src_data.b + 4, real_src_len);
 
-		mempool_free(workspace, &c->decompress_workspace);
+		mempool_free(workspace, workspace_pool);
 
-		if (len != dst_len)
+		if (ret != dst_len)
 			goto err;
 		break;
 	}
@@ -213,6 +245,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 		BUG();
 	}
 	ret = 0;
+fsck_err:
 out:
 	bio_unmap_or_unbounce(c, src_data);
 	return ret;
@@ -231,8 +264,8 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
 	BUG_ON(!bio->bi_vcnt);
 	BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
 
-	if (crc->uncompressed_size	> c->sb.encoded_extent_max ||
-	    crc->compressed_size	> c->sb.encoded_extent_max) {
+	if (crc->uncompressed_size << 9	> c->opts.encoded_extent_max ||
+	    crc->compressed_size << 9	> c->opts.encoded_extent_max) {
 		bch_err(c, "error rewriting existing data: extent too big");
 		return -EIO;
 	}
@@ -240,16 +273,17 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
 	data = __bounce_alloc(c, dst_len, WRITE);
 
 	if (__bio_uncompress(c, bio, data.b, *crc)) {
-		bch_err(c, "error rewriting existing data: decompression error");
+		if (!c->opts.no_data_io)
+			bch_err(c, "error rewriting existing data: decompression error");
 		bio_unmap_or_unbounce(c, data);
 		return -EIO;
 	}
 
 	/*
-	 * might have to free existing pages and retry allocation from mempool -
-	 * do this _after_ decompressing:
+	 * XXX: don't have a good way to assert that the bio was allocated with
+	 * enough space, we depend on bch2_move_extent doing the right thing
 	 */
-	bch2_bio_alloc_more_pages_pool(c, bio, crc->live_size << 9);
+	bio->bi_iter.bi_size = crc->live_size << 9;
 
 	memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
 
@@ -270,10 +304,10 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
 {
 	struct bbuf dst_data = { NULL };
 	size_t dst_len = crc.uncompressed_size << 9;
-	int ret = -ENOMEM;
+	int ret;
 
-	if (crc.uncompressed_size	> c->sb.encoded_extent_max ||
-	    crc.compressed_size		> c->sb.encoded_extent_max)
+	if (crc.uncompressed_size << 9	> c->opts.encoded_extent_max ||
+	    crc.compressed_size << 9	> c->opts.encoded_extent_max)
 		return -EIO;
 
 	dst_data = dst_len == dst_iter.bi_size
@@ -284,7 +318,8 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
 	if (ret)
 		goto err;
 
-	if (dst_data.type != BB_NONE)
+	if (dst_data.type != BB_NONE &&
+	    dst_data.type != BB_VMAP)
 		memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
 err:
 	bio_unmap_or_unbounce(c, dst_data);
@@ -295,22 +330,33 @@ static int attempt_compress(struct bch_fs *c,
 			    void *workspace,
 			    void *dst, size_t dst_len,
 			    void *src, size_t src_len,
-			    unsigned compression_type)
+			    struct bch_compression_opt compression)
 {
-	switch (compression_type) {
-	case BCH_COMPRESSION_LZ4: {
-		int len = src_len;
-		int ret = LZ4_compress_destSize(
-				src,		dst,
-				&len,		dst_len,
-				workspace);
-
-		if (len < src_len)
-			return -len;
+	enum bch_compression_type compression_type =
+		__bch2_compression_opt_to_type[compression.type];
 
-		return ret;
-	}
-	case BCH_COMPRESSION_GZIP: {
+	switch (compression_type) {
+	case BCH_COMPRESSION_TYPE_lz4:
+		if (compression.level < LZ4HC_MIN_CLEVEL) {
+			int len = src_len;
+			int ret = LZ4_compress_destSize(
+					src,		dst,
+					&len,		dst_len,
+					workspace);
+			if (len < src_len)
+				return -len;
+
+			return ret;
+		} else {
+			int ret = LZ4_compress_HC(
+					src,		dst,
+					src_len,	dst_len,
+					compression.level,
+					workspace);
+
+			return ret ?: -1;
+		}
+	case BCH_COMPRESSION_TYPE_gzip: {
 		z_stream strm = {
 			.next_in	= src,
 			.avail_in	= src_len,
@@ -319,7 +365,11 @@ static int attempt_compress(struct bch_fs *c,
 		};
 
 		zlib_set_workspace(&strm, workspace);
-		zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+		zlib_deflateInit2(&strm,
+				  compression.level
+				  ? clamp_t(unsigned, compression.level,
+					    Z_BEST_SPEED, Z_BEST_COMPRESSION)
+				  : Z_DEFAULT_COMPRESSION,
 				  Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
 				  Z_DEFAULT_STRATEGY);
 
@@ -331,15 +381,31 @@ static int attempt_compress(struct bch_fs *c,
 
 		return strm.total_out;
 	}
-	case BCH_COMPRESSION_ZSTD: {
-		ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace,
-			ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams));
+	case BCH_COMPRESSION_TYPE_zstd: {
+		/*
+		 * rescale:
+		 * zstd max compression level is 22, our max level is 15
+		 */
+		unsigned level = min((compression.level * 3) / 2, zstd_max_clevel());
+		ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max);
+		ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size);
 
-		size_t len = ZSTD_compressCCtx(ctx,
-				dst + 4,	dst_len - 4,
+		/*
+		 * ZSTD requires that when we decompress we pass in the exact
+		 * compressed size - rounding it up to the nearest sector
+		 * doesn't work, so we use the first 4 bytes of the buffer for
+		 * that.
+		 *
+		 * Additionally, the ZSTD code seems to have a bug where it will
+		 * write just past the end of the buffer - so subtract a fudge
+		 * factor (7 bytes) from the dst buffer size to account for
+		 * that.
+		 */
+		size_t len = zstd_compress_cctx(ctx,
+				dst + 4,	dst_len - 4 - 7,
 				src,		src_len,
-				c->zstd_params);
-		if (ZSTD_isError(len))
+				&params);
+		if (zstd_is_error(len))
 			return 0;
 
 		*((__le32 *) dst) = cpu_to_le32(len);
@@ -353,24 +419,39 @@ static int attempt_compress(struct bch_fs *c,
 static unsigned __bio_compress(struct bch_fs *c,
 			       struct bio *dst, size_t *dst_len,
 			       struct bio *src, size_t *src_len,
-			       unsigned compression_type)
+			       struct bch_compression_opt compression)
 {
 	struct bbuf src_data = { NULL }, dst_data = { NULL };
 	void *workspace;
+	enum bch_compression_type compression_type =
+		__bch2_compression_opt_to_type[compression.type];
 	unsigned pad;
 	int ret = 0;
 
-	BUG_ON(compression_type >= BCH_COMPRESSION_NR);
-	BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
+	/* bch2_compression_decode catches unknown compression types: */
+	BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR);
+
+	mempool_t *workspace_pool = &c->compress_workspace[compression.type];
+	if (unlikely(!mempool_initialized(workspace_pool))) {
+		if (fsck_err(c, compression_opt_not_marked_in_sb,
+			     "compression opt %s set but not marked in superblock",
+			     bch2_compression_opts[compression.type])) {
+			ret = bch2_check_set_has_compressed_data(c, compression.type);
+			if (ret) /* memory allocation failure, don't compress */
+				return 0;
+		} else {
+			return 0;
+		}
+	}
 
 	/* If it's only one block, don't bother trying to compress: */
-	if (bio_sectors(src) <= c->opts.block_size)
-		return 0;
+	if (src->bi_iter.bi_size <= c->opts.block_size)
+		return BCH_COMPRESSION_TYPE_incompressible;
 
 	dst_data = bio_map_or_bounce(c, dst, WRITE);
 	src_data = bio_map_or_bounce(c, src, READ);
 
-	workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO);
+	workspace = mempool_alloc(workspace_pool, GFP_NOFS);
 
 	*src_len = src->bi_iter.bi_size;
 	*dst_len = dst->bi_iter.bi_size;
@@ -388,7 +469,7 @@ static unsigned __bio_compress(struct bch_fs *c,
 		ret = attempt_compress(c, workspace,
 				       dst_data.b,	*dst_len,
 				       src_data.b,	*src_len,
-				       compression_type);
+				       compression);
 		if (ret > 0) {
 			*dst_len = ret;
 			ret = 0;
@@ -413,7 +494,7 @@ static unsigned __bio_compress(struct bch_fs *c,
 		*src_len = round_down(*src_len, block_bytes(c));
 	}
 
-	mempool_free(workspace, &c->compress_workspace[compression_type]);
+	mempool_free(workspace, workspace_pool);
 
 	if (ret)
 		goto err;
@@ -427,41 +508,45 @@ static unsigned __bio_compress(struct bch_fs *c,
 	memset(dst_data.b + *dst_len, 0, pad);
 	*dst_len += pad;
 
-	if (dst_data.type != BB_NONE)
+	if (dst_data.type != BB_NONE &&
+	    dst_data.type != BB_VMAP)
 		memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
 
 	BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
 	BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
 	BUG_ON(*dst_len & (block_bytes(c) - 1));
 	BUG_ON(*src_len & (block_bytes(c) - 1));
+	ret = compression_type;
 out:
 	bio_unmap_or_unbounce(c, src_data);
 	bio_unmap_or_unbounce(c, dst_data);
-	return compression_type;
+	return ret;
 err:
-	compression_type = 0;
+	ret = BCH_COMPRESSION_TYPE_incompressible;
+	goto out;
+fsck_err:
+	ret = 0;
 	goto out;
 }
 
 unsigned bch2_bio_compress(struct bch_fs *c,
 			   struct bio *dst, size_t *dst_len,
 			   struct bio *src, size_t *src_len,
-			   unsigned compression_type)
+			   unsigned compression_opt)
 {
 	unsigned orig_dst = dst->bi_iter.bi_size;
 	unsigned orig_src = src->bi_iter.bi_size;
+	unsigned compression_type;
 
 	/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
 	src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
-				     c->sb.encoded_extent_max << 9);
+				     c->opts.encoded_extent_max);
 	/* Don't generate a bigger output than input: */
 	dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
 
-	if (compression_type == BCH_COMPRESSION_LZ4_OLD)
-		compression_type = BCH_COMPRESSION_LZ4;
-
 	compression_type =
-		__bio_compress(c, dst, dst_len, src, src_len, compression_type);
+		__bio_compress(c, dst, dst_len, src, src_len,
+			       bch2_compression_decode(compression_opt));
 
 	dst->bi_iter.bi_size = orig_dst;
 	src->bi_iter.bi_size = orig_src;
@@ -470,15 +555,15 @@ unsigned bch2_bio_compress(struct bch_fs *c,
 
 static int __bch2_fs_compress_init(struct bch_fs *, u64);
 
-#define BCH_FEATURE_NONE	0
+#define BCH_FEATURE_none	0
 
 static const unsigned bch2_compression_opt_to_feature[] = {
-#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
-	BCH_COMPRESSION_TYPES()
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
+	BCH_COMPRESSION_OPTS()
 #undef x
 };
 
-#undef BCH_FEATURE_NONE
+#undef BCH_FEATURE_none
 
 static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
 {
@@ -508,8 +593,10 @@ static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
 }
 
 int bch2_check_set_has_compressed_data(struct bch_fs *c,
-				       unsigned compression_type)
+				       unsigned compression_opt)
 {
+	unsigned compression_type = bch2_compression_decode(compression_opt).type;
+
 	BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
 
 	return compression_type
@@ -522,7 +609,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
 {
 	unsigned i;
 
-	mempool_exit(&c->decompress_workspace);
 	for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
 		mempool_exit(&c->compress_workspace[i]);
 	mempool_exit(&c->compression_bounce[WRITE]);
@@ -531,96 +617,150 @@ void bch2_fs_compress_exit(struct bch_fs *c)
 
 static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 {
-	size_t max_extent = c->sb.encoded_extent_max << 9;
-	size_t order = get_order(max_extent);
-	size_t decompress_workspace_size = 0;
-	bool decompress_workspace_needed;
-	ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0);
+	ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
+						 c->opts.encoded_extent_max);
+
+	c->zstd_workspace_size = zstd_cctx_workspace_bound(&params.cParams);
+
 	struct {
-		unsigned	feature;
-		unsigned	type;
-		size_t		compress_workspace;
-		size_t		decompress_workspace;
+		unsigned			feature;
+		enum bch_compression_opts	type;
+		size_t				compress_workspace;
 	} compression_types[] = {
-		{ BCH_FEATURE_LZ4, BCH_COMPRESSION_LZ4, LZ4_MEM_COMPRESS, 0 },
-		{ BCH_FEATURE_GZIP, BCH_COMPRESSION_GZIP,
-			zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
-			zlib_inflate_workspacesize(), },
-		{ BCH_FEATURE_ZSTD, BCH_COMPRESSION_ZSTD,
-			ZSTD_CCtxWorkspaceBound(params.cParams),
-			ZSTD_DCtxWorkspaceBound() },
+		{ BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4,
+			max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) },
+		{ BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip,
+			max(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
+			    zlib_inflate_workspacesize()) },
+		{ BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd,
+			max(c->zstd_workspace_size,
+			    zstd_dctx_workspace_bound()) },
 	}, *i;
-	int ret = 0;
-
-	pr_verbose_init(c->opts, "");
-
-	c->zstd_params = params;
+	bool have_compressed = false;
 
 	for (i = compression_types;
 	     i < compression_types + ARRAY_SIZE(compression_types);
 	     i++)
-		if (features & (1 << i->feature))
-			goto have_compressed;
+		have_compressed |= (features & (1 << i->feature)) != 0;
 
-	goto out;
-have_compressed:
+	if (!have_compressed)
+		return 0;
 
-	if (!mempool_initialized(&c->compression_bounce[READ])) {
-		ret = mempool_init_page_pool(&c->compression_bounce[READ],
-					     1, order);
-		if (ret)
-			goto out;
-	}
+	if (!mempool_initialized(&c->compression_bounce[READ]) &&
+	    mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
+				       1, c->opts.encoded_extent_max))
+		return -BCH_ERR_ENOMEM_compression_bounce_read_init;
 
-	if (!mempool_initialized(&c->compression_bounce[WRITE])) {
-		ret = mempool_init_page_pool(&c->compression_bounce[WRITE],
-					     1, order);
-		if (ret)
-			goto out;
-	}
+	if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
+	    mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
+				       1, c->opts.encoded_extent_max))
+		return -BCH_ERR_ENOMEM_compression_bounce_write_init;
 
 	for (i = compression_types;
 	     i < compression_types + ARRAY_SIZE(compression_types);
 	     i++) {
-		decompress_workspace_size =
-			max(decompress_workspace_size, i->decompress_workspace);
-
 		if (!(features & (1 << i->feature)))
 			continue;
 
-		if (i->decompress_workspace)
-			decompress_workspace_needed = true;
-
 		if (mempool_initialized(&c->compress_workspace[i->type]))
 			continue;
 
-		ret = mempool_init_kvpmalloc_pool(
+		if (mempool_init_kvmalloc_pool(
 				&c->compress_workspace[i->type],
-				1, i->compress_workspace);
-		if (ret)
-			goto out;
+				1, i->compress_workspace))
+			return -BCH_ERR_ENOMEM_compression_workspace_init;
 	}
 
-	ret = mempool_init_kmalloc_pool(
-			&c->decompress_workspace,
-			1, decompress_workspace_size);
-	if (ret)
-		goto out;
-out:
-	pr_verbose_init(c->opts, "ret %i", ret);
-	return ret;
+	return 0;
+}
+
+static u64 compression_opt_to_feature(unsigned v)
+{
+	unsigned type = bch2_compression_decode(v).type;
+
+	return BIT_ULL(bch2_compression_opt_to_feature[type]);
 }
 
 int bch2_fs_compress_init(struct bch_fs *c)
 {
 	u64 f = c->sb.features;
 
-	if (c->opts.compression)
-		f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression];
-
-	if (c->opts.background_compression)
-		f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression];
+	f |= compression_opt_to_feature(c->opts.compression);
+	f |= compression_opt_to_feature(c->opts.background_compression);
 
 	return __bch2_fs_compress_init(c, f);
+}
 
+int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
+			       struct printbuf *err)
+{
+	char *val = kstrdup(_val, GFP_KERNEL);
+	char *p = val, *type_str, *level_str;
+	struct bch_compression_opt opt = { 0 };
+	int ret;
+
+	if (!val)
+		return -ENOMEM;
+
+	type_str = strsep(&p, ":");
+	level_str = p;
+
+	ret = match_string(bch2_compression_opts, -1, type_str);
+	if (ret < 0 && err)
+		prt_str(err, "invalid compression type");
+	if (ret < 0)
+		goto err;
+
+	opt.type = ret;
+
+	if (level_str) {
+		unsigned level;
+
+		ret = kstrtouint(level_str, 10, &level);
+		if (!ret && !opt.type && level)
+			ret = -EINVAL;
+		if (!ret && level > 15)
+			ret = -EINVAL;
+		if (ret < 0 && err)
+			prt_str(err, "invalid compression level");
+		if (ret < 0)
+			goto err;
+
+		opt.level = level;
+	}
+
+	*res = bch2_compression_encode(opt);
+err:
+	kfree(val);
+	return ret;
+}
+
+void bch2_compression_opt_to_text(struct printbuf *out, u64 v)
+{
+	struct bch_compression_opt opt = bch2_compression_decode(v);
+
+	if (opt.type < BCH_COMPRESSION_OPT_NR)
+		prt_str(out, bch2_compression_opts[opt.type]);
+	else
+		prt_printf(out, "(unknown compression opt %u)", opt.type);
+	if (opt.level)
+		prt_printf(out, ":%u", opt.level);
+}
+
+void bch2_opt_compression_to_text(struct printbuf *out,
+				  struct bch_fs *c,
+				  struct bch_sb *sb,
+				  u64 v)
+{
+	return bch2_compression_opt_to_text(out, v);
+}
+
+int bch2_opt_compression_validate(u64 v, struct printbuf *err)
+{
+	if (!bch2_compression_opt_valid(v)) {
+		prt_printf(err, "invalid compression opt %llu", v);
+		return -BCH_ERR_invalid_sb_opt_compression;
+	}
+
+	return 0;
 }
diff --git a/libbcachefs/compress.h b/libbcachefs/compress.h
index 06fff6a5..607fd5e2 100644
--- a/libbcachefs/compress.h
+++ b/libbcachefs/compress.h
@@ -1,8 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_COMPRESS_H
 #define _BCACHEFS_COMPRESS_H
 
 #include "extents_types.h"
 
+static const unsigned __bch2_compression_opt_to_type[] = {
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
+	BCH_COMPRESSION_OPTS()
+#undef x
+};
+
+struct bch_compression_opt {
+	u8		type:4,
+			level:4;
+};
+
+static inline struct bch_compression_opt __bch2_compression_decode(unsigned v)
+{
+	return (struct bch_compression_opt) {
+		.type	= v & 15,
+		.level	= v >> 4,
+	};
+}
+
+static inline bool bch2_compression_opt_valid(unsigned v)
+{
+	struct bch_compression_opt opt = __bch2_compression_decode(v);
+
+	return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level);
+}
+
+static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
+{
+	return bch2_compression_opt_valid(v)
+		? __bch2_compression_decode(v)
+		: (struct bch_compression_opt) { 0 };
+}
+
+static inline unsigned bch2_compression_encode(struct bch_compression_opt opt)
+{
+	return opt.type|(opt.level << 4);
+}
+
+static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
+{
+	return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
+}
+
 int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
 				struct bch_extent_crc_unpacked *);
 int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
@@ -14,4 +58,16 @@ int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
 void bch2_fs_compress_exit(struct bch_fs *);
 int bch2_fs_compress_init(struct bch_fs *);
 
+void bch2_compression_opt_to_text(struct printbuf *, u64);
+
+int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
+void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+int bch2_opt_compression_validate(u64, struct printbuf *);
+
+#define bch2_opt_compression (struct bch_opt_fn) {		\
+	.parse		= bch2_opt_compression_parse,		\
+	.to_text	= bch2_opt_compression_to_text,		\
+	.validate	= bch2_opt_compression_validate,	\
+}
+
 #endif /* _BCACHEFS_COMPRESS_H */
diff --git a/libbcachefs/darray.c b/libbcachefs/darray.c
new file mode 100644
index 00000000..e86d36d2
--- /dev/null
+++ b/libbcachefs/darray.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/log2.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include "darray.h"
+
+int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
+{
+	if (new_size > d->size) {
+		new_size = roundup_pow_of_two(new_size);
+
+		/*
+		 * This is a workaround: kvmalloc() doesn't support > INT_MAX
+		 * allocations, but vmalloc() does.
+		 * The limit needs to be lifted from kvmalloc, and when it does
+		 * we'll go back to just using that.
+		 */
+		size_t bytes;
+		if (unlikely(check_mul_overflow(new_size, element_size, &bytes)))
+			return -ENOMEM;
+
+		void *data = likely(bytes < INT_MAX)
+			? kvmalloc_noprof(bytes, gfp)
+			: vmalloc_noprof(bytes);
+		if (!data)
+			return -ENOMEM;
+
+		if (d->size)
+			memcpy(data, d->data, d->size * element_size);
+		if (d->data != d->preallocated)
+			kvfree(d->data);
+		d->data	= data;
+		d->size = new_size;
+	}
+
+	return 0;
+}
diff --git a/libbcachefs/darray.h b/libbcachefs/darray.h
new file mode 100644
index 00000000..8f4c3f06
--- /dev/null
+++ b/libbcachefs/darray.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DARRAY_H
+#define _BCACHEFS_DARRAY_H
+
+/*
+ * Dynamic arrays:
+ *
+ * Inspired by CCAN's darray
+ */
+
+#include <linux/slab.h>
+
+#define DARRAY_PREALLOCATED(_type, _nr)					\
+struct {								\
+	size_t nr, size;						\
+	_type *data;							\
+	_type preallocated[_nr];					\
+}
+
+#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
+
+typedef DARRAY(char)	darray_char;
+typedef DARRAY(char *) darray_str;
+
+int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
+
+#define __bch2_darray_resize(...)	alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__))
+
+#define __darray_resize(_d, _element_size, _new_size, _gfp)		\
+	(unlikely((_new_size) > (_d)->size)				\
+	 ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\
+	 : 0)
+
+#define darray_resize_gfp(_d, _new_size, _gfp)				\
+	__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
+
+#define darray_resize(_d, _new_size)					\
+	darray_resize_gfp(_d, _new_size, GFP_KERNEL)
+
+#define darray_make_room_gfp(_d, _more, _gfp)				\
+	darray_resize_gfp((_d), (_d)->nr + (_more), _gfp)
+
+#define darray_make_room(_d, _more)					\
+	darray_make_room_gfp(_d, _more, GFP_KERNEL)
+
+#define darray_room(_d)		((_d).size - (_d).nr)
+
+#define darray_top(_d)		((_d).data[(_d).nr])
+
+#define darray_push_gfp(_d, _item, _gfp)				\
+({									\
+	int _ret = darray_make_room_gfp((_d), 1, _gfp);			\
+									\
+	if (!_ret)							\
+		(_d)->data[(_d)->nr++] = (_item);			\
+	_ret;								\
+})
+
+#define darray_push(_d, _item)	darray_push_gfp(_d, _item, GFP_KERNEL)
+
+#define darray_pop(_d)		((_d)->data[--(_d)->nr])
+
+#define darray_first(_d)	((_d).data[0])
+#define darray_last(_d)		((_d).data[(_d).nr - 1])
+
+#define darray_insert_item(_d, pos, _item)				\
+({									\
+	size_t _pos = (pos);						\
+	int _ret = darray_make_room((_d), 1);				\
+									\
+	if (!_ret)							\
+		array_insert_item((_d)->data, (_d)->nr, _pos, (_item));	\
+	_ret;								\
+})
+
+#define darray_remove_item(_d, _pos)					\
+	array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
+
+#define __darray_for_each(_d, _i)						\
+	for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
+
+#define darray_for_each(_d, _i)						\
+	for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++)
+
+#define darray_for_each_reverse(_d, _i)					\
+	for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i)
+
+#define darray_init(_d)							\
+do {									\
+	(_d)->nr = 0;							\
+	(_d)->size = ARRAY_SIZE((_d)->preallocated);			\
+	(_d)->data = (_d)->size ? (_d)->preallocated : NULL;		\
+} while (0)
+
+#define darray_exit(_d)							\
+do {									\
+	if (!ARRAY_SIZE((_d)->preallocated) ||				\
+	    (_d)->data != (_d)->preallocated)				\
+		kvfree((_d)->data);					\
+	darray_init(_d);						\
+} while (0)
+
+#endif /* _BCACHEFS_DARRAY_H */
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
new file mode 100644
index 00000000..31b2aeb0
--- /dev/null
+++ b/libbcachefs/data_update.c
@@ -0,0 +1,761 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "compress.h"
+#include "data_update.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "extents.h"
+#include "io_write.h"
+#include "keylist.h"
+#include "move.h"
+#include "nocow_locking.h"
+#include "rebalance.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "trace.h"
+
+static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	bkey_for_each_ptr(ptrs, ptr)
+		bch2_dev_put(bch2_dev_have_ref(c, ptr->dev));
+}
+
+static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		if (!bch2_dev_tryget(c, ptr->dev)) {
+			bkey_for_each_ptr(ptrs, ptr2) {
+				if (ptr2 == ptr)
+					break;
+				bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev));
+			}
+			return false;
+		}
+	}
+	return true;
+}
+
+static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+		struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+
+		bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
+	}
+}
+
+static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+		struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+
+		if (ctxt) {
+			bool locked;
+
+			move_ctxt_wait_event(ctxt,
+				(locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
+				list_empty(&ctxt->ios));
+
+			if (!locked)
+				bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
+		} else {
+			if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) {
+				bkey_for_each_ptr(ptrs, ptr2) {
+					if (ptr2 == ptr)
+						break;
+
+					ca = bch2_dev_have_ref(c, ptr2->dev);
+					bucket = PTR_BUCKET_POS(ca, ptr2);
+					bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
+				}
+				return false;
+			}
+		}
+	}
+	return true;
+}
+
+static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k)
+{
+	if (trace_move_extent_finish_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		trace_move_extent_finish(c, buf.buf);
+		printbuf_exit(&buf);
+	}
+}
+
+static void trace_move_extent_fail2(struct data_update *m,
+			 struct bkey_s_c new,
+			 struct bkey_s_c wrote,
+			 struct bkey_i *insert,
+			 const char *msg)
+{
+	struct bch_fs *c = m->op.c;
+	struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
+	struct printbuf buf = PRINTBUF;
+	unsigned rewrites_found = 0;
+
+	if (!trace_move_extent_fail_enabled())
+		return;
+
+	prt_str(&buf, msg);
+
+	if (insert) {
+		const union bch_extent_entry *entry;
+		struct bch_extent_ptr *ptr;
+		struct extent_ptr_decoded p;
+
+		unsigned ptr_bit = 1;
+		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
+			if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
+			    (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
+			    !ptr->cached)
+				rewrites_found |= ptr_bit;
+			ptr_bit <<= 1;
+		}
+	}
+
+	prt_str(&buf, "rewrites found:\t");
+	bch2_prt_u64_base2(&buf, rewrites_found);
+	prt_newline(&buf);
+
+	bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
+
+	prt_str(&buf, "\nold:    ");
+	bch2_bkey_val_to_text(&buf, c, old);
+
+	prt_str(&buf, "\nnew:    ");
+	bch2_bkey_val_to_text(&buf, c, new);
+
+	prt_str(&buf, "\nwrote:  ");
+	bch2_bkey_val_to_text(&buf, c, wrote);
+
+	if (insert) {
+		prt_str(&buf, "\ninsert: ");
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+	}
+
+	trace_move_extent_fail(c, buf.buf);
+	printbuf_exit(&buf);
+}
+
+static int __bch2_data_update_index_update(struct btree_trans *trans,
+					   struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct btree_iter iter;
+	struct data_update *m =
+		container_of(op, struct data_update, op);
+	struct keylist *keys = &op->insert_keys;
+	struct bkey_buf _new, _insert;
+	int ret = 0;
+
+	bch2_bkey_buf_init(&_new);
+	bch2_bkey_buf_init(&_insert);
+	bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
+
+	bch2_trans_iter_init(trans, &iter, m->btree_id,
+			     bkey_start_pos(&bch2_keylist_front(keys)->k),
+			     BTREE_ITER_slots|BTREE_ITER_intent);
+
+	while (1) {
+		struct bkey_s_c k;
+		struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
+		struct bkey_i *insert = NULL;
+		struct bkey_i_extent *new;
+		const union bch_extent_entry *entry_c;
+		union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+		struct bch_extent_ptr *ptr;
+		const struct bch_extent_ptr *ptr_c;
+		struct bpos next_pos;
+		bool should_check_enospc;
+		s64 i_sectors_delta = 0, disk_sectors_delta = 0;
+		unsigned rewrites_found = 0, durability, ptr_bit;
+
+		bch2_trans_begin(trans);
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		new = bkey_i_to_extent(bch2_keylist_front(keys));
+
+		if (!bch2_extents_match(k, old)) {
+			trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i),
+						NULL, "no match:");
+			goto nowork;
+		}
+
+		bkey_reassemble(_insert.k, k);
+		insert = _insert.k;
+
+		bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
+		new = bkey_i_to_extent(_new.k);
+		bch2_cut_front(iter.pos, &new->k_i);
+
+		bch2_cut_front(iter.pos,	insert);
+		bch2_cut_back(new->k.p,		insert);
+		bch2_cut_back(insert->k.p,	&new->k_i);
+
+		/*
+		 * @old: extent that we read from
+		 * @insert: key that we're going to update, initialized from
+		 * extent currently in btree - same as @old unless we raced with
+		 * other updates
+		 * @new: extent with new pointers that we'll be adding to @insert
+		 *
+		 * Fist, drop rewrite_ptrs from @new:
+		 */
+		ptr_bit = 1;
+		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
+			if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
+			    (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
+			    !ptr->cached) {
+				bch2_extent_ptr_set_cached(c, &m->op.opts,
+							   bkey_i_to_s(insert), ptr);
+				rewrites_found |= ptr_bit;
+			}
+			ptr_bit <<= 1;
+		}
+
+		if (m->data_opts.rewrite_ptrs &&
+		    !rewrites_found &&
+		    bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
+			trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
+			goto nowork;
+		}
+
+		/*
+		 * A replica that we just wrote might conflict with a replica
+		 * that we want to keep, due to racing with another move:
+		 */
+restart_drop_conflicting_replicas:
+		extent_for_each_ptr(extent_i_to_s(new), ptr)
+			if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) &&
+			    !ptr_c->cached) {
+				bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr);
+				goto restart_drop_conflicting_replicas;
+			}
+
+		if (!bkey_val_u64s(&new->k)) {
+			trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
+			goto nowork;
+		}
+
+		/* Now, drop pointers that conflict with what we just wrote: */
+		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
+			if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev)))
+				bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
+
+		durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) +
+			bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
+
+		/* Now, drop excess replicas: */
+		rcu_read_lock();
+restart_drop_extra_replicas:
+		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
+			unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
+
+			if (!p.ptr.cached &&
+			    durability - ptr_durability >= m->op.opts.data_replicas) {
+				durability -= ptr_durability;
+
+				bch2_extent_ptr_set_cached(c, &m->op.opts,
+							   bkey_i_to_s(insert), &entry->ptr);
+				goto restart_drop_extra_replicas;
+			}
+		}
+		rcu_read_unlock();
+
+		/* Finally, add the pointers we just wrote: */
+		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
+			bch2_extent_ptr_decoded_append(insert, &p);
+
+		bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
+		bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert));
+
+		ret = bch2_sum_sector_overwrites(trans, &iter, insert,
+						 &should_check_enospc,
+						 &i_sectors_delta,
+						 &disk_sectors_delta);
+		if (ret)
+			goto err;
+
+		if (disk_sectors_delta > (s64) op->res.sectors) {
+			ret = bch2_disk_reservation_add(c, &op->res,
+						disk_sectors_delta - op->res.sectors,
+						!should_check_enospc
+						? BCH_DISK_RESERVATION_NOFAIL : 0);
+			if (ret)
+				goto out;
+		}
+
+		next_pos = insert->k.p;
+
+		/*
+		 * Check for nonce offset inconsistency:
+		 * This is debug code - we've been seeing this bug rarely, and
+		 * it's been hard to reproduce, so this should give us some more
+		 * information when it does occur:
+		 */
+		int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert),
+						 (struct bkey_validate_context) {
+							.btree	= m->btree_id,
+							.flags	= BCH_VALIDATE_commit,
+						 });
+		if (invalid) {
+			struct printbuf buf = PRINTBUF;
+
+			prt_str(&buf, "about to insert invalid key in data update path");
+			prt_str(&buf, "\nold: ");
+			bch2_bkey_val_to_text(&buf, c, old);
+			prt_str(&buf, "\nk:   ");
+			bch2_bkey_val_to_text(&buf, c, k);
+			prt_str(&buf, "\nnew: ");
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+			bch2_print_string_as_lines(KERN_ERR, buf.buf);
+			printbuf_exit(&buf);
+
+			bch2_fatal_error(c);
+			ret = -EIO;
+			goto out;
+		}
+
+		if (trace_data_update_enabled()) {
+			struct printbuf buf = PRINTBUF;
+
+			prt_str(&buf, "\nold: ");
+			bch2_bkey_val_to_text(&buf, c, old);
+			prt_str(&buf, "\nk:   ");
+			bch2_bkey_val_to_text(&buf, c, k);
+			prt_str(&buf, "\nnew: ");
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+			trace_data_update(c, buf.buf);
+			printbuf_exit(&buf);
+		}
+
+		ret =   bch2_insert_snapshot_whiteouts(trans, m->btree_id,
+						k.k->p, bkey_start_pos(&insert->k)) ?:
+			bch2_insert_snapshot_whiteouts(trans, m->btree_id,
+						k.k->p, insert->k.p) ?:
+			bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?:
+			bch2_trans_update(trans, &iter, insert,
+				BTREE_UPDATE_internal_snapshot_node) ?:
+			bch2_trans_commit(trans, &op->res,
+				NULL,
+				BCH_TRANS_COMMIT_no_check_rw|
+				BCH_TRANS_COMMIT_no_enospc|
+				m->data_opts.btree_insert_flags);
+		if (!ret) {
+			bch2_btree_iter_set_pos(&iter, next_pos);
+
+			this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
+			trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i));
+		}
+err:
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			ret = 0;
+		if (ret)
+			break;
+next:
+		while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) {
+			bch2_keylist_pop_front(keys);
+			if (bch2_keylist_empty(keys))
+				goto out;
+		}
+		continue;
+nowork:
+		if (m->stats) {
+			BUG_ON(k.k->p.offset <= iter.pos.offset);
+			atomic64_inc(&m->stats->keys_raced);
+			atomic64_add(k.k->p.offset - iter.pos.offset,
+				     &m->stats->sectors_raced);
+		}
+
+		count_event(c, move_extent_fail);
+
+		bch2_btree_iter_advance(&iter);
+		goto next;
+	}
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_bkey_buf_exit(&_insert, c);
+	bch2_bkey_buf_exit(&_new, c);
+	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
+	return ret;
+}
+
+int bch2_data_update_index_update(struct bch_write_op *op)
+{
+	return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
+}
+
+void bch2_data_update_read_done(struct data_update *m,
+				struct bch_extent_crc_unpacked crc)
+{
+	/* write bio must own pages: */
+	BUG_ON(!m->op.wbio.bio.bi_vcnt);
+
+	m->op.crc = crc;
+	m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
+
+	closure_call(&m->op.cl, bch2_write, NULL, NULL);
+}
+
+void bch2_data_update_exit(struct data_update *update)
+{
+	struct bch_fs *c = update->op.c;
+	struct bkey_s_c k = bkey_i_to_s_c(update->k.k);
+
+	if (c->opts.nocow_enabled)
+		bkey_nocow_unlock(c, k);
+	bkey_put_dev_refs(c, k);
+	bch2_bkey_buf_exit(&update->k, c);
+	bch2_disk_reservation_put(c, &update->op.res);
+	bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
+}
+
+static void bch2_update_unwritten_extent(struct btree_trans *trans,
+				  struct data_update *update)
+{
+	struct bch_fs *c = update->op.c;
+	struct bio *bio = &update->op.wbio.bio;
+	struct bkey_i_extent *e;
+	struct write_point *wp;
+	struct closure cl;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	closure_init_stack(&cl);
+	bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys);
+
+	while (bio_sectors(bio)) {
+		unsigned sectors = bio_sectors(bio);
+
+		bch2_trans_begin(trans);
+
+		bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
+				     BTREE_ITER_slots);
+		ret = lockrestart_do(trans, ({
+			k = bch2_btree_iter_peek_slot(&iter);
+			bkey_err(k);
+		}));
+		bch2_trans_iter_exit(trans, &iter);
+
+		if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k)))
+			break;
+
+		e = bkey_extent_init(update->op.insert_keys.top);
+		e->k.p = update->op.pos;
+
+		ret = bch2_alloc_sectors_start_trans(trans,
+				update->op.target,
+				false,
+				update->op.write_point,
+				&update->op.devs_have,
+				update->op.nr_replicas,
+				update->op.nr_replicas,
+				update->op.watermark,
+				0, &cl, &wp);
+		if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) {
+			bch2_trans_unlock(trans);
+			closure_sync(&cl);
+			continue;
+		}
+
+		bch_err_fn_ratelimited(c, ret);
+
+		if (ret)
+			return;
+
+		sectors = min(sectors, wp->sectors_free);
+
+		bch2_key_resize(&e->k, sectors);
+
+		bch2_open_bucket_get(c, wp, &update->op.open_buckets);
+		bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
+		bch2_alloc_sectors_done(c, wp);
+
+		bio_advance(bio, sectors << 9);
+		update->op.pos.offset += sectors;
+
+		extent_for_each_ptr(extent_i_to_s(e), ptr)
+			ptr->unwritten = true;
+		bch2_keylist_push(&update->op.insert_keys);
+
+		ret = __bch2_data_update_index_update(trans, &update->op);
+
+		bch2_open_buckets_put(c, &update->op.open_buckets);
+
+		if (ret)
+			break;
+	}
+
+	if (closure_nr_remaining(&cl) != 1) {
+		bch2_trans_unlock(trans);
+		closure_sync(&cl);
+	}
+}
+
+void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
+				   struct bch_io_opts *io_opts,
+				   struct data_update_opts *data_opts)
+{
+	printbuf_tabstop_push(out, 20);
+	prt_str(out, "rewrite ptrs:\t");
+	bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
+	prt_newline(out);
+
+	prt_str(out, "kill ptrs:\t");
+	bch2_prt_u64_base2(out, data_opts->kill_ptrs);
+	prt_newline(out);
+
+	prt_str(out, "target:\t");
+	bch2_target_to_text(out, c, data_opts->target);
+	prt_newline(out);
+
+	prt_str(out, "compression:\t");
+	bch2_compression_opt_to_text(out, io_opts->background_compression);
+	prt_newline(out);
+
+	prt_str(out, "opts.replicas:\t");
+	prt_u64(out, io_opts->data_replicas);
+
+	prt_str(out, "extra replicas:\t");
+	prt_u64(out, data_opts->extra_replicas);
+}
+
+void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
+{
+	bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
+	prt_newline(out);
+	bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
+}
+
+int bch2_extent_drop_ptrs(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  struct bkey_s_c k,
+			  struct bch_io_opts *io_opts,
+			  struct data_update_opts *data_opts)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i *n;
+	int ret;
+
+	n = bch2_bkey_make_mut_noupdate(trans, k);
+	ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	while (data_opts->kill_ptrs) {
+		unsigned i = 0, drop = __fls(data_opts->kill_ptrs);
+
+		bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop);
+		data_opts->kill_ptrs ^= 1U << drop;
+	}
+
+	/*
+	 * If the new extent no longer has any pointers, bch2_extent_normalize()
+	 * will do the appropriate thing with it (turning it into a
+	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
+	 */
+	bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n));
+
+	/*
+	 * Since we're not inserting through an extent iterator
+	 * (BTREE_ITER_all_snapshots iterators aren't extent iterators),
+	 * we aren't using the extent overwrite path to delete, we're
+	 * just using the normal key deletion path:
+	 */
+	if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents))
+		n->k.size = 0;
+
+	return bch2_trans_relock(trans) ?:
+		bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
+		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+}
+
+int bch2_data_update_init(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  struct moving_context *ctxt,
+			  struct data_update *m,
+			  struct write_point_specifier wp,
+			  struct bch_io_opts io_opts,
+			  struct data_update_opts data_opts,
+			  enum btree_id btree_id,
+			  struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned reserve_sectors = k.k->size * data_opts.extra_replicas;
+	int ret = 0;
+
+	/*
+	 * fs is corrupt  we have a key for a snapshot node that doesn't exist,
+	 * and we have to check for this because we go rw before repairing the
+	 * snapshots table - just skip it, we can move it later.
+	 */
+	if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot)))
+		return -BCH_ERR_data_update_done;
+
+	if (!bkey_get_dev_refs(c, k))
+		return -BCH_ERR_data_update_done;
+
+	if (c->opts.nocow_enabled &&
+	    !bkey_nocow_lock(c, ctxt, k)) {
+		bkey_put_dev_refs(c, k);
+		return -BCH_ERR_nocow_lock_blocked;
+	}
+
+	bch2_bkey_buf_init(&m->k);
+	bch2_bkey_buf_reassemble(&m->k, c, k);
+	m->btree_id	= btree_id;
+	m->data_opts	= data_opts;
+	m->ctxt		= ctxt;
+	m->stats	= ctxt ? ctxt->stats : NULL;
+
+	bch2_write_op_init(&m->op, c, io_opts);
+	m->op.pos	= bkey_start_pos(k.k);
+	m->op.version	= k.k->bversion;
+	m->op.target	= data_opts.target;
+	m->op.write_point = wp;
+	m->op.nr_replicas = 0;
+	m->op.flags	|= BCH_WRITE_PAGES_STABLE|
+		BCH_WRITE_PAGES_OWNED|
+		BCH_WRITE_DATA_ENCODED|
+		BCH_WRITE_MOVE|
+		m->data_opts.write_flags;
+	m->op.compression_opt	= io_opts.background_compression;
+	m->op.watermark		= m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
+
+	unsigned durability_have = 0, durability_removing = 0;
+
+	unsigned ptr_bit = 1;
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if (!p.ptr.cached) {
+			rcu_read_lock();
+			if (ptr_bit & m->data_opts.rewrite_ptrs) {
+				if (crc_is_compressed(p.crc))
+					reserve_sectors += k.k->size;
+
+				m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
+				durability_removing += bch2_extent_ptr_desired_durability(c, &p);
+			} else if (!(ptr_bit & m->data_opts.kill_ptrs)) {
+				bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
+				durability_have += bch2_extent_ptr_durability(c, &p);
+			}
+			rcu_read_unlock();
+		}
+
+		/*
+		 * op->csum_type is normally initialized from the fs/file's
+		 * current options - but if an extent is encrypted, we require
+		 * that it stays encrypted:
+		 */
+		if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
+			m->op.nonce	= p.crc.nonce + p.crc.offset;
+			m->op.csum_type = p.crc.csum_type;
+		}
+
+		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
+			m->op.incompressible = true;
+
+		ptr_bit <<= 1;
+	}
+
+	unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
+
+	/*
+	 * If current extent durability is less than io_opts.data_replicas,
+	 * we're not trying to rereplicate the extent up to data_replicas here -
+	 * unless extra_replicas was specified
+	 *
+	 * Increasing replication is an explicit operation triggered by
+	 * rereplicate, currently, so that users don't get an unexpected -ENOSPC
+	 */
+	m->op.nr_replicas = min(durability_removing, durability_required) +
+		m->data_opts.extra_replicas;
+
+	/*
+	 * If device(s) were set to durability=0 after data was written to them
+	 * we can end up with a duribilty=0 extent, and the normal algorithm
+	 * that tries not to increase durability doesn't work:
+	 */
+	if (!(durability_have + durability_removing))
+		m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
+
+	m->op.nr_replicas_required = m->op.nr_replicas;
+
+	/*
+	 * It might turn out that we don't need any new replicas, if the
+	 * replicas or durability settings have been changed since the extent
+	 * was written:
+	 */
+	if (!m->op.nr_replicas) {
+		m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
+		m->data_opts.rewrite_ptrs = 0;
+		/* if iter == NULL, it's just a promote */
+		if (iter)
+			ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts);
+		goto out;
+	}
+
+	if (reserve_sectors) {
+		ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
+				m->data_opts.extra_replicas
+				? 0
+				: BCH_DISK_RESERVATION_NOFAIL);
+		if (ret)
+			goto out;
+	}
+
+	if (bkey_extent_is_unwritten(k)) {
+		bch2_update_unwritten_extent(trans, m);
+		goto out;
+	}
+
+	return 0;
+out:
+	bch2_data_update_exit(m);
+	return ret ?: -BCH_ERR_data_update_done;
+}
+
+void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	unsigned ptr_bit = 1;
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) {
+			opts->kill_ptrs |= ptr_bit;
+			opts->rewrite_ptrs ^= ptr_bit;
+		}
+
+		ptr_bit <<= 1;
+	}
+}
diff --git a/libbcachefs/data_update.h b/libbcachefs/data_update.h
new file mode 100644
index 00000000..e4b50723
--- /dev/null
+++ b/libbcachefs/data_update.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _BCACHEFS_DATA_UPDATE_H
+#define _BCACHEFS_DATA_UPDATE_H
+
+#include "bkey_buf.h"
+#include "io_write_types.h"
+
+struct moving_context;
+
+struct data_update_opts {
+	unsigned	rewrite_ptrs;
+	unsigned	kill_ptrs;
+	u16		target;
+	u8		extra_replicas;
+	unsigned	btree_insert_flags;
+	unsigned	write_flags;
+};
+
+void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
+				   struct bch_io_opts *, struct data_update_opts *);
+
+struct data_update {
+	/* extent being updated: */
+	enum btree_id		btree_id;
+	struct bkey_buf		k;
+	struct data_update_opts	data_opts;
+	struct moving_context	*ctxt;
+	struct bch_move_stats	*stats;
+	struct bch_write_op	op;
+};
+
+void bch2_data_update_to_text(struct printbuf *, struct data_update *);
+
+int bch2_data_update_index_update(struct bch_write_op *);
+
+void bch2_data_update_read_done(struct data_update *,
+				struct bch_extent_crc_unpacked);
+
+int bch2_extent_drop_ptrs(struct btree_trans *,
+			  struct btree_iter *,
+			  struct bkey_s_c,
+			  struct bch_io_opts *,
+			  struct data_update_opts *);
+
+void bch2_data_update_exit(struct data_update *);
+int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
+			  struct moving_context *,
+			  struct data_update *,
+			  struct write_point_specifier,
+			  struct bch_io_opts, struct data_update_opts,
+			  enum btree_id, struct bkey_s_c);
+void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
+
+#endif /* _BCACHEFS_DATA_UPDATE_H */
diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c
index 71f649bc..b5de52a5 100644
--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Assorted bcachefs debug code
  *
@@ -10,14 +11,15 @@
 #include "btree_cache.h"
 #include "btree_io.h"
 #include "btree_iter.h"
+#include "btree_locking.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
 #include "buckets.h"
 #include "debug.h"
 #include "error.h"
 #include "extents.h"
 #include "fsck.h"
 #include "inode.h"
-#include "io.h"
 #include "super.h"
 
 #include <linux/console.h>
@@ -28,60 +30,41 @@
 
 static struct dentry *bch_debug;
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
+static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
+				      struct extent_ptr_decoded pick)
 {
 	struct btree *v = c->verify_data;
-	struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
-	struct bset *sorted, *inmemory;
-	struct extent_pick_ptr pick;
-	struct bch_dev *ca;
+	struct btree_node *n_ondisk = c->verify_ondisk;
+	struct btree_node *n_sorted = c->verify_data->data;
+	struct bset *sorted, *inmemory = &b->data->keys;
 	struct bio *bio;
+	bool failed = false, saw_error = false;
 
-	if (c->opts.nochanges)
-		return;
-
-	btree_node_io_lock(b);
-	mutex_lock(&c->verify_lock);
-
-	n_ondisk = c->verify_ondisk;
-	n_sorted = c->verify_data->data;
-	n_inmemory = b->data;
-
-	bkey_copy(&v->key, &b->key);
-	v->written	= 0;
-	v->level	= b->level;
-	v->btree_id	= b->btree_id;
-	bch2_btree_keys_init(v, &c->expensive_debug_checks);
+	struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+	if (!ca)
+		return false;
 
-	if (bch2_btree_pick_ptr(c, b, NULL, &pick) <= 0)
-		return;
-
-	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
-	if (!bch2_dev_get_ioref(ca, READ))
-		return;
-
-	bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
-	bio_set_dev(bio, ca->disk_sb.bdev);
-	bio->bi_opf		= REQ_OP_READ|REQ_META;
+	bio = bio_alloc_bioset(ca->disk_sb.bdev,
+			       buf_pages(n_sorted, btree_buf_bytes(b)),
+			       REQ_OP_READ|REQ_META,
+			       GFP_NOFS,
+			       &c->btree_bio);
 	bio->bi_iter.bi_sector	= pick.ptr.offset;
-	bio->bi_iter.bi_size	= btree_bytes(c);
-	bch2_bio_map(bio, n_sorted);
+	bch2_bio_map(bio, n_sorted, btree_buf_bytes(b));
 
 	submit_bio_wait(bio);
 
 	bio_put(bio);
 	percpu_ref_put(&ca->io_ref);
 
-	memcpy(n_ondisk, n_sorted, btree_bytes(c));
+	memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
 
-	if (bch2_btree_node_read_done(c, v, false))
-		goto out;
+	v->written = 0;
+	if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
+		return false;
 
 	n_sorted = c->verify_data->data;
 	sorted = &n_sorted->keys;
-	inmemory = &n_inmemory->keys;
 
 	if (inmemory->u64s != sorted->u64s ||
 	    memcmp(inmemory->start,
@@ -94,13 +77,13 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 		console_lock();
 
 		printk(KERN_ERR "*** in memory:\n");
-		bch2_dump_bset(b, inmemory, 0);
+		bch2_dump_bset(c, b, inmemory, 0);
 
 		printk(KERN_ERR "*** read back in:\n");
-		bch2_dump_bset(v, sorted, 0);
+		bch2_dump_bset(c, v, sorted, 0);
 
-		while (offset < b->written) {
-			if (!offset ) {
+		while (offset < v->written) {
+			if (!offset) {
 				i = &n_ondisk->keys;
 				sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
 					c->block_bits;
@@ -114,64 +97,242 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 			}
 
 			printk(KERN_ERR "*** on disk block %u:\n", offset);
-			bch2_dump_bset(b, i, offset);
+			bch2_dump_bset(c, b, i, offset);
 
 			offset += sectors;
 		}
 
-		printk(KERN_ERR "*** block %u/%u not written\n",
-		       offset >> c->block_bits, btree_blocks(c));
-
 		for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
 			if (inmemory->_data[j] != sorted->_data[j])
 				break;
 
-		printk(KERN_ERR "b->written %u\n", b->written);
-
 		console_unlock();
-		panic("verify failed at %u\n", j);
+		bch_err(c, "verify failed at key %u", j);
+
+		failed = true;
+	}
+
+	if (v->written != b->written) {
+		bch_err(c, "written wrong: expected %u, got %u",
+			b->written, v->written);
+		failed = true;
+	}
+
+	return failed;
+}
+
+void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
+{
+	struct bkey_ptrs_c ptrs;
+	struct extent_ptr_decoded p;
+	const union bch_extent_entry *entry;
+	struct btree *v;
+	struct bset *inmemory = &b->data->keys;
+	struct bkey_packed *k;
+	bool failed = false;
+
+	if (c->opts.nochanges)
+		return;
+
+	bch2_btree_node_io_lock(b);
+	mutex_lock(&c->verify_lock);
+
+	if (!c->verify_ondisk) {
+		c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
+		if (!c->verify_ondisk)
+			goto out;
+	}
+
+	if (!c->verify_data) {
+		c->verify_data = __bch2_btree_node_mem_alloc(c);
+		if (!c->verify_data)
+			goto out;
+
+		list_del_init(&c->verify_data->list);
+	}
+
+	BUG_ON(b->nsets != 1);
+
+	for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k))
+		if (k->type == KEY_TYPE_btree_ptr_v2)
+			((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0;
+
+	v = c->verify_data;
+	bkey_copy(&v->key, &b->key);
+	v->c.level	= b->c.level;
+	v->c.btree_id	= b->c.btree_id;
+	bch2_btree_keys_init(v);
+
+	ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
+	bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry)
+		failed |= bch2_btree_verify_replica(c, b, p);
+
+	if (failed) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+		bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf);
+		printbuf_exit(&buf);
 	}
 out:
 	mutex_unlock(&c->verify_lock);
-	btree_node_io_unlock(b);
+	bch2_btree_node_io_unlock(b);
 }
 
-#endif
+void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
+				    const struct btree *b)
+{
+	struct btree_node *n_ondisk = NULL;
+	struct extent_ptr_decoded pick;
+	struct bch_dev *ca;
+	struct bio *bio = NULL;
+	unsigned offset = 0;
+	int ret;
+
+	if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) {
+		prt_printf(out, "error getting device to read from: invalid device\n");
+		return;
+	}
+
+	ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+	if (!ca) {
+		prt_printf(out, "error getting device to read from: not online\n");
+		return;
+	}
+
+	n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
+	if (!n_ondisk) {
+		prt_printf(out, "memory allocation failure\n");
+		goto out;
+	}
+
+	bio = bio_alloc_bioset(ca->disk_sb.bdev,
+			       buf_pages(n_ondisk, btree_buf_bytes(b)),
+			       REQ_OP_READ|REQ_META,
+			       GFP_NOFS,
+			       &c->btree_bio);
+	bio->bi_iter.bi_sector	= pick.ptr.offset;
+	bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b));
+
+	ret = submit_bio_wait(bio);
+	if (ret) {
+		prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret));
+		goto out;
+	}
+
+	while (offset < btree_sectors(c)) {
+		struct bset *i;
+		struct nonce nonce;
+		struct bch_csum csum;
+		struct bkey_packed *k;
+		unsigned sectors;
+
+		if (!offset) {
+			i = &n_ondisk->keys;
+
+			if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
+				prt_printf(out, "unknown checksum type at offset %u: %llu\n",
+					   offset, BSET_CSUM_TYPE(i));
+				goto out;
+			}
+
+			nonce = btree_nonce(i, offset << 9);
+			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk);
+
+			if (bch2_crc_cmp(csum, n_ondisk->csum)) {
+				prt_printf(out, "invalid checksum\n");
+				goto out;
+			}
+
+			bset_encrypt(c, i, offset << 9);
+
+			sectors = vstruct_sectors(n_ondisk, c->block_bits);
+		} else {
+			struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9);
+
+			i = &bne->keys;
+
+			if (i->seq != n_ondisk->keys.seq)
+				break;
+
+			if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
+				prt_printf(out, "unknown checksum type at offset %u: %llu\n",
+					   offset, BSET_CSUM_TYPE(i));
+				goto out;
+			}
+
+			nonce = btree_nonce(i, offset << 9);
+			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
+			if (bch2_crc_cmp(csum, bne->csum)) {
+				prt_printf(out, "invalid checksum");
+				goto out;
+			}
+
+			bset_encrypt(c, i, offset << 9);
+
+			sectors = vstruct_sectors(bne, c->block_bits);
+		}
+
+		prt_printf(out, "  offset %u version %u, journal seq %llu\n",
+			   offset,
+			   le16_to_cpu(i->version),
+			   le64_to_cpu(i->journal_seq));
+		offset += sectors;
+
+		printbuf_indent_add(out, 4);
+
+		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) {
+			struct bkey u;
+
+			bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u));
+			prt_newline(out);
+		}
+
+		printbuf_indent_sub(out, 4);
+	}
+out:
+	if (bio)
+		bio_put(bio);
+	kvfree(n_ondisk);
+	percpu_ref_put(&ca->io_ref);
+}
 
 #ifdef CONFIG_DEBUG_FS
 
 /* XXX: bch_fs refcounting */
 
 struct dump_iter {
-	struct bpos		from;
-	struct bch_fs	*c;
+	struct bch_fs		*c;
 	enum btree_id		id;
+	struct bpos		from;
+	struct bpos		prev_node;
+	u64			iter;
 
-	char			buf[PAGE_SIZE];
-	size_t			bytes;	/* what's currently in buf */
+	struct printbuf		buf;
 
 	char __user		*ubuf;	/* destination user buffer */
 	size_t			size;	/* size of requested read */
 	ssize_t			ret;	/* bytes read so far */
 };
 
-static int flush_buf(struct dump_iter *i)
+static ssize_t flush_buf(struct dump_iter *i)
 {
-	if (i->bytes) {
-		size_t bytes = min(i->bytes, i->size);
-		int err = copy_to_user(i->ubuf, i->buf, bytes);
-
-		if (err)
-			return err;
-
-		i->ret	 += bytes;
-		i->ubuf	 += bytes;
-		i->size	 -= bytes;
-		i->bytes -= bytes;
-		memmove(i->buf, i->buf + bytes, i->bytes);
+	if (i->buf.pos) {
+		size_t bytes = min_t(size_t, i->buf.pos, i->size);
+		int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes);
+
+		i->ret	 += copied;
+		i->ubuf	 += copied;
+		i->size	 -= copied;
+		i->buf.pos -= copied;
+		memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos);
+
+		if (copied != bytes)
+			return -EFAULT;
 	}
 
-	return 0;
+	return i->size ? 0 : i->ret;
 }
 
 static int bch2_dump_open(struct inode *inode, struct file *file)
@@ -185,15 +346,20 @@ static int bch2_dump_open(struct inode *inode, struct file *file)
 
 	file->private_data = i;
 	i->from = POS_MIN;
+	i->iter	= 0;
 	i->c	= container_of(bd, struct bch_fs, btree_debug[bd->id]);
 	i->id	= bd->id;
+	i->buf	= PRINTBUF;
 
 	return 0;
 }
 
 static int bch2_dump_release(struct inode *inode, struct file *file)
 {
-	kfree(file->private_data);
+	struct dump_iter *i = file->private_data;
+
+	printbuf_exit(&i->buf);
+	kfree(i);
 	return 0;
 }
 
@@ -201,173 +367,532 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 			       size_t size, loff_t *ppos)
 {
 	struct dump_iter *i = file->private_data;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int err;
 
 	i->ubuf = buf;
 	i->size	= size;
 	i->ret	= 0;
 
-	err = flush_buf(i);
-	if (err)
-		return err;
+	return flush_buf(i) ?:
+		bch2_trans_run(i->c,
+			for_each_btree_key(trans, iter, i->id, i->from,
+					   BTREE_ITER_prefetch|
+					   BTREE_ITER_all_snapshots, k, ({
+				bch2_bkey_val_to_text(&i->buf, i->c, k);
+				prt_newline(&i->buf);
+				bch2_trans_unlock(trans);
+				i->from = bpos_successor(iter.pos);
+				flush_buf(i);
+			}))) ?:
+		i->ret;
+}
 
-	if (!i->size)
-		return i->ret;
+static const struct file_operations btree_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_read_btree,
+};
 
-	bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
-	k = bch2_btree_iter_peek(&iter);
+static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
+				       size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
 
-	while (k.k && !(err = btree_iter_err(k))) {
-		bch2_bkey_val_to_text(i->c, bkey_type(0, i->id),
-				      i->buf, sizeof(i->buf), k);
-		i->bytes = strlen(i->buf);
-		BUG_ON(i->bytes >= PAGE_SIZE);
-		i->buf[i->bytes] = '\n';
-		i->bytes++;
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
 
-		k = bch2_btree_iter_next(&iter);
-		i->from = iter.pos;
+	ssize_t ret = flush_buf(i);
+	if (ret)
+		return ret;
 
-		err = flush_buf(i);
-		if (err)
-			break;
+	if (bpos_eq(SPOS_MAX, i->from))
+		return i->ret;
 
-		if (!i->size)
-			break;
-	}
-	bch2_btree_iter_unlock(&iter);
+	return bch2_trans_run(i->c,
+		for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({
+			bch2_btree_node_to_text(&i->buf, i->c, b);
+			i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
+				? bpos_successor(b->key.k.p)
+				: b->key.k.p;
 
-	return err < 0 ? err : i->ret;
+			drop_locks_do(trans, flush_buf(i));
+		}))) ?: i->ret;
 }
 
-static const struct file_operations btree_debug_ops = {
+static const struct file_operations btree_format_debug_ops = {
 	.owner		= THIS_MODULE,
 	.open		= bch2_dump_open,
 	.release	= bch2_dump_release,
-	.read		= bch2_read_btree,
+	.read		= bch2_read_btree_formats,
 };
 
-static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
+static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 				       size_t size, loff_t *ppos)
 {
 	struct dump_iter *i = file->private_data;
-	struct btree_iter iter;
-	struct btree *b;
-	int err;
 
 	i->ubuf = buf;
 	i->size	= size;
 	i->ret	= 0;
 
-	err = flush_buf(i);
-	if (err)
-		return err;
+	return flush_buf(i) ?:
+		bch2_trans_run(i->c,
+			for_each_btree_key(trans, iter, i->id, i->from,
+					   BTREE_ITER_prefetch|
+					   BTREE_ITER_all_snapshots, k, ({
+				struct btree_path_level *l =
+					&btree_iter_path(trans, &iter)->l[0];
+				struct bkey_packed *_k =
+					bch2_btree_node_iter_peek(&l->iter, l->b);
+
+				if (bpos_gt(l->b->key.k.p, i->prev_node)) {
+					bch2_btree_node_to_text(&i->buf, i->c, l->b);
+					i->prev_node = l->b->key.k.p;
+				}
+
+				bch2_bfloat_to_text(&i->buf, l->b, _k);
+				bch2_trans_unlock(trans);
+				i->from = bpos_successor(iter.pos);
+				flush_buf(i);
+			}))) ?:
+		i->ret;
+}
 
-	if (!i->size || !bkey_cmp(POS_MAX, i->from))
-		return i->ret;
+static const struct file_operations bfloat_failed_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_read_bfloat_failed,
+};
+
+static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
+					   struct btree *b)
+{
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 32);
+
+	prt_printf(out, "%px ", b);
+	bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level);
+	prt_printf(out, "\n");
+
+	printbuf_indent_add(out, 2);
+
+	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+	prt_newline(out);
+
+	prt_printf(out, "flags:\t");
+	prt_bitflags(out, bch2_btree_node_flags, b->flags);
+	prt_newline(out);
+
+	prt_printf(out, "pcpu read locks:\t%u\n",	b->c.lock.readers != NULL);
+	prt_printf(out, "written:\t%u\n",		b->written);
+	prt_printf(out, "writes blocked:\t%u\n",	!list_empty_careful(&b->write_blocked));
+	prt_printf(out, "will make reachable:\t%lx\n",	b->will_make_reachable);
+
+	prt_printf(out, "journal pin %px:\t%llu\n",
+		   &b->writes[0].journal, b->writes[0].journal.seq);
+	prt_printf(out, "journal pin %px:\t%llu\n",
+		   &b->writes[1].journal, b->writes[1].journal.seq);
+
+	printbuf_indent_sub(out, 2);
+}
+
+static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
+					    size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct bch_fs *c = i->c;
+	bool done = false;
+	ssize_t ret = 0;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	do {
+		struct bucket_table *tbl;
+		struct rhash_head *pos;
+		struct btree *b;
+
+		ret = flush_buf(i);
+		if (ret)
+			return ret;
+
+		rcu_read_lock();
+		i->buf.atomic++;
+		tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
+					  &c->btree_cache.table);
+		if (i->iter < tbl->size) {
+			rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
+				bch2_cached_btree_node_to_text(&i->buf, c, b);
+			i->iter++;
+		} else {
+			done = true;
+		}
+		--i->buf.atomic;
+		rcu_read_unlock();
+	} while (!done);
+
+	if (i->buf.allocation_failure)
+		ret = -ENOMEM;
+
+	if (!ret)
+		ret = flush_buf(i);
 
-	for_each_btree_node(&iter, i->c, i->id, i->from, 0, b) {
-		i->bytes = bch2_print_btree_node(i->c, b, i->buf,
-						sizeof(i->buf));
+	return ret ?: i->ret;
+}
+
+static const struct file_operations cached_btree_nodes_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_cached_btree_nodes_read,
+};
+
+typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r);
+
+static void list_sort(struct list_head *head, list_cmp_fn cmp)
+{
+	struct list_head *pos;
+
+	list_for_each(pos, head)
+		while (!list_is_last(pos, head) &&
+		       cmp(pos, pos->next) > 0) {
+			struct list_head *pos2, *next = pos->next;
+
+			list_del(next);
+			list_for_each(pos2, head)
+				if (cmp(next, pos2) < 0)
+					goto pos_found;
+			BUG();
+pos_found:
+			list_add_tail(next, pos2);
+		}
+}
+
+static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r)
+{
+	return cmp_int(l, r);
+}
+
+static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
+					    size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct bch_fs *c = i->c;
+	struct btree_trans *trans;
+	ssize_t ret = 0;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+restart:
+	seqmutex_lock(&c->btree_trans_lock);
+	list_sort(&c->btree_trans_list, list_ptr_order_cmp);
+
+	list_for_each_entry(trans, &c->btree_trans_list, list) {
+		if ((ulong) trans <= i->iter)
+			continue;
+
+		i->iter = (ulong) trans;
+
+		if (!closure_get_not_zero(&trans->ref))
+			continue;
+
+		u32 seq = seqmutex_unlock(&c->btree_trans_lock);
+
+		bch2_btree_trans_to_text(&i->buf, trans);
+
+		prt_printf(&i->buf, "backtrace:\n");
+		printbuf_indent_add(&i->buf, 2);
+		bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL);
+		printbuf_indent_sub(&i->buf, 2);
+		prt_newline(&i->buf);
+
+		closure_put(&trans->ref);
+
+		ret = flush_buf(i);
+		if (ret)
+			goto unlocked;
+
+		if (!seqmutex_relock(&c->btree_trans_lock, seq))
+			goto restart;
+	}
+	seqmutex_unlock(&c->btree_trans_lock);
+unlocked:
+	if (i->buf.allocation_failure)
+		ret = -ENOMEM;
+
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
+}
+
+static const struct file_operations btree_transactions_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_btree_transactions_read,
+};
+
+static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
+				      size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct bch_fs *c = i->c;
+	bool done = false;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	while (1) {
 		err = flush_buf(i);
 		if (err)
-			break;
-
-		/*
-		 * can't easily correctly restart a btree node traversal across
-		 * all nodes, meh
-		 */
-		i->from = bkey_cmp(POS_MAX, b->key.k.p)
-			? bkey_successor(b->key.k.p)
-			: b->key.k.p;
+			return err;
 
 		if (!i->size)
 			break;
+
+		if (done)
+			break;
+
+		done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
+		i->iter++;
 	}
-	bch2_btree_iter_unlock(&iter);
 
-	return err < 0 ? err : i->ret;
+	if (i->buf.allocation_failure)
+		return -ENOMEM;
+
+	return i->ret;
 }
 
-static const struct file_operations btree_format_debug_ops = {
+static const struct file_operations journal_pins_ops = {
 	.owner		= THIS_MODULE,
 	.open		= bch2_dump_open,
 	.release	= bch2_dump_release,
-	.read		= bch2_read_btree_formats,
+	.read		= bch2_journal_pins_read,
 };
 
-static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
+static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf,
 				       size_t size, loff_t *ppos)
 {
 	struct dump_iter *i = file->private_data;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct btree *prev_node = NULL;
+	struct bch_fs *c = i->c;
 	int err;
 
 	i->ubuf = buf;
 	i->size	= size;
 	i->ret	= 0;
 
+	if (!i->iter) {
+		bch2_btree_updates_to_text(&i->buf, c);
+		i->iter++;
+	}
+
 	err = flush_buf(i);
 	if (err)
 		return err;
 
-	if (!i->size)
-		return i->ret;
+	if (i->buf.allocation_failure)
+		return -ENOMEM;
 
-	bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
+	return i->ret;
+}
 
-	while ((k = bch2_btree_iter_peek(&iter)).k &&
-	       !(err = btree_iter_err(k))) {
-		struct btree_iter_level *l = &iter.l[0];
-		struct bkey_packed *_k =
-			bch2_btree_node_iter_peek(&l->iter, l->b);
+static const struct file_operations btree_updates_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_btree_updates_read,
+};
 
-		if (l->b != prev_node) {
-			i->bytes = bch2_print_btree_node(i->c, l->b, i->buf,
-							sizeof(i->buf));
-			err = flush_buf(i);
-			if (err)
-				break;
-		}
-		prev_node = l->b;
+static int btree_transaction_stats_open(struct inode *inode, struct file *file)
+{
+	struct bch_fs *c = inode->i_private;
+	struct dump_iter *i;
 
-		i->bytes = bch2_bkey_print_bfloat(l->b, _k, i->buf,
-						  sizeof(i->buf));
+	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+	if (!i)
+		return -ENOMEM;
 
-		err = flush_buf(i);
-		if (err)
-			break;
+	i->iter = 1;
+	i->c    = c;
+	i->buf  = PRINTBUF;
+	file->private_data = i;
 
-		bch2_btree_iter_next(&iter);
-		i->from = iter.pos;
+	return 0;
+}
+
+static int btree_transaction_stats_release(struct inode *inode, struct file *file)
+{
+	struct dump_iter *i = file->private_data;
+
+	printbuf_exit(&i->buf);
+	kfree(i);
+
+	return 0;
+}
+
+static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
+					    size_t size, loff_t *ppos)
+{
+	struct dump_iter        *i = file->private_data;
+	struct bch_fs *c = i->c;
+	int err;
+
+	i->ubuf = buf;
+	i->size = size;
+	i->ret  = 0;
+
+	while (1) {
+		struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter];
 
 		err = flush_buf(i);
 		if (err)
-			break;
+			return err;
 
 		if (!i->size)
 			break;
+
+		if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) ||
+		    !bch2_btree_transaction_fns[i->iter])
+			break;
+
+		prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]);
+		printbuf_indent_add(&i->buf, 2);
+
+		mutex_lock(&s->lock);
+
+		prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem);
+		prt_printf(&i->buf, "Transaction duration:\n");
+
+		printbuf_indent_add(&i->buf, 2);
+		bch2_time_stats_to_text(&i->buf, &s->duration);
+		printbuf_indent_sub(&i->buf, 2);
+
+		if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
+			prt_printf(&i->buf, "Lock hold times:\n");
+
+			printbuf_indent_add(&i->buf, 2);
+			bch2_time_stats_to_text(&i->buf, &s->lock_hold_times);
+			printbuf_indent_sub(&i->buf, 2);
+		}
+
+		if (s->max_paths_text) {
+			prt_printf(&i->buf, "Maximum allocated btree paths (%u):\n", s->nr_max_paths);
+
+			printbuf_indent_add(&i->buf, 2);
+			prt_str_indented(&i->buf, s->max_paths_text);
+			printbuf_indent_sub(&i->buf, 2);
+		}
+
+		mutex_unlock(&s->lock);
+
+		printbuf_indent_sub(&i->buf, 2);
+		prt_newline(&i->buf);
+		i->iter++;
 	}
-	bch2_btree_iter_unlock(&iter);
 
-	return err < 0 ? err : i->ret;
+	if (i->buf.allocation_failure)
+		return -ENOMEM;
+
+	return i->ret;
 }
 
-static const struct file_operations bfloat_failed_debug_ops = {
+static const struct file_operations btree_transaction_stats_op = {
+	.owner		= THIS_MODULE,
+	.open		= btree_transaction_stats_open,
+	.release	= btree_transaction_stats_release,
+	.read		= btree_transaction_stats_read,
+};
+
+/* walk btree transactions until we find a deadlock and print it */
+static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct btree_trans *trans;
+	ulong iter = 0;
+restart:
+	seqmutex_lock(&c->btree_trans_lock);
+	list_sort(&c->btree_trans_list, list_ptr_order_cmp);
+
+	list_for_each_entry(trans, &c->btree_trans_list, list) {
+		if ((ulong) trans <= iter)
+			continue;
+
+		iter = (ulong) trans;
+
+		if (!closure_get_not_zero(&trans->ref))
+			continue;
+
+		u32 seq = seqmutex_unlock(&c->btree_trans_lock);
+
+		bool found = bch2_check_for_deadlock(trans, out) != 0;
+
+		closure_put(&trans->ref);
+
+		if (found)
+			return;
+
+		if (!seqmutex_relock(&c->btree_trans_lock, seq))
+			goto restart;
+	}
+	seqmutex_unlock(&c->btree_trans_lock);
+}
+
+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
+					    size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct bch_fs *c = i->c;
+	ssize_t ret = 0;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	if (!i->iter) {
+		btree_deadlock_to_text(&i->buf, c);
+		i->iter++;
+	}
+
+	if (i->buf.allocation_failure)
+		ret = -ENOMEM;
+
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
+}
+
+static const struct file_operations btree_deadlock_ops = {
 	.owner		= THIS_MODULE,
 	.open		= bch2_dump_open,
 	.release	= bch2_dump_release,
-	.read		= bch2_read_bfloat_failed,
+	.read		= bch2_btree_deadlock_read,
 };
 
 void bch2_fs_debug_exit(struct bch_fs *c)
 {
-	if (!IS_ERR_OR_NULL(c->debug))
-		debugfs_remove_recursive(c->debug);
+	if (!IS_ERR_OR_NULL(c->fs_debug_dir))
+		debugfs_remove_recursive(c->fs_debug_dir);
+}
+
+static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd)
+{
+	struct dentry *d;
+
+	d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir);
+
+	debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops);
+
+	debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops);
+
+	debugfs_create_file("bfloat-failed", 0400, d, bd,
+			    &bfloat_failed_debug_ops);
 }
 
 void bch2_fs_debug_init(struct bch_fs *c)
@@ -379,29 +904,37 @@ void bch2_fs_debug_init(struct bch_fs *c)
 		return;
 
 	snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
-	c->debug = debugfs_create_dir(name, bch_debug);
-	if (IS_ERR_OR_NULL(c->debug))
+	c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
+	if (IS_ERR_OR_NULL(c->fs_debug_dir))
 		return;
 
-	for (bd = c->btree_debug;
-	     bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
-	     bd++) {
-		bd->id = bd - c->btree_debug;
-		bd->btree = debugfs_create_file(bch2_btree_ids[bd->id],
-						0400, c->debug, bd,
-						&btree_debug_ops);
+	debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
+			    c->btree_debug, &cached_btree_nodes_ops);
 
-		snprintf(name, sizeof(name), "%s-formats",
-			 bch2_btree_ids[bd->id]);
+	debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
+			    c->btree_debug, &btree_transactions_ops);
 
-		bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
-						       &btree_format_debug_ops);
+	debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
+			    c->btree_debug, &journal_pins_ops);
 
-		snprintf(name, sizeof(name), "%s-bfloat-failed",
-			 bch2_btree_ids[bd->id]);
+	debugfs_create_file("btree_updates", 0400, c->fs_debug_dir,
+			    c->btree_debug, &btree_updates_ops);
 
-		bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
-						 &bfloat_failed_debug_ops);
+	debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
+			    c, &btree_transaction_stats_op);
+
+	debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
+			    c->btree_debug, &btree_deadlock_ops);
+
+	c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
+	if (IS_ERR_OR_NULL(c->btree_debug_dir))
+		return;
+
+	for (bd = c->btree_debug;
+	     bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
+	     bd++) {
+		bd->id = bd - c->btree_debug;
+		bch2_fs_debug_btree_init(c, bd);
 	}
 }
 
@@ -415,8 +948,6 @@ void bch2_debug_exit(void)
 
 int __init bch2_debug_init(void)
 {
-	int ret = 0;
-
 	bch_debug = debugfs_create_dir("bcachefs", NULL);
-	return ret;
+	return 0;
 }
diff --git a/libbcachefs/debug.h b/libbcachefs/debug.h
index b5de1a70..2c37143b 100644
--- a/libbcachefs/debug.h
+++ b/libbcachefs/debug.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_DEBUG_H
 #define _BCACHEFS_DEBUG_H
 
@@ -7,44 +8,13 @@ struct bio;
 struct btree;
 struct bch_fs;
 
-#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
-BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
-#define BCH_DEBUG_PARAM(name, description)				\
-	static inline bool name(struct bch_fs *c)			\
-	{ return bch2_##name || c->name;	}
-BCH_DEBUG_PARAMS_ALWAYS()
-#undef BCH_DEBUG_PARAM
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-#define BCH_DEBUG_PARAM(name, description)				\
-	static inline bool name(struct bch_fs *c)			\
-	{ return bch2_##name || c->name;	}
-BCH_DEBUG_PARAMS_DEBUG()
-#undef BCH_DEBUG_PARAM
-
 void __bch2_btree_verify(struct bch_fs *, struct btree *);
-
-#define bypass_torture_test(d)		((d)->bypass_torture_test)
-
-#else /* DEBUG */
-
-#define BCH_DEBUG_PARAM(name, description)				\
-	static inline bool name(struct bch_fs *c) { return false; }
-BCH_DEBUG_PARAMS_DEBUG()
-#undef BCH_DEBUG_PARAM
-
-static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {}
-
-#define bypass_torture_test(d)		0
-
-#endif
+void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *,
+				    const struct btree *);
 
 static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
 {
-	if (verify_btree_ondisk(c))
+	if (bch2_verify_btree_ondisk)
 		__bch2_btree_verify(c, b);
 }
 
diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c
index d979ae0e..41813f9c 100644
--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@@ -1,5 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey_buf.h"
 #include "bkey_methods.h"
 #include "btree_update.h"
 #include "extents.h"
@@ -7,24 +9,32 @@
 #include "fs.h"
 #include "keylist.h"
 #include "str_hash.h"
+#include "subvolume.h"
 
 #include <linux/dcache.h>
 
-unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
+static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
 {
-	unsigned len = bkey_val_bytes(d.k) -
-		offsetof(struct bch_dirent, d_name);
-
-	while (len && !d.v->d_name[len - 1])
-		--len;
+	if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name))
+		return 0;
 
-	return len;
+	unsigned bkey_u64s = bkey_val_u64s(d.k);
+	unsigned bkey_bytes = bkey_u64s * sizeof(u64);
+	u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1];
+#if CPU_BIG_ENDIAN
+	unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8;
+#else
+	unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8;
+#endif
+
+	return bkey_bytes -
+		offsetof(struct bch_dirent, d_name) -
+		trailing_nuls;
 }
 
-static unsigned dirent_val_u64s(unsigned len)
+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
 {
-	return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
-			    sizeof(u64));
+	return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
 }
 
 static u64 bch2_dirent_hash(const struct bch_hash_info *info,
@@ -47,7 +57,7 @@ static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
 static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
 {
 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-	struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
+	struct qstr name = bch2_dirent_get_name(d);
 
 	return bch2_dirent_hash(info, &name);
 }
@@ -55,94 +65,107 @@ static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
 static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
 {
 	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
-	int len = bch2_dirent_name_bytes(l);
-	const struct qstr *r = _r;
+	const struct qstr l_name = bch2_dirent_get_name(l);
+	const struct qstr *r_name = _r;
 
-	return len - r->len ?: memcmp(l.v->d_name, r->name, len);
+	return !qstr_eq(l_name, *r_name);
 }
 
 static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
 {
 	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
 	struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
-	int l_len = bch2_dirent_name_bytes(l);
-	int r_len = bch2_dirent_name_bytes(r);
+	const struct qstr l_name = bch2_dirent_get_name(l);
+	const struct qstr r_name = bch2_dirent_get_name(r);
+
+	return !qstr_eq(l_name, r_name);
+}
 
-	return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
+static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+
+	if (d.v->d_type == DT_SUBVOL)
+		return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol;
+	return true;
 }
 
 const struct bch_hash_desc bch2_dirent_hash_desc = {
-	.btree_id	= BTREE_ID_DIRENTS,
-	.key_type	= BCH_DIRENT,
-	.whiteout_type	= BCH_DIRENT_WHITEOUT,
+	.btree_id	= BTREE_ID_dirents,
+	.key_type	= KEY_TYPE_dirent,
 	.hash_key	= dirent_hash_key,
 	.hash_bkey	= dirent_hash_bkey,
 	.cmp_key	= dirent_cmp_key,
 	.cmp_bkey	= dirent_cmp_bkey,
+	.is_visible	= dirent_is_visible,
 };
 
-const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
+			 struct bkey_validate_context from)
 {
-	struct bkey_s_c_dirent d;
-	unsigned len;
-
-	switch (k.k->type) {
-	case BCH_DIRENT:
-		if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
-			return "value too small";
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+	struct qstr d_name = bch2_dirent_get_name(d);
+	int ret = 0;
 
-		d = bkey_s_c_to_dirent(k);
-		len = bch2_dirent_name_bytes(d);
+	bkey_fsck_err_on(!d_name.len,
+			 c, dirent_empty_name,
+			 "empty name");
 
-		if (!len)
-			return "empty name";
+	bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len),
+			 c, dirent_val_too_big,
+			 "value too big (%zu > %u)",
+			 bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
 
-		/*
-		 * older versions of bcachefs were buggy and creating dirent
-		 * keys that were bigger than necessary:
-		 */
-		if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7))
-			return "value too big";
-
-		if (len > BCH_NAME_MAX)
-			return "dirent name too big";
+	/*
+	 * Check new keys don't exceed the max length
+	 * (older keys may be larger.)
+	 */
+	bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX,
+			 c, dirent_name_too_long,
+			 "dirent name too big (%u > %u)",
+			 d_name.len, BCH_NAME_MAX);
+
+	bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len),
+			 c, dirent_name_embedded_nul,
+			 "dirent has stray data after name's NUL");
+
+	bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) ||
+			 (d_name.len == 2 && !memcmp(d_name.name, "..", 2)),
+			 c, dirent_name_dot_or_dotdot,
+			 "invalid name");
+
+	bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len),
+			 c, dirent_name_has_slash,
+			 "name with /");
+
+	bkey_fsck_err_on(d.v->d_type != DT_SUBVOL &&
+			 le64_to_cpu(d.v->d_inum) == d.k->p.inode,
+			 c, dirent_to_itself,
+			 "dirent points to own directory");
+fsck_err:
+	return ret;
+}
 
-		if (memchr(d.v->d_name, '/', len))
-			return "dirent name has invalid characters";
+void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+	struct qstr d_name = bch2_dirent_get_name(d);
 
-		return NULL;
-	case BCH_DIRENT_WHITEOUT:
-		return bkey_val_bytes(k.k) != 0
-			? "value size should be zero"
-			: NULL;
+	prt_printf(out, "%.*s -> ", d_name.len, d_name.name);
 
-	default:
-		return "invalid type";
-	}
-}
+	if (d.v->d_type != DT_SUBVOL)
+		prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum));
+	else
+		prt_printf(out, "%u -> %u",
+			   le32_to_cpu(d.v->d_parent_subvol),
+			   le32_to_cpu(d.v->d_child_subvol));
 
-void bch2_dirent_to_text(struct bch_fs *c, char *buf,
-			 size_t size, struct bkey_s_c k)
-{
-	struct bkey_s_c_dirent d;
-	size_t n = 0;
-
-	switch (k.k->type) {
-	case BCH_DIRENT:
-		d = bkey_s_c_to_dirent(k);
-
-		n += bch_scnmemcpy(buf + n, size - n, d.v->d_name,
-				   bch2_dirent_name_bytes(d));
-		n += scnprintf(buf + n, size - n, " -> %llu", d.v->d_inum);
-		break;
-	case BCH_DIRENT_WHITEOUT:
-		scnprintf(buf, size, "whiteout");
-		break;
-	}
+	prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
 }
 
 static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
-				u8 type, const struct qstr *name, u64 dst)
+				subvol_inum dir, u8 type,
+				const struct qstr *name, u64 dst)
 {
 	struct bkey_i_dirent *dirent;
 	unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
@@ -158,7 +181,14 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
 
 	bkey_dirent_init(&dirent->k_i);
 	dirent->k.u64s = u64s;
-	dirent->v.d_inum = cpu_to_le64(dst);
+
+	if (type != DT_SUBVOL) {
+		dirent->v.d_inum = cpu_to_le64(dst);
+	} else {
+		dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
+		dirent->v.d_child_subvol = cpu_to_le32(dst);
+	}
+
 	dirent->v.d_type = type;
 
 	memcpy(dirent->v.d_name, name->name, name->len);
@@ -172,110 +202,168 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
 	return dirent;
 }
 
-int __bch2_dirent_create(struct btree_trans *trans,
-			 u64 dir_inum, const struct bch_hash_info *hash_info,
-			 u8 type, const struct qstr *name, u64 dst_inum,
-			 int flags)
+int bch2_dirent_create_snapshot(struct btree_trans *trans,
+			u32 dir_subvol, u64 dir, u32 snapshot,
+			const struct bch_hash_info *hash_info,
+			u8 type, const struct qstr *name, u64 dst_inum,
+			u64 *dir_offset,
+			enum btree_iter_update_trigger_flags flags)
 {
+	subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir };
 	struct bkey_i_dirent *dirent;
 	int ret;
 
-	dirent = dirent_create_key(trans, type, name, dst_inum);
+	dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum);
 	ret = PTR_ERR_OR_ZERO(dirent);
 	if (ret)
 		return ret;
 
-	return __bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
-			       dir_inum, &dirent->k_i, flags);
+	dirent->k.p.inode	= dir;
+	dirent->k.p.snapshot	= snapshot;
+
+	ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+					dir_inum, snapshot, &dirent->k_i,
+					flags|BTREE_UPDATE_internal_snapshot_node);
+	*dir_offset = dirent->k.p.offset;
+
+	return ret;
 }
 
-int bch2_dirent_create(struct bch_fs *c, u64 dir_inum,
+int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
 		       const struct bch_hash_info *hash_info,
 		       u8 type, const struct qstr *name, u64 dst_inum,
-		       u64 *journal_seq, int flags)
+		       u64 *dir_offset,
+		       enum btree_iter_update_trigger_flags flags)
 {
-	return bch2_trans_do(c, journal_seq, flags,
-		__bch2_dirent_create(&trans, dir_inum, hash_info,
-				     type, name, dst_inum, flags));
-}
+	struct bkey_i_dirent *dirent;
+	int ret;
 
-static void dirent_copy_target(struct bkey_i_dirent *dst,
-			       struct bkey_s_c_dirent src)
-{
-	dst->v.d_inum = src.v->d_inum;
-	dst->v.d_type = src.v->d_type;
+	dirent = dirent_create_key(trans, dir, type, name, dst_inum);
+	ret = PTR_ERR_OR_ZERO(dirent);
+	if (ret)
+		return ret;
+
+	ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
+			    dir, &dirent->k_i, flags);
+	*dir_offset = dirent->k.p.offset;
+
+	return ret;
 }
 
-static struct bpos bch2_dirent_pos(struct bch_inode_info *inode,
-				   const struct qstr *name)
+int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
+			    struct bkey_s_c_dirent d, subvol_inum *target)
 {
-	return POS(inode->v.i_ino, bch2_dirent_hash(&inode->ei_str_hash, name));
+	struct bch_subvolume s;
+	int ret = 0;
+
+	if (d.v->d_type == DT_SUBVOL &&
+	    le32_to_cpu(d.v->d_parent_subvol) != dir.subvol)
+		return 1;
+
+	if (likely(d.v->d_type != DT_SUBVOL)) {
+		target->subvol	= dir.subvol;
+		target->inum	= le64_to_cpu(d.v->d_inum);
+	} else {
+		target->subvol	= le32_to_cpu(d.v->d_child_subvol);
+
+		ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_cached, &s);
+
+		target->inum	= le64_to_cpu(s.inode);
+	}
+
+	return ret;
 }
 
 int bch2_dirent_rename(struct btree_trans *trans,
-		struct bch_inode_info *src_dir, const struct qstr *src_name,
-		struct bch_inode_info *dst_dir, const struct qstr *dst_name,
+		subvol_inum src_dir, struct bch_hash_info *src_hash,
+		subvol_inum dst_dir, struct bch_hash_info *dst_hash,
+		const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
+		const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
 		enum bch_rename_mode mode)
 {
-	struct btree_iter *src_iter, *dst_iter;
-	struct bkey_s_c old_src, old_dst;
+	struct btree_iter src_iter = { NULL };
+	struct btree_iter dst_iter = { NULL };
+	struct bkey_s_c old_src, old_dst = bkey_s_c_null;
 	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
-	struct bpos dst_pos = bch2_dirent_pos(dst_dir, dst_name);
-	int ret;
+	struct bpos dst_pos =
+		POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
+	unsigned src_update_flags = 0;
+	bool delete_src, delete_dst;
+	int ret = 0;
 
-	/*
-	 * Lookup dst:
-	 *
-	 * Note that in BCH_RENAME mode, we're _not_ checking if
-	 * the target already exists - we're relying on the VFS
-	 * to do that check for us for correctness:
-	 */
-	dst_iter = mode == BCH_RENAME
-		? bch2_hash_hole(trans, bch2_dirent_hash_desc,
-				 &dst_dir->ei_str_hash,
-				 dst_dir->v.i_ino, dst_name)
-		: bch2_hash_lookup(trans, bch2_dirent_hash_desc,
-				   &dst_dir->ei_str_hash,
-				   dst_dir->v.i_ino, dst_name,
-				   BTREE_ITER_INTENT);
-	if (IS_ERR(dst_iter))
-		return PTR_ERR(dst_iter);
-	old_dst = bch2_btree_iter_peek_slot(dst_iter);
+	memset(src_inum, 0, sizeof(*src_inum));
+	memset(dst_inum, 0, sizeof(*dst_inum));
 
 	/* Lookup src: */
-	src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
-				    &src_dir->ei_str_hash,
-				    src_dir->v.i_ino, src_name,
-				    BTREE_ITER_INTENT);
-	if (IS_ERR(src_iter))
-		return PTR_ERR(src_iter);
-	old_src = bch2_btree_iter_peek_slot(src_iter);
+	old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
+				   src_hash, src_dir, src_name,
+				   BTREE_ITER_intent);
+	ret = bkey_err(old_src);
+	if (ret)
+		goto out;
+
+	ret = bch2_dirent_read_target(trans, src_dir,
+			bkey_s_c_to_dirent(old_src), src_inum);
+	if (ret)
+		goto out;
+
+	/* Lookup dst: */
+	if (mode == BCH_RENAME) {
+		/*
+		 * Note that we're _not_ checking if the target already exists -
+		 * we're relying on the VFS to do that check for us for
+		 * correctness:
+		 */
+		ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
+				     dst_hash, dst_dir, dst_name);
+		if (ret)
+			goto out;
+	} else {
+		old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
+					    dst_hash, dst_dir, dst_name,
+					    BTREE_ITER_intent);
+		ret = bkey_err(old_dst);
+		if (ret)
+			goto out;
+
+		ret = bch2_dirent_read_target(trans, dst_dir,
+				bkey_s_c_to_dirent(old_dst), dst_inum);
+		if (ret)
+			goto out;
+	}
+
+	if (mode != BCH_RENAME_EXCHANGE)
+		*src_offset = dst_iter.pos.offset;
 
 	/* Create new dst key: */
-	new_dst = dirent_create_key(trans, 0, dst_name, 0);
-	if (IS_ERR(new_dst))
-		return PTR_ERR(new_dst);
+	new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0);
+	ret = PTR_ERR_OR_ZERO(new_dst);
+	if (ret)
+		goto out;
 
 	dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
-	new_dst->k.p = dst_iter->pos;
+	new_dst->k.p = dst_iter.pos;
 
 	/* Create new src key: */
 	if (mode == BCH_RENAME_EXCHANGE) {
-		new_src = dirent_create_key(trans, 0, src_name, 0);
-		if (IS_ERR(new_src))
-			return PTR_ERR(new_src);
+		new_src = dirent_create_key(trans, src_dir, 0, src_name, 0);
+		ret = PTR_ERR_OR_ZERO(new_src);
+		if (ret)
+			goto out;
 
 		dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
-		new_src->k.p = src_iter->pos;
+		new_src->k.p = src_iter.pos;
 	} else {
 		new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
-		if (IS_ERR(new_src))
-			return PTR_ERR(new_src);
+		ret = PTR_ERR_OR_ZERO(new_src);
+		if (ret)
+			goto out;
+
 		bkey_init(&new_src->k);
-		new_src->k.p = src_iter->pos;
+		new_src->k.p = src_iter.pos;
 
-		if (bkey_cmp(dst_pos, src_iter->pos) <= 0 &&
-		    bkey_cmp(src_iter->pos, dst_iter->pos) < 0) {
+		if (bkey_le(dst_pos, src_iter.pos) &&
+		    bkey_lt(src_iter.pos, dst_iter.pos)) {
 			/*
 			 * We have a hash collision for the new dst key,
 			 * and new_src - the key we're deleting - is between
@@ -288,138 +376,199 @@ int bch2_dirent_rename(struct btree_trans *trans,
 				 * If we're not overwriting, we can just insert
 				 * new_dst at the src position:
 				 */
-				new_dst->k.p = src_iter->pos;
-				bch2_trans_update(trans, src_iter, &new_dst->k_i, 0);
-				return 0;
+				new_src = new_dst;
+				new_src->k.p = src_iter.pos;
+				goto out_set_src;
 			} else {
 				/* If we're overwriting, we can't insert new_dst
 				 * at a different slot because it has to
 				 * overwrite old_dst - just make sure to use a
 				 * whiteout when deleting src:
 				 */
-				new_src->k.type = BCH_DIRENT_WHITEOUT;
+				new_src->k.type = KEY_TYPE_hash_whiteout;
 			}
 		} else {
 			/* Check if we need a whiteout to delete src: */
 			ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
-						       &src_dir->ei_str_hash,
-						       src_iter);
+						       src_hash, &src_iter);
 			if (ret < 0)
-				return ret;
+				goto out;
 
 			if (ret)
-				new_src->k.type = BCH_DIRENT_WHITEOUT;
+				new_src->k.type = KEY_TYPE_hash_whiteout;
 		}
 	}
 
-	bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
-	bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
-	return 0;
-}
+	if (new_dst->v.d_type == DT_SUBVOL)
+		new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol);
 
-int __bch2_dirent_delete(struct btree_trans *trans, u64 dir_inum,
-			 const struct bch_hash_info *hash_info,
-			 const struct qstr *name)
-{
-	return bch2_hash_delete(trans, bch2_dirent_hash_desc, hash_info,
-				dir_inum, name);
+	if ((mode == BCH_RENAME_EXCHANGE) &&
+	    new_src->v.d_type == DT_SUBVOL)
+		new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
+
+	ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
+	if (ret)
+		goto out;
+out_set_src:
+	/*
+	 * If we're deleting a subvolume we need to really delete the dirent,
+	 * not just emit a whiteout in the current snapshot - there can only be
+	 * single dirent that points to a given subvolume.
+	 *
+	 * IOW, we don't maintain multiple versions in different snapshots of
+	 * dirents that point to subvolumes - dirents that point to subvolumes
+	 * are only visible in one particular subvolume so it's not necessary,
+	 * and it would be particularly confusing for fsck to have to deal with.
+	 */
+	delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL &&
+		new_src->k.p.snapshot != old_src.k->p.snapshot;
+
+	delete_dst = old_dst.k &&
+		bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL &&
+		new_dst->k.p.snapshot != old_dst.k->p.snapshot;
+
+	if (!delete_src || !bkey_deleted(&new_src->k)) {
+		ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
+		if (ret)
+			goto out;
+	}
+
+	if (delete_src) {
+		bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
+		ret =   bch2_btree_iter_traverse(&src_iter) ?:
+			bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node);
+		if (ret)
+			goto out;
+	}
+
+	if (delete_dst) {
+		bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
+		ret =   bch2_btree_iter_traverse(&dst_iter) ?:
+			bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node);
+		if (ret)
+			goto out;
+	}
+
+	if (mode == BCH_RENAME_EXCHANGE)
+		*src_offset = new_src->k.p.offset;
+	*dst_offset = new_dst->k.p.offset;
+out:
+	bch2_trans_iter_exit(trans, &src_iter);
+	bch2_trans_iter_exit(trans, &dst_iter);
+	return ret;
 }
 
-int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
-		       const struct bch_hash_info *hash_info,
-		       const struct qstr *name,
-		       u64 *journal_seq)
+int bch2_dirent_lookup_trans(struct btree_trans *trans,
+			     struct btree_iter *iter,
+			     subvol_inum dir,
+			     const struct bch_hash_info *hash_info,
+			     const struct qstr *name, subvol_inum *inum,
+			     unsigned flags)
 {
-	return bch2_trans_do(c, journal_seq,
-			     BTREE_INSERT_ATOMIC|
-			     BTREE_INSERT_NOFAIL,
-		__bch2_dirent_delete(&trans, dir_inum, hash_info, name));
+	struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+					     hash_info, dir, name, flags);
+	int ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum);
+	if (ret > 0)
+		ret = -ENOENT;
+err:
+	if (ret)
+		bch2_trans_iter_exit(trans, iter);
+	return ret;
 }
 
-u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
+u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
 		       const struct bch_hash_info *hash_info,
-		       const struct qstr *name)
+		       const struct qstr *name, subvol_inum *inum)
 {
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct bkey_s_c k;
-	u64 inum = 0;
-
-	bch2_trans_init(&trans, c);
-
-	iter = bch2_hash_lookup(&trans, bch2_dirent_hash_desc,
-				hash_info, dir_inum, name, 0);
-	if (IS_ERR(iter)) {
-		BUG_ON(PTR_ERR(iter) == -EINTR);
-		goto out;
-	}
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter = { NULL };
 
-	k = bch2_btree_iter_peek_slot(iter);
-	inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
-out:
-	bch2_trans_exit(&trans);
-	return inum;
+	int ret = lockrestart_do(trans,
+		bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
 }
 
-int bch2_empty_dir(struct bch_fs *c, u64 dir_inum)
+int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	int ret = 0;
-
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), 0, k) {
-		if (k.k->p.inode > dir_inum)
-			break;
+	int ret;
 
-		if (k.k->type == BCH_DIRENT) {
-			ret = -ENOTEMPTY;
+	for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents,
+			   SPOS(dir, 0, snapshot),
+			   POS(dir, U64_MAX), 0, k, ret)
+		if (k.k->type == KEY_TYPE_dirent) {
+			struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+			if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol)
+				continue;
+			ret = -BCH_ERR_ENOTEMPTY_dir_not_empty;
 			break;
 		}
-	}
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_iter_exit(trans, &iter);
 
 	return ret;
 }
 
-int bch2_readdir(struct bch_fs *c, struct file *file,
-		 struct dir_context *ctx)
+int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
 {
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_s_c_dirent dirent;
-	unsigned len;
+	u32 snapshot;
 
-	if (!dir_emit_dots(file, ctx))
-		return 0;
+	return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?:
+		bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot);
+}
 
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
-			   POS(inode->v.i_ino, ctx->pos), 0, k) {
-		if (k.k->type != BCH_DIRENT)
-			continue;
+static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target)
+{
+	struct qstr name = bch2_dirent_get_name(d);
+	/*
+	 * Although not required by the kernel code, updating ctx->pos is needed
+	 * for the bcachefs FUSE driver. Without this update, the FUSE
+	 * implementation will be stuck in an infinite loop when reading
+	 * directories (via the bcachefs_fuse_readdir callback).
+	 * In kernel space, ctx->pos is updated by the VFS code.
+	 */
+	ctx->pos = d.k->p.offset;
+	bool ret = dir_emit(ctx, name.name,
+		      name.len,
+		      target.inum,
+		      vfs_d_type(d.v->d_type));
+	if (ret)
+		ctx->pos = d.k->p.offset + 1;
+	return ret;
+}
 
-		dirent = bkey_s_c_to_dirent(k);
+int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
+{
+	struct bkey_buf sk;
+	bch2_bkey_buf_init(&sk);
 
-		if (bkey_cmp(k.k->p, POS(inode->v.i_ino, ctx->pos)) < 0)
-			continue;
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents,
+				   POS(inum.inum, ctx->pos),
+				   POS(inum.inum, U64_MAX),
+				   inum.subvol, 0, k, ({
+			if (k.k->type != KEY_TYPE_dirent)
+				continue;
 
-		if (k.k->p.inode > inode->v.i_ino)
-			break;
+			/* dir_emit() can fault and block: */
+			bch2_bkey_buf_reassemble(&sk, c, k);
+			struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
 
-		len = bch2_dirent_name_bytes(dirent);
+			subvol_inum target;
+			int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target);
+			if (ret2 > 0)
+				continue;
 
-		/*
-		 * XXX: dir_emit() can fault and block, while we're holding
-		 * locks
-		 */
-		if (!dir_emit(ctx, dirent.v->d_name, len,
-			      le64_to_cpu(dirent.v->d_inum),
-			      dirent.v->d_type))
-			break;
+			ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target));
+		})));
 
-		ctx->pos = k.k->p.offset + 1;
-	}
-	bch2_btree_iter_unlock(&iter);
+	bch2_bkey_buf_exit(&sk, c);
 
-	return 0;
+	return ret < 0 ? ret : 0;
 }
diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h
index 4d92ffba..362b3b2f 100644
--- a/libbcachefs/dirent.h
+++ b/libbcachefs/dirent.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_DIRENT_H
 #define _BCACHEFS_DIRENT_H
 
@@ -5,13 +6,15 @@
 
 extern const struct bch_hash_desc bch2_dirent_hash_desc;
 
-const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c,
+			 struct bkey_validate_context);
+void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_dirent_ops (struct bkey_ops) {	\
-	.key_invalid	= bch2_dirent_invalid,		\
+#define bch2_bkey_ops_dirent ((struct bkey_ops) {	\
+	.key_validate	= bch2_dirent_validate,		\
 	.val_to_text	= bch2_dirent_to_text,		\
-}
+	.min_val_size	= 16,				\
+})
 
 struct qstr;
 struct file;
@@ -20,19 +23,37 @@ struct bch_fs;
 struct bch_hash_info;
 struct bch_inode_info;
 
-unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent);
+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
+
+static inline unsigned dirent_val_u64s(unsigned len)
+{
+	return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
+			    sizeof(u64));
+}
 
-int __bch2_dirent_create(struct btree_trans *, u64,
-			 const struct bch_hash_info *, u8,
-			 const struct qstr *, u64, int);
-int bch2_dirent_create(struct bch_fs *c, u64, const struct bch_hash_info *,
-		       u8, const struct qstr *, u64, u64 *, int);
+int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
+			    struct bkey_s_c_dirent, subvol_inum *);
 
-int __bch2_dirent_delete(struct btree_trans *, u64,
-			 const struct bch_hash_info *,
-			 const struct qstr *);
-int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *,
-		       const struct qstr *, u64 *);
+static inline void dirent_copy_target(struct bkey_i_dirent *dst,
+				      struct bkey_s_c_dirent src)
+{
+	dst->v.d_inum = src.v->d_inum;
+	dst->v.d_type = src.v->d_type;
+}
+
+int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
+			const struct bch_hash_info *, u8,
+			const struct qstr *, u64, u64 *,
+			enum btree_iter_update_trigger_flags);
+int bch2_dirent_create(struct btree_trans *, subvol_inum,
+		       const struct bch_hash_info *, u8,
+		       const struct qstr *, u64, u64 *,
+		       enum btree_iter_update_trigger_flags);
+
+static inline unsigned vfs_d_type(unsigned type)
+{
+	return type == DT_SUBVOL ? DT_DIR : type;
+}
 
 enum bch_rename_mode {
 	BCH_RENAME,
@@ -41,14 +62,21 @@ enum bch_rename_mode {
 };
 
 int bch2_dirent_rename(struct btree_trans *,
-		       struct bch_inode_info *, const struct qstr *,
-		       struct bch_inode_info *, const struct qstr *,
+		       subvol_inum, struct bch_hash_info *,
+		       subvol_inum, struct bch_hash_info *,
+		       const struct qstr *, subvol_inum *, u64 *,
+		       const struct qstr *, subvol_inum *, u64 *,
 		       enum bch_rename_mode);
 
-u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
-		       const struct qstr *);
+int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
+			       subvol_inum, const struct bch_hash_info *,
+			       const struct qstr *, subvol_inum *, unsigned);
+u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
+		       const struct bch_hash_info *,
+		       const struct qstr *, subvol_inum *);
 
-int bch2_empty_dir(struct bch_fs *, u64);
-int bch2_readdir(struct bch_fs *, struct file *, struct dir_context *);
+int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
+int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
+int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
 
 #endif /* _BCACHEFS_DIRENT_H */
diff --git a/libbcachefs/dirent_format.h b/libbcachefs/dirent_format.h
new file mode 100644
index 00000000..5e116b88
--- /dev/null
+++ b/libbcachefs/dirent_format.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DIRENT_FORMAT_H
+#define _BCACHEFS_DIRENT_FORMAT_H
+
+/*
+ * Dirents (and xattrs) have to implement string lookups; since our b-tree
+ * doesn't support arbitrary length strings for the key, we instead index by a
+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset
+ * field of the key - using linear probing to resolve hash collisions. This also
+ * provides us with the readdir cookie posix requires.
+ *
+ * Linear probing requires us to use whiteouts for deletions, in the event of a
+ * collision:
+ */
+
+struct bch_dirent {
+	struct bch_val		v;
+
+	/* Target inode number: */
+	union {
+	__le64			d_inum;
+	struct {		/* DT_SUBVOL */
+	__le32			d_child_subvol;
+	__le32			d_parent_subvol;
+	};
+	};
+
+	/*
+	 * Copy of mode bits 12-15 from the target inode - so userspace can get
+	 * the filetype without having to do a stat()
+	 */
+	__u8			d_type;
+
+	__u8			d_name[];
+} __packed __aligned(8);
+
+#define DT_SUBVOL	16
+#define BCH_DT_MAX	17
+
+#define BCH_NAME_MAX	512
+
+#endif /* _BCACHEFS_DIRENT_FORMAT_H */
diff --git a/libbcachefs/disk_accounting.c b/libbcachefs/disk_accounting.c
new file mode 100644
index 00000000..77534838
--- /dev/null
+++ b/libbcachefs/disk_accounting.c
@@ -0,0 +1,1010 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bcachefs_ioctl.h"
+#include "btree_cache.h"
+#include "btree_journal_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "compress.h"
+#include "disk_accounting.h"
+#include "error.h"
+#include "journal_io.h"
+#include "replicas.h"
+
+/*
+ * Notes on disk accounting:
+ *
+ * We have two parallel sets of counters to be concerned with, and both must be
+ * kept in sync.
+ *
+ *  - Persistent/on disk accounting, stored in the accounting btree and updated
+ *    via btree write buffer updates that treat new accounting keys as deltas to
+ *    apply to existing values. But reading from a write buffer btree is
+ *    expensive, so we also have
+ *
+ *  - In memory accounting, where accounting is stored as an array of percpu
+ *    counters, indexed by an eytzinger array of disk acounting keys/bpos (which
+ *    are the same thing, excepting byte swabbing on big endian).
+ *
+ *    Cheap to read, but non persistent.
+ *
+ * Disk accounting updates are generated by transactional triggers; these run as
+ * keys enter and leave the btree, and can compare old and new versions of keys;
+ * the output of these triggers are deltas to the various counters.
+ *
+ * Disk accounting updates are done as btree write buffer updates, where the
+ * counters in the disk accounting key are deltas that will be applied to the
+ * counter in the btree when the key is flushed by the write buffer (or journal
+ * replay).
+ *
+ * To do a disk accounting update:
+ * - initialize a disk_accounting_pos, to specify which counter is being update
+ * - initialize counter deltas, as an array of 1-3 s64s
+ * - call bch2_disk_accounting_mod()
+ *
+ * This queues up the accounting update to be done at transaction commit time.
+ * Underneath, it's a normal btree write buffer update.
+ *
+ * The transaction commit path is responsible for propagating updates to the in
+ * memory counters, with bch2_accounting_mem_mod().
+ *
+ * The commit path also assigns every disk accounting update a unique version
+ * number, based on the journal sequence number and offset within that journal
+ * buffer; this is used by journal replay to determine which updates have been
+ * done.
+ *
+ * The transaction commit path also ensures that replicas entry accounting
+ * updates are properly marked in the superblock (so that we know whether we can
+ * mount without data being unavailable); it will update the superblock if
+ * bch2_accounting_mem_mod() tells it to.
+ */
+
+static const char * const disk_accounting_type_strs[] = {
+#define x(t, n, ...) [n] = #t,
+	BCH_DISK_ACCOUNTING_TYPES()
+#undef x
+	NULL
+};
+
+static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos,
+				       s64 *d, unsigned nr)
+{
+	struct bkey_i_accounting *acc = bkey_accounting_init(k);
+
+	acc->k.p = disk_accounting_pos_to_bpos(pos);
+	set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr);
+
+	memcpy_u64s_small(acc->v.d, d, nr);
+}
+
+int bch2_disk_accounting_mod(struct btree_trans *trans,
+			     struct disk_accounting_pos *k,
+			     s64 *d, unsigned nr, bool gc)
+{
+	/* Normalize: */
+	switch (k->type) {
+	case BCH_DISK_ACCOUNTING_replicas:
+		bubble_sort(k->replicas.devs, k->replicas.nr_devs, u8_cmp);
+		break;
+	}
+
+	BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS);
+
+	struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
+
+	accounting_key_init(&k_i.k, k, d, nr);
+
+	return likely(!gc)
+		? bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k)
+		: bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
+}
+
+int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
+				unsigned dev, s64 sectors,
+				bool gc)
+{
+	struct disk_accounting_pos acc = {
+		.type = BCH_DISK_ACCOUNTING_replicas,
+	};
+
+	bch2_replicas_entry_cached(&acc.replicas, dev);
+
+	return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
+}
+
+static inline bool is_zero(char *start, char *end)
+{
+	BUG_ON(start > end);
+
+	for (; start < end; start++)
+		if (*start)
+			return false;
+	return true;
+}
+
+#define field_end(p, member)	(((void *) (&p.member)) + sizeof(p.member))
+
+int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
+			     struct bkey_validate_context from)
+{
+	struct disk_accounting_pos acc_k;
+	bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+	void *end = &acc_k + 1;
+	int ret = 0;
+
+	bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) &&
+			 bversion_zero(k.k->bversion),
+			 c, accounting_key_version_0,
+			 "accounting key with version=0");
+
+	switch (acc_k.type) {
+	case BCH_DISK_ACCOUNTING_nr_inodes:
+		end = field_end(acc_k, nr_inodes);
+		break;
+	case BCH_DISK_ACCOUNTING_persistent_reserved:
+		end = field_end(acc_k, persistent_reserved);
+		break;
+	case BCH_DISK_ACCOUNTING_replicas:
+		bkey_fsck_err_on(!acc_k.replicas.nr_devs,
+				 c, accounting_key_replicas_nr_devs_0,
+				 "accounting key replicas entry with nr_devs=0");
+
+		bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs ||
+				 (acc_k.replicas.nr_required > 1 &&
+				  acc_k.replicas.nr_required == acc_k.replicas.nr_devs),
+				 c, accounting_key_replicas_nr_required_bad,
+				 "accounting key replicas entry with bad nr_required");
+
+		for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++)
+			bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1],
+					 c, accounting_key_replicas_devs_unsorted,
+					 "accounting key replicas entry with unsorted devs");
+
+		end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas);
+		break;
+	case BCH_DISK_ACCOUNTING_dev_data_type:
+		end = field_end(acc_k, dev_data_type);
+		break;
+	case BCH_DISK_ACCOUNTING_compression:
+		end = field_end(acc_k, compression);
+		break;
+	case BCH_DISK_ACCOUNTING_snapshot:
+		end = field_end(acc_k, snapshot);
+		break;
+	case BCH_DISK_ACCOUNTING_btree:
+		end = field_end(acc_k, btree);
+		break;
+	case BCH_DISK_ACCOUNTING_rebalance_work:
+		end = field_end(acc_k, rebalance_work);
+		break;
+	}
+
+	bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)),
+			 c, accounting_key_junk_at_end,
+			 "junk at end of accounting key");
+fsck_err:
+	return ret;
+}
+
+void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k)
+{
+	if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) {
+		prt_printf(out, "unknown type %u", k->type);
+		return;
+	}
+
+	prt_str(out, disk_accounting_type_strs[k->type]);
+	prt_str(out, " ");
+
+	switch (k->type) {
+	case BCH_DISK_ACCOUNTING_nr_inodes:
+		break;
+	case BCH_DISK_ACCOUNTING_persistent_reserved:
+		prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas);
+		break;
+	case BCH_DISK_ACCOUNTING_replicas:
+		bch2_replicas_entry_to_text(out, &k->replicas);
+		break;
+	case BCH_DISK_ACCOUNTING_dev_data_type:
+		prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev);
+		bch2_prt_data_type(out, k->dev_data_type.data_type);
+		break;
+	case BCH_DISK_ACCOUNTING_compression:
+		bch2_prt_compression_type(out, k->compression.type);
+		break;
+	case BCH_DISK_ACCOUNTING_snapshot:
+		prt_printf(out, "id=%u", k->snapshot.id);
+		break;
+	case BCH_DISK_ACCOUNTING_btree:
+		prt_str(out, "btree=");
+		bch2_btree_id_to_text(out, k->btree.id);
+		break;
+	}
+}
+
+void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k);
+	struct disk_accounting_pos acc_k;
+	bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+
+	bch2_accounting_key_to_text(out, &acc_k);
+
+	for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++)
+		prt_printf(out, " %lli", acc.v->d[i]);
+}
+
+void bch2_accounting_swab(struct bkey_s k)
+{
+	for (u64 *p = (u64 *) k.v;
+	     p < (u64 *) bkey_val_end(k);
+	     p++)
+		*p = swab64(*p);
+}
+
+static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r,
+					    struct disk_accounting_pos *acc)
+{
+	unsafe_memcpy(r, &acc->replicas,
+		      replicas_entry_bytes(&acc->replicas),
+		      "variable length struct");
+}
+
+static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p)
+{
+	struct disk_accounting_pos acc_k;
+	bpos_to_disk_accounting_pos(&acc_k, p);
+
+	switch (acc_k.type) {
+	case BCH_DISK_ACCOUNTING_replicas:
+		__accounting_to_replicas(r, &acc_k);
+		return true;
+	default:
+		return false;
+	}
+}
+
+static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p)
+{
+	struct bch_replicas_padded r;
+	return accounting_to_replicas(&r.e, p)
+		? bch2_mark_replicas(c, &r.e)
+		: 0;
+}
+
+/*
+ * Ensure accounting keys being updated are present in the superblock, when
+ * applicable (i.e. replicas updates)
+ */
+int bch2_accounting_update_sb(struct btree_trans *trans)
+{
+	for (struct jset_entry *i = trans->journal_entries;
+	     i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+	     i = vstruct_next(i))
+		if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) {
+			int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p);
+			if (ret)
+				return ret;
+		}
+
+	return 0;
+}
+
+static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+
+	/* raced with another insert, already present: */
+	if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+			    accounting_pos_cmp, &a.k->p) < acc->k.nr)
+		return 0;
+
+	struct accounting_mem_entry n = {
+		.pos		= a.k->p,
+		.bversion	= a.k->bversion,
+		.nr_counters	= bch2_accounting_counters(a.k),
+		.v[0]		= __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
+						     sizeof(u64), GFP_KERNEL),
+	};
+
+	if (!n.v[0])
+		goto err;
+
+	if (acc->gc_running) {
+		n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
+					    sizeof(u64), GFP_KERNEL);
+		if (!n.v[1])
+			goto err;
+	}
+
+	if (darray_push(&acc->k, n))
+		goto err;
+
+	eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+			accounting_pos_cmp, NULL);
+	return 0;
+err:
+	free_percpu(n.v[1]);
+	free_percpu(n.v[0]);
+	return -BCH_ERR_ENOMEM_disk_accounting;
+}
+
+int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
+			       enum bch_accounting_mode mode)
+{
+	struct bch_replicas_padded r;
+
+	if (mode != BCH_ACCOUNTING_read &&
+	    accounting_to_replicas(&r.e, a.k->p) &&
+	    !bch2_replicas_marked_locked(c, &r.e))
+		return -BCH_ERR_btree_insert_need_mark_replicas;
+
+	percpu_up_read(&c->mark_lock);
+	percpu_down_write(&c->mark_lock);
+	int ret = __bch2_accounting_mem_insert(c, a);
+	percpu_up_write(&c->mark_lock);
+	percpu_down_read(&c->mark_lock);
+	return ret;
+}
+
+static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e)
+{
+	for (unsigned i = 0; i < e->nr_counters; i++)
+		if (percpu_u64_get(e->v[0] + i) ||
+		    (e->v[1] &&
+		     percpu_u64_get(e->v[1] + i)))
+			return false;
+	return true;
+}
+
+void bch2_accounting_mem_gc(struct bch_fs *c)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+
+	percpu_down_write(&c->mark_lock);
+	struct accounting_mem_entry *dst = acc->k.data;
+
+	darray_for_each(acc->k, src) {
+		if (accounting_mem_entry_is_zero(src)) {
+			free_percpu(src->v[0]);
+			free_percpu(src->v[1]);
+		} else {
+			*dst++ = *src;
+		}
+	}
+
+	acc->k.nr = dst - acc->k.data;
+	eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+			accounting_pos_cmp, NULL);
+	percpu_up_write(&c->mark_lock);
+}
+
+/*
+ * Read out accounting keys for replicas entries, as an array of
+ * bch_replicas_usage entries.
+ *
+ * Note: this may be deprecated/removed at smoe point in the future and replaced
+ * with something more general, it exists to support the ioctl used by the
+ * 'bcachefs fs usage' command.
+ */
+int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+	int ret = 0;
+
+	darray_init(usage);
+
+	percpu_down_read(&c->mark_lock);
+	darray_for_each(acc->k, i) {
+		struct {
+			struct bch_replicas_usage r;
+			u8 pad[BCH_BKEY_PTRS_MAX];
+		} u;
+
+		if (!accounting_to_replicas(&u.r.r, i->pos))
+			continue;
+
+		u64 sectors;
+		bch2_accounting_mem_read_counters(acc, i - acc->k.data, &sectors, 1, false);
+		u.r.sectors = sectors;
+
+		ret = darray_make_room(usage, replicas_usage_bytes(&u.r));
+		if (ret)
+			break;
+
+		memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r));
+		usage->nr += replicas_usage_bytes(&u.r);
+	}
+	percpu_up_read(&c->mark_lock);
+
+	if (ret)
+		darray_exit(usage);
+	return ret;
+}
+
+int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask)
+{
+
+	struct bch_accounting_mem *acc = &c->accounting;
+	int ret = 0;
+
+	darray_init(out_buf);
+
+	percpu_down_read(&c->mark_lock);
+	darray_for_each(acc->k, i) {
+		struct disk_accounting_pos a_p;
+		bpos_to_disk_accounting_pos(&a_p, i->pos);
+
+		if (!(accounting_types_mask & BIT(a_p.type)))
+			continue;
+
+		ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) +
+				       sizeof(u64) * i->nr_counters);
+		if (ret)
+			break;
+
+		struct bkey_i_accounting *a_out =
+			bkey_accounting_init((void *) &darray_top(*out_buf));
+		set_bkey_val_u64s(&a_out->k, i->nr_counters);
+		a_out->k.p = i->pos;
+		bch2_accounting_mem_read_counters(acc, i - acc->k.data,
+						  a_out->v.d, i->nr_counters, false);
+
+		if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out)))
+			out_buf->nr += bkey_bytes(&a_out->k);
+	}
+
+	percpu_up_read(&c->mark_lock);
+
+	if (ret)
+		darray_exit(out_buf);
+	return ret;
+}
+
+void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+
+	percpu_down_read(&c->mark_lock);
+	out->atomic++;
+
+	eytzinger0_for_each(i, acc->k.nr) {
+		struct disk_accounting_pos acc_k;
+		bpos_to_disk_accounting_pos(&acc_k, acc->k.data[i].pos);
+
+		bch2_accounting_key_to_text(out, &acc_k);
+
+		u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+		bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
+
+		prt_str(out, ":");
+		for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++)
+			prt_printf(out, " %llu", v[j]);
+		prt_newline(out);
+	}
+
+	--out->atomic;
+	percpu_up_read(&c->mark_lock);
+}
+
+static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc)
+{
+	darray_for_each(acc->k, e) {
+		free_percpu(e->v[gc]);
+		e->v[gc] = NULL;
+	}
+}
+
+int bch2_gc_accounting_start(struct bch_fs *c)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+	int ret = 0;
+
+	percpu_down_write(&c->mark_lock);
+	darray_for_each(acc->k, e) {
+		e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64),
+					     sizeof(u64), GFP_KERNEL);
+		if (!e->v[1]) {
+			bch2_accounting_free_counters(acc, true);
+			ret = -BCH_ERR_ENOMEM_disk_accounting;
+			break;
+		}
+	}
+
+	acc->gc_running = !ret;
+	percpu_up_write(&c->mark_lock);
+
+	return ret;
+}
+
+int bch2_gc_accounting_done(struct bch_fs *c)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct printbuf buf = PRINTBUF;
+	struct bpos pos = POS_MIN;
+	int ret = 0;
+
+	percpu_down_write(&c->mark_lock);
+	while (1) {
+		unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+						  accounting_pos_cmp, &pos);
+
+		if (idx >= acc->k.nr)
+			break;
+
+		struct accounting_mem_entry *e = acc->k.data + idx;
+		pos = bpos_successor(e->pos);
+
+		struct disk_accounting_pos acc_k;
+		bpos_to_disk_accounting_pos(&acc_k, e->pos);
+
+		if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
+			continue;
+
+		u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS];
+		u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS];
+
+		unsigned nr = e->nr_counters;
+		bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false);
+		bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true);
+
+		if (memcmp(dst_v, src_v, nr * sizeof(u64))) {
+			printbuf_reset(&buf);
+			prt_str(&buf, "accounting mismatch for ");
+			bch2_accounting_key_to_text(&buf, &acc_k);
+
+			prt_str(&buf, ": got");
+			for (unsigned j = 0; j < nr; j++)
+				prt_printf(&buf, " %llu", dst_v[j]);
+
+			prt_str(&buf, " should be");
+			for (unsigned j = 0; j < nr; j++)
+				prt_printf(&buf, " %llu", src_v[j]);
+
+			for (unsigned j = 0; j < nr; j++)
+				src_v[j] -= dst_v[j];
+
+			if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) {
+				percpu_up_write(&c->mark_lock);
+				ret = commit_do(trans, NULL, NULL, 0,
+						bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false));
+				percpu_down_write(&c->mark_lock);
+				if (ret)
+					goto err;
+
+				if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
+					memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
+					struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
+
+					accounting_key_init(&k_i.k, &acc_k, src_v, nr);
+					bch2_accounting_mem_mod_locked(trans,
+								bkey_i_to_s_c_accounting(&k_i.k),
+								BCH_ACCOUNTING_normal);
+
+					preempt_disable();
+					struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
+					struct bch_fs_usage_base *src = &trans->fs_usage_delta;
+					acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64));
+					preempt_enable();
+				}
+			}
+		}
+	}
+err:
+fsck_err:
+	percpu_up_write(&c->mark_lock);
+	printbuf_exit(&buf);
+	bch2_trans_put(trans);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+
+	if (k.k->type != KEY_TYPE_accounting)
+		return 0;
+
+	percpu_down_read(&c->mark_lock);
+	int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k),
+						 BCH_ACCOUNTING_read);
+	percpu_up_read(&c->mark_lock);
+	return ret;
+}
+
+static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
+					      struct disk_accounting_pos acc,
+					      u64 *v, unsigned nr)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0, invalid_dev = -1;
+
+	switch (acc.type) {
+	case BCH_DISK_ACCOUNTING_replicas: {
+		struct bch_replicas_padded r;
+		__accounting_to_replicas(&r.e, &acc);
+
+		for (unsigned i = 0; i < r.e.nr_devs; i++)
+			if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
+			    !bch2_dev_exists(c, r.e.devs[i])) {
+				invalid_dev = r.e.devs[i];
+				goto invalid_device;
+			}
+
+		/*
+		 * All replicas entry checks except for invalid device are done
+		 * in bch2_accounting_validate
+		 */
+		BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf));
+
+		if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
+				trans, accounting_replicas_not_marked,
+				"accounting not marked in superblock replicas\n  %s",
+				(printbuf_reset(&buf),
+				 bch2_accounting_key_to_text(&buf, &acc),
+				 buf.buf))) {
+			/*
+			 * We're not RW yet and still single threaded, dropping
+			 * and retaking lock is ok:
+			 */
+			percpu_up_write(&c->mark_lock);
+			ret = bch2_mark_replicas(c, &r.e);
+			if (ret)
+				goto fsck_err;
+			percpu_down_write(&c->mark_lock);
+		}
+		break;
+	}
+
+	case BCH_DISK_ACCOUNTING_dev_data_type:
+		if (!bch2_dev_exists(c, acc.dev_data_type.dev)) {
+			invalid_dev = acc.dev_data_type.dev;
+			goto invalid_device;
+		}
+		break;
+	}
+
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+invalid_device:
+	if (fsck_err(trans, accounting_to_invalid_device,
+		     "accounting entry points to invalid device %i\n  %s",
+		     invalid_dev,
+		     (printbuf_reset(&buf),
+		      bch2_accounting_key_to_text(&buf, &acc),
+		      buf.buf))) {
+		for (unsigned i = 0; i < nr; i++)
+			v[i] = -v[i];
+
+		ret = commit_do(trans, NULL, NULL, 0,
+				bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?:
+			-BCH_ERR_remove_disk_accounting_entry;
+	} else {
+		ret = -BCH_ERR_remove_disk_accounting_entry;
+	}
+	goto fsck_err;
+}
+
+/*
+ * At startup time, initialize the in memory accounting from the btree (and
+ * journal)
+ */
+int bch2_accounting_read(struct bch_fs *c)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct printbuf buf = PRINTBUF;
+
+	/*
+	 * We might run more than once if we rewind to start topology repair or
+	 * btree node scan - and those might cause us to get different results,
+	 * so we can't just skip if we've already run.
+	 *
+	 * Instead, zero out any accounting we have:
+	 */
+	percpu_down_write(&c->mark_lock);
+	darray_for_each(acc->k, e)
+		percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters);
+	for_each_member_device(c, ca)
+		percpu_memset(ca->usage, 0, sizeof(*ca->usage));
+	percpu_memset(c->usage, 0, sizeof(*c->usage));
+	percpu_up_write(&c->mark_lock);
+
+	struct btree_iter iter;
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
+			     BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
+	iter.flags &= ~BTREE_ITER_with_journal;
+	int ret = for_each_btree_key_continue(trans, iter,
+				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
+			if (k.k->type != KEY_TYPE_accounting)
+				continue;
+
+			struct disk_accounting_pos acc_k;
+			bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+			if (!bch2_accounting_is_mem(acc_k)) {
+				struct disk_accounting_pos next = { .type = acc_k.type + 1 };
+				bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
+				continue;
+			}
+
+			accounting_read_key(trans, k);
+		}));
+	if (ret)
+		goto err;
+
+	struct journal_keys *keys = &c->journal_keys;
+	struct journal_key *dst = keys->data;
+	move_gap(keys, keys->nr);
+
+	darray_for_each(*keys, i) {
+		if (i->k->k.type == KEY_TYPE_accounting) {
+			struct disk_accounting_pos acc_k;
+			bpos_to_disk_accounting_pos(&acc_k, i->k->k.p);
+			if (!bch2_accounting_is_mem(acc_k))
+				continue;
+
+			struct bkey_s_c k = bkey_i_to_s_c(i->k);
+			unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr,
+						sizeof(acc->k.data[0]),
+						accounting_pos_cmp, &k.k->p);
+
+			bool applied = idx < acc->k.nr &&
+				bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0;
+
+			if (applied)
+				continue;
+
+			if (i + 1 < &darray_top(*keys) &&
+			    i[1].k->k.type == KEY_TYPE_accounting &&
+			    !journal_key_cmp(i, i + 1)) {
+				WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0);
+
+				i[1].journal_seq = i[0].journal_seq;
+
+				bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k),
+							   bkey_s_c_to_accounting(k));
+				continue;
+			}
+
+			ret = accounting_read_key(trans, k);
+			if (ret)
+				goto err;
+		}
+
+		*dst++ = *i;
+	}
+	keys->gap = keys->nr = dst - keys->data;
+
+	percpu_down_write(&c->mark_lock);
+	unsigned i = 0;
+	while (i < acc->k.nr) {
+		unsigned idx = inorder_to_eytzinger0(i, acc->k.nr);
+
+		struct disk_accounting_pos acc_k;
+		bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos);
+
+		u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+		bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false);
+
+		/*
+		 * If the entry counters are zeroed, it should be treated as
+		 * nonexistent - it might point to an invalid device.
+		 *
+		 * Remove it, so that if it's re-added it gets re-marked in the
+		 * superblock:
+		 */
+		ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters)
+			? -BCH_ERR_remove_disk_accounting_entry
+			: bch2_disk_accounting_validate_late(trans, acc_k,
+							v, acc->k.data[idx].nr_counters);
+
+		if (ret == -BCH_ERR_remove_disk_accounting_entry) {
+			free_percpu(acc->k.data[idx].v[0]);
+			free_percpu(acc->k.data[idx].v[1]);
+			darray_remove_item(&acc->k, &acc->k.data[idx]);
+			eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+					accounting_pos_cmp, NULL);
+			ret = 0;
+			continue;
+		}
+
+		if (ret)
+			goto fsck_err;
+		i++;
+	}
+
+	preempt_disable();
+	struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage);
+
+	for (unsigned i = 0; i < acc->k.nr; i++) {
+		struct disk_accounting_pos k;
+		bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
+
+		u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+		bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
+
+		switch (k.type) {
+		case BCH_DISK_ACCOUNTING_persistent_reserved:
+			usage->reserved += v[0] * k.persistent_reserved.nr_replicas;
+			break;
+		case BCH_DISK_ACCOUNTING_replicas:
+			fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]);
+			break;
+		case BCH_DISK_ACCOUNTING_dev_data_type:
+			rcu_read_lock();
+			struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev);
+			if (ca) {
+				struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type];
+				percpu_u64_set(&d->buckets,	v[0]);
+				percpu_u64_set(&d->sectors,	v[1]);
+				percpu_u64_set(&d->fragmented,	v[2]);
+
+				if (k.dev_data_type.data_type == BCH_DATA_sb ||
+				    k.dev_data_type.data_type == BCH_DATA_journal)
+					usage->hidden += v[0] * ca->mi.bucket_size;
+			}
+			rcu_read_unlock();
+			break;
+		}
+	}
+	preempt_enable();
+fsck_err:
+	percpu_up_write(&c->mark_lock);
+err:
+	printbuf_exit(&buf);
+	bch2_trans_put(trans);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev)
+{
+	return bch2_trans_run(c,
+		bch2_btree_write_buffer_flush_sync(trans) ?:
+		for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN,
+				BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({
+			struct disk_accounting_pos acc;
+			bpos_to_disk_accounting_pos(&acc, k.k->p);
+
+			acc.type == BCH_DISK_ACCOUNTING_dev_data_type &&
+			acc.dev_data_type.dev == dev
+				? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0)
+				: 0;
+		})) ?:
+		bch2_btree_write_buffer_flush_sync(trans));
+}
+
+int bch2_dev_usage_init(struct bch_dev *ca, bool gc)
+{
+	struct bch_fs *c = ca->fs;
+	struct disk_accounting_pos acc = {
+		.type = BCH_DISK_ACCOUNTING_dev_data_type,
+		.dev_data_type.dev = ca->dev_idx,
+		.dev_data_type.data_type = BCH_DATA_free,
+	};
+	u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 };
+
+	int ret = bch2_trans_do(c, ({
+		bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc) ?:
+		(!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0);
+	}));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+void bch2_verify_accounting_clean(struct bch_fs *c)
+{
+	bool mismatch = false;
+	struct bch_fs_usage_base base = {}, base_inmem = {};
+
+	bch2_trans_run(c,
+		for_each_btree_key(trans, iter,
+				   BTREE_ID_accounting, POS_MIN,
+				   BTREE_ITER_all_snapshots, k, ({
+			u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+			struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k);
+			unsigned nr = bch2_accounting_counters(k.k);
+
+			struct disk_accounting_pos acc_k;
+			bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+
+			if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
+				continue;
+
+			if (acc_k.type == BCH_DISK_ACCOUNTING_inum)
+				continue;
+
+			bch2_accounting_mem_read(c, k.k->p, v, nr);
+
+			if (memcmp(a.v->d, v, nr * sizeof(u64))) {
+				struct printbuf buf = PRINTBUF;
+
+				bch2_bkey_val_to_text(&buf, c, k);
+				prt_str(&buf, " !=");
+				for (unsigned j = 0; j < nr; j++)
+					prt_printf(&buf, " %llu", v[j]);
+
+				pr_err("%s", buf.buf);
+				printbuf_exit(&buf);
+				mismatch = true;
+			}
+
+			switch (acc_k.type) {
+			case BCH_DISK_ACCOUNTING_persistent_reserved:
+				base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
+				break;
+			case BCH_DISK_ACCOUNTING_replicas:
+				fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]);
+				break;
+			case BCH_DISK_ACCOUNTING_dev_data_type: {
+				rcu_read_lock();
+				struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
+				if (!ca) {
+					rcu_read_unlock();
+					continue;
+				}
+
+				v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets);
+				v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors);
+				v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented);
+				rcu_read_unlock();
+
+				if (memcmp(a.v->d, v, 3 * sizeof(u64))) {
+					struct printbuf buf = PRINTBUF;
+
+					bch2_bkey_val_to_text(&buf, c, k);
+					prt_str(&buf, " in mem");
+					for (unsigned j = 0; j < nr; j++)
+						prt_printf(&buf, " %llu", v[j]);
+
+					pr_err("dev accounting mismatch: %s", buf.buf);
+					printbuf_exit(&buf);
+					mismatch = true;
+				}
+			}
+			}
+
+			0;
+		})));
+
+	acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64));
+
+#define check(x)										\
+	if (base.x != base_inmem.x) {								\
+		pr_err("fs_usage_base.%s mismatch: %llu != %llu", #x, base.x, base_inmem.x);	\
+		mismatch = true;								\
+	}
+
+	//check(hidden);
+	check(btree);
+	check(data);
+	check(cached);
+	check(reserved);
+	check(nr_inodes);
+
+	WARN_ON(mismatch);
+}
+
+void bch2_accounting_gc_free(struct bch_fs *c)
+{
+	lockdep_assert_held(&c->mark_lock);
+
+	struct bch_accounting_mem *acc = &c->accounting;
+
+	bch2_accounting_free_counters(acc, true);
+	acc->gc_running = false;
+}
+
+void bch2_fs_accounting_exit(struct bch_fs *c)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+
+	bch2_accounting_free_counters(acc, false);
+	darray_exit(&acc->k);
+}
diff --git a/libbcachefs/disk_accounting.h b/libbcachefs/disk_accounting.h
new file mode 100644
index 00000000..cb20f723
--- /dev/null
+++ b/libbcachefs/disk_accounting.h
@@ -0,0 +1,272 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_ACCOUNTING_H
+#define _BCACHEFS_DISK_ACCOUNTING_H
+
+#include "btree_update.h"
+#include "eytzinger.h"
+#include "sb-members.h"
+
+static inline void bch2_u64s_neg(u64 *v, unsigned nr)
+{
+	for (unsigned i = 0; i < nr; i++)
+		v[i] = -v[i];
+}
+
+static inline unsigned bch2_accounting_counters(const struct bkey *k)
+{
+	return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64);
+}
+
+static inline void bch2_accounting_neg(struct bkey_s_accounting a)
+{
+	bch2_u64s_neg(a.v->d, bch2_accounting_counters(a.k));
+}
+
+static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a)
+{
+	for (unsigned i = 0;  i < bch2_accounting_counters(a.k); i++)
+		if (a.v->d[i])
+			return false;
+	return true;
+}
+
+static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
+					      struct bkey_s_c_accounting src)
+{
+	EBUG_ON(dst->k.u64s != src.k->u64s);
+
+	for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++)
+		dst->v.d[i] += src.v->d[i];
+	if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0)
+		dst->k.bversion = src.k->bversion;
+}
+
+static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
+					      enum bch_data_type data_type,
+					      s64 sectors)
+{
+	switch (data_type) {
+	case BCH_DATA_btree:
+		fs_usage->btree		+= sectors;
+		break;
+	case BCH_DATA_user:
+	case BCH_DATA_parity:
+		fs_usage->data		+= sectors;
+		break;
+	case BCH_DATA_cached:
+		fs_usage->cached	+= sectors;
+		break;
+	default:
+		break;
+	}
+}
+
+static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p)
+{
+	BUILD_BUG_ON(sizeof(*acc) != sizeof(p));
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	acc->_pad = p;
+#else
+	memcpy_swab(acc, &p, sizeof(p));
+#endif
+}
+
+static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *acc)
+{
+	struct bpos p;
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	p = acc->_pad;
+#else
+	memcpy_swab(&p, acc, sizeof(p));
+#endif
+	return p;
+}
+
+int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
+			     s64 *, unsigned, bool);
+int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
+
+int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c,
+			     struct bkey_validate_context);
+void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *);
+void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_accounting_swab(struct bkey_s);
+
+#define bch2_bkey_ops_accounting ((struct bkey_ops) {	\
+	.key_validate	= bch2_accounting_validate,	\
+	.val_to_text	= bch2_accounting_to_text,	\
+	.swab		= bch2_accounting_swab,		\
+	.min_val_size	= 8,				\
+})
+
+int bch2_accounting_update_sb(struct btree_trans *);
+
+static inline int accounting_pos_cmp(const void *_l, const void *_r)
+{
+	const struct bpos *l = _l, *r = _r;
+
+	return bpos_cmp(*l, *r);
+}
+
+enum bch_accounting_mode {
+	BCH_ACCOUNTING_normal,
+	BCH_ACCOUNTING_gc,
+	BCH_ACCOUNTING_read,
+};
+
+int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
+void bch2_accounting_mem_gc(struct bch_fs *);
+
+static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
+{
+	return acc.type != BCH_DISK_ACCOUNTING_inum;
+}
+
+/*
+ * Update in memory counters so they match the btree update we're doing; called
+ * from transaction commit path
+ */
+static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
+						 struct bkey_s_c_accounting a,
+						 enum bch_accounting_mode mode)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_accounting_mem *acc = &c->accounting;
+	struct disk_accounting_pos acc_k;
+	bpos_to_disk_accounting_pos(&acc_k, a.k->p);
+	bool gc = mode == BCH_ACCOUNTING_gc;
+
+	EBUG_ON(gc && !acc->gc_running);
+
+	if (!bch2_accounting_is_mem(acc_k))
+		return 0;
+
+	if (mode == BCH_ACCOUNTING_normal) {
+		switch (acc_k.type) {
+		case BCH_DISK_ACCOUNTING_persistent_reserved:
+			trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
+			break;
+		case BCH_DISK_ACCOUNTING_replicas:
+			fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]);
+			break;
+		case BCH_DISK_ACCOUNTING_dev_data_type:
+			rcu_read_lock();
+			struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
+			if (ca) {
+				this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
+				this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
+				this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]);
+			}
+			rcu_read_unlock();
+			break;
+		}
+	}
+
+	unsigned idx;
+
+	while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+				      accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
+		int ret = bch2_accounting_mem_insert(c, a, mode);
+		if (ret)
+			return ret;
+	}
+
+	struct accounting_mem_entry *e = &acc->k.data[idx];
+
+	EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters);
+
+	for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
+		this_cpu_add(e->v[gc][i], a.v->d[i]);
+	return 0;
+}
+
+static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
+{
+	percpu_down_read(&trans->c->mark_lock);
+	int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal);
+	percpu_up_read(&trans->c->mark_lock);
+	return ret;
+}
+
+static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *acc,
+						     unsigned idx, u64 *v, unsigned nr, bool gc)
+{
+	memset(v, 0, sizeof(*v) * nr);
+
+	if (unlikely(idx >= acc->k.nr))
+		return;
+
+	struct accounting_mem_entry *e = &acc->k.data[idx];
+
+	nr = min_t(unsigned, nr, e->nr_counters);
+
+	for (unsigned i = 0; i < nr; i++)
+		v[i] = percpu_u64_get(e->v[gc] + i);
+}
+
+static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
+					    u64 *v, unsigned nr)
+{
+	struct bch_accounting_mem *acc = &c->accounting;
+	unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+				       accounting_pos_cmp, &p);
+
+	bch2_accounting_mem_read_counters(acc, idx, v, nr, false);
+}
+
+static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
+{
+	EBUG_ON(!res->ref);
+
+	return (struct bversion) {
+		.hi = res->seq >> 32,
+		.lo = (res->seq << 32) | (res->offset + offset),
+	};
+}
+
+static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans,
+						    struct bkey_i_accounting *a,
+						    unsigned commit_flags)
+{
+	a->k.bversion = journal_pos_to_bversion(&trans->journal_res,
+						(u64 *) a - (u64 *) trans->journal_entries);
+
+	EBUG_ON(bversion_zero(a->k.bversion));
+
+	return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))
+		? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal)
+		: 0;
+}
+
+static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans,
+						       struct bkey_i_accounting *a_i,
+						       unsigned commit_flags)
+{
+	if (likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
+		struct bkey_s_accounting a = accounting_i_to_s(a_i);
+
+		bch2_accounting_neg(a);
+		bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
+		bch2_accounting_neg(a);
+	}
+}
+
+int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
+int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned);
+void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *);
+
+int bch2_gc_accounting_start(struct bch_fs *);
+int bch2_gc_accounting_done(struct bch_fs *);
+
+int bch2_accounting_read(struct bch_fs *);
+
+int bch2_dev_usage_remove(struct bch_fs *, unsigned);
+int bch2_dev_usage_init(struct bch_dev *, bool);
+
+void bch2_verify_accounting_clean(struct bch_fs *c);
+
+void bch2_accounting_gc_free(struct bch_fs *);
+void bch2_fs_accounting_exit(struct bch_fs *);
+
+#endif /* _BCACHEFS_DISK_ACCOUNTING_H */
diff --git a/libbcachefs/disk_accounting_format.h b/libbcachefs/disk_accounting_format.h
new file mode 100644
index 00000000..7b6e6c97
--- /dev/null
+++ b/libbcachefs/disk_accounting_format.h
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
+#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
+
+#include "replicas_format.h"
+
+/*
+ * Disk accounting - KEY_TYPE_accounting - on disk format:
+ *
+ * Here, the key has considerably more structure than a typical key (bpos); an
+ * accounting key is 'struct disk_accounting_pos', which is a union of bpos.
+ *
+ * More specifically: a key is just a muliword integer (where word endianness
+ * matches native byte order), so we're treating bpos as an opaque 20 byte
+ * integer and mapping bch_accounting_key to that.
+ *
+ * This is a type-tagged union of all our various subtypes; a disk accounting
+ * key can be device counters, replicas counters, et cetera - it's extensible.
+ *
+ * The value is a list of u64s or s64s; the number of counters is specific to a
+ * given accounting type.
+ *
+ * Unlike with other key types, updates are _deltas_, and the deltas are not
+ * resolved until the update to the underlying btree, done by btree write buffer
+ * flush or journal replay.
+ *
+ * Journal replay in particular requires special handling. The journal tracks a
+ * range of entries which may possibly have not yet been applied to the btree
+ * yet - it does not know definitively whether individual entries are dirty and
+ * still need to be applied.
+ *
+ * To handle this, we use the version field of struct bkey, and give every
+ * accounting update a unique version number - a total ordering in time; the
+ * version number is derived from the key's position in the journal. Then
+ * journal replay can compare the version number of the key from the journal
+ * with the version number of the key in the btree to determine if a key needs
+ * to be replayed.
+ *
+ * For this to work, we must maintain this strict time ordering of updates as
+ * they are flushed to the btree, both via write buffer flush and via journal
+ * replay. This has complications for the write buffer code while journal replay
+ * is still in progress; the write buffer cannot flush any accounting keys to
+ * the btree until journal replay has finished replaying its accounting keys, or
+ * the (newer) version number of the keys from the write buffer will cause
+ * updates from journal replay to be lost.
+ */
+
+struct bch_accounting {
+	struct bch_val		v;
+	__u64			d[];
+};
+
+#define BCH_ACCOUNTING_MAX_COUNTERS		3
+
+#define BCH_DATA_TYPES()		\
+	x(free,		0)		\
+	x(sb,		1)		\
+	x(journal,	2)		\
+	x(btree,	3)		\
+	x(user,		4)		\
+	x(cached,	5)		\
+	x(parity,	6)		\
+	x(stripe,	7)		\
+	x(need_gc_gens,	8)		\
+	x(need_discard,	9)		\
+	x(unstriped,	10)
+
+enum bch_data_type {
+#define x(t, n) BCH_DATA_##t,
+	BCH_DATA_TYPES()
+#undef x
+	BCH_DATA_NR
+};
+
+static inline bool data_type_is_empty(enum bch_data_type type)
+{
+	switch (type) {
+	case BCH_DATA_free:
+	case BCH_DATA_need_gc_gens:
+	case BCH_DATA_need_discard:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool data_type_is_hidden(enum bch_data_type type)
+{
+	switch (type) {
+	case BCH_DATA_sb:
+	case BCH_DATA_journal:
+		return true;
+	default:
+		return false;
+	}
+}
+
+#define BCH_DISK_ACCOUNTING_TYPES()		\
+	x(nr_inodes,		0)		\
+	x(persistent_reserved,	1)		\
+	x(replicas,		2)		\
+	x(dev_data_type,	3)		\
+	x(compression,		4)		\
+	x(snapshot,		5)		\
+	x(btree,		6)		\
+	x(rebalance_work,	7)		\
+	x(inum,			8)
+
+enum disk_accounting_type {
+#define x(f, nr)	BCH_DISK_ACCOUNTING_##f	= nr,
+	BCH_DISK_ACCOUNTING_TYPES()
+#undef x
+	BCH_DISK_ACCOUNTING_TYPE_NR,
+};
+
+struct bch_nr_inodes {
+};
+
+struct bch_persistent_reserved {
+	__u8			nr_replicas;
+};
+
+struct bch_dev_data_type {
+	__u8			dev;
+	__u8			data_type;
+};
+
+struct bch_acct_compression {
+	__u8			type;
+};
+
+struct bch_acct_snapshot {
+	__u32			id;
+} __packed;
+
+struct bch_acct_btree {
+	__u32			id;
+} __packed;
+
+struct bch_acct_inum {
+	__u64			inum;
+} __packed;
+
+struct bch_acct_rebalance_work {
+};
+
+struct disk_accounting_pos {
+	union {
+	struct {
+		__u8				type;
+		union {
+		struct bch_nr_inodes		nr_inodes;
+		struct bch_persistent_reserved	persistent_reserved;
+		struct bch_replicas_entry_v1	replicas;
+		struct bch_dev_data_type	dev_data_type;
+		struct bch_acct_compression	compression;
+		struct bch_acct_snapshot	snapshot;
+		struct bch_acct_btree		btree;
+		struct bch_acct_rebalance_work	rebalance_work;
+		struct bch_acct_inum		inum;
+		} __packed;
+	} __packed;
+		struct bpos			_pad;
+	};
+};
+
+#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */
diff --git a/libbcachefs/disk_accounting_types.h b/libbcachefs/disk_accounting_types.h
new file mode 100644
index 00000000..b1982131
--- /dev/null
+++ b/libbcachefs/disk_accounting_types.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_ACCOUNTING_TYPES_H
+#define _BCACHEFS_DISK_ACCOUNTING_TYPES_H
+
+#include "darray.h"
+
+struct accounting_mem_entry {
+	struct bpos				pos;
+	struct bversion				bversion;
+	unsigned				nr_counters;
+	u64 __percpu				*v[2];
+};
+
+struct bch_accounting_mem {
+	DARRAY(struct accounting_mem_entry)	k;
+	bool					gc_running;
+};
+
+#endif /* _BCACHEFS_DISK_ACCOUNTING_TYPES_H */
diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c
index 87f3940e..5df8de0b 100644
--- a/libbcachefs/disk_groups.c
+++ b/libbcachefs/disk_groups.c
@@ -1,5 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "disk_groups.h"
+#include "sb-members.h"
 #include "super-io.h"
 
 #include <linux/sort.h>
@@ -16,77 +18,107 @@ static int group_cmp(const void *_l, const void *_r)
 		strncmp(l->label, r->label, sizeof(l->label));
 }
 
-static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
-						struct bch_sb_field *f)
+static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				enum bch_validate_flags flags, struct printbuf *err)
 {
 	struct bch_sb_field_disk_groups *groups =
 		field_to_type(f, disk_groups);
 	struct bch_disk_group *g, *sorted = NULL;
-	struct bch_sb_field_members *mi;
-	struct bch_member *m;
-	unsigned i, nr_groups, len;
-	const char *err = NULL;
-
-	mi		= bch2_sb_get_members(sb);
-	groups		= bch2_sb_get_disk_groups(sb);
-	nr_groups	= disk_groups_nr(groups);
+	unsigned nr_groups = disk_groups_nr(groups);
+	unsigned i, len;
+	int ret = 0;
 
-	for (m = mi->members;
-	     m < mi->members + sb->nr_devices;
-	     m++) {
-		unsigned g;
+	for (i = 0; i < sb->nr_devices; i++) {
+		struct bch_member m = bch2_sb_member_get(sb, i);
+		unsigned group_id;
 
-		if (!BCH_MEMBER_GROUP(m))
+		if (!BCH_MEMBER_GROUP(&m))
 			continue;
 
-		g = BCH_MEMBER_GROUP(m) - 1;
+		group_id = BCH_MEMBER_GROUP(&m) - 1;
 
-		if (g >= nr_groups ||
-		    BCH_GROUP_DELETED(&groups->entries[g]))
-			return "disk has invalid group";
+		if (group_id >= nr_groups) {
+			prt_printf(err, "disk %u has invalid label %u (have %u)",
+				   i, group_id, nr_groups);
+			return -BCH_ERR_invalid_sb_disk_groups;
+		}
+
+		if (BCH_GROUP_DELETED(&groups->entries[group_id])) {
+			prt_printf(err, "disk %u has deleted label %u", i, group_id);
+			return -BCH_ERR_invalid_sb_disk_groups;
+		}
 	}
 
 	if (!nr_groups)
-		return NULL;
+		return 0;
+
+	for (i = 0; i < nr_groups; i++) {
+		g = groups->entries + i;
 
-	for (g = groups->entries;
-	     g < groups->entries + nr_groups;
-	     g++) {
 		if (BCH_GROUP_DELETED(g))
 			continue;
 
 		len = strnlen(g->label, sizeof(g->label));
 		if (!len) {
-			err = "group with empty label";
-			goto err;
+			prt_printf(err, "label %u empty", i);
+			return -BCH_ERR_invalid_sb_disk_groups;
 		}
 	}
 
 	sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
 	if (!sorted)
-		return "cannot allocate memory";
+		return -BCH_ERR_ENOMEM_disk_groups_validate;
 
 	memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
 	sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
 
-	for (i = 0; i + 1 < nr_groups; i++)
-		if (!BCH_GROUP_DELETED(sorted + i) &&
-		    !group_cmp(sorted + i, sorted + i + 1)) {
-			err = "duplicate groups";
+	for (g = sorted; g + 1 < sorted + nr_groups; g++)
+		if (!BCH_GROUP_DELETED(g) &&
+		    !group_cmp(&g[0], &g[1])) {
+			prt_printf(err, "duplicate label %llu.%.*s",
+			       BCH_GROUP_PARENT(g),
+			       (int) sizeof(g->label), g->label);
+			ret = -BCH_ERR_invalid_sb_disk_groups;
 			goto err;
 		}
-
-	err = NULL;
 err:
 	kfree(sorted);
-	return err;
+	return ret;
+}
+
+void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	out->atomic++;
+	rcu_read_lock();
+
+	struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
+	if (!g)
+		goto out;
+
+	for (unsigned i = 0; i < g->nr; i++) {
+		if (i)
+			prt_printf(out, " ");
+
+		if (g->entries[i].deleted) {
+			prt_printf(out, "[deleted]");
+			continue;
+		}
+
+		prt_printf(out, "[parent %d devs", g->entries[i].parent);
+		for_each_member_device_rcu(c, ca, &g->entries[i].devs)
+			prt_printf(out, " %s", ca->name);
+		prt_printf(out, "]");
+	}
+
+out:
+	rcu_read_unlock();
+	out->atomic--;
 }
 
-static size_t bch2_sb_disk_groups_to_text(char *buf, size_t size,
+static void bch2_sb_disk_groups_to_text(struct printbuf *out,
 					struct bch_sb *sb,
 					struct bch_sb_field *f)
 {
-	char *out = buf, *end = buf + size;
 	struct bch_sb_field_disk_groups *groups =
 		field_to_type(f, disk_groups);
 	struct bch_disk_group *g;
@@ -96,18 +128,14 @@ static size_t bch2_sb_disk_groups_to_text(char *buf, size_t size,
 	     g < groups->entries + nr_groups;
 	     g++) {
 		if (g != groups->entries)
-			out += scnprintf(out, end - out, " ");
+			prt_printf(out, " ");
 
 		if (BCH_GROUP_DELETED(g))
-			out += scnprintf(out, end - out, "[deleted]");
+			prt_printf(out, "[deleted]");
 		else
-			out += scnprintf(out, end - out,
-					 "[parent %llu name %s]",
-					 BCH_GROUP_PARENT(g),
-					 g->label);
+			prt_printf(out, "[parent %llu name %s]",
+			       BCH_GROUP_PARENT(g), g->label);
 	}
-
-	return out - buf;
 }
 
 const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
@@ -117,24 +145,21 @@ const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
 
 int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
 {
-	struct bch_sb_field_members *mi;
 	struct bch_sb_field_disk_groups *groups;
 	struct bch_disk_groups_cpu *cpu_g, *old_g;
 	unsigned i, g, nr_groups;
 
 	lockdep_assert_held(&c->sb_lock);
 
-	mi		= bch2_sb_get_members(c->disk_sb.sb);
-	groups		= bch2_sb_get_disk_groups(c->disk_sb.sb);
+	groups		= bch2_sb_field_get(c->disk_sb.sb, disk_groups);
 	nr_groups	= disk_groups_nr(groups);
 
 	if (!groups)
 		return 0;
 
-	cpu_g = kzalloc(sizeof(*cpu_g) +
-			sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
+	cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL);
 	if (!cpu_g)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_disk_groups_to_cpu;
 
 	cpu_g->nr = nr_groups;
 
@@ -144,17 +169,17 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
 
 		dst->deleted	= BCH_GROUP_DELETED(src);
 		dst->parent	= BCH_GROUP_PARENT(src);
+		memcpy(dst->label, src->label, sizeof(dst->label));
 	}
 
 	for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
-		struct bch_member *m = mi->members + i;
-		struct bch_disk_group_cpu *dst =
-			&cpu_g->entries[BCH_MEMBER_GROUP(m)];
+		struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i);
+		struct bch_disk_group_cpu *dst;
 
-		if (!bch2_member_exists(m))
+		if (!bch2_member_alive(&m))
 			continue;
 
-		g = BCH_MEMBER_GROUP(m);
+		g = BCH_MEMBER_GROUP(&m);
 		while (g) {
 			dst = &cpu_g->entries[g - 1];
 			__set_bit(i, dst->devs.d);
@@ -174,26 +199,36 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
 const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
 {
 	struct target t = target_decode(target);
+	struct bch_devs_mask *devs;
+
+	rcu_read_lock();
 
 	switch (t.type) {
 	case TARGET_NULL:
-		return NULL;
+		devs = NULL;
+		break;
 	case TARGET_DEV: {
 		struct bch_dev *ca = t.dev < c->sb.nr_devices
 			? rcu_dereference(c->devs[t.dev])
 			: NULL;
-		return ca ? &ca->self : NULL;
+		devs = ca ? &ca->self : NULL;
+		break;
 	}
 	case TARGET_GROUP: {
 		struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
 
-		return t.group < g->nr && !g->entries[t.group].deleted
+		devs = g && t.group < g->nr && !g->entries[t.group].deleted
 			? &g->entries[t.group].devs
 			: NULL;
+		break;
 	}
 	default:
 		BUG();
 	}
+
+	rcu_read_unlock();
+
+	return devs;
 }
 
 bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
@@ -212,7 +247,7 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
 
 		rcu_read_lock();
 		g = rcu_dereference(c->disk_groups);
-		m = t.group < g->nr && !g->entries[t.group].deleted
+		m = g && t.group < g->nr && !g->entries[t.group].deleted
 			? &g->entries[t.group].devs
 			: NULL;
 
@@ -255,7 +290,7 @@ static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
 				 const char *name, unsigned namelen)
 {
 	struct bch_sb_field_disk_groups *groups =
-		bch2_sb_get_disk_groups(sb->sb);
+		bch2_sb_field_get(sb->sb, disk_groups);
 	unsigned i, nr_groups = disk_groups_nr(groups);
 	struct bch_disk_group *g;
 
@@ -273,9 +308,9 @@ static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
 			 sizeof(struct bch_disk_group) * (nr_groups + 1)) /
 			sizeof(u64);
 
-		groups = bch2_sb_resize_disk_groups(sb, u64s);
+		groups = bch2_sb_field_resize(sb, disk_groups, u64s);
 		if (!groups)
-			return -ENOSPC;
+			return -BCH_ERR_ENOSPC_disk_label_add;
 
 		nr_groups = disk_groups_nr(groups);
 	}
@@ -297,7 +332,7 @@ static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
 int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
 {
 	struct bch_sb_field_disk_groups *groups =
-		bch2_sb_get_disk_groups(sb->sb);
+		bch2_sb_field_get(sb->sb, disk_groups);
 	int v = -1;
 
 	do {
@@ -327,7 +362,7 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
 		if (*next == '.')
 			next++;
 
-		groups = bch2_sb_get_disk_groups(sb->sb);
+		groups = bch2_sb_field_get(sb->sb, disk_groups);
 
 		v = __bch2_disk_group_find(groups, parent, name, len);
 		if (v < 0)
@@ -342,12 +377,60 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
 	return v;
 }
 
-int bch2_disk_path_print(struct bch_sb_handle *sb,
-			 char *buf, size_t len, unsigned v)
+void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
+{
+	struct bch_disk_groups_cpu *groups;
+	struct bch_disk_group_cpu *g;
+	unsigned nr = 0;
+	u16 path[32];
+
+	out->atomic++;
+	rcu_read_lock();
+	groups = rcu_dereference(c->disk_groups);
+	if (!groups)
+		goto invalid;
+
+	while (1) {
+		if (nr == ARRAY_SIZE(path))
+			goto invalid;
+
+		if (v >= groups->nr)
+			goto invalid;
+
+		g = groups->entries + v;
+
+		if (g->deleted)
+			goto invalid;
+
+		path[nr++] = v;
+
+		if (!g->parent)
+			break;
+
+		v = g->parent - 1;
+	}
+
+	while (nr) {
+		v = path[--nr];
+		g = groups->entries + v;
+
+		prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
+		if (nr)
+			prt_printf(out, ".");
+	}
+out:
+	rcu_read_unlock();
+	out->atomic--;
+	return;
+invalid:
+	prt_printf(out, "invalid label %u", v);
+	goto out;
+}
+
+void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
 {
-	char *out = buf, *end = out + len;
 	struct bch_sb_field_disk_groups *groups =
-		bch2_sb_get_disk_groups(sb->sb);
+		bch2_sb_field_get(sb, disk_groups);
 	struct bch_disk_group *g;
 	unsigned nr = 0;
 	u16 path[32];
@@ -373,123 +456,161 @@ int bch2_disk_path_print(struct bch_sb_handle *sb,
 	}
 
 	while (nr) {
-		unsigned b = 0;
-
 		v = path[--nr];
 		g = groups->entries + v;
 
-		if (end != out)
-			b = min_t(size_t, end - out,
-				  strnlen(g->label, sizeof(g->label)));
-		memcpy(out, g->label, b);
-		if (b < end - out)
-			out[b] = '\0';
-		out += b;
-
+		prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
 		if (nr)
-			out += scnprintf(out, end - out, ".");
+			prt_printf(out, ".");
 	}
-
-	return out - buf;
+	return;
 inval:
-	return scnprintf(buf, len, "invalid group %u", v);
+	prt_printf(out, "invalid label %u", v);
 }
 
-int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
 {
 	struct bch_member *mi;
-	int v = -1;
-
-	mutex_lock(&c->sb_lock);
+	int ret, v = -1;
 
 	if (!strlen(name) || !strcmp(name, "none"))
-		goto write_sb;
+		return 0;
 
 	v = bch2_disk_path_find_or_create(&c->disk_sb, name);
-	if (v < 0) {
-		mutex_unlock(&c->sb_lock);
+	if (v < 0)
 		return v;
-	}
 
-write_sb:
-	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+	ret = bch2_sb_disk_groups_to_cpu(c);
+	if (ret)
+		return ret;
+
+	mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
 	SET_BCH_MEMBER_GROUP(mi, v + 1);
+	return 0;
+}
 
-	bch2_write_super(c);
+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	ret = __bch2_dev_group_set(c, ca, name) ?:
+		bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	return 0;
+	return ret;
 }
 
-int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
+int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
+			  struct printbuf *err)
 {
 	struct bch_dev *ca;
 	int g;
 
-	if (!strlen(buf) || !strcmp(buf, "none")) {
-		*v = 0;
+	if (!val)
+		return -EINVAL;
+
+	if (!c)
+		return -BCH_ERR_option_needs_open_fs;
+
+	if (!strlen(val) || !strcmp(val, "none")) {
+		*res = 0;
 		return 0;
 	}
 
 	/* Is it a device? */
-	ca = bch2_dev_lookup(c, buf);
+	ca = bch2_dev_lookup(c, val);
 	if (!IS_ERR(ca)) {
-		*v = dev_to_target(ca->dev_idx);
-		percpu_ref_put(&ca->ref);
+		*res = dev_to_target(ca->dev_idx);
+		bch2_dev_put(ca);
 		return 0;
 	}
 
 	mutex_lock(&c->sb_lock);
-	g = bch2_disk_path_find(&c->disk_sb, buf);
+	g = bch2_disk_path_find(&c->disk_sb, val);
 	mutex_unlock(&c->sb_lock);
 
 	if (g >= 0) {
-		*v = group_to_target(g);
+		*res = group_to_target(g);
 		return 0;
 	}
 
 	return -EINVAL;
 }
 
-int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v)
+void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
 {
 	struct target t = target_decode(v);
-	int ret;
 
 	switch (t.type) {
 	case TARGET_NULL:
-		return scnprintf(buf, len, "none");
+		prt_printf(out, "none");
+		break;
 	case TARGET_DEV: {
 		struct bch_dev *ca;
 
+		out->atomic++;
 		rcu_read_lock();
 		ca = t.dev < c->sb.nr_devices
 			? rcu_dereference(c->devs[t.dev])
 			: NULL;
 
 		if (ca && percpu_ref_tryget(&ca->io_ref)) {
-			char b[BDEVNAME_SIZE];
-
-			ret = scnprintf(buf, len, "/dev/%s",
-					bdevname(ca->disk_sb.bdev, b));
+			prt_printf(out, "/dev/%s", ca->name);
 			percpu_ref_put(&ca->io_ref);
 		} else if (ca) {
-			ret = scnprintf(buf, len, "offline device %u", t.dev);
+			prt_printf(out, "offline device %u", t.dev);
 		} else {
-			ret = scnprintf(buf, len, "invalid device %u", t.dev);
+			prt_printf(out, "invalid device %u", t.dev);
 		}
 
 		rcu_read_unlock();
+		out->atomic--;
 		break;
 	}
 	case TARGET_GROUP:
-		mutex_lock(&c->sb_lock);
-		ret = bch2_disk_path_print(&c->disk_sb, buf, len, t.group);
-		mutex_unlock(&c->sb_lock);
+		bch2_disk_path_to_text(out, c, t.group);
 		break;
 	default:
 		BUG();
 	}
+}
 
-	return ret;
+static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
+{
+	struct target t = target_decode(v);
+
+	switch (t.type) {
+	case TARGET_NULL:
+		prt_printf(out, "none");
+		break;
+	case TARGET_DEV: {
+		struct bch_member m = bch2_sb_member_get(sb, t.dev);
+
+		if (bch2_member_exists(sb, t.dev)) {
+			prt_printf(out, "Device ");
+			pr_uuid(out, m.uuid.b);
+			prt_printf(out, " (%u)", t.dev);
+		} else {
+			prt_printf(out, "Bad device %u", t.dev);
+		}
+		break;
+	}
+	case TARGET_GROUP:
+		bch2_disk_path_to_text_sb(out, sb, t.group);
+		break;
+	default:
+		BUG();
+	}
+}
+
+void bch2_opt_target_to_text(struct printbuf *out,
+			     struct bch_fs *c,
+			     struct bch_sb *sb,
+			     u64 v)
+{
+	if (c)
+		bch2_target_to_text(out, c, v);
+	else
+		bch2_target_to_text_sb(out, sb, v);
 }
diff --git a/libbcachefs/disk_groups.h b/libbcachefs/disk_groups.h
index e92c0dc5..441826ff 100644
--- a/libbcachefs/disk_groups.h
+++ b/libbcachefs/disk_groups.h
@@ -1,6 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_DISK_GROUPS_H
 #define _BCACHEFS_DISK_GROUPS_H
 
+#include "disk_groups_types.h"
+
 extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
 
 static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
@@ -54,20 +57,55 @@ static inline struct target target_decode(unsigned target)
 }
 
 const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+
+static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
+						  enum bch_data_type data_type,
+						  u16 target)
+{
+	struct bch_devs_mask devs = c->rw_devs[data_type];
+	const struct bch_devs_mask *t = bch2_target_to_mask(c, target);
+
+	if (t)
+		bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
+	return devs;
+}
+
+static inline bool bch2_target_accepts_data(struct bch_fs *c,
+					    enum bch_data_type data_type,
+					    u16 target)
+{
+	struct bch_devs_mask rw_devs = target_rw_devs(c, data_type, target);
+	return !bitmap_empty(rw_devs.d, BCH_SB_MEMBERS_MAX);
+}
+
 bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
 
 int bch2_disk_path_find(struct bch_sb_handle *, const char *);
+
+/* Exported for userspace bcachefs-tools: */
 int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
-int bch2_disk_path_print(struct bch_sb_handle *, char *, size_t, unsigned);
 
-int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
-int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64);
+void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned);
+void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned);
+
+void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned);
+
+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
+void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+
+#define bch2_opt_target (struct bch_opt_fn) {		\
+	.parse		= bch2_opt_target_parse,	\
+	.to_text	= bch2_opt_target_to_text,	\
+}
 
 int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
 
+int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
 int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
 
 const char *bch2_sb_validate_disk_groups(struct bch_sb *,
 					 struct bch_sb_field *);
 
+void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *);
+
 #endif /* _BCACHEFS_DISK_GROUPS_H */
diff --git a/libbcachefs/disk_groups_format.h b/libbcachefs/disk_groups_format.h
new file mode 100644
index 00000000..698990bb
--- /dev/null
+++ b/libbcachefs/disk_groups_format.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_GROUPS_FORMAT_H
+#define _BCACHEFS_DISK_GROUPS_FORMAT_H
+
+#define BCH_SB_LABEL_SIZE		32
+
+struct bch_disk_group {
+	__u8			label[BCH_SB_LABEL_SIZE];
+	__le64			flags[2];
+} __packed __aligned(8);
+
+LE64_BITMASK(BCH_GROUP_DELETED,		struct bch_disk_group, flags[0], 0,  1)
+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,	struct bch_disk_group, flags[0], 1,  6)
+LE64_BITMASK(BCH_GROUP_PARENT,		struct bch_disk_group, flags[0], 6, 24)
+
+struct bch_sb_field_disk_groups {
+	struct bch_sb_field	field;
+	struct bch_disk_group	entries[];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_DISK_GROUPS_FORMAT_H */
diff --git a/libbcachefs/disk_groups_types.h b/libbcachefs/disk_groups_types.h
new file mode 100644
index 00000000..a54ef085
--- /dev/null
+++ b/libbcachefs/disk_groups_types.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H
+#define _BCACHEFS_DISK_GROUPS_TYPES_H
+
+struct bch_disk_group_cpu {
+	bool				deleted;
+	u16				parent;
+	u8				label[BCH_SB_LABEL_SIZE];
+	struct bch_devs_mask		devs;
+};
+
+struct bch_disk_groups_cpu {
+	struct rcu_head			rcu;
+	unsigned			nr;
+	struct bch_disk_group_cpu	entries[] __counted_by(nr);
+};
+
+#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
new file mode 100644
index 00000000..250e7389
--- /dev/null
+++ b/libbcachefs/ec.c
@@ -0,0 +1,2510 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* erasure coding */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "backpointers.h"
+#include "bkey_buf.h"
+#include "bset.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "disk_accounting.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io_read.h"
+#include "io_write.h"
+#include "keylist.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "util.h"
+
+#include <linux/sort.h>
+#include <linux/string_choices.h>
+
+#ifdef __KERNEL__
+
+#include <linux/raid/pq.h>
+#include <linux/raid/xor.h>
+
+static void raid5_recov(unsigned disks, unsigned failed_idx,
+			size_t size, void **data)
+{
+	unsigned i = 2, nr;
+
+	BUG_ON(failed_idx >= disks);
+
+	swap(data[0], data[failed_idx]);
+	memcpy(data[0], data[1], size);
+
+	while (i < disks) {
+		nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
+		xor_blocks(nr, size, data[0], data + i);
+		i += nr;
+	}
+
+	swap(data[0], data[failed_idx]);
+}
+
+static void raid_gen(int nd, int np, size_t size, void **v)
+{
+	if (np >= 1)
+		raid5_recov(nd + np, nd, size, v);
+	if (np >= 2)
+		raid6_call.gen_syndrome(nd + np, size, v);
+	BUG_ON(np > 2);
+}
+
+static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
+{
+	switch (nr) {
+	case 0:
+		break;
+	case 1:
+		if (ir[0] < nd + 1)
+			raid5_recov(nd + 1, ir[0], size, v);
+		else
+			raid6_call.gen_syndrome(nd + np, size, v);
+		break;
+	case 2:
+		if (ir[1] < nd) {
+			/* data+data failure. */
+			raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
+		} else if (ir[0] < nd) {
+			/* data + p/q failure */
+
+			if (ir[1] == nd) /* data + p failure */
+				raid6_datap_recov(nd + np, size, ir[0], v);
+			else { /* data + q failure */
+				raid5_recov(nd + 1, ir[0], size, v);
+				raid6_call.gen_syndrome(nd + np, size, v);
+			}
+		} else {
+			raid_gen(nd, np, size, v);
+		}
+		break;
+	default:
+		BUG();
+	}
+}
+
+#else
+
+#include <raid/raid.h>
+
+#endif
+
+struct ec_bio {
+	struct bch_dev		*ca;
+	struct ec_stripe_buf	*buf;
+	size_t			idx;
+	struct bio		bio;
+};
+
+/* Stripes btree keys: */
+
+int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
+			 struct bkey_validate_context from)
+{
+	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
+			 bpos_gt(k.k->p, POS(0, U32_MAX)),
+			 c, stripe_pos_bad,
+			 "stripe at bad pos");
+
+	bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s),
+			 c, stripe_val_size_bad,
+			 "incorrect value size (%zu < %u)",
+			 bkey_val_u64s(k.k), stripe_val_u64s(s));
+
+	bkey_fsck_err_on(s->csum_granularity_bits >= 64,
+			 c, stripe_csum_granularity_bad,
+			 "invalid csum granularity (%u >= 64)",
+			 s->csum_granularity_bits);
+
+	ret = bch2_bkey_ptrs_validate(c, k, from);
+fsck_err:
+	return ret;
+}
+
+void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
+			 struct bkey_s_c k)
+{
+	const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
+	struct bch_stripe s = {};
+
+	memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
+
+	unsigned nr_data = s.nr_blocks - s.nr_redundant;
+
+	prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
+		   s.algorithm,
+		   le16_to_cpu(s.sectors),
+		   nr_data,
+		   s.nr_redundant);
+	bch2_prt_csum_type(out, s.csum_type);
+	prt_str(out, " gran ");
+	if (s.csum_granularity_bits < 64)
+		prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits);
+	else
+		prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits);
+
+	if (s.disk_label) {
+		prt_str(out, " label");
+		bch2_disk_path_to_text(out, c, s.disk_label - 1);
+	}
+
+	for (unsigned i = 0; i < s.nr_blocks; i++) {
+		const struct bch_extent_ptr *ptr = sp->ptrs + i;
+
+		if ((void *) ptr >= bkey_val_end(k))
+			break;
+
+		prt_char(out, ' ');
+		bch2_extent_ptr_to_text(out, c, ptr);
+
+		if (s.csum_type < BCH_CSUM_NR &&
+		    i < nr_data &&
+		    stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
+			prt_printf(out,  "#%u", stripe_blockcount_get(sp, i));
+	}
+}
+
+/* Triggers: */
+
+static int __mark_stripe_bucket(struct btree_trans *trans,
+				struct bch_dev *ca,
+				struct bkey_s_c_stripe s,
+				unsigned ptr_idx, bool deleting,
+				struct bpos bucket,
+				struct bch_alloc_v4 *a,
+				enum btree_iter_update_trigger_flags flags)
+{
+	const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
+	unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant;
+	bool parity = ptr_idx >= nr_data;
+	enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
+	s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	struct bch_fs *c = trans->c;
+	if (deleting)
+		sectors = -sectors;
+
+	if (!deleting) {
+		if (bch2_trans_inconsistent_on(a->stripe ||
+					       a->stripe_redundancy, trans,
+				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s",
+				bucket.inode, bucket.offset, a->gen,
+				bch2_data_type_str(a->data_type),
+				a->dirty_sectors,
+				a->stripe, s.k->p.offset,
+				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+			ret = -BCH_ERR_mark_stripe;
+			goto err;
+		}
+
+		if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans,
+				"bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s",
+				bucket.inode, bucket.offset, a->gen,
+				bch2_data_type_str(a->data_type),
+				a->dirty_sectors,
+				a->cached_sectors,
+				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+			ret = -BCH_ERR_mark_stripe;
+			goto err;
+		}
+	} else {
+		if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset ||
+					       a->stripe_redundancy != s.v->nr_redundant, trans,
+				"bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s",
+				bucket.inode, bucket.offset, a->gen,
+				a->stripe,
+				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+			ret = -BCH_ERR_mark_stripe;
+			goto err;
+		}
+
+		if (bch2_trans_inconsistent_on(a->data_type != data_type, trans,
+				"bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s",
+				bucket.inode, bucket.offset, a->gen,
+				bch2_data_type_str(a->data_type),
+				bch2_data_type_str(data_type),
+				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+			ret = -BCH_ERR_mark_stripe;
+			goto err;
+		}
+
+		if (bch2_trans_inconsistent_on(parity &&
+					       (a->dirty_sectors != -sectors ||
+						a->cached_sectors), trans,
+				"bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s",
+				bucket.inode, bucket.offset, a->gen,
+				a->dirty_sectors,
+				a->cached_sectors,
+				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+			ret = -BCH_ERR_mark_stripe;
+			goto err;
+		}
+	}
+
+	if (sectors) {
+		ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type,
+					     a->gen, a->data_type, &a->dirty_sectors);
+		if (ret)
+			goto err;
+	}
+
+	if (!deleting) {
+		a->stripe		= s.k->p.offset;
+		a->stripe_redundancy	= s.v->nr_redundant;
+		alloc_data_type_set(a, data_type);
+	} else {
+		a->stripe		= 0;
+		a->stripe_redundancy	= 0;
+		alloc_data_type_set(a, BCH_DATA_user);
+	}
+err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int mark_stripe_bucket(struct btree_trans *trans,
+			      struct bkey_s_c_stripe s,
+			      unsigned ptr_idx, bool deleting,
+			      enum btree_iter_update_trigger_flags flags)
+{
+	struct bch_fs *c = trans->c;
+	const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
+	if (unlikely(!ca)) {
+		if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite))
+			ret = -BCH_ERR_mark_stripe;
+		goto err;
+	}
+
+	struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+
+	if (flags & BTREE_TRIGGER_transactional) {
+		struct bkey_i_alloc_v4 *a =
+			bch2_trans_start_alloc_update(trans, bucket, 0);
+		ret = PTR_ERR_OR_ZERO(a) ?:
+			__mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags);
+	}
+
+	if (flags & BTREE_TRIGGER_gc) {
+		percpu_down_read(&c->mark_lock);
+		struct bucket *g = gc_bucket(ca, bucket.offset);
+		if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n  %s",
+					    ptr->dev,
+					    (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+			ret = -BCH_ERR_mark_stripe;
+			goto err_unlock;
+		}
+
+		bucket_lock(g);
+		struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
+		ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
+		alloc_to_bucket(g, new);
+		bucket_unlock(g);
+err_unlock:
+		percpu_up_read(&c->mark_lock);
+		if (!ret)
+			ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
+	}
+err:
+	bch2_dev_put(ca);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int mark_stripe_buckets(struct btree_trans *trans,
+			       struct bkey_s_c old, struct bkey_s_c new,
+			       enum btree_iter_update_trigger_flags flags)
+{
+	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
+		? bkey_s_c_to_stripe(old).v : NULL;
+	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
+		? bkey_s_c_to_stripe(new).v : NULL;
+
+	BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks);
+
+	unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
+
+	for (unsigned i = 0; i < nr_blocks; i++) {
+		if (new_s && old_s &&
+		    !memcmp(&new_s->ptrs[i],
+			    &old_s->ptrs[i],
+			    sizeof(new_s->ptrs[i])))
+			continue;
+
+		if (new_s) {
+			int ret = mark_stripe_bucket(trans,
+					bkey_s_c_to_stripe(new), i, false, flags);
+			if (ret)
+				return ret;
+		}
+
+		if (old_s) {
+			int ret = mark_stripe_bucket(trans,
+					bkey_s_c_to_stripe(old), i, true, flags);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
+{
+	m->sectors	= le16_to_cpu(s->sectors);
+	m->algorithm	= s->algorithm;
+	m->nr_blocks	= s->nr_blocks;
+	m->nr_redundant	= s->nr_redundant;
+	m->disk_label	= s->disk_label;
+	m->blocks_nonempty = 0;
+
+	for (unsigned i = 0; i < s->nr_blocks; i++)
+		m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+}
+
+int bch2_trigger_stripe(struct btree_trans *trans,
+			enum btree_id btree, unsigned level,
+			struct bkey_s_c old, struct bkey_s _new,
+			enum btree_iter_update_trigger_flags flags)
+{
+	struct bkey_s_c new = _new.s_c;
+	struct bch_fs *c = trans->c;
+	u64 idx = new.k->p.offset;
+	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
+		? bkey_s_c_to_stripe(old).v : NULL;
+	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
+		? bkey_s_c_to_stripe(new).v : NULL;
+
+	if (unlikely(flags & BTREE_TRIGGER_check_repair))
+		return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags);
+
+	BUG_ON(new_s && old_s &&
+	       (new_s->nr_blocks	!= old_s->nr_blocks ||
+		new_s->nr_redundant	!= old_s->nr_redundant));
+
+
+	if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
+		/*
+		 * If the pointers aren't changing, we don't need to do anything:
+		 */
+		if (new_s && old_s &&
+		    new_s->nr_blocks	== old_s->nr_blocks &&
+		    new_s->nr_redundant	== old_s->nr_redundant &&
+		    !memcmp(old_s->ptrs, new_s->ptrs,
+			    new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
+			return 0;
+
+		struct gc_stripe *gc = NULL;
+		if (flags & BTREE_TRIGGER_gc) {
+			gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
+			if (!gc) {
+				bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx);
+				return -BCH_ERR_ENOMEM_mark_stripe;
+			}
+
+			/*
+			 * This will be wrong when we bring back runtime gc: we should
+			 * be unmarking the old key and then marking the new key
+			 *
+			 * Also: when we bring back runtime gc, locking
+			 */
+			gc->alive	= true;
+			gc->sectors	= le16_to_cpu(new_s->sectors);
+			gc->nr_blocks	= new_s->nr_blocks;
+			gc->nr_redundant	= new_s->nr_redundant;
+
+			for (unsigned i = 0; i < new_s->nr_blocks; i++)
+				gc->ptrs[i] = new_s->ptrs[i];
+
+			/*
+			 * gc recalculates this field from stripe ptr
+			 * references:
+			 */
+			memset(gc->block_sectors, 0, sizeof(gc->block_sectors));
+		}
+
+		if (new_s) {
+			s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant;
+
+			struct disk_accounting_pos acc = {
+				.type = BCH_DISK_ACCOUNTING_replicas,
+			};
+			bch2_bkey_to_replicas(&acc.replicas, new);
+			int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
+			if (ret)
+				return ret;
+
+			if (gc)
+				memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas));
+		}
+
+		if (old_s) {
+			s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant;
+
+			struct disk_accounting_pos acc = {
+				.type = BCH_DISK_ACCOUNTING_replicas,
+			};
+			bch2_bkey_to_replicas(&acc.replicas, old);
+			int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
+			if (ret)
+				return ret;
+		}
+
+		int ret = mark_stripe_buckets(trans, old, new, flags);
+		if (ret)
+			return ret;
+	}
+
+	if (flags & BTREE_TRIGGER_atomic) {
+		struct stripe *m = genradix_ptr(&c->stripes, idx);
+
+		if (!m) {
+			struct printbuf buf1 = PRINTBUF;
+			struct printbuf buf2 = PRINTBUF;
+
+			bch2_bkey_val_to_text(&buf1, c, old);
+			bch2_bkey_val_to_text(&buf2, c, new);
+			bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
+					    "old %s\n"
+					    "new %s", idx, buf1.buf, buf2.buf);
+			printbuf_exit(&buf2);
+			printbuf_exit(&buf1);
+			bch2_inconsistent_error(c);
+			return -1;
+		}
+
+		if (!new_s) {
+			bch2_stripes_heap_del(c, m, idx);
+
+			memset(m, 0, sizeof(*m));
+		} else {
+			stripe_to_mem(m, new_s);
+
+			if (!old_s)
+				bch2_stripes_heap_insert(c, m, idx);
+			else
+				bch2_stripes_heap_update(c, m, idx);
+		}
+	}
+
+	return 0;
+}
+
+/* returns blocknr in stripe that we matched: */
+static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
+						struct bkey_s_c k, unsigned *block)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		for (i = 0; i < nr_data; i++)
+			if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
+						      le16_to_cpu(s->sectors))) {
+				*block = i;
+				return ptr;
+			}
+
+	return NULL;
+}
+
+static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+
+		extent_for_each_entry(e, entry)
+			if (extent_entry_type(entry) ==
+			    BCH_EXTENT_ENTRY_stripe_ptr &&
+			    entry->stripe_ptr.idx == idx)
+				return true;
+
+		break;
+	}
+	}
+
+	return false;
+}
+
+/* Stripe bufs: */
+
+static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
+{
+	if (buf->key.k.type == KEY_TYPE_stripe) {
+		struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
+		unsigned i;
+
+		for (i = 0; i < s->v.nr_blocks; i++) {
+			kvfree(buf->data[i]);
+			buf->data[i] = NULL;
+		}
+	}
+}
+
+/* XXX: this is a non-mempoolified memory allocation: */
+static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
+			      unsigned offset, unsigned size)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned csum_granularity = 1U << v->csum_granularity_bits;
+	unsigned end = offset + size;
+	unsigned i;
+
+	BUG_ON(end > le16_to_cpu(v->sectors));
+
+	offset	= round_down(offset, csum_granularity);
+	end	= min_t(unsigned, le16_to_cpu(v->sectors),
+			round_up(end, csum_granularity));
+
+	buf->offset	= offset;
+	buf->size	= end - offset;
+
+	memset(buf->valid, 0xFF, sizeof(buf->valid));
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
+		if (!buf->data[i])
+			goto err;
+	}
+
+	return 0;
+err:
+	ec_stripe_buf_exit(buf);
+	return -BCH_ERR_ENOMEM_stripe_buf;
+}
+
+/* Checksumming: */
+
+static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
+					 unsigned block, unsigned offset)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned csum_granularity = 1 << v->csum_granularity_bits;
+	unsigned end = buf->offset + buf->size;
+	unsigned len = min(csum_granularity, end - offset);
+
+	BUG_ON(offset >= end);
+	BUG_ON(offset <  buf->offset);
+	BUG_ON(offset & (csum_granularity - 1));
+	BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
+	       (len & (csum_granularity - 1)));
+
+	return bch2_checksum(NULL, v->csum_type,
+			     null_nonce(),
+			     buf->data[block] + ((offset - buf->offset) << 9),
+			     len << 9);
+}
+
+static void ec_generate_checksums(struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned i, j, csums_per_device = stripe_csums_per_device(v);
+
+	if (!v->csum_type)
+		return;
+
+	BUG_ON(buf->offset);
+	BUG_ON(buf->size != le16_to_cpu(v->sectors));
+
+	for (i = 0; i < v->nr_blocks; i++)
+		for (j = 0; j < csums_per_device; j++)
+			stripe_csum_set(v, i, j,
+				ec_block_checksum(buf, i, j << v->csum_granularity_bits));
+}
+
+static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned csum_granularity = 1 << v->csum_granularity_bits;
+	unsigned i;
+
+	if (!v->csum_type)
+		return;
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		unsigned offset = buf->offset;
+		unsigned end = buf->offset + buf->size;
+
+		if (!test_bit(i, buf->valid))
+			continue;
+
+		while (offset < end) {
+			unsigned j = offset >> v->csum_granularity_bits;
+			unsigned len = min(csum_granularity, end - offset);
+			struct bch_csum want = stripe_csum_get(v, i, j);
+			struct bch_csum got = ec_block_checksum(buf, i, offset);
+
+			if (bch2_crc_cmp(want, got)) {
+				struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev);
+				if (ca) {
+					struct printbuf err = PRINTBUF;
+
+					prt_str(&err, "stripe ");
+					bch2_csum_err_msg(&err, v->csum_type, want, got);
+					prt_printf(&err, "  for %ps at %u of\n  ", (void *) _RET_IP_, i);
+					bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
+					bch_err_ratelimited(ca, "%s", err.buf);
+					printbuf_exit(&err);
+
+					bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+				}
+
+				clear_bit(i, buf->valid);
+				break;
+			}
+
+			offset += len;
+		}
+	}
+}
+
+/* Erasure coding: */
+
+static void ec_generate_ec(struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned nr_data = v->nr_blocks - v->nr_redundant;
+	unsigned bytes = le16_to_cpu(v->sectors) << 9;
+
+	raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
+}
+
+static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+
+	return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
+}
+
+static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
+	unsigned nr_data = v->nr_blocks - v->nr_redundant;
+	unsigned bytes = buf->size << 9;
+
+	if (ec_nr_failed(buf) > v->nr_redundant) {
+		bch_err_ratelimited(c,
+			"error doing reconstruct read: unable to read enough blocks");
+		return -1;
+	}
+
+	for (i = 0; i < nr_data; i++)
+		if (!test_bit(i, buf->valid))
+			failed[nr_failed++] = i;
+
+	raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
+	return 0;
+}
+
+/* IO: */
+
+static void ec_block_endio(struct bio *bio)
+{
+	struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
+	struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
+	struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
+	struct bch_dev *ca = ec_bio->ca;
+	struct closure *cl = bio->bi_private;
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca,
+			       bio_data_dir(bio)
+			       ? BCH_MEMBER_ERROR_write
+			       : BCH_MEMBER_ERROR_read,
+			       "erasure coding %s error: %s",
+			       str_write_read(bio_data_dir(bio)),
+			       bch2_blk_status_to_str(bio->bi_status)))
+		clear_bit(ec_bio->idx, ec_bio->buf->valid);
+
+	int stale = dev_ptr_stale(ca, ptr);
+	if (stale) {
+		bch_err_ratelimited(ca->fs,
+				    "error %s stripe: stale/invalid pointer (%i) after io",
+				    bio_data_dir(bio) == READ ? "reading from" : "writing to",
+				    stale);
+		clear_bit(ec_bio->idx, ec_bio->buf->valid);
+	}
+
+	bio_put(&ec_bio->bio);
+	percpu_ref_put(&ca->io_ref);
+	closure_put(cl);
+}
+
+static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
+			blk_opf_t opf, unsigned idx, struct closure *cl)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned offset = 0, bytes = buf->size << 9;
+	struct bch_extent_ptr *ptr = &v->ptrs[idx];
+	enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
+		? BCH_DATA_user
+		: BCH_DATA_parity;
+	int rw = op_is_write(opf);
+
+	struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw);
+	if (!ca) {
+		clear_bit(idx, buf->valid);
+		return;
+	}
+
+	int stale = dev_ptr_stale(ca, ptr);
+	if (stale) {
+		bch_err_ratelimited(c,
+				    "error %s stripe: stale pointer (%i)",
+				    rw == READ ? "reading from" : "writing to",
+				    stale);
+		clear_bit(idx, buf->valid);
+		return;
+	}
+
+
+	this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
+
+	while (offset < bytes) {
+		unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
+					   DIV_ROUND_UP(bytes, PAGE_SIZE));
+		unsigned b = min_t(size_t, bytes - offset,
+				   nr_iovecs << PAGE_SHIFT);
+		struct ec_bio *ec_bio;
+
+		ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
+						       nr_iovecs,
+						       opf,
+						       GFP_KERNEL,
+						       &c->ec_bioset),
+				      struct ec_bio, bio);
+
+		ec_bio->ca			= ca;
+		ec_bio->buf			= buf;
+		ec_bio->idx			= idx;
+
+		ec_bio->bio.bi_iter.bi_sector	= ptr->offset + buf->offset + (offset >> 9);
+		ec_bio->bio.bi_end_io		= ec_block_endio;
+		ec_bio->bio.bi_private		= cl;
+
+		bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
+
+		closure_get(cl);
+		percpu_ref_get(&ca->io_ref);
+
+		submit_bio(&ec_bio->bio);
+
+		offset += b;
+	}
+
+	percpu_ref_put(&ca->io_ref);
+}
+
+static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
+				struct ec_stripe_buf *stripe)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
+			       POS(0, idx), BTREE_ITER_slots);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+	if (k.k->type != KEY_TYPE_stripe) {
+		ret = -ENOENT;
+		goto err;
+	}
+	bkey_reassemble(&stripe->key, k);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/* recovery read path: */
+int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
+			struct bkey_s_c orig_k)
+{
+	struct bch_fs *c = trans->c;
+	struct ec_stripe_buf *buf = NULL;
+	struct closure cl;
+	struct bch_stripe *v;
+	unsigned i, offset;
+	const char *msg = NULL;
+	struct printbuf msgbuf = PRINTBUF;
+	int ret = 0;
+
+	closure_init_stack(&cl);
+
+	BUG_ON(!rbio->pick.has_ec);
+
+	buf = kzalloc(sizeof(*buf), GFP_NOFS);
+	if (!buf)
+		return -BCH_ERR_ENOMEM_ec_read_extent;
+
+	ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
+	if (ret) {
+		msg = "stripe not found";
+		goto err;
+	}
+
+	v = &bkey_i_to_stripe(&buf->key)->v;
+
+	if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
+		msg = "pointer doesn't match stripe";
+		goto err;
+	}
+
+	offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
+	if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
+		msg = "read is bigger than stripe";
+		goto err;
+	}
+
+	ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
+	if (ret) {
+		msg = "-ENOMEM";
+		goto err;
+	}
+
+	for (i = 0; i < v->nr_blocks; i++)
+		ec_block_io(c, buf, REQ_OP_READ, i, &cl);
+
+	closure_sync(&cl);
+
+	if (ec_nr_failed(buf) > v->nr_redundant) {
+		msg = "unable to read enough blocks";
+		goto err;
+	}
+
+	ec_validate_checksums(c, buf);
+
+	ret = ec_do_recov(c, buf);
+	if (ret)
+		goto err;
+
+	memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
+		      buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
+out:
+	ec_stripe_buf_exit(buf);
+	kfree(buf);
+	return ret;
+err:
+	bch2_bkey_val_to_text(&msgbuf, c, orig_k);
+	bch_err_ratelimited(c,
+			    "error doing reconstruct read: %s\n  %s", msg, msgbuf.buf);
+	printbuf_exit(&msgbuf);
+	ret = -BCH_ERR_stripe_reconstruct;
+	goto out;
+}
+
+/* stripe bucket accounting: */
+
+static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
+{
+	ec_stripes_heap n, *h = &c->ec_stripes_heap;
+
+	if (idx >= h->size) {
+		if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
+			return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
+
+		mutex_lock(&c->ec_stripes_heap_lock);
+		if (n.size > h->size) {
+			memcpy(n.data, h->data, h->nr * sizeof(h->data[0]));
+			n.nr = h->nr;
+			swap(*h, n);
+		}
+		mutex_unlock(&c->ec_stripes_heap_lock);
+
+		free_heap(&n);
+	}
+
+	if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
+		return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
+
+	if (c->gc_pos.phase != GC_PHASE_not_running &&
+	    !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
+		return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
+
+	return 0;
+}
+
+static int ec_stripe_mem_alloc(struct btree_trans *trans,
+			       struct btree_iter *iter)
+{
+	return allocate_dropping_locks_errcode(trans,
+			__ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
+}
+
+/*
+ * Hash table of open stripes:
+ * Stripes that are being created or modified are kept in a hash table, so that
+ * stripe deletion can skip them.
+ */
+
+static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
+{
+	unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
+	struct ec_stripe_new *s;
+
+	hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
+		if (s->idx == idx)
+			return true;
+	return false;
+}
+
+static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
+{
+	bool ret = false;
+
+	spin_lock(&c->ec_stripes_new_lock);
+	ret = __bch2_stripe_is_open(c, idx);
+	spin_unlock(&c->ec_stripes_new_lock);
+
+	return ret;
+}
+
+static bool bch2_try_open_stripe(struct bch_fs *c,
+				 struct ec_stripe_new *s,
+				 u64 idx)
+{
+	bool ret;
+
+	spin_lock(&c->ec_stripes_new_lock);
+	ret = !__bch2_stripe_is_open(c, idx);
+	if (ret) {
+		unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
+
+		s->idx = idx;
+		hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
+	}
+	spin_unlock(&c->ec_stripes_new_lock);
+
+	return ret;
+}
+
+static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
+{
+	BUG_ON(!s->idx);
+
+	spin_lock(&c->ec_stripes_new_lock);
+	hlist_del_init(&s->hash);
+	spin_unlock(&c->ec_stripes_new_lock);
+
+	s->idx = 0;
+}
+
+/* Heap of all existing stripes, ordered by blocks_nonempty */
+
+static u64 stripe_idx_to_delete(struct bch_fs *c)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+
+	lockdep_assert_held(&c->ec_stripes_heap_lock);
+
+	if (h->nr &&
+	    h->data[0].blocks_nonempty == 0 &&
+	    !bch2_stripe_is_open(c, h->data[0].idx))
+		return h->data[0].idx;
+
+	return 0;
+}
+
+static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
+						   size_t i)
+{
+	struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
+
+	genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
+}
+
+static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args)
+{
+	struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
+	struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
+
+	return ((_l->blocks_nonempty > _r->blocks_nonempty) <
+		(_l->blocks_nonempty < _r->blocks_nonempty));
+}
+
+static inline void ec_stripes_heap_swap(void *l, void *r, void *h)
+{
+	struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
+	struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
+	ec_stripes_heap *_h = (ec_stripes_heap *)h;
+	size_t i = _l - _h->data;
+	size_t j = _r - _h->data;
+
+	swap(*_l, *_r);
+
+	ec_stripes_heap_set_backpointer(_h, i);
+	ec_stripes_heap_set_backpointer(_h, j);
+}
+
+static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	struct stripe *m = genradix_ptr(&c->stripes, idx);
+
+	BUG_ON(m->heap_idx >= h->nr);
+	BUG_ON(h->data[m->heap_idx].idx != idx);
+}
+
+void bch2_stripes_heap_del(struct bch_fs *c,
+			   struct stripe *m, size_t idx)
+{
+	const struct min_heap_callbacks callbacks = {
+		.less = ec_stripes_heap_cmp,
+		.swp = ec_stripes_heap_swap,
+	};
+
+	mutex_lock(&c->ec_stripes_heap_lock);
+	heap_verify_backpointer(c, idx);
+
+	min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap);
+	mutex_unlock(&c->ec_stripes_heap_lock);
+}
+
+void bch2_stripes_heap_insert(struct bch_fs *c,
+			      struct stripe *m, size_t idx)
+{
+	const struct min_heap_callbacks callbacks = {
+		.less = ec_stripes_heap_cmp,
+		.swp = ec_stripes_heap_swap,
+	};
+
+	mutex_lock(&c->ec_stripes_heap_lock);
+	BUG_ON(min_heap_full(&c->ec_stripes_heap));
+
+	genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr;
+	min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) {
+			.idx = idx,
+			.blocks_nonempty = m->blocks_nonempty,
+		}),
+		&callbacks,
+		&c->ec_stripes_heap);
+
+	heap_verify_backpointer(c, idx);
+	mutex_unlock(&c->ec_stripes_heap_lock);
+}
+
+void bch2_stripes_heap_update(struct bch_fs *c,
+			      struct stripe *m, size_t idx)
+{
+	const struct min_heap_callbacks callbacks = {
+		.less = ec_stripes_heap_cmp,
+		.swp = ec_stripes_heap_swap,
+	};
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	bool do_deletes;
+	size_t i;
+
+	mutex_lock(&c->ec_stripes_heap_lock);
+	heap_verify_backpointer(c, idx);
+
+	h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
+
+	i = m->heap_idx;
+	min_heap_sift_up(h,	i, &callbacks, &c->ec_stripes_heap);
+	min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap);
+
+	heap_verify_backpointer(c, idx);
+
+	do_deletes = stripe_idx_to_delete(c) != 0;
+	mutex_unlock(&c->ec_stripes_heap_lock);
+
+	if (do_deletes)
+		bch2_do_stripe_deletes(c);
+}
+
+/* stripe deletion */
+
+static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_stripe s;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
+			       BTREE_ITER_intent);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_stripe) {
+		bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	s = bkey_s_c_to_stripe(k);
+	for (unsigned i = 0; i < s.v->nr_blocks; i++)
+		if (stripe_blockcount_get(s.v, i)) {
+			struct printbuf buf = PRINTBUF;
+
+			bch2_bkey_val_to_text(&buf, c, k);
+			bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
+			printbuf_exit(&buf);
+			ret = -EINVAL;
+			goto err;
+		}
+
+	ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static void ec_stripe_delete_work(struct work_struct *work)
+{
+	struct bch_fs *c =
+		container_of(work, struct bch_fs, ec_stripe_delete_work);
+
+	while (1) {
+		mutex_lock(&c->ec_stripes_heap_lock);
+		u64 idx = stripe_idx_to_delete(c);
+		mutex_unlock(&c->ec_stripes_heap_lock);
+
+		if (!idx)
+			break;
+
+		int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+					ec_stripe_delete(trans, idx));
+		bch_err_fn(c, ret);
+		if (ret)
+			break;
+	}
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
+}
+
+void bch2_do_stripe_deletes(struct bch_fs *c)
+{
+	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
+	    !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
+}
+
+/* stripe creation: */
+
+static int ec_stripe_key_update(struct btree_trans *trans,
+				struct bkey_i_stripe *old,
+				struct bkey_i_stripe *new)
+{
+	struct bch_fs *c = trans->c;
+	bool create = !old;
+
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
+					       new->k.p, BTREE_ITER_intent);
+	int ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe),
+				    c, "error %s stripe: got existing key type %s",
+				    create ? "creating" : "updating",
+				    bch2_bkey_types[k.k->type])) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (k.k->type == KEY_TYPE_stripe) {
+		const struct bch_stripe *v = bkey_s_c_to_stripe(k).v;
+
+		BUG_ON(old->v.nr_blocks != new->v.nr_blocks);
+		BUG_ON(old->v.nr_blocks != v->nr_blocks);
+
+		for (unsigned i = 0; i < new->v.nr_blocks; i++) {
+			unsigned sectors = stripe_blockcount_get(v, i);
+
+			if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) {
+				struct printbuf buf = PRINTBUF;
+
+				prt_printf(&buf, "stripe changed nonempty block %u", i);
+				prt_str(&buf, "\nold: ");
+				bch2_bkey_val_to_text(&buf, c, k);
+				prt_str(&buf, "\nnew: ");
+				bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i));
+				bch2_fs_inconsistent(c, "%s", buf.buf);
+				printbuf_exit(&buf);
+				ret = -EINVAL;
+				goto err;
+			}
+
+			/*
+			 * If the stripe ptr changed underneath us, it must have
+			 * been dev_remove_stripes() -> * invalidate_stripe_to_dev()
+			 */
+			if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) {
+				BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID);
+
+				if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]))
+					new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID;
+			}
+
+			stripe_blockcount_set(&new->v, i, sectors);
+		}
+	}
+
+	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int ec_stripe_update_extent(struct btree_trans *trans,
+				   struct bch_dev *ca,
+				   struct bpos bucket, u8 gen,
+				   struct ec_stripe_buf *s,
+				   struct bkey_s_c_backpointer bp)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	const struct bch_extent_ptr *ptr_c;
+	struct bch_extent_ptr *ec_ptr = NULL;
+	struct bch_extent_stripe_ptr stripe_ptr;
+	struct bkey_i *n;
+	int ret, dev, block;
+
+	if (bp.v->level) {
+		struct printbuf buf = PRINTBUF;
+		struct btree_iter node_iter;
+		struct btree *b;
+
+		b = bch2_backpointer_get_node(trans, bp, &node_iter);
+		bch2_trans_iter_exit(trans, &node_iter);
+
+		if (!b)
+			return 0;
+
+		prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
+		bch2_bkey_val_to_text(&buf, c, bp.s_c);
+
+		bch2_fs_inconsistent(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+		return -EIO;
+	}
+
+	k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+	if (!k.k) {
+		/*
+		 * extent no longer exists - we could flush the btree
+		 * write buffer and retry to verify, but no need:
+		 */
+		return 0;
+	}
+
+	if (extent_has_stripe_ptr(k, s->key.k.p.offset))
+		goto out;
+
+	ptr_c = bkey_matches_stripe(v, k, &block);
+	/*
+	 * It doesn't generally make sense to erasure code cached ptrs:
+	 * XXX: should we be incrementing a counter?
+	 */
+	if (!ptr_c || ptr_c->cached)
+		goto out;
+
+	dev = v->ptrs[block].dev;
+
+	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
+	ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		goto out;
+
+	bkey_reassemble(n, k);
+
+	bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev);
+	ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
+	BUG_ON(!ec_ptr);
+
+	stripe_ptr = (struct bch_extent_stripe_ptr) {
+		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
+		.block		= block,
+		.redundancy	= v->nr_redundant,
+		.idx		= s->key.k.p.offset,
+	};
+
+	__extent_entry_insert(n,
+			(union bch_extent_entry *) ec_ptr,
+			(union bch_extent_entry *) &stripe_ptr);
+
+	ret = bch2_trans_update(trans, &iter, n, 0);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
+				   unsigned block)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
+	struct bch_extent_ptr ptr = v->ptrs[block];
+	int ret = 0;
+
+	struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
+	if (!ca)
+		return -EIO;
+
+	struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
+
+	ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers,
+			bucket_pos_to_bp_start(ca, bucket_pos),
+			bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k,
+			NULL, NULL,
+			BCH_TRANS_COMMIT_no_check_rw|
+			BCH_TRANS_COMMIT_no_enospc, ({
+		if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0)))
+			break;
+
+		if (bp_k.k->type != KEY_TYPE_backpointer)
+			continue;
+
+		ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s,
+					bkey_s_c_to_backpointer(bp_k));
+	}));
+
+	bch2_dev_put(ca);
+	return ret;
+}
+
+static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
+	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
+	int ret = 0;
+
+	ret = bch2_btree_write_buffer_flush_sync(trans);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < nr_data; i++) {
+		ret = ec_stripe_update_bucket(trans, s, i);
+		if (ret)
+			break;
+	}
+err:
+	bch2_trans_put(trans);
+
+	return ret;
+}
+
+static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
+				       struct ec_stripe_new *s,
+				       unsigned block,
+				       struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE);
+	if (!ca) {
+		s->err = -BCH_ERR_erofs_no_writes;
+		return;
+	}
+
+	unsigned offset = ca->mi.bucket_size - ob->sectors_free;
+	memset(s->new_stripe.data[block] + (offset << 9),
+	       0,
+	       ob->sectors_free << 9);
+
+	int ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
+			ob->bucket * ca->mi.bucket_size + offset,
+			ob->sectors_free,
+			GFP_KERNEL, 0);
+
+	percpu_ref_put(&ca->io_ref);
+
+	if (ret)
+		s->err = ret;
+}
+
+void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
+{
+	if (s->idx)
+		bch2_stripe_close(c, s);
+	kfree(s);
+}
+
+/*
+ * data buckets of new stripe all written: create the stripe
+ */
+static void ec_stripe_create(struct ec_stripe_new *s)
+{
+	struct bch_fs *c = s->c;
+	struct open_bucket *ob;
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
+	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
+	int ret;
+
+	BUG_ON(s->h->s == s);
+
+	closure_sync(&s->iodone);
+
+	if (!s->err) {
+		for (i = 0; i < nr_data; i++)
+			if (s->blocks[i]) {
+				ob = c->open_buckets + s->blocks[i];
+
+				if (ob->sectors_free)
+					zero_out_rest_of_ec_bucket(c, s, i, ob);
+			}
+	}
+
+	if (s->err) {
+		if (!bch2_err_matches(s->err, EROFS))
+			bch_err(c, "error creating stripe: error writing data buckets");
+		goto err;
+	}
+
+	if (s->have_existing_stripe) {
+		ec_validate_checksums(c, &s->existing_stripe);
+
+		if (ec_do_recov(c, &s->existing_stripe)) {
+			bch_err(c, "error creating stripe: error reading existing stripe");
+			goto err;
+		}
+
+		for (i = 0; i < nr_data; i++)
+			if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
+				swap(s->new_stripe.data[i],
+				     s->existing_stripe.data[i]);
+
+		ec_stripe_buf_exit(&s->existing_stripe);
+	}
+
+	BUG_ON(!s->allocated);
+	BUG_ON(!s->idx);
+
+	ec_generate_ec(&s->new_stripe);
+
+	ec_generate_checksums(&s->new_stripe);
+
+	/* write p/q: */
+	for (i = nr_data; i < v->nr_blocks; i++)
+		ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
+	closure_sync(&s->iodone);
+
+	if (ec_nr_failed(&s->new_stripe)) {
+		bch_err(c, "error creating stripe: error writing redundancy buckets");
+		goto err;
+	}
+
+	ret = bch2_trans_commit_do(c, &s->res, NULL,
+		BCH_TRANS_COMMIT_no_check_rw|
+		BCH_TRANS_COMMIT_no_enospc,
+		ec_stripe_key_update(trans,
+				     s->have_existing_stripe
+				     ? bkey_i_to_stripe(&s->existing_stripe.key)
+				     : NULL,
+				     bkey_i_to_stripe(&s->new_stripe.key)));
+	bch_err_msg(c, ret, "creating stripe key");
+	if (ret) {
+		goto err;
+	}
+
+	ret = ec_stripe_update_extents(c, &s->new_stripe);
+	bch_err_msg(c, ret, "error updating extents");
+	if (ret)
+		goto err;
+err:
+	bch2_disk_reservation_put(c, &s->res);
+
+	for (i = 0; i < v->nr_blocks; i++)
+		if (s->blocks[i]) {
+			ob = c->open_buckets + s->blocks[i];
+
+			if (i < nr_data) {
+				ob->ec = NULL;
+				__bch2_open_bucket_put(c, ob);
+			} else {
+				bch2_open_bucket_put(c, ob);
+			}
+		}
+
+	mutex_lock(&c->ec_stripe_new_lock);
+	list_del(&s->list);
+	mutex_unlock(&c->ec_stripe_new_lock);
+	wake_up(&c->ec_stripe_new_wait);
+
+	ec_stripe_buf_exit(&s->existing_stripe);
+	ec_stripe_buf_exit(&s->new_stripe);
+	closure_debug_destroy(&s->iodone);
+
+	ec_stripe_new_put(c, s, STRIPE_REF_stripe);
+}
+
+static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
+{
+	struct ec_stripe_new *s;
+
+	mutex_lock(&c->ec_stripe_new_lock);
+	list_for_each_entry(s, &c->ec_stripe_new_list, list)
+		if (!atomic_read(&s->ref[STRIPE_REF_io]))
+			goto out;
+	s = NULL;
+out:
+	mutex_unlock(&c->ec_stripe_new_lock);
+
+	return s;
+}
+
+static void ec_stripe_create_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work,
+		struct bch_fs, ec_stripe_create_work);
+	struct ec_stripe_new *s;
+
+	while ((s = get_pending_stripe(c)))
+		ec_stripe_create(s);
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
+}
+
+void bch2_ec_do_stripe_creates(struct bch_fs *c)
+{
+	bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
+
+	if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
+}
+
+static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
+{
+	struct ec_stripe_new *s = h->s;
+
+	lockdep_assert_held(&h->lock);
+
+	BUG_ON(!s->allocated && !s->err);
+
+	h->s		= NULL;
+	s->pending	= true;
+
+	mutex_lock(&c->ec_stripe_new_lock);
+	list_add(&s->list, &c->ec_stripe_new_list);
+	mutex_unlock(&c->ec_stripe_new_lock);
+
+	ec_stripe_new_put(c, s, STRIPE_REF_io);
+}
+
+static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err)
+{
+	h->s->err = err;
+	ec_stripe_new_set_pending(c, h);
+}
+
+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct ec_stripe_new *s = ob->ec;
+
+	s->err = -EIO;
+}
+
+void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
+{
+	struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
+	if (!ob)
+		return NULL;
+
+	BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
+
+	struct bch_dev *ca	= ob_dev(c, ob);
+	unsigned offset		= ca->mi.bucket_size - ob->sectors_free;
+
+	return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
+}
+
+static int unsigned_cmp(const void *_l, const void *_r)
+{
+	unsigned l = *((const unsigned *) _l);
+	unsigned r = *((const unsigned *) _r);
+
+	return cmp_int(l, r);
+}
+
+/* pick most common bucket size: */
+static unsigned pick_blocksize(struct bch_fs *c,
+			       struct bch_devs_mask *devs)
+{
+	unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX];
+	struct {
+		unsigned nr, size;
+	} cur = { 0, 0 }, best = { 0, 0 };
+
+	for_each_member_device_rcu(c, ca, devs)
+		sizes[nr++] = ca->mi.bucket_size;
+
+	sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
+
+	for (unsigned i = 0; i < nr; i++) {
+		if (sizes[i] != cur.size) {
+			if (cur.nr > best.nr)
+				best = cur;
+
+			cur.nr = 0;
+			cur.size = sizes[i];
+		}
+
+		cur.nr++;
+	}
+
+	if (cur.nr > best.nr)
+		best = cur;
+
+	return best.size;
+}
+
+static bool may_create_new_stripe(struct bch_fs *c)
+{
+	return false;
+}
+
+static void ec_stripe_key_init(struct bch_fs *c,
+			       struct bkey_i *k,
+			       unsigned nr_data,
+			       unsigned nr_parity,
+			       unsigned stripe_size,
+			       unsigned disk_label)
+{
+	struct bkey_i_stripe *s = bkey_stripe_init(k);
+	unsigned u64s;
+
+	s->v.sectors			= cpu_to_le16(stripe_size);
+	s->v.algorithm			= 0;
+	s->v.nr_blocks			= nr_data + nr_parity;
+	s->v.nr_redundant		= nr_parity;
+	s->v.csum_granularity_bits	= ilog2(c->opts.encoded_extent_max >> 9);
+	s->v.csum_type			= BCH_CSUM_crc32c;
+	s->v.disk_label			= disk_label;
+
+	while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
+		BUG_ON(1 << s->v.csum_granularity_bits >=
+		       le16_to_cpu(s->v.sectors) ||
+		       s->v.csum_granularity_bits == U8_MAX);
+		s->v.csum_granularity_bits++;
+	}
+
+	set_bkey_val_u64s(&s->k, u64s);
+}
+
+static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
+{
+	struct ec_stripe_new *s;
+
+	lockdep_assert_held(&h->lock);
+
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return NULL;
+
+	mutex_init(&s->lock);
+	closure_init(&s->iodone, NULL);
+	atomic_set(&s->ref[STRIPE_REF_stripe], 1);
+	atomic_set(&s->ref[STRIPE_REF_io], 1);
+	s->c		= c;
+	s->h		= h;
+	s->nr_data	= min_t(unsigned, h->nr_active_devs,
+				BCH_BKEY_PTRS_MAX) - h->redundancy;
+	s->nr_parity	= h->redundancy;
+
+	ec_stripe_key_init(c, &s->new_stripe.key,
+			   s->nr_data, s->nr_parity,
+			   h->blocksize, h->disk_label);
+	return s;
+}
+
+static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
+{
+	struct bch_devs_mask devs = h->devs;
+
+	rcu_read_lock();
+	h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
+				 ? group_to_target(h->disk_label - 1)
+				 : 0);
+	unsigned nr_devs = dev_mask_nr(&h->devs);
+
+	for_each_member_device_rcu(c, ca, &h->devs)
+		if (!ca->mi.durability)
+			__clear_bit(ca->dev_idx, h->devs.d);
+	unsigned nr_devs_with_durability = dev_mask_nr(&h->devs);
+
+	h->blocksize = pick_blocksize(c, &h->devs);
+
+	h->nr_active_devs = 0;
+	for_each_member_device_rcu(c, ca, &h->devs)
+		if (ca->mi.bucket_size == h->blocksize)
+			h->nr_active_devs++;
+
+	rcu_read_unlock();
+
+	/*
+	 * If we only have redundancy + 1 devices, we're better off with just
+	 * replication:
+	 */
+	h->insufficient_devs = h->nr_active_devs < h->redundancy + 2;
+
+	if (h->insufficient_devs) {
+		const char *err;
+
+		if (nr_devs < h->redundancy + 2)
+			err = NULL;
+		else if (nr_devs_with_durability < h->redundancy + 2)
+			err = "cannot use durability=0 devices";
+		else
+			err = "mismatched bucket sizes";
+
+		if (err)
+			bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s",
+				h->nr_active_devs, h->redundancy + 2, err);
+	}
+
+	struct bch_devs_mask devs_leaving;
+	bitmap_andnot(devs_leaving.d, devs.d, h->devs.d, BCH_SB_MEMBERS_MAX);
+
+	if (h->s && !h->s->allocated && dev_mask_nr(&devs_leaving))
+		ec_stripe_new_cancel(c, h, -EINTR);
+
+	h->rw_devs_change_count = c->rw_devs_change_count;
+}
+
+static struct ec_stripe_head *
+ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label,
+			 unsigned algo, unsigned redundancy,
+			 enum bch_watermark watermark)
+{
+	struct ec_stripe_head *h;
+
+	h = kzalloc(sizeof(*h), GFP_KERNEL);
+	if (!h)
+		return NULL;
+
+	mutex_init(&h->lock);
+	BUG_ON(!mutex_trylock(&h->lock));
+
+	h->disk_label	= disk_label;
+	h->algo		= algo;
+	h->redundancy	= redundancy;
+	h->watermark	= watermark;
+
+	list_add(&h->list, &c->ec_stripe_head_list);
+	return h;
+}
+
+void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
+{
+	if (h->s &&
+	    h->s->allocated &&
+	    bitmap_weight(h->s->blocks_allocated,
+			  h->s->nr_data) == h->s->nr_data)
+		ec_stripe_new_set_pending(c, h);
+
+	mutex_unlock(&h->lock);
+}
+
+static struct ec_stripe_head *
+__bch2_ec_stripe_head_get(struct btree_trans *trans,
+			  unsigned disk_label,
+			  unsigned algo,
+			  unsigned redundancy,
+			  enum bch_watermark watermark)
+{
+	struct bch_fs *c = trans->c;
+	struct ec_stripe_head *h;
+	int ret;
+
+	if (!redundancy)
+		return NULL;
+
+	ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (test_bit(BCH_FS_going_ro, &c->flags)) {
+		h = ERR_PTR(-BCH_ERR_erofs_no_writes);
+		goto err;
+	}
+
+	list_for_each_entry(h, &c->ec_stripe_head_list, list)
+		if (h->disk_label	== disk_label &&
+		    h->algo		== algo &&
+		    h->redundancy	== redundancy &&
+		    h->watermark	== watermark) {
+			ret = bch2_trans_mutex_lock(trans, &h->lock);
+			if (ret) {
+				h = ERR_PTR(ret);
+				goto err;
+			}
+			goto found;
+		}
+
+	h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark);
+	if (!h) {
+		h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc);
+		goto err;
+	}
+found:
+	if (h->rw_devs_change_count != c->rw_devs_change_count)
+		ec_stripe_head_devs_update(c, h);
+
+	if (h->insufficient_devs) {
+		mutex_unlock(&h->lock);
+		h = NULL;
+	}
+err:
+	mutex_unlock(&c->ec_stripe_head_lock);
+	return h;
+}
+
+static int new_stripe_alloc_buckets(struct btree_trans *trans,
+				    struct ec_stripe_head *h, struct ec_stripe_new *s,
+				    enum bch_watermark watermark, struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_devs_mask devs = h->devs;
+	struct open_bucket *ob;
+	struct open_buckets buckets;
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
+	unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
+	bool have_cache = true;
+	int ret = 0;
+
+	BUG_ON(v->nr_blocks	!= s->nr_data + s->nr_parity);
+	BUG_ON(v->nr_redundant	!= s->nr_parity);
+
+	/* * We bypass the sector allocator which normally does this: */
+	bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
+
+	for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) {
+		/*
+		 * Note: we don't yet repair invalid blocks (failed/removed
+		 * devices) when reusing stripes - we still need a codepath to
+		 * walk backpointers and update all extents that point to that
+		 * block when updating the stripe
+		 */
+		if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
+			__clear_bit(v->ptrs[i].dev, devs.d);
+
+		if (i < s->nr_data)
+			nr_have_data++;
+		else
+			nr_have_parity++;
+	}
+
+	BUG_ON(nr_have_data	> s->nr_data);
+	BUG_ON(nr_have_parity	> s->nr_parity);
+
+	buckets.nr = 0;
+	if (nr_have_parity < s->nr_parity) {
+		ret = bch2_bucket_alloc_set_trans(trans, &buckets,
+					    &h->parity_stripe,
+					    &devs,
+					    s->nr_parity,
+					    &nr_have_parity,
+					    &have_cache, 0,
+					    BCH_DATA_parity,
+					    watermark,
+					    cl);
+
+		open_bucket_for_each(c, &buckets, ob, i) {
+			j = find_next_zero_bit(s->blocks_gotten,
+					       s->nr_data + s->nr_parity,
+					       s->nr_data);
+			BUG_ON(j >= s->nr_data + s->nr_parity);
+
+			s->blocks[j] = buckets.v[i];
+			v->ptrs[j] = bch2_ob_ptr(c, ob);
+			__set_bit(j, s->blocks_gotten);
+		}
+
+		if (ret)
+			return ret;
+	}
+
+	buckets.nr = 0;
+	if (nr_have_data < s->nr_data) {
+		ret = bch2_bucket_alloc_set_trans(trans, &buckets,
+					    &h->block_stripe,
+					    &devs,
+					    s->nr_data,
+					    &nr_have_data,
+					    &have_cache, 0,
+					    BCH_DATA_user,
+					    watermark,
+					    cl);
+
+		open_bucket_for_each(c, &buckets, ob, i) {
+			j = find_next_zero_bit(s->blocks_gotten,
+					       s->nr_data, 0);
+			BUG_ON(j >= s->nr_data);
+
+			s->blocks[j] = buckets.v[i];
+			v->ptrs[j] = bch2_ob_ptr(c, ob);
+			__set_bit(j, s->blocks_gotten);
+		}
+
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static s64 get_existing_stripe(struct bch_fs *c,
+			       struct ec_stripe_head *head)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	struct stripe *m;
+	size_t heap_idx;
+	u64 stripe_idx;
+	s64 ret = -1;
+
+	if (may_create_new_stripe(c))
+		return -1;
+
+	mutex_lock(&c->ec_stripes_heap_lock);
+	for (heap_idx = 0; heap_idx < h->nr; heap_idx++) {
+		/* No blocks worth reusing, stripe will just be deleted: */
+		if (!h->data[heap_idx].blocks_nonempty)
+			continue;
+
+		stripe_idx = h->data[heap_idx].idx;
+
+		m = genradix_ptr(&c->stripes, stripe_idx);
+
+		if (m->disk_label	== head->disk_label &&
+		    m->algorithm	== head->algo &&
+		    m->nr_redundant	== head->redundancy &&
+		    m->sectors		== head->blocksize &&
+		    m->blocks_nonempty	< m->nr_blocks - m->nr_redundant &&
+		    bch2_try_open_stripe(c, head->s, stripe_idx)) {
+			ret = stripe_idx;
+			break;
+		}
+	}
+	mutex_unlock(&c->ec_stripes_heap_lock);
+	return ret;
+}
+
+static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s)
+{
+	struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
+	struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v;
+	unsigned i;
+
+	BUG_ON(existing_v->nr_redundant != s->nr_parity);
+	s->nr_data = existing_v->nr_blocks -
+		existing_v->nr_redundant;
+
+	int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
+	if (ret) {
+		bch2_stripe_close(c, s);
+		return ret;
+	}
+
+	BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
+
+	/*
+	 * Free buckets we initially allocated - they might conflict with
+	 * blocks from the stripe we're reusing:
+	 */
+	for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) {
+		bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]);
+		s->blocks[i] = 0;
+	}
+	memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten));
+	memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated));
+
+	for (unsigned i = 0; i < existing_v->nr_blocks; i++) {
+		if (stripe_blockcount_get(existing_v, i)) {
+			__set_bit(i, s->blocks_gotten);
+			__set_bit(i, s->blocks_allocated);
+		}
+
+		ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone);
+	}
+
+	bkey_copy(&s->new_stripe.key, &s->existing_stripe.key);
+	s->have_existing_stripe = true;
+
+	return 0;
+}
+
+static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h,
+				       struct ec_stripe_new *s)
+{
+	struct bch_fs *c = trans->c;
+	s64 idx;
+	int ret;
+
+	/*
+	 * If we can't allocate a new stripe, and there's no stripes with empty
+	 * blocks for us to reuse, that means we have to wait on copygc:
+	 */
+	idx = get_existing_stripe(c, h);
+	if (idx < 0)
+		return -BCH_ERR_stripe_alloc_blocked;
+
+	ret = get_stripe_key_trans(trans, idx, &s->existing_stripe);
+	bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
+			     "reading stripe key: %s", bch2_err_str(ret));
+	if (ret) {
+		bch2_stripe_close(c, s);
+		return ret;
+	}
+
+	return init_new_stripe_from_existing(c, s);
+}
+
+static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h,
+					 struct ec_stripe_new *s)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bpos min_pos = POS(0, 1);
+	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
+	int ret;
+
+	if (!s->res.sectors) {
+		ret = bch2_disk_reservation_get(c, &s->res,
+					h->blocksize,
+					s->nr_parity,
+					BCH_DISK_RESERVATION_NOFAIL);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * Allocate stripe slot
+	 * XXX: we're going to need a bitrange btree of free stripes
+	 */
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
+			   BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
+		if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
+			if (start_pos.offset) {
+				start_pos = min_pos;
+				bch2_btree_iter_set_pos(&iter, start_pos);
+				continue;
+			}
+
+			ret = -BCH_ERR_ENOSPC_stripe_create;
+			break;
+		}
+
+		if (bkey_deleted(k.k) &&
+		    bch2_try_open_stripe(c, s, k.k->p.offset))
+			break;
+	}
+
+	c->ec_stripe_hint = iter.pos.offset;
+
+	if (ret)
+		goto err;
+
+	ret = ec_stripe_mem_alloc(trans, &iter);
+	if (ret) {
+		bch2_stripe_close(c, s);
+		goto err;
+	}
+
+	s->new_stripe.key.k.p = iter.pos;
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+err:
+	bch2_disk_reservation_put(c, &s->res);
+	goto out;
+}
+
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
+					       unsigned target,
+					       unsigned algo,
+					       unsigned redundancy,
+					       enum bch_watermark watermark,
+					       struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	struct ec_stripe_head *h;
+	bool waiting = false;
+	unsigned disk_label = 0;
+	struct target t = target_decode(target);
+	int ret;
+
+	if (t.type == TARGET_GROUP) {
+		if (t.group > U8_MAX) {
+			bch_err(c, "cannot create a stripe when disk_label > U8_MAX");
+			return NULL;
+		}
+		disk_label = t.group + 1; /* 0 == no label */
+	}
+
+	h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark);
+	if (IS_ERR_OR_NULL(h))
+		return h;
+
+	if (!h->s) {
+		h->s = ec_new_stripe_alloc(c, h);
+		if (!h->s) {
+			ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
+			bch_err(c, "failed to allocate new stripe");
+			goto err;
+		}
+
+		h->nr_created++;
+	}
+
+	struct ec_stripe_new *s = h->s;
+
+	if (s->allocated)
+		goto allocated;
+
+	if (s->have_existing_stripe)
+		goto alloc_existing;
+
+	/* First, try to allocate a full stripe: */
+	ret =   new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?:
+		__bch2_ec_stripe_head_reserve(trans, h, s);
+	if (!ret)
+		goto allocate_buf;
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+	    bch2_err_matches(ret, ENOMEM))
+		goto err;
+
+	/*
+	 * Not enough buckets available for a full stripe: we must reuse an
+	 * existing stripe:
+	 */
+	while (1) {
+		ret = __bch2_ec_stripe_head_reuse(trans, h, s);
+		if (!ret)
+			break;
+		if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
+			goto err;
+
+		if (watermark == BCH_WATERMARK_copygc) {
+			ret =   new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?:
+				__bch2_ec_stripe_head_reserve(trans, h, s);
+			if (ret)
+				goto err;
+			goto allocate_buf;
+		}
+
+		/* XXX freelist_wait? */
+		closure_wait(&c->freelist_wait, cl);
+		waiting = true;
+	}
+
+	if (waiting)
+		closure_wake_up(&c->freelist_wait);
+alloc_existing:
+	/*
+	 * Retry allocating buckets, with the watermark for this
+	 * particular write:
+	 */
+	ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl);
+	if (ret)
+		goto err;
+
+allocate_buf:
+	ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize);
+	if (ret)
+		goto err;
+
+	s->allocated = true;
+allocated:
+	BUG_ON(!s->idx);
+	BUG_ON(!s->new_stripe.data[0]);
+	BUG_ON(trans->restarted);
+	return h;
+err:
+	bch2_ec_stripe_head_put(c, h);
+	return ERR_PTR(ret);
+}
+
+/* device removal */
+
+static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a)
+{
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
+
+	if (!a->stripe)
+		return 0;
+
+	if (a->stripe_sectors) {
+		bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
+		return -BCH_ERR_invalidate_stripe_to_dev;
+	}
+
+	struct btree_iter iter;
+	struct bkey_i_stripe *s =
+		bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
+					BTREE_ITER_slots, stripe);
+	int ret = PTR_ERR_OR_ZERO(s);
+	if (ret)
+		return ret;
+
+	struct disk_accounting_pos acc = {
+		.type = BCH_DISK_ACCOUNTING_replicas,
+	};
+
+	s64 sectors = 0;
+	for (unsigned i = 0; i < s->v.nr_blocks; i++)
+		sectors -= stripe_blockcount_get(&s->v, i);
+
+	bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
+	acc.replicas.data_type = BCH_DATA_user;
+	ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
+	if (ret)
+		goto err;
+
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i));
+	bkey_for_each_ptr(ptrs, ptr)
+		if (ptr->dev == k_a.k->p.inode)
+			ptr->dev = BCH_SB_MEMBER_INVALID;
+
+	sectors = -sectors;
+
+	bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
+	acc.replicas.data_type = BCH_DATA_user;
+	ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
+	if (ret)
+		goto err;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx)
+{
+	return bch2_trans_run(c,
+		for_each_btree_key_max_commit(trans, iter,
+				  BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX),
+				  BTREE_ITER_intent, k,
+				  NULL, NULL, 0, ({
+			bch2_invalidate_stripe_to_dev(trans, k);
+	})));
+}
+
+/* startup/shutdown */
+
+static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct ec_stripe_head *h;
+	struct open_bucket *ob;
+	unsigned i;
+
+	mutex_lock(&c->ec_stripe_head_lock);
+	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
+		mutex_lock(&h->lock);
+		if (!h->s)
+			goto unlock;
+
+		if (!ca)
+			goto found;
+
+		for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
+			if (!h->s->blocks[i])
+				continue;
+
+			ob = c->open_buckets + h->s->blocks[i];
+			if (ob->dev == ca->dev_idx)
+				goto found;
+		}
+		goto unlock;
+found:
+		ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes);
+unlock:
+		mutex_unlock(&h->lock);
+	}
+	mutex_unlock(&c->ec_stripe_head_lock);
+}
+
+void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
+{
+	__bch2_ec_stop(c, ca);
+}
+
+void bch2_fs_ec_stop(struct bch_fs *c)
+{
+	__bch2_ec_stop(c, NULL);
+}
+
+static bool bch2_fs_ec_flush_done(struct bch_fs *c)
+{
+	bool ret;
+
+	mutex_lock(&c->ec_stripe_new_lock);
+	ret = list_empty(&c->ec_stripe_new_list);
+	mutex_unlock(&c->ec_stripe_new_lock);
+
+	return ret;
+}
+
+void bch2_fs_ec_flush(struct bch_fs *c)
+{
+	wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
+}
+
+int bch2_stripes_read(struct bch_fs *c)
+{
+	int ret = bch2_trans_run(c,
+		for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
+				   BTREE_ITER_prefetch, k, ({
+			if (k.k->type != KEY_TYPE_stripe)
+				continue;
+
+			ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
+			if (ret)
+				break;
+
+			struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
+
+			stripe_to_mem(m, bkey_s_c_to_stripe(k).v);
+
+			bch2_stripes_heap_insert(c, m, k.k->p.offset);
+			0;
+		})));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	struct stripe *m;
+	size_t i;
+
+	mutex_lock(&c->ec_stripes_heap_lock);
+	for (i = 0; i < min_t(size_t, h->nr, 50); i++) {
+		m = genradix_ptr(&c->stripes, h->data[i].idx);
+
+		prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
+		       h->data[i].blocks_nonempty,
+		       m->nr_blocks - m->nr_redundant,
+		       m->nr_redundant);
+		if (bch2_stripe_is_open(c, h->data[i].idx))
+			prt_str(out, " open");
+		prt_newline(out);
+	}
+	mutex_unlock(&c->ec_stripes_heap_lock);
+}
+
+static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
+				    struct ec_stripe_new *s)
+{
+	prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs",
+		   s->idx, s->nr_data, s->nr_parity,
+		   bitmap_weight(s->blocks_allocated, s->nr_data),
+		   atomic_read(&s->ref[STRIPE_REF_io]),
+		   atomic_read(&s->ref[STRIPE_REF_stripe]),
+		   bch2_watermarks[s->h->watermark]);
+
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
+	unsigned i;
+	for_each_set_bit(i, s->blocks_gotten, v->nr_blocks)
+		prt_printf(out, " %u", s->blocks[i]);
+	prt_newline(out);
+	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key));
+	prt_newline(out);
+}
+
+void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct ec_stripe_head *h;
+	struct ec_stripe_new *s;
+
+	mutex_lock(&c->ec_stripe_head_lock);
+	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
+		prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n",
+		       h->disk_label, h->algo, h->redundancy,
+		       bch2_watermarks[h->watermark],
+		       h->nr_created);
+
+		if (h->s)
+			bch2_new_stripe_to_text(out, c, h->s);
+	}
+	mutex_unlock(&c->ec_stripe_head_lock);
+
+	prt_printf(out, "in flight:\n");
+
+	mutex_lock(&c->ec_stripe_new_lock);
+	list_for_each_entry(s, &c->ec_stripe_new_list, list)
+		bch2_new_stripe_to_text(out, c, s);
+	mutex_unlock(&c->ec_stripe_new_lock);
+}
+
+void bch2_fs_ec_exit(struct bch_fs *c)
+{
+	struct ec_stripe_head *h;
+	unsigned i;
+
+	while (1) {
+		mutex_lock(&c->ec_stripe_head_lock);
+		h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list);
+		mutex_unlock(&c->ec_stripe_head_lock);
+
+		if (!h)
+			break;
+
+		if (h->s) {
+			for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
+				BUG_ON(h->s->blocks[i]);
+
+			kfree(h->s);
+		}
+		kfree(h);
+	}
+
+	BUG_ON(!list_empty(&c->ec_stripe_new_list));
+
+	free_heap(&c->ec_stripes_heap);
+	genradix_free(&c->stripes);
+	bioset_exit(&c->ec_bioset);
+}
+
+void bch2_fs_ec_init_early(struct bch_fs *c)
+{
+	spin_lock_init(&c->ec_stripes_new_lock);
+	mutex_init(&c->ec_stripes_heap_lock);
+
+	INIT_LIST_HEAD(&c->ec_stripe_head_list);
+	mutex_init(&c->ec_stripe_head_lock);
+
+	INIT_LIST_HEAD(&c->ec_stripe_new_list);
+	mutex_init(&c->ec_stripe_new_lock);
+	init_waitqueue_head(&c->ec_stripe_new_wait);
+
+	INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
+	INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
+}
+
+int bch2_fs_ec_init(struct bch_fs *c)
+{
+	return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
+			   BIOSET_NEED_BVECS);
+}
diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h
new file mode 100644
index 00000000..583ca6a2
--- /dev/null
+++ b/libbcachefs/ec.h
@@ -0,0 +1,271 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_H
+#define _BCACHEFS_EC_H
+
+#include "ec_types.h"
+#include "buckets_types.h"
+#include "extents_types.h"
+
+int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c,
+			 struct bkey_validate_context);
+void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
+			 struct bkey_s_c);
+int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
+			struct bkey_s_c, struct bkey_s,
+			enum btree_iter_update_trigger_flags);
+
+#define bch2_bkey_ops_stripe ((struct bkey_ops) {	\
+	.key_validate	= bch2_stripe_validate,		\
+	.val_to_text	= bch2_stripe_to_text,		\
+	.swab		= bch2_ptr_swab,		\
+	.trigger	= bch2_trigger_stripe,		\
+	.min_val_size	= 8,				\
+})
+
+static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
+{
+	return DIV_ROUND_UP(le16_to_cpu(s->sectors),
+			    1 << s->csum_granularity_bits);
+}
+
+static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
+					  unsigned dev, unsigned csum_idx)
+{
+	EBUG_ON(s->csum_type >= BCH_CSUM_NR);
+
+	unsigned csum_bytes = bch_crc_bytes[s->csum_type];
+
+	return sizeof(struct bch_stripe) +
+		sizeof(struct bch_extent_ptr) * s->nr_blocks +
+		(dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
+}
+
+static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s,
+						unsigned idx)
+{
+	return stripe_csum_offset(s, s->nr_blocks, 0) +
+		sizeof(u16) * idx;
+}
+
+static inline unsigned stripe_blockcount_get(const struct bch_stripe *s,
+					     unsigned idx)
+{
+	return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx));
+}
+
+static inline void stripe_blockcount_set(struct bch_stripe *s,
+					 unsigned idx, unsigned v)
+{
+	__le16 *p = (void *) s + stripe_blockcount_offset(s, idx);
+
+	*p = cpu_to_le16(v);
+}
+
+static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
+{
+	return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
+			    sizeof(u64));
+}
+
+static inline void *stripe_csum(struct bch_stripe *s,
+				unsigned block, unsigned csum_idx)
+{
+	EBUG_ON(block >= s->nr_blocks);
+	EBUG_ON(csum_idx >= stripe_csums_per_device(s));
+
+	return (void *) s + stripe_csum_offset(s, block, csum_idx);
+}
+
+static inline struct bch_csum stripe_csum_get(struct bch_stripe *s,
+				   unsigned block, unsigned csum_idx)
+{
+	struct bch_csum csum = { 0 };
+
+	memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]);
+	return csum;
+}
+
+static inline void stripe_csum_set(struct bch_stripe *s,
+				   unsigned block, unsigned csum_idx,
+				   struct bch_csum csum)
+{
+	memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
+}
+
+static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
+					     const struct bch_extent_ptr *data_ptr,
+					     unsigned sectors)
+{
+	return  (data_ptr->dev    == stripe_ptr->dev ||
+		 data_ptr->dev    == BCH_SB_MEMBER_INVALID ||
+		 stripe_ptr->dev  == BCH_SB_MEMBER_INVALID) &&
+		data_ptr->gen    == stripe_ptr->gen &&
+		data_ptr->offset >= stripe_ptr->offset &&
+		data_ptr->offset  < stripe_ptr->offset + sectors;
+}
+
+static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
+					   struct extent_ptr_decoded p)
+{
+	unsigned nr_data = s->nr_blocks - s->nr_redundant;
+
+	BUG_ON(!p.has_ec);
+
+	if (p.ec.block >= nr_data)
+		return false;
+
+	return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr,
+					 le16_to_cpu(s->sectors));
+}
+
+static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
+					     struct extent_ptr_decoded p)
+{
+	unsigned nr_data = m->nr_blocks - m->nr_redundant;
+
+	BUG_ON(!p.has_ec);
+
+	if (p.ec.block >= nr_data)
+		return false;
+
+	return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr,
+					 m->sectors);
+}
+
+struct bch_read_bio;
+
+struct ec_stripe_buf {
+	/* might not be buffering the entire stripe: */
+	unsigned		offset;
+	unsigned		size;
+	unsigned long		valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
+
+	void			*data[BCH_BKEY_PTRS_MAX];
+
+	__BKEY_PADDED(key, 255);
+};
+
+struct ec_stripe_head;
+
+enum ec_stripe_ref {
+	STRIPE_REF_io,
+	STRIPE_REF_stripe,
+	STRIPE_REF_NR
+};
+
+struct ec_stripe_new {
+	struct bch_fs		*c;
+	struct ec_stripe_head	*h;
+	struct mutex		lock;
+	struct list_head	list;
+
+	struct hlist_node	hash;
+	u64			idx;
+
+	struct closure		iodone;
+
+	atomic_t		ref[STRIPE_REF_NR];
+
+	int			err;
+
+	u8			nr_data;
+	u8			nr_parity;
+	bool			allocated;
+	bool			pending;
+	bool			have_existing_stripe;
+
+	unsigned long		blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
+	unsigned long		blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
+	open_bucket_idx_t	blocks[BCH_BKEY_PTRS_MAX];
+	struct disk_reservation	res;
+
+	struct ec_stripe_buf	new_stripe;
+	struct ec_stripe_buf	existing_stripe;
+};
+
+struct ec_stripe_head {
+	struct list_head	list;
+	struct mutex		lock;
+
+	unsigned		disk_label;
+	unsigned		algo;
+	unsigned		redundancy;
+	enum bch_watermark	watermark;
+	bool			insufficient_devs;
+
+	unsigned long		rw_devs_change_count;
+
+	u64			nr_created;
+
+	struct bch_devs_mask	devs;
+	unsigned		nr_active_devs;
+
+	unsigned		blocksize;
+
+	struct dev_stripe_state	block_stripe;
+	struct dev_stripe_state	parity_stripe;
+
+	struct ec_stripe_new	*s;
+};
+
+int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c);
+
+void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
+
+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
+
+int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
+
+void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
+			unsigned, unsigned, unsigned,
+			enum bch_watermark, struct closure *);
+
+void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
+void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
+void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
+
+void bch2_do_stripe_deletes(struct bch_fs *);
+void bch2_ec_do_stripe_creates(struct bch_fs *);
+void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
+
+static inline void ec_stripe_new_get(struct ec_stripe_new *s,
+				     enum ec_stripe_ref ref)
+{
+	atomic_inc(&s->ref[ref]);
+}
+
+static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
+				     enum ec_stripe_ref ref)
+{
+	BUG_ON(atomic_read(&s->ref[ref]) <= 0);
+
+	if (atomic_dec_and_test(&s->ref[ref]))
+		switch (ref) {
+		case STRIPE_REF_stripe:
+			bch2_ec_stripe_new_free(c, s);
+			break;
+		case STRIPE_REF_io:
+			bch2_ec_do_stripe_creates(c);
+			break;
+		default:
+			BUG();
+		}
+}
+
+int bch2_dev_remove_stripes(struct bch_fs *, unsigned);
+
+void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
+void bch2_fs_ec_stop(struct bch_fs *);
+void bch2_fs_ec_flush(struct bch_fs *);
+
+int bch2_stripes_read(struct bch_fs *);
+
+void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
+void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_fs_ec_exit(struct bch_fs *);
+void bch2_fs_ec_init_early(struct bch_fs *);
+int bch2_fs_ec_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_EC_H */
diff --git a/libbcachefs/ec_format.h b/libbcachefs/ec_format.h
new file mode 100644
index 00000000..64ef52e0
--- /dev/null
+++ b/libbcachefs/ec_format.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_FORMAT_H
+#define _BCACHEFS_EC_FORMAT_H
+
+struct bch_stripe {
+	struct bch_val		v;
+	__le16			sectors;
+	__u8			algorithm;
+	__u8			nr_blocks;
+	__u8			nr_redundant;
+
+	__u8			csum_granularity_bits;
+	__u8			csum_type;
+
+	/*
+	 * XXX: targets should be 16 bits - fix this if we ever do a stripe_v2
+	 *
+	 * we can manage with this because this only needs to point to a
+	 * disk label, not a target:
+	 */
+	__u8			disk_label;
+
+	struct bch_extent_ptr	ptrs[];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_EC_FORMAT_H */
diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h
new file mode 100644
index 00000000..8d1e70e8
--- /dev/null
+++ b/libbcachefs/ec_types.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_TYPES_H
+#define _BCACHEFS_EC_TYPES_H
+
+#include "bcachefs_format.h"
+
+struct bch_replicas_padded {
+	struct bch_replicas_entry_v1	e;
+	u8				pad[BCH_BKEY_PTRS_MAX];
+};
+
+struct stripe {
+	size_t			heap_idx;
+	u16			sectors;
+	u8			algorithm;
+	u8			nr_blocks;
+	u8			nr_redundant;
+	u8			blocks_nonempty;
+	u8			disk_label;
+};
+
+struct gc_stripe {
+	u16			sectors;
+
+	u8			nr_blocks;
+	u8			nr_redundant;
+
+	unsigned		alive:1; /* does a corresponding key exist in stripes btree? */
+	u16			block_sectors[BCH_BKEY_PTRS_MAX];
+	struct bch_extent_ptr	ptrs[BCH_BKEY_PTRS_MAX];
+
+	struct bch_replicas_padded r;
+};
+
+struct ec_stripe_heap_entry {
+	size_t			idx;
+	unsigned		blocks_nonempty;
+};
+
+typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap;
+
+#endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/libbcachefs/errcode.c b/libbcachefs/errcode.c
new file mode 100644
index 00000000..43557beb
--- /dev/null
+++ b/libbcachefs/errcode.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "errcode.h"
+#include "trace.h"
+
+#include <linux/errname.h>
+
+static const char * const bch2_errcode_strs[] = {
+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err,
+	BCH_ERRCODES()
+#undef x
+	NULL
+};
+
+static unsigned bch2_errcode_parents[] = {
+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
+	BCH_ERRCODES()
+#undef x
+};
+
+const char *bch2_err_str(int err)
+{
+	const char *errstr;
+
+	err = abs(err);
+
+	BUG_ON(err >= BCH_ERR_MAX);
+
+	if (err >= BCH_ERR_START)
+		errstr = bch2_errcode_strs[err - BCH_ERR_START];
+	else if (err)
+		errstr = errname(err);
+	else
+		errstr = "(No error)";
+	return errstr ?: "(Invalid error)";
+}
+
+bool __bch2_err_matches(int err, int class)
+{
+	err	= abs(err);
+	class	= abs(class);
+
+	BUG_ON(err	>= BCH_ERR_MAX);
+	BUG_ON(class	>= BCH_ERR_MAX);
+
+	while (err >= BCH_ERR_START && err != class)
+		err = bch2_errcode_parents[err - BCH_ERR_START];
+
+	return err == class;
+}
+
+int __bch2_err_class(int bch_err)
+{
+	int std_err = -bch_err;
+	BUG_ON((unsigned) std_err >= BCH_ERR_MAX);
+
+	while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START])
+		std_err = bch2_errcode_parents[std_err - BCH_ERR_START];
+
+	trace_error_downcast(bch_err, std_err, _RET_IP_);
+
+	return -std_err;
+}
+
+const char *bch2_blk_status_to_str(blk_status_t status)
+{
+	if (status == BLK_STS_REMOVED)
+		return "device removed";
+	return blk_status_to_str(status);
+}
diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h
new file mode 100644
index 00000000..47387f7d
--- /dev/null
+++ b/libbcachefs/errcode.h
@@ -0,0 +1,316 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ERRCODE_H
+#define _BCACHEFS_ERRCODE_H
+
+#define BCH_ERRCODES()								\
+	x(ERANGE,			ERANGE_option_too_small)		\
+	x(ERANGE,			ERANGE_option_too_big)			\
+	x(EINVAL,			mount_option)				\
+	x(BCH_ERR_mount_option,		option_name)				\
+	x(BCH_ERR_mount_option,		option_value)				\
+	x(BCH_ERR_mount_option,         option_not_bool)                        \
+	x(ENOMEM,			ENOMEM_stripe_buf)			\
+	x(ENOMEM,			ENOMEM_replicas_table)			\
+	x(ENOMEM,			ENOMEM_cpu_replicas)			\
+	x(ENOMEM,			ENOMEM_replicas_gc)			\
+	x(ENOMEM,			ENOMEM_disk_groups_validate)		\
+	x(ENOMEM,			ENOMEM_disk_groups_to_cpu)		\
+	x(ENOMEM,			ENOMEM_mark_snapshot)			\
+	x(ENOMEM,			ENOMEM_mark_stripe)			\
+	x(ENOMEM,			ENOMEM_mark_stripe_ptr)			\
+	x(ENOMEM,			ENOMEM_btree_key_cache_create)		\
+	x(ENOMEM,			ENOMEM_btree_key_cache_fill)		\
+	x(ENOMEM,			ENOMEM_btree_key_cache_insert)		\
+	x(ENOMEM,			ENOMEM_trans_kmalloc)			\
+	x(ENOMEM,			ENOMEM_trans_log_msg)			\
+	x(ENOMEM,			ENOMEM_do_encrypt)			\
+	x(ENOMEM,			ENOMEM_ec_read_extent)			\
+	x(ENOMEM,			ENOMEM_ec_stripe_mem_alloc)		\
+	x(ENOMEM,			ENOMEM_ec_new_stripe_alloc)		\
+	x(ENOMEM,			ENOMEM_fs_btree_cache_init)		\
+	x(ENOMEM,			ENOMEM_fs_btree_key_cache_init)		\
+	x(ENOMEM,			ENOMEM_fs_counters_init)		\
+	x(ENOMEM,			ENOMEM_fs_btree_write_buffer_init)	\
+	x(ENOMEM,			ENOMEM_io_clock_init)			\
+	x(ENOMEM,			ENOMEM_blacklist_table_init)		\
+	x(ENOMEM,			ENOMEM_sb_realloc_injected)		\
+	x(ENOMEM,			ENOMEM_sb_bio_realloc)			\
+	x(ENOMEM,			ENOMEM_sb_buf_realloc)			\
+	x(ENOMEM,			ENOMEM_sb_journal_validate)		\
+	x(ENOMEM,			ENOMEM_sb_journal_v2_validate)		\
+	x(ENOMEM,			ENOMEM_journal_entry_add)		\
+	x(ENOMEM,			ENOMEM_journal_read_buf_realloc)	\
+	x(ENOMEM,			ENOMEM_btree_interior_update_worker_init)\
+	x(ENOMEM,			ENOMEM_btree_interior_update_pool_init)	\
+	x(ENOMEM,			ENOMEM_bio_read_init)			\
+	x(ENOMEM,			ENOMEM_bio_read_split_init)		\
+	x(ENOMEM,			ENOMEM_bio_write_init)			\
+	x(ENOMEM,			ENOMEM_bio_bounce_pages_init)		\
+	x(ENOMEM,			ENOMEM_writepage_bioset_init)		\
+	x(ENOMEM,			ENOMEM_dio_read_bioset_init)		\
+	x(ENOMEM,			ENOMEM_dio_write_bioset_init)		\
+	x(ENOMEM,			ENOMEM_nocow_flush_bioset_init)		\
+	x(ENOMEM,			ENOMEM_promote_table_init)		\
+	x(ENOMEM,			ENOMEM_compression_bounce_read_init)	\
+	x(ENOMEM,			ENOMEM_compression_bounce_write_init)	\
+	x(ENOMEM,			ENOMEM_compression_workspace_init)	\
+	x(EIO,				compression_workspace_not_initialized)	\
+	x(ENOMEM,			ENOMEM_bucket_gens)			\
+	x(ENOMEM,			ENOMEM_buckets_nouse)			\
+	x(ENOMEM,			ENOMEM_usage_init)			\
+	x(ENOMEM,			ENOMEM_btree_node_read_all_replicas)	\
+	x(ENOMEM,			ENOMEM_btree_node_reclaim)		\
+	x(ENOMEM,			ENOMEM_btree_node_mem_alloc)		\
+	x(ENOMEM,			ENOMEM_btree_cache_cannibalize_lock)	\
+	x(ENOMEM,			ENOMEM_buckets_waiting_for_journal_init)\
+	x(ENOMEM,			ENOMEM_buckets_waiting_for_journal_set)	\
+	x(ENOMEM,			ENOMEM_set_nr_journal_buckets)		\
+	x(ENOMEM,			ENOMEM_dev_journal_init)		\
+	x(ENOMEM,			ENOMEM_journal_pin_fifo)		\
+	x(ENOMEM,			ENOMEM_journal_buf)			\
+	x(ENOMEM,			ENOMEM_gc_start)			\
+	x(ENOMEM,			ENOMEM_gc_alloc_start)			\
+	x(ENOMEM,			ENOMEM_gc_reflink_start)		\
+	x(ENOMEM,			ENOMEM_gc_gens)				\
+	x(ENOMEM,			ENOMEM_gc_repair_key)			\
+	x(ENOMEM,			ENOMEM_fsck_extent_ends_at)		\
+	x(ENOMEM,			ENOMEM_fsck_add_nlink)			\
+	x(ENOMEM,			ENOMEM_journal_key_insert)		\
+	x(ENOMEM,			ENOMEM_journal_keys_sort)		\
+	x(ENOMEM,			ENOMEM_read_superblock_clean)		\
+	x(ENOMEM,			ENOMEM_fs_alloc)			\
+	x(ENOMEM,			ENOMEM_fs_name_alloc)			\
+	x(ENOMEM,			ENOMEM_fs_other_alloc)			\
+	x(ENOMEM,			ENOMEM_dev_alloc)			\
+	x(ENOMEM,			ENOMEM_disk_accounting)			\
+	x(ENOMEM,			ENOMEM_stripe_head_alloc)		\
+	x(ENOMEM,                       ENOMEM_journal_read_bucket)             \
+	x(ENOSPC,			ENOSPC_disk_reservation)		\
+	x(ENOSPC,			ENOSPC_bucket_alloc)			\
+	x(ENOSPC,			ENOSPC_disk_label_add)			\
+	x(ENOSPC,			ENOSPC_stripe_create)			\
+	x(ENOSPC,			ENOSPC_inode_create)			\
+	x(ENOSPC,			ENOSPC_str_hash_create)			\
+	x(ENOSPC,			ENOSPC_snapshot_create)			\
+	x(ENOSPC,			ENOSPC_subvolume_create)		\
+	x(ENOSPC,			ENOSPC_sb)				\
+	x(ENOSPC,			ENOSPC_sb_journal)			\
+	x(ENOSPC,			ENOSPC_sb_journal_seq_blacklist)	\
+	x(ENOSPC,			ENOSPC_sb_quota)			\
+	x(ENOSPC,			ENOSPC_sb_replicas)			\
+	x(ENOSPC,			ENOSPC_sb_members)			\
+	x(ENOSPC,			ENOSPC_sb_members_v2)			\
+	x(ENOSPC,			ENOSPC_sb_crypt)			\
+	x(ENOSPC,			ENOSPC_sb_downgrade)			\
+	x(ENOSPC,			ENOSPC_btree_slot)			\
+	x(ENOSPC,			ENOSPC_snapshot_tree)			\
+	x(ENOENT,			ENOENT_bkey_type_mismatch)		\
+	x(ENOENT,			ENOENT_str_hash_lookup)			\
+	x(ENOENT,			ENOENT_str_hash_set_must_replace)	\
+	x(ENOENT,			ENOENT_inode)				\
+	x(ENOENT,			ENOENT_not_subvol)			\
+	x(ENOENT,			ENOENT_not_directory)			\
+	x(ENOENT,			ENOENT_directory_dead)			\
+	x(ENOENT,			ENOENT_subvolume)			\
+	x(ENOENT,			ENOENT_snapshot_tree)			\
+	x(ENOENT,			ENOENT_dirent_doesnt_match_inode)	\
+	x(ENOENT,			ENOENT_dev_not_found)			\
+	x(ENOENT,			ENOENT_dev_idx_not_found)		\
+	x(ENOENT,			ENOENT_inode_no_backpointer)		\
+	x(ENOTEMPTY,			ENOTEMPTY_dir_not_empty)		\
+	x(ENOTEMPTY,			ENOTEMPTY_subvol_not_empty)		\
+	x(EEXIST,			EEXIST_str_hash_set)			\
+	x(EEXIST,			EEXIST_discard_in_flight_add)		\
+	x(EEXIST,			EEXIST_subvolume_create)		\
+	x(ENOSPC,			open_buckets_empty)			\
+	x(ENOSPC,			freelist_empty)				\
+	x(BCH_ERR_freelist_empty,	no_buckets_found)			\
+	x(0,				transaction_restart)			\
+	x(BCH_ERR_transaction_restart,	transaction_restart_fault_inject)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_relock)		\
+	x(BCH_ERR_transaction_restart,	transaction_restart_relock_path)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_relock_path_intent)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_relock_after_fill)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_too_many_iters)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_lock_node_reused)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_fill_relock)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_fill_mem_alloc_fail)\
+	x(BCH_ERR_transaction_restart,	transaction_restart_mem_realloced)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_in_traverse_all)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_would_deadlock)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_would_deadlock_write)\
+	x(BCH_ERR_transaction_restart,	transaction_restart_deadlock_recursion_limit)\
+	x(BCH_ERR_transaction_restart,	transaction_restart_upgrade)		\
+	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_upgrade)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_fill)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_raced)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_realloced)\
+	x(BCH_ERR_transaction_restart,	transaction_restart_journal_preres_get)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_split_race)		\
+	x(BCH_ERR_transaction_restart,	transaction_restart_write_buffer_flush)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_nested)		\
+	x(BCH_ERR_transaction_restart,	transaction_restart_commit)		\
+	x(0,				no_btree_node)				\
+	x(BCH_ERR_no_btree_node,	no_btree_node_relock)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_upgrade)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_drop)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_lock_root)		\
+	x(BCH_ERR_no_btree_node,	no_btree_node_up)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_down)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_init)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_cached)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_srcu_reset)		\
+	x(0,				btree_insert_fail)			\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_btree_node_full)		\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_need_mark_replicas)	\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_res)		\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_reclaim)	\
+	x(0,				backpointer_to_overwritten_btree_node)	\
+	x(0,				journal_reclaim_would_deadlock)		\
+	x(EINVAL,			fsck)					\
+	x(BCH_ERR_fsck,			fsck_fix)				\
+	x(BCH_ERR_fsck,			fsck_delete_bkey)			\
+	x(BCH_ERR_fsck,			fsck_ignore)				\
+	x(BCH_ERR_fsck,			fsck_errors_not_fixed)			\
+	x(BCH_ERR_fsck,			fsck_repair_unimplemented)		\
+	x(BCH_ERR_fsck,			fsck_repair_impossible)			\
+	x(EINVAL,			restart_recovery)			\
+	x(EINVAL,			not_in_recovery)			\
+	x(EINVAL,			cannot_rewind_recovery)			\
+	x(0,				data_update_done)			\
+	x(EINVAL,			device_state_not_allowed)		\
+	x(EINVAL,			member_info_missing)			\
+	x(EINVAL,			mismatched_block_size)			\
+	x(EINVAL,			block_size_too_small)			\
+	x(EINVAL,			bucket_size_too_small)			\
+	x(EINVAL,			device_size_too_small)			\
+	x(EINVAL,			device_size_too_big)			\
+	x(EINVAL,			device_not_a_member_of_filesystem)	\
+	x(EINVAL,			device_has_been_removed)		\
+	x(EINVAL,			device_splitbrain)			\
+	x(EINVAL,			device_already_online)			\
+	x(EINVAL,			insufficient_devices_to_start)		\
+	x(EINVAL,			invalid)				\
+	x(EINVAL,			internal_fsck_err)			\
+	x(EINVAL,			opt_parse_error)			\
+	x(EINVAL,			remove_with_metadata_missing_unimplemented)\
+	x(EINVAL,			remove_would_lose_data)			\
+	x(EROFS,			erofs_trans_commit)			\
+	x(EROFS,			erofs_no_writes)			\
+	x(EROFS,			erofs_journal_err)			\
+	x(EROFS,			erofs_sb_err)				\
+	x(EROFS,			erofs_unfixed_errors)			\
+	x(EROFS,			erofs_norecovery)			\
+	x(EROFS,			erofs_nochanges)			\
+	x(EROFS,			insufficient_devices)			\
+	x(0,				operation_blocked)			\
+	x(BCH_ERR_operation_blocked,	btree_cache_cannibalize_lock_blocked)	\
+	x(BCH_ERR_operation_blocked,	journal_res_get_blocked)		\
+	x(BCH_ERR_operation_blocked,	journal_preres_get_blocked)		\
+	x(BCH_ERR_operation_blocked,	bucket_alloc_blocked)			\
+	x(BCH_ERR_operation_blocked,	stripe_alloc_blocked)			\
+	x(BCH_ERR_invalid,		invalid_sb)				\
+	x(BCH_ERR_invalid_sb,		invalid_sb_magic)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_version)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_features)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_too_big)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_csum_type)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_csum)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_block_size)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_uuid)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_too_many_members)		\
+	x(BCH_ERR_invalid_sb,		invalid_sb_dev_idx)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_time_precision)		\
+	x(BCH_ERR_invalid_sb,		invalid_sb_field_size)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_layout)			\
+	x(BCH_ERR_invalid_sb_layout,	invalid_sb_layout_type)			\
+	x(BCH_ERR_invalid_sb_layout,	invalid_sb_layout_nr_superblocks)	\
+	x(BCH_ERR_invalid_sb_layout,	invalid_sb_layout_superblocks_overlap)	\
+	x(BCH_ERR_invalid_sb_layout,    invalid_sb_layout_sb_max_size_bits)     \
+	x(BCH_ERR_invalid_sb,		invalid_sb_members_missing)		\
+	x(BCH_ERR_invalid_sb,		invalid_sb_members)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_disk_groups)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_replicas)			\
+	x(BCH_ERR_invalid_sb,		invalid_replicas_entry)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_journal)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_journal_seq_blacklist)	\
+	x(BCH_ERR_invalid_sb,		invalid_sb_crypt)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_clean)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_quota)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_errors)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_opt_compression)		\
+	x(BCH_ERR_invalid_sb,		invalid_sb_ext)				\
+	x(BCH_ERR_invalid_sb,		invalid_sb_downgrade)			\
+	x(BCH_ERR_invalid,		invalid_bkey)				\
+	x(BCH_ERR_operation_blocked,    nocow_lock_blocked)			\
+	x(EIO,				journal_shutdown)			\
+	x(EIO,				journal_flush_err)			\
+	x(EIO,				btree_node_read_err)			\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_cached)		\
+	x(EIO,				sb_not_downgraded)			\
+	x(EIO,				btree_node_write_all_failed)		\
+	x(EIO,				btree_node_read_error)			\
+	x(EIO,				btree_node_read_validate_error)		\
+	x(EIO,				btree_need_topology_repair)		\
+	x(EIO,				bucket_ref_update)			\
+	x(EIO,				trigger_pointer)			\
+	x(EIO,				trigger_stripe_pointer)			\
+	x(EIO,				metadata_bucket_inconsistency)		\
+	x(EIO,				mark_stripe)				\
+	x(EIO,				stripe_reconstruct)			\
+	x(EIO,				key_type_error)				\
+	x(EIO,				no_device_to_read_from)			\
+	x(EIO,				missing_indirect_extent)		\
+	x(EIO,				invalidate_stripe_to_dev)		\
+	x(EIO,				no_encryption_key)			\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_fixable)		\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_want_retry)		\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_must_retry)		\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_bad_node)		\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_incompatible)	\
+	x(0,				nopromote)				\
+	x(BCH_ERR_nopromote,		nopromote_may_not)			\
+	x(BCH_ERR_nopromote,		nopromote_already_promoted)		\
+	x(BCH_ERR_nopromote,		nopromote_unwritten)			\
+	x(BCH_ERR_nopromote,		nopromote_congested)			\
+	x(BCH_ERR_nopromote,		nopromote_in_flight)			\
+	x(BCH_ERR_nopromote,		nopromote_no_writes)			\
+	x(BCH_ERR_nopromote,		nopromote_enomem)			\
+	x(0,				invalid_snapshot_node)			\
+	x(0,				option_needs_open_fs)			\
+	x(0,				remove_disk_accounting_entry)
+
+enum bch_errcode {
+	BCH_ERR_START		= 2048,
+#define x(class, err) BCH_ERR_##err,
+	BCH_ERRCODES()
+#undef x
+	BCH_ERR_MAX
+};
+
+const char *bch2_err_str(int);
+bool __bch2_err_matches(int, int);
+
+static inline bool _bch2_err_matches(int err, int class)
+{
+	return err < 0 && __bch2_err_matches(err, class);
+}
+
+#define bch2_err_matches(_err, _class)			\
+({							\
+	BUILD_BUG_ON(!__builtin_constant_p(_class));	\
+	unlikely(_bch2_err_matches(_err, _class));	\
+})
+
+int __bch2_err_class(int);
+
+static inline long bch2_err_class(long err)
+{
+	return err < 0 ? __bch2_err_class(err) : err;
+}
+
+#define BLK_STS_REMOVED		((__force blk_status_t)128)
+
+const char *bch2_blk_status_to_str(blk_status_t);
+
+#endif /* _BCACHFES_ERRCODE_H */
diff --git a/libbcachefs/error.c b/libbcachefs/error.c
index 9505b6e6..7af5c594 100644
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@@ -1,20 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
 #include "error.h"
-#include "io.h"
+#include "fs-common.h"
+#include "journal.h"
+#include "recovery_passes.h"
 #include "super.h"
+#include "thread_with_file.h"
+
+#define FSCK_ERR_RATELIMIT_NR	10
 
 bool bch2_inconsistent_error(struct bch_fs *c)
 {
-	set_bit(BCH_FS_ERROR, &c->flags);
+	set_bit(BCH_FS_error, &c->flags);
 
 	switch (c->opts.errors) {
-	case BCH_ON_ERROR_CONTINUE:
+	case BCH_ON_ERROR_continue:
 		return false;
-	case BCH_ON_ERROR_RO:
+	case BCH_ON_ERROR_fix_safe:
+	case BCH_ON_ERROR_ro:
 		if (bch2_fs_emergency_read_only(c))
-			bch_err(c, "emergency read only");
+			bch_err(c, "inconsistency detected - emergency read only at journal seq %llu",
+				journal_cur_seq(&c->journal));
 		return true;
-	case BCH_ON_ERROR_PANIC:
+	case BCH_ON_ERROR_panic:
 		panic(bch2_fmt(c, "panic after error"));
 		return true;
 	default:
@@ -22,10 +32,22 @@ bool bch2_inconsistent_error(struct bch_fs *c)
 	}
 }
 
+int bch2_topology_error(struct bch_fs *c)
+{
+	set_bit(BCH_FS_topology_error, &c->flags);
+	if (!test_bit(BCH_FS_recovery_running, &c->flags)) {
+		bch2_inconsistent_error(c);
+		return -BCH_ERR_btree_need_topology_repair;
+	} else {
+		return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?:
+			-BCH_ERR_btree_node_read_validate_error;
+	}
+}
+
 void bch2_fatal_error(struct bch_fs *c)
 {
 	if (bch2_fs_emergency_read_only(c))
-		bch_err(c, "emergency read only");
+		bch_err(c, "fatal error - emergency read only");
 }
 
 void bch2_io_error_work(struct work_struct *work)
@@ -34,126 +56,504 @@ void bch2_io_error_work(struct work_struct *work)
 	struct bch_fs *c = ca->fs;
 	bool dev;
 
-	mutex_lock(&c->state_lock);
-	dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO,
+	down_write(&c->state_lock);
+	dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
 				    BCH_FORCE_IF_DEGRADED);
 	if (dev
-	    ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO,
+	    ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
 				  BCH_FORCE_IF_DEGRADED)
 	    : bch2_fs_emergency_read_only(c))
 		bch_err(ca,
 			"too many IO errors, setting %s RO",
 			dev ? "device" : "filesystem");
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 }
 
-void bch2_io_error(struct bch_dev *ca)
+void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
 {
+	atomic64_inc(&ca->errors[type]);
 	//queue_work(system_long_wq, &ca->io_error_work);
 }
 
+enum ask_yn {
+	YN_NO,
+	YN_YES,
+	YN_ALLNO,
+	YN_ALLYES,
+};
+
+static enum ask_yn parse_yn_response(char *buf)
+{
+	buf = strim(buf);
+
+	if (strlen(buf) == 1)
+		switch (buf[0]) {
+		case 'n':
+			return YN_NO;
+		case 'y':
+			return YN_YES;
+		case 'N':
+			return YN_ALLNO;
+		case 'Y':
+			return YN_ALLYES;
+		}
+	return -1;
+}
+
 #ifdef __KERNEL__
-#define ask_yn()	false
+static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
+{
+	struct stdio_redirect *stdio = c->stdio;
+
+	if (c->stdio_filter && c->stdio_filter != current)
+		stdio = NULL;
+
+	if (!stdio)
+		return YN_NO;
+
+	if (trans)
+		bch2_trans_unlock(trans);
+
+	unsigned long unlock_long_at = trans ? jiffies + HZ * 2 : 0;
+	darray_char line = {};
+	int ret;
+
+	do {
+		unsigned long t;
+		bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
+rewait:
+		t = unlock_long_at
+			? max_t(long, unlock_long_at - jiffies, 0)
+			: MAX_SCHEDULE_TIMEOUT;
+
+		int r = bch2_stdio_redirect_readline_timeout(stdio, &line, t);
+		if (r == -ETIME) {
+			bch2_trans_unlock_long(trans);
+			unlock_long_at = 0;
+			goto rewait;
+		}
+
+		if (r < 0) {
+			ret = YN_NO;
+			break;
+		}
+
+		darray_last(line) = '\0';
+	} while ((ret = parse_yn_response(line.data)) < 0);
+
+	darray_exit(&line);
+	return ret;
+}
 #else
+
 #include "tools-util.h"
+
+static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
+{
+	char *buf = NULL;
+	size_t buflen = 0;
+	int ret;
+
+	do {
+		fputs(" (y,n, or Y,N for all errors of this type) ", stdout);
+		fflush(stdout);
+
+		if (getline(&buf, &buflen, stdin) < 0)
+			die("error reading from standard input");
+	} while ((ret = parse_yn_response(buf)) < 0);
+
+	free(buf);
+	return ret;
+}
+
 #endif
 
-enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
-				const char *fmt, ...)
+static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
 {
 	struct fsck_err_state *s;
-	va_list args;
-	bool fix = false, print = true, suppressing = false;
-	char _buf[sizeof(s->buf)], *buf = _buf;
 
-	mutex_lock(&c->fsck_error_lock);
-
-	if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
-		goto print;
+	if (!test_bit(BCH_FS_fsck_running, &c->flags))
+		return NULL;
 
-	list_for_each_entry(s, &c->fsck_errors, list)
-		if (s->fmt == fmt)
-			goto found;
+	list_for_each_entry(s, &c->fsck_error_msgs, list)
+		if (s->fmt == fmt) {
+			/*
+			 * move it to the head of the list: repeated fsck errors
+			 * are common
+			 */
+			list_move(&s->list, &c->fsck_error_msgs);
+			return s;
+		}
 
-	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	s = kzalloc(sizeof(*s), GFP_NOFS);
 	if (!s) {
-		if (!c->fsck_alloc_err)
+		if (!c->fsck_alloc_msgs_err)
 			bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
-		c->fsck_alloc_err = true;
-		buf = _buf;
-		goto print;
+		c->fsck_alloc_msgs_err = true;
+		return NULL;
 	}
 
 	INIT_LIST_HEAD(&s->list);
 	s->fmt = fmt;
-found:
-	list_move(&s->list, &c->fsck_errors);
-	s->nr++;
-	suppressing	= s->nr == 10;
-	print		= s->nr <= 10;
-	buf		= s->buf;
-print:
+	list_add(&s->list, &c->fsck_error_msgs);
+	return s;
+}
+
+/* s/fix?/fixing/ s/recreate?/recreating/ */
+static void prt_actioning(struct printbuf *out, const char *action)
+{
+	unsigned len = strlen(action);
+
+	BUG_ON(action[len - 1] != '?');
+	--len;
+
+	if (action[len - 1] == 'e')
+		--len;
+
+	prt_bytes(out, action, len);
+	prt_str(out, "ing");
+}
+
+static const u8 fsck_flags_extra[] = {
+#define x(t, n, flags)		[BCH_FSCK_ERR_##t] = flags,
+	BCH_SB_ERRS()
+#undef x
+};
+
+static int do_fsck_ask_yn(struct bch_fs *c,
+			  struct btree_trans *trans,
+			  struct printbuf *question,
+			  const char *action)
+{
+	prt_str(question, ", ");
+	prt_str(question, action);
+
+	if (bch2_fs_stdio_redirect(c))
+		bch2_print(c, "%s", question->buf);
+	else
+		bch2_print_string_as_lines(KERN_ERR, question->buf);
+
+	int ask = bch2_fsck_ask_yn(c, trans);
+
+	if (trans) {
+		int ret = bch2_trans_relock(trans);
+		if (ret)
+			return ret;
+	}
+
+	return ask;
+}
+
+int __bch2_fsck_err(struct bch_fs *c,
+		  struct btree_trans *trans,
+		  enum bch_fsck_flags flags,
+		  enum bch_sb_error_id err,
+		  const char *fmt, ...)
+{
+	struct fsck_err_state *s = NULL;
+	va_list args;
+	bool print = true, suppressing = false, inconsistent = false, exiting = false;
+	struct printbuf buf = PRINTBUF, *out = &buf;
+	int ret = -BCH_ERR_fsck_ignore;
+	const char *action_orig = "fix?", *action = action_orig;
+
+	might_sleep();
+
+	if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
+		flags |= fsck_flags_extra[err];
+
+	if (!c)
+		c = trans->c;
+
+	/*
+	 * Ugly: if there's a transaction in the current task it has to be
+	 * passed in to unlock if we prompt for user input.
+	 *
+	 * But, plumbing a transaction and transaction restarts into
+	 * bkey_validate() is problematic.
+	 *
+	 * So:
+	 * - make all bkey errors AUTOFIX, they're simple anyways (we just
+	 *   delete the key)
+	 * - and we don't need to warn if we're not prompting
+	 */
+	WARN_ON((flags & FSCK_CAN_FIX) &&
+		!(flags & FSCK_AUTOFIX) &&
+		!trans &&
+		bch2_current_has_btree_trans(c));
+
+	if (test_bit(err, c->sb.errors_silent))
+		return flags & FSCK_CAN_FIX
+			? -BCH_ERR_fsck_fix
+			: -BCH_ERR_fsck_ignore;
+
+	bch2_sb_error_count(c, err);
+
 	va_start(args, fmt);
-	vscnprintf(buf, sizeof(_buf), fmt, args);
+	prt_vprintf(out, fmt, args);
 	va_end(args);
 
-	if (c->opts.fix_errors == FSCK_OPT_EXIT) {
-		bch_err(c, "%s, exiting", buf);
-		mutex_unlock(&c->fsck_error_lock);
-		return FSCK_ERR_EXIT;
+	/* Custom fix/continue/recreate/etc.? */
+	if (out->buf[out->pos - 1] == '?') {
+		const char *p = strrchr(out->buf, ',');
+		if (p) {
+			out->pos = p - out->buf;
+			action = kstrdup(p + 2, GFP_KERNEL);
+			if (!action) {
+				ret = -ENOMEM;
+				goto err;
+			}
+		}
 	}
 
-	if (flags & FSCK_CAN_FIX) {
-		if (c->opts.fix_errors == FSCK_OPT_ASK) {
-			printk(KERN_ERR "%s: fix?", buf);
-			fix = ask_yn();
-		} else if (c->opts.fix_errors == FSCK_OPT_YES ||
+	mutex_lock(&c->fsck_error_msgs_lock);
+	s = fsck_err_get(c, fmt);
+	if (s) {
+		/*
+		 * We may be called multiple times for the same error on
+		 * transaction restart - this memoizes instead of asking the user
+		 * multiple times for the same error:
+		 */
+		if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
+			ret = s->ret;
+			goto err_unlock;
+		}
+
+		kfree(s->last_msg);
+		s->last_msg = kstrdup(buf.buf, GFP_KERNEL);
+		if (!s->last_msg) {
+			ret = -ENOMEM;
+			goto err_unlock;
+		}
+
+		if (c->opts.ratelimit_errors &&
+		    !(flags & FSCK_NO_RATELIMIT) &&
+		    s->nr >= FSCK_ERR_RATELIMIT_NR) {
+			if (s->nr == FSCK_ERR_RATELIMIT_NR)
+				suppressing = true;
+			else
+				print = false;
+		}
+
+		s->nr++;
+	}
+
+#ifdef BCACHEFS_LOG_PREFIX
+	if (!strncmp(fmt, "bcachefs:", 9))
+		prt_printf(out, bch2_log_msg(c, ""));
+#endif
+
+	if ((flags & FSCK_AUTOFIX) &&
+	    (c->opts.errors == BCH_ON_ERROR_continue ||
+	     c->opts.errors == BCH_ON_ERROR_fix_safe)) {
+		prt_str(out, ", ");
+		if (flags & FSCK_CAN_FIX) {
+			prt_actioning(out, action);
+			ret = -BCH_ERR_fsck_fix;
+		} else {
+			prt_str(out, ", continuing");
+			ret = -BCH_ERR_fsck_ignore;
+		}
+
+		goto print;
+	} else if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
+		if (c->opts.errors != BCH_ON_ERROR_continue ||
+		    !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
+			prt_str(out, ", shutting down");
+			inconsistent = true;
+			ret = -BCH_ERR_fsck_errors_not_fixed;
+		} else if (flags & FSCK_CAN_FIX) {
+			prt_str(out, ", ");
+			prt_actioning(out, action);
+			ret = -BCH_ERR_fsck_fix;
+		} else {
+			prt_str(out, ", continuing");
+			ret = -BCH_ERR_fsck_ignore;
+		}
+	} else if (c->opts.fix_errors == FSCK_FIX_exit) {
+		prt_str(out, ", exiting");
+		ret = -BCH_ERR_fsck_errors_not_fixed;
+	} else if (flags & FSCK_CAN_FIX) {
+		int fix = s && s->fix
+			? s->fix
+			: c->opts.fix_errors;
+
+		if (fix == FSCK_FIX_ask) {
+			print = false;
+
+			ret = do_fsck_ask_yn(c, trans, out, action);
+			if (ret < 0)
+				goto err_unlock;
+
+			if (ret >= YN_ALLNO && s)
+				s->fix = ret == YN_ALLNO
+					? FSCK_FIX_no
+					: FSCK_FIX_yes;
+
+			ret = ret & 1
+				? -BCH_ERR_fsck_fix
+				: -BCH_ERR_fsck_ignore;
+		} else if (fix == FSCK_FIX_yes ||
 			   (c->opts.nochanges &&
 			    !(flags & FSCK_CAN_IGNORE))) {
-			if (print)
-				bch_err(c, "%s, fixing", buf);
-			fix = true;
+			prt_str(out, ", ");
+			prt_actioning(out, action);
+			ret = -BCH_ERR_fsck_fix;
 		} else {
-			if (print)
-				bch_err(c, "%s, not fixing", buf);
-			fix = false;
+			prt_str(out, ", not ");
+			prt_actioning(out, action);
 		}
-	} else if (flags & FSCK_NEED_FSCK) {
-		if (print)
-			bch_err(c, "%s (run fsck to correct)", buf);
-	} else {
-		if (print)
-			bch_err(c, "%s (repair unimplemented)", buf);
+	} else if (!(flags & FSCK_CAN_IGNORE)) {
+		prt_str(out, " (repair unimplemented)");
 	}
 
-	if (suppressing)
+	if (ret == -BCH_ERR_fsck_ignore &&
+	    (c->opts.fix_errors == FSCK_FIX_exit ||
+	     !(flags & FSCK_CAN_IGNORE)))
+		ret = -BCH_ERR_fsck_errors_not_fixed;
+
+	if (test_bit(BCH_FS_fsck_running, &c->flags) &&
+	    (ret != -BCH_ERR_fsck_fix &&
+	     ret != -BCH_ERR_fsck_ignore)) {
+		exiting = true;
+		print = true;
+	}
+print:
+	if (print) {
+		if (bch2_fs_stdio_redirect(c))
+			bch2_print(c, "%s\n", out->buf);
+		else
+			bch2_print_string_as_lines(KERN_ERR, out->buf);
+	}
+
+	if (exiting)
+		bch_err(c, "Unable to continue, halting");
+	else if (suppressing)
 		bch_err(c, "Ratelimiting new instances of previous error");
 
-	mutex_unlock(&c->fsck_error_lock);
+	if (s)
+		s->ret = ret;
 
-	set_bit(fix
-		? BCH_FS_FSCK_FIXED_ERRORS
-		: BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags);
+	if (inconsistent)
+		bch2_inconsistent_error(c);
 
-	return fix				? FSCK_ERR_FIX
-		: flags & FSCK_CAN_IGNORE	? FSCK_ERR_IGNORE
-						: FSCK_ERR_EXIT;
+	/*
+	 * We don't yet track whether the filesystem currently has errors, for
+	 * log_fsck_err()s: that would require us to track for every error type
+	 * which recovery pass corrects it, to get the fsck exit status correct:
+	 */
+	if (flags & FSCK_CAN_FIX) {
+		if (ret == -BCH_ERR_fsck_fix) {
+			set_bit(BCH_FS_errors_fixed, &c->flags);
+		} else {
+			set_bit(BCH_FS_errors_not_fixed, &c->flags);
+			set_bit(BCH_FS_error, &c->flags);
+		}
+	}
+err_unlock:
+	mutex_unlock(&c->fsck_error_msgs_lock);
+err:
+	if (action != action_orig)
+		kfree(action);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static const char * const bch2_bkey_validate_contexts[] = {
+#define x(n) #n,
+	BKEY_VALIDATE_CONTEXTS()
+#undef x
+	NULL
+};
+
+int __bch2_bkey_fsck_err(struct bch_fs *c,
+			 struct bkey_s_c k,
+			 struct bkey_validate_context from,
+			 enum bch_sb_error_id err,
+			 const char *fmt, ...)
+{
+	if (from.flags & BCH_VALIDATE_silent)
+		return -BCH_ERR_fsck_delete_bkey;
+
+	unsigned fsck_flags = 0;
+	if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) {
+		//if (test_bit(err, c->sb.errors_silent))
+		//	return -BCH_ERR_fsck_delete_bkey;
+
+		fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX;
+	}
+	if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
+		fsck_flags |= fsck_flags_extra[err];
+
+	struct printbuf buf = PRINTBUF;
+
+	prt_printf(&buf, "invalid bkey in %s btree=",
+		   bch2_bkey_validate_contexts[from.from]);
+	bch2_btree_id_to_text(&buf, from.btree);
+	prt_printf(&buf, " level=%u: ", from.level);
+
+	bch2_bkey_val_to_text(&buf, c, k);
+	prt_str(&buf, "\n  ");
+
+	va_list args;
+	va_start(args, fmt);
+	prt_vprintf(&buf, fmt, args);
+	va_end(args);
+
+	prt_str(&buf, ": delete?");
+
+	int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf);
+	printbuf_exit(&buf);
+	return ret;
 }
 
 void bch2_flush_fsck_errs(struct bch_fs *c)
 {
 	struct fsck_err_state *s, *n;
 
-	mutex_lock(&c->fsck_error_lock);
-	set_bit(BCH_FS_FSCK_DONE, &c->flags);
+	mutex_lock(&c->fsck_error_msgs_lock);
 
-	list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
-		if (s->nr > 10)
-			bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->buf);
+	list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) {
+		if (s->ratelimited && s->last_msg)
+			bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->last_msg);
 
 		list_del(&s->list);
+		kfree(s->last_msg);
 		kfree(s);
 	}
 
-	mutex_unlock(&c->fsck_error_lock);
+	mutex_unlock(&c->fsck_error_msgs_lock);
+}
+
+int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum)
+{
+	u32 restart_count = trans->restart_count;
+	int ret = 0;
+
+	/* XXX: we don't yet attempt to print paths when we don't know the subvol */
+	if (inum.subvol)
+		ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out));
+	if (!inum.subvol || ret)
+		prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum);
+
+	return trans_was_restarted(trans, restart_count);
+}
+
+int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
+				    subvol_inum inum, u64 offset)
+{
+	int ret = bch2_inum_err_msg_trans(trans, out, inum);
+	prt_printf(out, " offset %llu: ", offset);
+	return ret;
+}
+
+void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum)
+{
+	bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum));
+}
+
+void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
+			      subvol_inum inum, u64 offset)
+{
+	bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
 }
diff --git a/libbcachefs/error.h b/libbcachefs/error.h
index 588e763f..7acf2a27 100644
--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@@ -1,8 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_ERROR_H
 #define _BCACHEFS_ERROR_H
 
 #include <linux/list.h>
 #include <linux/printk.h>
+#include "bkey_types.h"
+#include "sb-errors.h"
 
 struct bch_dev;
 struct bch_fs;
@@ -16,26 +19,6 @@ struct work_struct;
 /* Error messages: */
 
 /*
- * Very fatal logic/inconsistency errors: these indicate that we've majorly
- * screwed up at runtime, i.e. it's not likely that it was just caused by the
- * data on disk being inconsistent. These BUG():
- *
- * XXX: audit and convert to inconsistent() checks
- */
-
-#define bch2_fs_bug(c, ...)						\
-do {									\
-	bch_err(c, __VA_ARGS__);					\
-	BUG();								\
-} while (0)
-
-#define bch2_fs_bug_on(cond, c, ...)					\
-do {									\
-	if (cond)							\
-		bch2_fs_bug(c, __VA_ARGS__);				\
-} while (0)
-
-/*
  * Inconsistency errors: The on disk data is inconsistent. If these occur during
  * initial recovery, they don't indicate a bug in the running code - we walk all
  * the metadata before modifying anything. If they occur at runtime, they
@@ -48,38 +31,45 @@ do {									\
 
 bool bch2_inconsistent_error(struct bch_fs *);
 
+int bch2_topology_error(struct bch_fs *);
+
+#define bch2_fs_topology_error(c, ...)					\
+({									\
+	bch_err(c, "btree topology error: " __VA_ARGS__);		\
+	bch2_topology_error(c);						\
+})
+
 #define bch2_fs_inconsistent(c, ...)					\
 ({									\
 	bch_err(c, __VA_ARGS__);					\
 	bch2_inconsistent_error(c);					\
 })
 
-#define bch2_fs_inconsistent_on(cond, c, ...)				\
+#define bch2_fs_inconsistent_on(cond, ...)				\
 ({									\
-	int _ret = !!(cond);						\
-									\
+	bool _ret = unlikely(!!(cond));					\
 	if (_ret)							\
-		bch2_fs_inconsistent(c, __VA_ARGS__);			\
+		bch2_fs_inconsistent(__VA_ARGS__);			\
 	_ret;								\
 })
 
 /*
- * Later we might want to mark only the particular device inconsistent, not the
- * entire filesystem:
+ * When a transaction update discovers or is causing a fs inconsistency, it's
+ * helpful to also dump the pending updates:
  */
+#define bch2_trans_inconsistent(trans, ...)				\
+({									\
+	bch_err(trans->c, __VA_ARGS__);					\
+	bch2_dump_trans_updates(trans);					\
+	bch2_inconsistent_error(trans->c);				\
+})
 
-#define bch2_dev_inconsistent(ca, ...)					\
-do {									\
-	bch_err(ca, __VA_ARGS__);					\
-	bch2_inconsistent_error((ca)->fs);				\
-} while (0)
-
-#define bch2_dev_inconsistent_on(cond, ca, ...)				\
+#define bch2_trans_inconsistent_on(cond, trans, ...)			\
 ({									\
-	int _ret = !!(cond);						\
+	bool _ret = unlikely(!!(cond));					\
 									\
 	if (_ret)							\
-		bch2_dev_inconsistent(ca, __VA_ARGS__);			\
+		bch2_trans_inconsistent(trans, __VA_ARGS__);		\
 	_ret;								\
 })
 
@@ -88,79 +78,110 @@ do {									\
  * be able to repair:
  */
 
-enum {
-	BCH_FSCK_OK			= 0,
-	BCH_FSCK_ERRORS_NOT_FIXED	= 1,
-	BCH_FSCK_REPAIR_UNIMPLEMENTED	= 2,
-	BCH_FSCK_REPAIR_IMPOSSIBLE	= 3,
-	BCH_FSCK_UNKNOWN_VERSION	= 4,
-};
-
-enum fsck_err_opts {
-	FSCK_OPT_EXIT,
-	FSCK_OPT_YES,
-	FSCK_OPT_NO,
-	FSCK_OPT_ASK,
-};
-
-enum fsck_err_ret {
-	FSCK_ERR_IGNORE	= 0,
-	FSCK_ERR_FIX	= 1,
-	FSCK_ERR_EXIT	= 2,
-};
-
 struct fsck_err_state {
 	struct list_head	list;
 	const char		*fmt;
 	u64			nr;
-	char			buf[512];
+	bool			ratelimited;
+	int			ret;
+	int			fix;
+	char			*last_msg;
 };
 
-#define FSCK_CAN_FIX		(1 << 0)
-#define FSCK_CAN_IGNORE		(1 << 1)
-#define FSCK_NEED_FSCK		(1 << 2)
+#define fsck_err_count(_c, _err)	bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
+
+__printf(5, 6) __cold
+int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
+		  enum bch_fsck_flags,
+		  enum bch_sb_error_id,
+		  const char *, ...);
+#define bch2_fsck_err(c, _flags, _err_type, ...)				\
+	__bch2_fsck_err(type_is(c, struct bch_fs *) ? (struct bch_fs *) c : NULL,\
+			type_is(c, struct btree_trans *) ? (struct btree_trans *) c : NULL,\
+			_flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__)
 
-enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
-				unsigned, const char *, ...);
 void bch2_flush_fsck_errs(struct bch_fs *);
 
-#define __fsck_err(c, _flags, msg, ...)					\
+#define fsck_err_wrap(_do)						\
 ({									\
-	int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\
-									\
-	if (_fix == FSCK_ERR_EXIT) {					\
-		bch_err(c, "Unable to continue, halting");		\
-		ret = BCH_FSCK_ERRORS_NOT_FIXED;			\
+	int _ret = _do;							\
+	if (_ret != -BCH_ERR_fsck_fix &&				\
+	    _ret != -BCH_ERR_fsck_ignore) {				\
+		ret = _ret;						\
 		goto fsck_err;						\
 	}								\
 									\
-	_fix;								\
+	_ret == -BCH_ERR_fsck_fix;					\
 })
 
+#define __fsck_err(...)		fsck_err_wrap(bch2_fsck_err(__VA_ARGS__))
+
 /* These macros return true if error should be fixed: */
 
 /* XXX: mark in superblock that filesystem contains errors, if we ignore: */
 
-#define __fsck_err_on(cond, c, _flags, ...)				\
-	((cond) ? __fsck_err(c, _flags,	##__VA_ARGS__) : false)
+#define __fsck_err_on(cond, c, _flags, _err_type, ...)			\
+({									\
+	might_sleep();							\
+									\
+	if (type_is(c, struct bch_fs *))				\
+		WARN_ON(bch2_current_has_btree_trans((struct bch_fs *) c));\
+									\
+	(unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\
+})
+
+#define mustfix_fsck_err(c, _err_type, ...)				\
+	__fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
 
-#define need_fsck_err_on(cond, c, ...)					\
-	__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+#define mustfix_fsck_err_on(cond, c, _err_type, ...)			\
+	__fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
 
-#define need_fsck_err(c, ...)						\
-	__fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+#define fsck_err(c, _err_type, ...)					\
+	__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
 
-#define mustfix_fsck_err(c, ...)					\
-	__fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
+#define fsck_err_on(cond, c, _err_type, ...)				\
+	__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
 
-#define mustfix_fsck_err_on(cond, c, ...)				\
-	__fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
+#define log_fsck_err(c, _err_type, ...)					\
+	__fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
 
-#define fsck_err(c, ...)						\
-	__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+#define log_fsck_err_on(cond, ...)					\
+({									\
+	bool _ret = unlikely(!!(cond));					\
+	if (_ret)							\
+		log_fsck_err(__VA_ARGS__);				\
+	_ret;								\
+})
 
-#define fsck_err_on(cond, c, ...)					\
-	__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+enum bch_validate_flags;
+__printf(5, 6)
+int __bch2_bkey_fsck_err(struct bch_fs *,
+			 struct bkey_s_c,
+			 struct bkey_validate_context from,
+			 enum bch_sb_error_id,
+			 const char *, ...);
+
+/*
+ * for now, bkey fsck errors are always handled by deleting the entire key -
+ * this will change at some point
+ */
+#define bkey_fsck_err(c, _err_type, _err_msg, ...)			\
+do {									\
+	int _ret = __bch2_bkey_fsck_err(c, k, from,			\
+				BCH_FSCK_ERR_##_err_type,		\
+				_err_msg, ##__VA_ARGS__);		\
+	if (_ret != -BCH_ERR_fsck_fix &&				\
+	    _ret != -BCH_ERR_fsck_ignore)				\
+		ret = _ret;						\
+	ret = -BCH_ERR_fsck_delete_bkey;				\
+	goto fsck_err;							\
+} while (0)
+
+#define bkey_fsck_err_on(cond, ...)					\
+do {									\
+	if (unlikely(cond))						\
+		bkey_fsck_err(__VA_ARGS__);				\
+} while (0)
 
 /*
  * Fatal errors: these don't indicate a bug, but we can't continue running in RW
@@ -169,15 +190,15 @@ void bch2_flush_fsck_errs(struct bch_fs *);
 
 void bch2_fatal_error(struct bch_fs *);
 
-#define bch2_fs_fatal_error(c, ...)					\
+#define bch2_fs_fatal_error(c, _msg, ...)				\
 do {									\
-	bch_err(c, __VA_ARGS__);					\
+	bch_err(c, "%s(): fatal error " _msg, __func__, ##__VA_ARGS__);	\
 	bch2_fatal_error(c);						\
 } while (0)
 
 #define bch2_fs_fatal_err_on(cond, c, ...)				\
 ({									\
-	int _ret = !!(cond);						\
+	bool _ret = unlikely(!!(cond));					\
 									\
 	if (_ret)							\
 		bch2_fs_fatal_error(c, __VA_ARGS__);			\
@@ -193,36 +214,34 @@ do {									\
 void bch2_io_error_work(struct work_struct *);
 
 /* Does the error handling without logging a message */
-void bch2_io_error(struct bch_dev *);
-
-/* Logs message and handles the error: */
-#define bch2_dev_io_error(ca, fmt, ...)					\
-do {									\
-	printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs,			\
-		"IO error on %s for " fmt),				\
-		(ca)->name, ##__VA_ARGS__);				\
-	bch2_io_error(ca);						\
-} while (0)
+void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
 
-#define bch2_dev_io_err_on(cond, ca, ...)				\
+#define bch2_dev_io_err_on(cond, ca, _type, ...)			\
 ({									\
 	bool _ret = (cond);						\
 									\
-	if (_ret)							\
-		bch2_dev_io_error(ca, __VA_ARGS__);			\
+	if (_ret) {							\
+		bch_err_dev_ratelimited(ca, __VA_ARGS__);		\
+		bch2_io_error(ca, _type);				\
+	}								\
 	_ret;								\
 })
 
-/* kill? */
+#define bch2_dev_inum_io_err_on(cond, ca, _type, ...)			\
+({									\
+	bool _ret = (cond);						\
+									\
+	if (_ret) {							\
+		bch_err_inum_offset_ratelimited(ca, __VA_ARGS__);	\
+		bch2_io_error(ca, _type);				\
+	}								\
+	_ret;								\
+})
 
-#define __bcache_io_error(c, fmt, ...)					\
-	printk_ratelimited(KERN_ERR bch2_fmt(c,				\
-			"IO error: " fmt), ##__VA_ARGS__)
+int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum);
+int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64);
 
-#define bcache_io_error(c, bio, fmt, ...)				\
-do {									\
-	__bcache_io_error(c, fmt, ##__VA_ARGS__);			\
-	(bio)->bi_status = BLK_STS_IOERR;					\
-} while (0)
+void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum);
+void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64);
 
 #endif /* _BCACHEFS_ERROR_H */
diff --git a/libbcachefs/extent_update.c b/libbcachefs/extent_update.c
new file mode 100644
index 00000000..6aac579a
--- /dev/null
+++ b/libbcachefs/extent_update.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "debug.h"
+#include "extents.h"
+#include "extent_update.h"
+
+/*
+ * This counts the number of iterators to the alloc & ec btrees we'll need
+ * inserting/removing this extent:
+ */
+static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	unsigned ret = 0, lru = 0;
+
+	bkey_extent_entry_for_each(ptrs, entry) {
+		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			/* Might also be updating LRU btree */
+			if (entry->ptr.cached)
+				lru++;
+
+			fallthrough;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			ret++;
+		}
+	}
+
+	/*
+	 * Updating keys in the alloc btree may also update keys in the
+	 * freespace or discard btrees:
+	 */
+	return lru + ret * 2;
+}
+
+static int count_iters_for_insert(struct btree_trans *trans,
+				  struct bkey_s_c k,
+				  unsigned offset,
+				  struct bpos *end,
+				  unsigned *nr_iters,
+				  unsigned max_iters)
+{
+	int ret = 0, ret2 = 0;
+
+	if (*nr_iters >= max_iters) {
+		*end = bpos_min(*end, k.k->p);
+		ret = 1;
+	}
+
+	switch (k.k->type) {
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		*nr_iters += bch2_bkey_nr_alloc_ptrs(k);
+
+		if (*nr_iters >= max_iters) {
+			*end = bpos_min(*end, k.k->p);
+			ret = 1;
+		}
+
+		break;
+	case KEY_TYPE_reflink_p: {
+		struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+		u64 idx = REFLINK_P_IDX(p.v);
+		unsigned sectors = bpos_min(*end, p.k->p).offset -
+			bkey_start_offset(p.k);
+		struct btree_iter iter;
+		struct bkey_s_c r_k;
+
+		for_each_btree_key_norestart(trans, iter,
+				   BTREE_ID_reflink, POS(0, idx + offset),
+				   BTREE_ITER_slots, r_k, ret2) {
+			if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors)))
+				break;
+
+			/* extent_update_to_keys(), for the reflink_v update */
+			*nr_iters += 1;
+
+			*nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
+
+			if (*nr_iters >= max_iters) {
+				struct bpos pos = bkey_start_pos(k.k);
+				pos.offset += min_t(u64, k.k->size,
+						    r_k.k->p.offset - idx);
+
+				*end = bpos_min(*end, pos);
+				ret = 1;
+				break;
+			}
+		}
+		bch2_trans_iter_exit(trans, &iter);
+
+		break;
+	}
+	}
+
+	return ret2 ?: ret;
+}
+
+#define EXTENT_ITERS_MAX	(BTREE_ITER_INITIAL / 3)
+
+int bch2_extent_atomic_end(struct btree_trans *trans,
+			   struct btree_iter *iter,
+			   struct bkey_i *insert,
+			   struct bpos *end)
+{
+	struct btree_iter copy;
+	struct bkey_s_c k;
+	unsigned nr_iters = 0;
+	int ret;
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		return ret;
+
+	*end = insert->k.p;
+
+	/* extent_update_to_keys(): */
+	nr_iters += 1;
+
+	ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
+				     &nr_iters, EXTENT_ITERS_MAX / 2);
+	if (ret < 0)
+		return ret;
+
+	bch2_trans_copy_iter(&copy, iter);
+
+	for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) {
+		unsigned offset = 0;
+
+		if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k)))
+			offset = bkey_start_offset(&insert->k) -
+				bkey_start_offset(k.k);
+
+		/* extent_handle_overwrites(): */
+		switch (bch2_extent_overlap(&insert->k, k.k)) {
+		case BCH_EXTENT_OVERLAP_ALL:
+		case BCH_EXTENT_OVERLAP_FRONT:
+			nr_iters += 1;
+			break;
+		case BCH_EXTENT_OVERLAP_BACK:
+		case BCH_EXTENT_OVERLAP_MIDDLE:
+			nr_iters += 2;
+			break;
+		}
+
+		ret = count_iters_for_insert(trans, k, offset, end,
+					&nr_iters, EXTENT_ITERS_MAX);
+		if (ret)
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &copy);
+	return ret < 0 ? ret : 0;
+}
+
+int bch2_extent_trim_atomic(struct btree_trans *trans,
+			    struct btree_iter *iter,
+			    struct bkey_i *k)
+{
+	struct bpos end;
+	int ret;
+
+	ret = bch2_extent_atomic_end(trans, iter, k, &end);
+	if (ret)
+		return ret;
+
+	bch2_cut_back(end, k);
+	return 0;
+}
diff --git a/libbcachefs/extent_update.h b/libbcachefs/extent_update.h
new file mode 100644
index 00000000..6f5cf449
--- /dev/null
+++ b/libbcachefs/extent_update.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENT_UPDATE_H
+#define _BCACHEFS_EXTENT_UPDATE_H
+
+#include "bcachefs.h"
+
+int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
+			   struct bkey_i *, struct bpos *);
+int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
+			    struct bkey_i *);
+
+#endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index fe4bb527..2fc9ace5 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
  *
@@ -7,303 +8,475 @@
 
 #include "bcachefs.h"
 #include "bkey_methods.h"
+#include "btree_cache.h"
 #include "btree_gc.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_iter.h"
 #include "buckets.h"
 #include "checksum.h"
+#include "compress.h"
 #include "debug.h"
-#include "dirent.h"
 #include "disk_groups.h"
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
 #include "journal.h"
+#include "rebalance.h"
 #include "replicas.h"
 #include "super.h"
 #include "super-io.h"
+#include "trace.h"
 #include "util.h"
-#include "xattr.h"
 
-#include <trace/events/bcachefs.h>
+static unsigned bch2_crc_field_size_max[] = {
+	[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
+	[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
+	[BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
+};
+
+static void bch2_extent_crc_pack(union bch_extent_crc *,
+				 struct bch_extent_crc_unpacked,
+				 enum bch_extent_entry_type);
+
+struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
+						 unsigned dev)
+{
+	struct bch_dev_io_failures *i;
+
+	for (i = f->devs; i < f->devs + f->nr; i++)
+		if (i->dev == dev)
+			return i;
+
+	return NULL;
+}
 
-static void sort_key_next(struct btree_node_iter_large *iter,
-			  struct btree *b,
-			  struct btree_node_iter_set *i)
+void bch2_mark_io_failure(struct bch_io_failures *failed,
+			  struct extent_ptr_decoded *p)
 {
-	i->k += __btree_node_offset_to_key(b, i->k)->u64s;
+	struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
+
+	if (!f) {
+		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
 
-	if (i->k == i->end)
-		*i = iter->data[--iter->used];
+		f = &failed->devs[failed->nr++];
+		f->dev		= p->ptr.dev;
+		f->idx		= p->idx;
+		f->nr_failed	= 1;
+		f->nr_retries	= 0;
+	} else if (p->idx != f->idx) {
+		f->idx		= p->idx;
+		f->nr_failed	= 1;
+		f->nr_retries	= 0;
+	} else {
+		f->nr_failed++;
+	}
+}
+
+static inline u64 dev_latency(struct bch_fs *c, unsigned dev)
+{
+	struct bch_dev *ca = bch2_dev_rcu(c, dev);
+	return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
 }
 
 /*
- * Returns true if l > r - unless l == r, in which case returns true if l is
- * older than r.
- *
- * Necessary for btree_sort_fixup() - if there are multiple keys that compare
- * equal in different sets, we have to process them newest to oldest.
+ * returns true if p1 is better than p2:
  */
-#define key_sort_cmp(h, l, r)						\
-({									\
-	bkey_cmp_packed(b,						\
-			__btree_node_offset_to_key(b, (l).k),		\
-			__btree_node_offset_to_key(b, (r).k))		\
-									\
-	?: (l).k - (r).k;						\
-})
-
-static inline bool should_drop_next_key(struct btree_node_iter_large *iter,
-					struct btree *b)
+static inline bool ptr_better(struct bch_fs *c,
+			      const struct extent_ptr_decoded p1,
+			      const struct extent_ptr_decoded p2)
 {
-	struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
-	struct bkey_packed *k = __btree_node_offset_to_key(b, l->k);
+	if (likely(!p1.idx && !p2.idx)) {
+		u64 l1 = dev_latency(c, p1.ptr.dev);
+		u64 l2 = dev_latency(c, p2.ptr.dev);
 
-	if (bkey_whiteout(k))
-		return true;
+		/*
+		 * Square the latencies, to bias more in favor of the faster
+		 * device - we never want to stop issuing reads to the slower
+		 * device altogether, so that we can update our latency numbers:
+		 */
+		l1 *= l1;
+		l2 *= l2;
 
-	if (iter->used < 2)
-		return false;
+		/* Pick at random, biased in favor of the faster device: */
 
-	if (iter->used > 2 &&
-	    key_sort_cmp(iter, r[0], r[1]) >= 0)
-		r++;
+		return bch2_rand_range(l1 + l2) > l1;
+	}
 
-	/*
-	 * key_sort_cmp() ensures that when keys compare equal the older key
-	 * comes first; so if l->k compares equal to r->k then l->k is older and
-	 * should be dropped.
-	 */
-	return !bkey_cmp_packed(b,
-				__btree_node_offset_to_key(b, l->k),
-				__btree_node_offset_to_key(b, r->k));
+	if (bch2_force_reconstruct_read)
+		return p1.idx > p2.idx;
+
+	return p1.idx < p2.idx;
 }
 
-struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
-					struct btree *b,
-					struct btree_node_iter_large *iter)
+/*
+ * This picks a non-stale pointer, preferably from a device other than @avoid.
+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
+ * other devices, it will still pick a pointer from avoid.
+ */
+int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
+			       struct bch_io_failures *failed,
+			       struct extent_ptr_decoded *pick)
 {
-	struct bkey_packed *out = dst->start;
-	struct btree_nr_keys nr;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	struct bch_dev_io_failures *f;
+	int ret = 0;
+
+	if (k.k->type == KEY_TYPE_error)
+		return -BCH_ERR_key_type_error;
+
+	rcu_read_lock();
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		/*
+		 * Unwritten extent: no need to actually read, treat it as a
+		 * hole and return 0s:
+		 */
+		if (p.ptr.unwritten) {
+			ret = 0;
+			break;
+		}
 
-	memset(&nr, 0, sizeof(nr));
+		/*
+		 * If there are any dirty pointers it's an error if we can't
+		 * read:
+		 */
+		if (!ret && !p.ptr.cached)
+			ret = -BCH_ERR_no_device_to_read_from;
 
-	heap_resort(iter, key_sort_cmp);
+		struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
 
-	while (!bch2_btree_node_iter_large_end(iter)) {
-		if (!should_drop_next_key(iter, b)) {
-			struct bkey_packed *k =
-				__btree_node_offset_to_key(b, iter->data->k);
+		if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
+			continue;
 
-			bkey_copy(out, k);
-			btree_keys_account_key_add(&nr, 0, out);
-			out = bkey_next(out);
-		}
+		f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
+		if (f)
+			p.idx = f->nr_failed < f->nr_retries
+				? f->idx
+				: f->idx + 1;
+
+		if (!p.idx && (!ca || !bch2_dev_is_readable(ca)))
+			p.idx++;
 
-		sort_key_next(iter, b, iter->data);
-		heap_sift_down(iter, 0, key_sort_cmp);
+		if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
+			p.idx++;
+
+		if (p.idx > (unsigned) p.has_ec)
+			continue;
+
+		if (ret > 0 && !ptr_better(c, p, *pick))
+			continue;
+
+		*pick = p;
+		ret = 1;
 	}
+	rcu_read_unlock();
 
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	return nr;
+	return ret;
 }
 
-/* Common among btree and extent ptrs */
+/* KEY_TYPE_btree_ptr: */
 
-const struct bch_extent_ptr *
-bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
+int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
+			    struct bkey_validate_context from)
 {
-	const struct bch_extent_ptr *ptr;
+	int ret = 0;
 
-	extent_for_each_ptr(e, ptr)
-		if (ptr->dev == dev)
-			return ptr;
+	bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX,
+			 c, btree_ptr_val_too_big,
+			 "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
 
-	return NULL;
+	ret = bch2_bkey_ptrs_validate(c, k, from);
+fsck_err:
+	return ret;
 }
 
-bool bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev)
+void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
 {
-	struct bch_extent_ptr *ptr;
-	bool dropped = false;
-
-	extent_for_each_ptr_backwards(e, ptr)
-		if (ptr->dev == dev) {
-			__bch2_extent_drop_ptr(e, ptr);
-			dropped = true;
-		}
-
-	if (dropped)
-		bch2_extent_drop_redundant_crcs(e);
-	return dropped;
+	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
-const struct bch_extent_ptr *
-bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group)
+int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
+			       struct bkey_validate_context from)
 {
-	const struct bch_extent_ptr *ptr;
+	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
+	int ret = 0;
 
-	extent_for_each_ptr(e, ptr) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
+			 c, btree_ptr_v2_val_too_big,
+			 "value too big (%zu > %zu)",
+			 bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
 
-		if (ca->mi.group &&
-		    ca->mi.group - 1 == group)
-			return ptr;
-	}
+	bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
+			 c, btree_ptr_v2_min_key_bad,
+			 "min_key > key");
 
-	return NULL;
+	if ((from.flags & BCH_VALIDATE_write) &&
+	    c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written)
+		bkey_fsck_err_on(!bp.v->sectors_written,
+				 c, btree_ptr_v2_written_0,
+				 "sectors_written == 0");
+
+	ret = bch2_bkey_ptrs_validate(c, k, from);
+fsck_err:
+	return ret;
 }
 
-const struct bch_extent_ptr *
-bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target)
+void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
+			       struct bkey_s_c k)
 {
-	const struct bch_extent_ptr *ptr;
+	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
-	extent_for_each_ptr(e, ptr)
-		if (bch2_dev_in_target(c, ptr->dev, target) &&
-		    (!ptr->cached ||
-		     !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
-			return ptr;
+	prt_printf(out, "seq %llx written %u min_key %s",
+	       le64_to_cpu(bp.v->seq),
+	       le16_to_cpu(bp.v->sectors_written),
+	       BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
 
-	return NULL;
+	bch2_bpos_to_text(out, bp.v->min_key);
+	prt_printf(out, " ");
+	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
-unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e)
+void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
+			      unsigned big_endian, int write,
+			      struct bkey_s k)
 {
-	const struct bch_extent_ptr *ptr;
-	unsigned nr_ptrs = 0;
+	struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
 
-	extent_for_each_ptr(e, ptr)
-		nr_ptrs++;
+	compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
 
-	return nr_ptrs;
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_id_is_extents(btree_id) &&
+	    !bkey_eq(bp.v->min_key, POS_MIN))
+		bp.v->min_key = write
+			? bpos_nosnap_predecessor(bp.v->min_key)
+			: bpos_nosnap_successor(bp.v->min_key);
 }
 
-unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
-{
-	struct bkey_s_c_extent e;
-	const struct bch_extent_ptr *ptr;
-	unsigned nr_ptrs = 0;
+/* KEY_TYPE_extent: */
 
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		e = bkey_s_c_to_extent(k);
+bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
+{
+	struct bkey_ptrs   l_ptrs = bch2_bkey_ptrs(l);
+	struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
+	union bch_extent_entry *en_l;
+	const union bch_extent_entry *en_r;
+	struct extent_ptr_decoded lp, rp;
+	bool use_right_ptr;
 
-		extent_for_each_ptr(e, ptr)
-			nr_ptrs += !ptr->cached;
-		break;
+	en_l = l_ptrs.start;
+	en_r = r_ptrs.start;
+	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
+		if (extent_entry_type(en_l) != extent_entry_type(en_r))
+			return false;
 
-	case BCH_RESERVATION:
-		nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
-		break;
+		en_l = extent_entry_next(en_l);
+		en_r = extent_entry_next(en_r);
 	}
 
-	return nr_ptrs;
-}
+	if (en_l < l_ptrs.end || en_r < r_ptrs.end)
+		return false;
 
-unsigned bch2_extent_ptr_durability(struct bch_fs *c,
-				    const struct bch_extent_ptr *ptr)
-{
-	struct bch_dev *ca;
+	en_l = l_ptrs.start;
+	en_r = r_ptrs.start;
+	lp.crc = bch2_extent_crc_unpack(l.k, NULL);
+	rp.crc = bch2_extent_crc_unpack(r.k, NULL);
+
+	while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
+	       __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
+		if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
+		    rp.ptr.offset + rp.crc.offset ||
+		    lp.ptr.dev			!= rp.ptr.dev ||
+		    lp.ptr.gen			!= rp.ptr.gen ||
+		    lp.ptr.unwritten		!= rp.ptr.unwritten ||
+		    lp.has_ec			!= rp.has_ec)
+			return false;
 
-	if (ptr->cached)
-		return 0;
+		/* Extents may not straddle buckets: */
+		rcu_read_lock();
+		struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
+		bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
+		rcu_read_unlock();
 
-	ca = bch_dev_bkey_exists(c, ptr->dev);
+		if (!same_bucket)
+			return false;
 
-	if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
-		return 0;
+		if (lp.has_ec			!= rp.has_ec ||
+		    (lp.has_ec &&
+		     (lp.ec.block		!= rp.ec.block ||
+		      lp.ec.redundancy		!= rp.ec.redundancy ||
+		      lp.ec.idx			!= rp.ec.idx)))
+			return false;
 
-	return ca->mi.durability;
-}
+		if (lp.crc.compression_type	!= rp.crc.compression_type ||
+		    lp.crc.nonce		!= rp.crc.nonce)
+			return false;
 
-unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e)
-{
-	const struct bch_extent_ptr *ptr;
-	unsigned durability = 0;
+		if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
+		    lp.crc.uncompressed_size) {
+			/* can use left extent's crc entry */
+		} else if (lp.crc.live_size <= rp.crc.offset) {
+			/* can use right extent's crc entry */
+		} else {
+			/* check if checksums can be merged: */
+			if (lp.crc.csum_type		!= rp.crc.csum_type ||
+			    lp.crc.nonce		!= rp.crc.nonce ||
+			    crc_is_compressed(lp.crc) ||
+			    !bch2_checksum_mergeable(lp.crc.csum_type))
+				return false;
+
+			if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
+			    rp.crc.offset)
+				return false;
+
+			if (lp.crc.csum_type &&
+			    lp.crc.uncompressed_size +
+			    rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
+				return false;
+		}
 
-	extent_for_each_ptr(e, ptr)
-		durability += bch2_extent_ptr_durability(c, ptr);
+		en_l = extent_entry_next(en_l);
+		en_r = extent_entry_next(en_r);
+	}
 
-	return durability;
-}
+	en_l = l_ptrs.start;
+	en_r = r_ptrs.start;
+	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
+		if (extent_entry_is_crc(en_l)) {
+			struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+			struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
 
-unsigned bch2_extent_is_compressed(struct bkey_s_c k)
-{
-	struct bkey_s_c_extent e;
-	const struct bch_extent_ptr *ptr;
-	struct bch_extent_crc_unpacked crc;
-	unsigned ret = 0;
+			if (crc_l.uncompressed_size + crc_r.uncompressed_size >
+			    bch2_crc_field_size_max[extent_entry_type(en_l)])
+				return false;
+		}
 
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		e = bkey_s_c_to_extent(k);
-
-		extent_for_each_ptr_crc(e, ptr, crc)
-			if (!ptr->cached &&
-			    crc.compression_type != BCH_COMPRESSION_NONE &&
-			    crc.compressed_size < crc.live_size)
-				ret = max_t(unsigned, ret, crc.compressed_size);
+		en_l = extent_entry_next(en_l);
+		en_r = extent_entry_next(en_r);
 	}
 
-	return ret;
+	use_right_ptr = false;
+	en_l = l_ptrs.start;
+	en_r = r_ptrs.start;
+	while (en_l < l_ptrs.end) {
+		if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
+		    use_right_ptr)
+			en_l->ptr = en_r->ptr;
+
+		if (extent_entry_is_crc(en_l)) {
+			struct bch_extent_crc_unpacked crc_l =
+				bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+			struct bch_extent_crc_unpacked crc_r =
+				bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+			use_right_ptr = false;
+
+			if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
+			    crc_l.uncompressed_size) {
+				/* can use left extent's crc entry */
+			} else if (crc_l.live_size <= crc_r.offset) {
+				/* can use right extent's crc entry */
+				crc_r.offset -= crc_l.live_size;
+				bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
+						     extent_entry_type(en_l));
+				use_right_ptr = true;
+			} else {
+				crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
+								 crc_l.csum,
+								 crc_r.csum,
+								 crc_r.uncompressed_size << 9);
+
+				crc_l.uncompressed_size	+= crc_r.uncompressed_size;
+				crc_l.compressed_size	+= crc_r.compressed_size;
+				bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
+						     extent_entry_type(en_l));
+			}
+		}
+
+		en_l = extent_entry_next(en_l);
+		en_r = extent_entry_next(en_r);
+	}
+
+	bch2_key_resize(l.k, l.k->size + r.k->size);
+	return true;
 }
 
-bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
-			     struct bch_extent_ptr m, u64 offset)
+/* KEY_TYPE_reservation: */
+
+int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k,
+			      struct bkey_validate_context from)
 {
-	const struct bch_extent_ptr *ptr;
-	struct bch_extent_crc_unpacked crc;
+	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+	int ret = 0;
 
-	extent_for_each_ptr_crc(e, ptr, crc)
-		if (ptr->dev	== m.dev &&
-		    ptr->gen	== m.gen &&
-		    (s64) ptr->offset + crc.offset - bkey_start_offset(e.k) ==
-		    (s64) m.offset  - offset)
-			return ptr;
+	bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX,
+			 c, reservation_key_nr_replicas_invalid,
+			 "invalid nr_replicas (%u)", r.v->nr_replicas);
+fsck_err:
+	return ret;
+}
 
-	return NULL;
+void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
+			      struct bkey_s_c k)
+{
+	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+	prt_printf(out, "generation %u replicas %u",
+	       le32_to_cpu(r.v->generation),
+	       r.v->nr_replicas);
 }
 
-/* Doesn't cleanup redundant crcs */
-void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
+bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
 {
-	EBUG_ON(ptr < &e.v->start->ptr ||
-		ptr >= &extent_entry_last(e)->ptr);
-	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
-	memmove_u64s_down(ptr, ptr + 1,
-			  (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1));
-	e.k->u64s -= sizeof(*ptr) / sizeof(u64);
+	struct bkey_s_reservation l = bkey_s_to_reservation(_l);
+	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r);
+
+	if (l.v->generation != r.v->generation ||
+	    l.v->nr_replicas != r.v->nr_replicas)
+		return false;
+
+	bch2_key_resize(l.k, l.k->size + r.k->size);
+	return true;
 }
 
-void bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
+/* Extent checksum entries: */
+
+/* returns true if not equal */
+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
+					 struct bch_extent_crc_unpacked r)
 {
-	__bch2_extent_drop_ptr(e, ptr);
-	bch2_extent_drop_redundant_crcs(e);
+	return (l.csum_type		!= r.csum_type ||
+		l.compression_type	!= r.compression_type ||
+		l.compressed_size	!= r.compressed_size ||
+		l.uncompressed_size	!= r.uncompressed_size ||
+		l.offset		!= r.offset ||
+		l.live_size		!= r.live_size ||
+		l.nonce			!= r.nonce ||
+		bch2_crc_cmp(l.csum, r.csum));
 }
 
 static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
 				  struct bch_extent_crc_unpacked n)
 {
-	return !u.compression_type &&
+	return !crc_is_compressed(u) &&
 		u.csum_type &&
 		u.uncompressed_size > u.live_size &&
 		bch2_csum_type_is_encryption(u.csum_type) ==
 		bch2_csum_type_is_encryption(n.csum_type);
 }
 
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
 				 struct bch_extent_crc_unpacked n)
 {
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	struct bch_extent_crc_unpacked crc;
 	const union bch_extent_entry *i;
 
 	if (!n.csum_type)
 		return false;
 
-	extent_for_each_crc(e, crc, i)
+	bkey_for_each_crc(k.k, ptrs, crc, i)
 		if (can_narrow_crc(crc, n))
 			return true;
 
@@ -319,2077 +492,1079 @@ bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
  * currently live (so that readers won't have to bounce) while we've got the
  * checksum we need:
  */
-bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
-			     struct bch_extent_crc_unpacked n)
+bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
 {
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
 	struct bch_extent_crc_unpacked u;
-	struct bch_extent_ptr *ptr;
+	struct extent_ptr_decoded p;
 	union bch_extent_entry *i;
+	bool ret = false;
 
 	/* Find a checksum entry that covers only live data: */
-	if (!n.csum_type)
-		extent_for_each_crc(extent_i_to_s(e), u, i)
-			if (!u.compression_type &&
+	if (!n.csum_type) {
+		bkey_for_each_crc(&k->k, ptrs, u, i)
+			if (!crc_is_compressed(u) &&
 			    u.csum_type &&
 			    u.live_size == u.uncompressed_size) {
 				n = u;
-				break;
+				goto found;
 			}
-
-	if (!bch2_can_narrow_extent_crcs(extent_i_to_s_c(e), n))
 		return false;
-
-	BUG_ON(n.compression_type);
+	}
+found:
+	BUG_ON(crc_is_compressed(n));
 	BUG_ON(n.offset);
-	BUG_ON(n.live_size != e->k.size);
+	BUG_ON(n.live_size != k->k.size);
 
-	bch2_extent_crc_append(e, n);
 restart_narrow_pointers:
-	extent_for_each_ptr_crc(extent_i_to_s(e), ptr, u)
-		if (can_narrow_crc(u, n)) {
-			ptr->offset += u.offset;
-			extent_ptr_append(e, *ptr);
-			__bch2_extent_drop_ptr(extent_i_to_s(e), ptr);
+	ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+
+	bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
+		if (can_narrow_crc(p.crc, n)) {
+			bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
+			p.ptr.offset += p.crc.offset;
+			p.crc = n;
+			bch2_extent_ptr_decoded_append(k, &p);
+			ret = true;
 			goto restart_narrow_pointers;
 		}
 
-	bch2_extent_drop_redundant_crcs(extent_i_to_s(e));
-	return true;
+	return ret;
 }
 
-/* returns true if not equal */
-static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
-					 struct bch_extent_crc_unpacked r)
+static void bch2_extent_crc_pack(union bch_extent_crc *dst,
+				 struct bch_extent_crc_unpacked src,
+				 enum bch_extent_entry_type type)
 {
-	return (l.csum_type		!= r.csum_type ||
-		l.compression_type	!= r.compression_type ||
-		l.compressed_size	!= r.compressed_size ||
-		l.uncompressed_size	!= r.uncompressed_size ||
-		l.offset		!= r.offset ||
-		l.live_size		!= r.live_size ||
-		l.nonce			!= r.nonce ||
-		bch2_crc_cmp(l.csum, r.csum));
+#define set_common_fields(_dst, _src)					\
+		_dst.type		= 1 << type;			\
+		_dst.csum_type		= _src.csum_type,		\
+		_dst.compression_type	= _src.compression_type,	\
+		_dst._compressed_size	= _src.compressed_size - 1,	\
+		_dst._uncompressed_size	= _src.uncompressed_size - 1,	\
+		_dst.offset		= _src.offset
+
+	switch (type) {
+	case BCH_EXTENT_ENTRY_crc32:
+		set_common_fields(dst->crc32, src);
+		dst->crc32.csum		= (u32 __force) *((__le32 *) &src.csum.lo);
+		break;
+	case BCH_EXTENT_ENTRY_crc64:
+		set_common_fields(dst->crc64, src);
+		dst->crc64.nonce	= src.nonce;
+		dst->crc64.csum_lo	= (u64 __force) src.csum.lo;
+		dst->crc64.csum_hi	= (u64 __force) *((__le16 *) &src.csum.hi);
+		break;
+	case BCH_EXTENT_ENTRY_crc128:
+		set_common_fields(dst->crc128, src);
+		dst->crc128.nonce	= src.nonce;
+		dst->crc128.csum	= src.csum;
+		break;
+	default:
+		BUG();
+	}
+#undef set_common_fields
 }
 
-void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
+void bch2_extent_crc_append(struct bkey_i *k,
+			    struct bch_extent_crc_unpacked new)
 {
-	union bch_extent_entry *entry = e.v->start;
-	union bch_extent_crc *crc, *prev = NULL;
-	struct bch_extent_crc_unpacked u, prev_u = { 0 };
-
-	while (entry != extent_entry_last(e)) {
-		union bch_extent_entry *next = extent_entry_next(entry);
-		size_t crc_u64s = extent_entry_u64s(entry);
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+	union bch_extent_crc *crc = (void *) ptrs.end;
+	enum bch_extent_entry_type type;
 
-		if (!extent_entry_is_crc(entry))
-			goto next;
-
-		crc = entry_to_crc(entry);
-		u = bch2_extent_crc_unpack(e.k, crc);
-
-		if (next == extent_entry_last(e)) {
-			/* crc entry with no pointers after it: */
-			goto drop;
-		}
-
-		if (extent_entry_is_crc(next)) {
-			/* no pointers before next crc entry: */
-			goto drop;
-		}
-
-		if (prev && !bch2_crc_unpacked_cmp(u, prev_u)) {
-			/* identical to previous crc entry: */
-			goto drop;
-		}
+	if (bch_crc_bytes[new.csum_type]	<= 4 &&
+	    new.uncompressed_size		<= CRC32_SIZE_MAX &&
+	    new.nonce				<= CRC32_NONCE_MAX)
+		type = BCH_EXTENT_ENTRY_crc32;
+	else if (bch_crc_bytes[new.csum_type]	<= 10 &&
+		   new.uncompressed_size	<= CRC64_SIZE_MAX &&
+		   new.nonce			<= CRC64_NONCE_MAX)
+		type = BCH_EXTENT_ENTRY_crc64;
+	else if (bch_crc_bytes[new.csum_type]	<= 16 &&
+		   new.uncompressed_size	<= CRC128_SIZE_MAX &&
+		   new.nonce			<= CRC128_NONCE_MAX)
+		type = BCH_EXTENT_ENTRY_crc128;
+	else
+		BUG();
 
-		if (!prev &&
-		    !u.csum_type &&
-		    !u.compression_type) {
-			/* null crc entry: */
-			union bch_extent_entry *e2;
+	bch2_extent_crc_pack(crc, new, type);
 
-			extent_for_each_entry_from(e, e2, extent_entry_next(entry)) {
-				if (!extent_entry_is_ptr(e2))
-					break;
+	k->k.u64s += extent_entry_u64s(ptrs.end);
 
-				e2->ptr.offset += u.offset;
-			}
-			goto drop;
-		}
-
-		prev = crc;
-		prev_u = u;
-next:
-		entry = next;
-		continue;
-drop:
-		memmove_u64s_down(crc, next,
-				  (u64 *) extent_entry_last(e) - (u64 *) next);
-		e.k->u64s -= crc_u64s;
-	}
-
-	EBUG_ON(bkey_val_u64s(e.k) && !bch2_extent_nr_ptrs(e.c));
+	EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
 }
 
-static bool should_drop_ptr(const struct bch_fs *c,
-			    struct bkey_s_c_extent e,
-			    const struct bch_extent_ptr *ptr)
-{
-	return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr);
-}
+/* Generic code for keys with pointers: */
 
-static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
 {
-	struct bch_extent_ptr *ptr = &e.v->start->ptr;
-	bool dropped = false;
-
-	while ((ptr = extent_ptr_next(e, ptr)))
-		if (should_drop_ptr(c, e.c, ptr)) {
-			__bch2_extent_drop_ptr(e, ptr);
-			dropped = true;
-		} else
-			ptr++;
-
-	if (dropped)
-		bch2_extent_drop_redundant_crcs(e);
+	return bch2_bkey_devs(k).nr;
 }
 
-bool bch2_ptr_normalize(struct bch_fs *c, struct btree *b, struct bkey_s k)
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
 {
-	return bch2_extent_normalize(c, k);
+	return k.k->type == KEY_TYPE_reservation
+		? bkey_s_c_to_reservation(k).v->nr_replicas
+		: bch2_bkey_dirty_devs(k).nr;
 }
 
-void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
 {
-	switch (k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED: {
-		union bch_extent_entry *entry;
-		u64 *d = (u64 *) bkeyp_val(f, k);
-		unsigned i;
+	unsigned ret = 0;
 
-		for (i = 0; i < bkeyp_val_u64s(f, k); i++)
-			d[i] = swab64(d[i]);
+	if (k.k->type == KEY_TYPE_reservation) {
+		ret = bkey_s_c_to_reservation(k).v->nr_replicas;
+	} else {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
 
-		for (entry = (union bch_extent_entry *) d;
-		     entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
-		     entry = extent_entry_next(entry)) {
-			switch (extent_entry_type(entry)) {
-			case BCH_EXTENT_ENTRY_crc32:
-				entry->crc32.csum = swab32(entry->crc32.csum);
-				break;
-			case BCH_EXTENT_ENTRY_crc64:
-				entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
-				entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
-				break;
-			case BCH_EXTENT_ENTRY_crc128:
-				entry->crc128.csum.hi = (__force __le64)
-					swab64((__force u64) entry->crc128.csum.hi);
-				entry->crc128.csum.lo = (__force __le64)
-					swab64((__force u64) entry->crc128.csum.lo);
-				break;
-			case BCH_EXTENT_ENTRY_ptr:
-				break;
-			}
-		}
-		break;
-	}
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+			ret += !p.ptr.cached && !crc_is_compressed(p.crc);
 	}
-}
-
-static const char *extent_ptr_invalid(const struct bch_fs *c,
-				      struct bkey_s_c_extent e,
-				      const struct bch_extent_ptr *ptr,
-				      unsigned size_ondisk,
-				      bool metadata)
-{
-	const struct bch_extent_ptr *ptr2;
-	struct bch_dev *ca;
 
-	if (ptr->dev >= c->sb.nr_devices ||
-	    !c->devs[ptr->dev])
-		return "pointer to invalid device";
-
-	ca = bch_dev_bkey_exists(c, ptr->dev);
-	if (!ca)
-		return "pointer to invalid device";
-
-	extent_for_each_ptr(e, ptr2)
-		if (ptr != ptr2 && ptr->dev == ptr2->dev)
-			return "multiple pointers to same device";
-
-	if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
-		return "offset past end of device";
-
-	if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
-		return "offset before first bucket";
-
-	if (bucket_remainder(ca, ptr->offset) +
-	    size_ondisk > ca->mi.bucket_size)
-		return "spans multiple buckets";
-
-	return NULL;
+	return ret;
 }
 
-static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
-				size_t size, struct bkey_s_c_extent e)
+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
 {
-	char *out = buf, *end = buf + size;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
-	struct bch_extent_crc_unpacked crc;
-	const struct bch_extent_ptr *ptr;
-	struct bch_dev *ca;
-	bool first = true;
-
-#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
-
-	extent_for_each_entry(e, entry) {
-		if (!first)
-			p(" ");
+	struct extent_ptr_decoded p;
+	unsigned ret = 0;
 
-		switch (__extent_entry_type(entry)) {
-		case BCH_EXTENT_ENTRY_crc32:
-		case BCH_EXTENT_ENTRY_crc64:
-		case BCH_EXTENT_ENTRY_crc128:
-			crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
-
-			p("crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
-			  crc.compressed_size,
-			  crc.uncompressed_size,
-			  crc.offset, crc.nonce,
-			  crc.csum_type,
-			  crc.compression_type);
-			break;
-		case BCH_EXTENT_ENTRY_ptr:
-			ptr = entry_to_ptr(entry);
-			ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
-				? bch_dev_bkey_exists(c, ptr->dev)
-				: NULL;
-
-			p("ptr: %u:%llu gen %u%s%s", ptr->dev,
-			  (u64) ptr->offset, ptr->gen,
-			  ptr->cached ? " cached" : "",
-			  ca && ptr_stale(ca, ptr)
-			  ? " stale" : "");
-			break;
-		default:
-			p("(invalid extent entry %.16llx)", *((u64 *) entry));
-			goto out;
-		}
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		if (!p.ptr.cached && crc_is_compressed(p.crc))
+			ret += p.crc.compressed_size;
 
-		first = false;
-	}
-out:
-	if (bkey_extent_is_cached(e.k))
-		p(" cached");
-#undef p
-	return out - buf;
+	return ret;
 }
 
-static inline bool dev_latency_better(struct bch_fs *c,
-			      const struct bch_extent_ptr *ptr1,
-			      const struct bch_extent_ptr *ptr2)
+bool bch2_bkey_is_incompressible(struct bkey_s_c k)
 {
-	struct bch_dev *dev1 = bch_dev_bkey_exists(c, ptr1->dev);
-	struct bch_dev *dev2 = bch_dev_bkey_exists(c, ptr2->dev);
-	u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
-	u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
-
-	/* Pick at random, biased in favor of the faster device: */
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct bch_extent_crc_unpacked crc;
 
-	return bch2_rand_range(l1 + l2) > l1;
+	bkey_for_each_crc(k.k, ptrs, crc, entry)
+		if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
+			return true;
+	return false;
 }
 
-static int extent_pick_read_device(struct bch_fs *c,
-				   struct bkey_s_c_extent e,
-				   struct bch_devs_mask *avoid,
-				   struct extent_pick_ptr *pick)
+unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 {
-	const struct bch_extent_ptr *ptr;
-	struct bch_extent_crc_unpacked crc;
-	struct bch_dev *ca;
-	int ret = 0;
-
-	extent_for_each_ptr_crc(e, ptr, crc) {
-		ca = bch_dev_bkey_exists(c, ptr->dev);
-
-		if (ptr->cached && ptr_stale(ca, ptr))
-			continue;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p = { 0 };
+	unsigned replicas = 0;
 
-		if (avoid && test_bit(ptr->dev, avoid->d))
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if (p.ptr.cached)
 			continue;
 
-		if (ret && !dev_latency_better(c, ptr, &pick->ptr))
-			continue;
+		if (p.has_ec)
+			replicas += p.ec.redundancy;
 
-		*pick = (struct extent_pick_ptr) {
-			.ptr	= *ptr,
-			.crc	= crc,
-		};
+		replicas++;
 
-		ret = 1;
 	}
 
-	return ret;
+	return replicas;
 }
 
-/* Btree ptrs */
-
-const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p)
 {
-	if (bkey_extent_is_cached(k.k))
-		return "cached";
-
-	if (k.k->size)
-		return "nonzero key size";
-
-	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
-		return "value too big";
-
-	switch (k.k->type) {
-	case BCH_EXTENT: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const union bch_extent_entry *entry;
-		const struct bch_extent_ptr *ptr;
-		const char *reason;
-
-		extent_for_each_entry(e, entry) {
-			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
-				return "invalid extent entry type";
-
-			if (extent_entry_is_crc(entry))
-				return "has crc field";
-		}
-
-		extent_for_each_ptr(e, ptr) {
-			reason = extent_ptr_invalid(c, e, ptr,
-						    c->opts.btree_node_size,
-						    true);
-			if (reason)
-				return reason;
-		}
-
-		return NULL;
-	}
+	if (p->ptr.cached)
+		return 0;
 
-	default:
-		return "invalid value type";
-	}
+	return p->has_ec
+		? p->ec.redundancy + 1
+		: ca->mi.durability;
 }
 
-void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
-			       struct bkey_s_c k)
+unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
 {
-	struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-	const struct bch_extent_ptr *ptr;
-	unsigned seq;
-	const char *err;
-	char buf[160];
-	struct bucket_mark mark;
-	struct bch_dev *ca;
-	unsigned replicas = 0;
-	bool bad;
+	struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
 
-	extent_for_each_ptr(e, ptr) {
-		ca = bch_dev_bkey_exists(c, ptr->dev);
-		replicas++;
-
-		if (!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags))
-			continue;
-
-		err = "stale";
-		if (ptr_stale(ca, ptr))
-			goto err;
-
-		do {
-			seq = read_seqcount_begin(&c->gc_pos_lock);
-			mark = ptr_bucket_mark(ca, ptr);
-
-			bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
-				(mark.data_type != BCH_DATA_BTREE ||
-				 mark.dirty_sectors < c->opts.btree_node_size);
-		} while (read_seqcount_retry(&c->gc_pos_lock, seq));
-
-		err = "inconsistent";
-		if (bad)
-			goto err;
-	}
-
-	if (!bch2_bkey_replicas_marked(c, BCH_DATA_BTREE, e.s_c)) {
-		bch2_bkey_val_to_text(c, btree_node_type(b),
-				     buf, sizeof(buf), k);
-		bch2_fs_bug(c,
-			"btree key bad (replicas not marked in superblock):\n%s",
-			buf);
-		return;
-	}
-
-	return;
-err:
-	bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
-	bch2_fs_bug(c, "%s btree pointer %s: bucket %zi "
-		      "gen %i mark %08x",
-		      err, buf, PTR_BUCKET_NR(ca, ptr),
-		      mark.gen, (unsigned) mark.v.counter);
+	return ca ? __extent_ptr_durability(ca, p) : 0;
 }
 
-void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
-			    size_t size, struct bkey_s_c k)
+unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
 {
-	char *out = buf, *end = buf + size;
-	const char *invalid;
-
-#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
-
-	if (bkey_extent_is_data(k.k))
-		out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
+	struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
 
-	invalid = bch2_btree_ptr_invalid(c, k);
-	if (invalid)
-		p(" invalid: %s", invalid);
-#undef p
-}
+	if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
+		return 0;
 
-int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
-			struct bch_devs_mask *avoid,
-			struct extent_pick_ptr *pick)
-{
-	return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
-				       avoid, pick);
+	return __extent_ptr_durability(ca, p);
 }
 
-/* Extents */
-
-static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
+unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
 {
-	u64 len = 0;
-
-	if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
-		return false;
-
-	EBUG_ON(bkey_cmp(where, k.k->p) > 0);
-
-	len = k.k->p.offset - where.offset;
-
-	BUG_ON(len > k.k->size);
-
-	/*
-	 * Don't readjust offset if the key size is now 0, because that could
-	 * cause offset to point to the next bucket:
-	 */
-	if (!len)
-		k.k->type = KEY_TYPE_DELETED;
-	else if (bkey_extent_is_data(k.k)) {
-		struct bkey_s_extent e = bkey_s_to_extent(k);
-		union bch_extent_entry *entry;
-		bool seen_crc = false;
-
-		extent_for_each_entry(e, entry) {
-			switch (extent_entry_type(entry)) {
-			case BCH_EXTENT_ENTRY_ptr:
-				if (!seen_crc)
-					entry->ptr.offset += e.k->size - len;
-				break;
-			case BCH_EXTENT_ENTRY_crc32:
-				entry->crc32.offset += e.k->size - len;
-				break;
-			case BCH_EXTENT_ENTRY_crc64:
-				entry->crc64.offset += e.k->size - len;
-				break;
-			case BCH_EXTENT_ENTRY_crc128:
-				entry->crc128.offset += e.k->size - len;
-				break;
-			}
-
-			if (extent_entry_is_crc(entry))
-				seen_crc = true;
-		}
-	}
-
-	k.k->size = len;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned durability = 0;
 
-	return true;
-}
+	rcu_read_lock();
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		durability += bch2_extent_ptr_durability(c, &p);
+	rcu_read_unlock();
 
-bool bch2_cut_front(struct bpos where, struct bkey_i *k)
-{
-	return __bch2_cut_front(where, bkey_i_to_s(k));
+	return durability;
 }
 
-bool bch2_cut_back(struct bpos where, struct bkey *k)
+static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
 {
-	u64 len = 0;
-
-	if (bkey_cmp(where, k->p) >= 0)
-		return false;
-
-	EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0);
-
-	len = where.offset - bkey_start_offset(k);
-
-	BUG_ON(len > k->size);
-
-	k->p = where;
-	k->size = len;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned durability = 0;
 
-	if (!len)
-		k->type = KEY_TYPE_DELETED;
+	rcu_read_lock();
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
+			durability += bch2_extent_ptr_durability(c, &p);
+	rcu_read_unlock();
 
-	return true;
+	return durability;
 }
 
-/**
- * bch_key_resize - adjust size of @k
- *
- * bkey_start_offset(k) will be preserved, modifies where the extent ends
- */
-void bch2_key_resize(struct bkey *k,
-		    unsigned new_size)
+void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
 {
-	k->p.offset -= k->size;
-	k->p.offset += new_size;
-	k->size = new_size;
+	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
+	union bch_extent_entry *next = extent_entry_next(entry);
+
+	memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
+	k->k.u64s -= extent_entry_u64s(entry);
 }
 
-/*
- * In extent_sort_fix_overlapping(), insert_fixup_extent(),
- * extent_merge_inline() - we're modifying keys in place that are packed. To do
- * that we have to unpack the key, modify the unpacked key - then this
- * copies/repacks the unpacked to the original as necessary.
- */
-static bool __extent_save(struct btree *b, struct btree_node_iter *iter,
-			  struct bkey_packed *dst, struct bkey *src)
+void bch2_extent_ptr_decoded_append(struct bkey_i *k,
+				    struct extent_ptr_decoded *p)
 {
-	struct bkey_format *f = &b->format;
-	struct bkey_i *dst_unpacked;
-	bool ret;
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+	struct bch_extent_crc_unpacked crc =
+		bch2_extent_crc_unpack(&k->k, NULL);
+	union bch_extent_entry *pos;
 
-	if ((dst_unpacked = packed_to_bkey(dst))) {
-		dst_unpacked->k = *src;
-		ret = true;
-	} else {
-		ret = bch2_bkey_pack_key(dst, src, f);
+	if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
+		pos = ptrs.start;
+		goto found;
 	}
 
-	if (ret && iter)
-		bch2_verify_key_order(b, iter, dst);
+	bkey_for_each_crc(&k->k, ptrs, crc, pos)
+		if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
+			pos = extent_entry_next(pos);
+			goto found;
+		}
 
-	return ret;
-}
+	bch2_extent_crc_append(k, p->crc);
+	pos = bkey_val_end(bkey_i_to_s(k));
+found:
+	p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+	__extent_entry_insert(k, pos, to_entry(&p->ptr));
 
-static void extent_save(struct btree *b, struct btree_node_iter *iter,
-			struct bkey_packed *dst, struct bkey *src)
-{
-	BUG_ON(!__extent_save(b, iter, dst, src));
+	if (p->has_ec) {
+		p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
+		__extent_entry_insert(k, pos, to_entry(&p->ec));
+	}
 }
 
-/*
- * If keys compare equal, compare by pointer order:
- *
- * Necessary for sort_fix_overlapping() - if there are multiple keys that
- * compare equal in different sets, we have to process them newest to oldest.
- */
-#define extent_sort_cmp(h, l, r)					\
-({									\
-	struct bkey _ul = bkey_unpack_key(b,				\
-				__btree_node_offset_to_key(b, (l).k));	\
-	struct bkey _ur = bkey_unpack_key(b,				\
-				__btree_node_offset_to_key(b, (r).k));	\
-									\
-	bkey_cmp(bkey_start_pos(&_ul),					\
-		 bkey_start_pos(&_ur)) ?: (r).k - (l).k;		\
-})
-
-static inline void extent_sort_sift(struct btree_node_iter_large *iter,
-				    struct btree *b, size_t i)
+static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
+					  union bch_extent_entry *entry)
 {
-	heap_sift_down(iter, i, extent_sort_cmp);
-}
+	union bch_extent_entry *i = ptrs.start;
 
-static inline void extent_sort_next(struct btree_node_iter_large *iter,
-				    struct btree *b,
-				    struct btree_node_iter_set *i)
-{
-	sort_key_next(iter, b, i);
-	heap_sift_down(iter, i - iter->data, extent_sort_cmp);
+	if (i == entry)
+		return NULL;
+
+	while (extent_entry_next(i) != entry)
+		i = extent_entry_next(i);
+	return i;
 }
 
-static void extent_sort_append(struct bch_fs *c,
-			       struct btree *b,
-			       struct btree_nr_keys *nr,
-			       struct bkey_packed *start,
-			       struct bkey_packed **prev,
-			       struct bkey_packed *k)
+/*
+ * Returns pointer to the next entry after the one being dropped:
+ */
+void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr)
 {
-	struct bkey_format *f = &b->format;
-	BKEY_PADDED(k) tmp;
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+	union bch_extent_entry *entry = to_entry(ptr), *next;
+	bool drop_crc = true;
 
-	if (bkey_whiteout(k))
+	if (k.k->type == KEY_TYPE_stripe) {
+		ptr->dev = BCH_SB_MEMBER_INVALID;
 		return;
-
-	bch2_bkey_unpack(b, &tmp.k, k);
-
-	if (*prev &&
-	    bch2_extent_merge(c, b, (void *) *prev, &tmp.k))
-		return;
-
-	if (*prev) {
-		bch2_bkey_pack(*prev, (void *) *prev, f);
-
-		btree_keys_account_key_add(nr, 0, *prev);
-		*prev = bkey_next(*prev);
-	} else {
-		*prev = start;
 	}
 
-	bkey_copy(*prev, &tmp.k);
-}
-
-struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
-					struct bset *dst,
-					struct btree *b,
-					struct btree_node_iter_large *iter)
-{
-	struct bkey_format *f = &b->format;
-	struct btree_node_iter_set *_l = iter->data, *_r;
-	struct bkey_packed *prev = NULL, *out, *lk, *rk;
-	struct bkey l_unpacked, r_unpacked;
-	struct bkey_s l, r;
-	struct btree_nr_keys nr;
-
-	memset(&nr, 0, sizeof(nr));
-
-	heap_resort(iter, extent_sort_cmp);
-
-	while (!bch2_btree_node_iter_large_end(iter)) {
-		lk = __btree_node_offset_to_key(b, _l->k);
-
-		if (iter->used == 1) {
-			extent_sort_append(c, b, &nr, dst->start, &prev, lk);
-			extent_sort_next(iter, b, _l);
-			continue;
-		}
-
-		_r = iter->data + 1;
-		if (iter->used > 2 &&
-		    extent_sort_cmp(iter, _r[0], _r[1]) >= 0)
-			_r++;
-
-		rk = __btree_node_offset_to_key(b, _r->k);
-
-		l = __bkey_disassemble(b, lk, &l_unpacked);
-		r = __bkey_disassemble(b, rk, &r_unpacked);
-
-		/* If current key and next key don't overlap, just append */
-		if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
-			extent_sort_append(c, b, &nr, dst->start, &prev, lk);
-			extent_sort_next(iter, b, _l);
-			continue;
-		}
+	EBUG_ON(ptr < &ptrs.start->ptr ||
+		ptr >= &ptrs.end->ptr);
+	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
 
-		/* Skip 0 size keys */
-		if (!r.k->size) {
-			extent_sort_next(iter, b, _r);
-			continue;
+	for (next = extent_entry_next(entry);
+	     next != ptrs.end;
+	     next = extent_entry_next(next)) {
+		if (extent_entry_is_crc(next)) {
+			break;
+		} else if (extent_entry_is_ptr(next)) {
+			drop_crc = false;
+			break;
 		}
+	}
 
-		/*
-		 * overlap: keep the newer key and trim the older key so they
-		 * don't overlap. comparing pointers tells us which one is
-		 * newer, since the bsets are appended one after the other.
-		 */
-
-		/* can't happen because of comparison func */
-		BUG_ON(_l->k < _r->k &&
-		       !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
-
-		if (_l->k > _r->k) {
-			/* l wins, trim r */
-			if (bkey_cmp(l.k->p, r.k->p) >= 0) {
-				sort_key_next(iter, b, _r);
-			} else {
-				__bch2_cut_front(l.k->p, r);
-				extent_save(b, NULL, rk, r.k);
-			}
-
-			extent_sort_sift(iter, b, _r - iter->data);
-		} else if (bkey_cmp(l.k->p, r.k->p) > 0) {
-			BKEY_PADDED(k) tmp;
-
-			/*
-			 * r wins, but it overlaps in the middle of l - split l:
-			 */
-			bkey_reassemble(&tmp.k, l.s_c);
-			bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k);
-
-			__bch2_cut_front(r.k->p, l);
-			extent_save(b, NULL, lk, l.k);
-
-			extent_sort_sift(iter, b, 0);
+	extent_entry_drop(k, entry);
 
-			extent_sort_append(c, b, &nr, dst->start, &prev,
-					   bkey_to_packed(&tmp.k));
-		} else {
-			bch2_cut_back(bkey_start_pos(r.k), l.k);
-			extent_save(b, NULL, lk, l.k);
-		}
-	}
+	while ((entry = extent_entry_prev(ptrs, entry))) {
+		if (extent_entry_is_ptr(entry))
+			break;
 
-	if (prev) {
-		bch2_bkey_pack(prev, (void *) prev, f);
-		btree_keys_account_key_add(&nr, 0, prev);
-		out = bkey_next(prev);
-	} else {
-		out = dst->start;
+		if ((extent_entry_is_crc(entry) && drop_crc) ||
+		    extent_entry_is_stripe_ptr(entry))
+			extent_entry_drop(k, entry);
 	}
-
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	return nr;
 }
 
-struct extent_insert_state {
-	struct btree_insert		*trans;
-	struct btree_insert_entry	*insert;
-	struct bpos			committed;
-	struct bch_fs_usage		stats;
-
-	/* for deleting: */
-	struct bkey_i			whiteout;
-	bool				do_journal;
-	bool				deleting;
-};
-
-static void bch2_add_sectors(struct extent_insert_state *s,
-			     struct bkey_s_c k, u64 offset, s64 sectors)
+void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr)
 {
-	struct bch_fs *c = s->trans->c;
-	struct btree *b = s->insert->iter->l[0].b;
+	if (k.k->type != KEY_TYPE_stripe) {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
 
-	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0);
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+			if (p.ptr.dev == ptr->dev && p.has_ec) {
+				ptr->dev = BCH_SB_MEMBER_INVALID;
+				return;
+			}
+	}
 
-	if (!sectors)
-		return;
+	bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
 
-	bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
-		      &s->stats, s->trans->journal_res.seq, 0);
-}
+	bch2_bkey_drop_ptr_noerror(k, ptr);
 
-static void bch2_subtract_sectors(struct extent_insert_state *s,
-				 struct bkey_s_c k, u64 offset, s64 sectors)
-{
-	bch2_add_sectors(s, k, offset, -sectors);
+	/*
+	 * If we deleted all the dirty pointers and there's still cached
+	 * pointers, we could set the cached pointers to dirty if they're not
+	 * stale - but to do that correctly we'd need to grab an open_bucket
+	 * reference so that we don't race with bucket reuse:
+	 */
+	if (have_dirty &&
+	    !bch2_bkey_dirty_devs(k.s_c).nr) {
+		k.k->type = KEY_TYPE_error;
+		set_bkey_val_u64s(k.k, 0);
+	} else if (!bch2_bkey_nr_ptrs(k.s_c)) {
+		k.k->type = KEY_TYPE_deleted;
+		set_bkey_val_u64s(k.k, 0);
+	}
 }
 
-/* These wrappers subtract exactly the sectors that we're removing from @k */
-static void bch2_cut_subtract_back(struct extent_insert_state *s,
-				  struct bpos where, struct bkey_s k)
+void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
 {
-	bch2_subtract_sectors(s, k.s_c, where.offset,
-			     k.k->p.offset - where.offset);
-	bch2_cut_back(where, k.k);
+	bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
 }
 
-static void bch2_cut_subtract_front(struct extent_insert_state *s,
-				   struct bpos where, struct bkey_s k)
+void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
 {
-	bch2_subtract_sectors(s, k.s_c, bkey_start_offset(k.k),
-			     where.offset - bkey_start_offset(k.k));
-	__bch2_cut_front(where, k);
+	bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev);
 }
 
-static void bch2_drop_subtract(struct extent_insert_state *s, struct bkey_s k)
+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
 {
-	if (k.k->size)
-		bch2_subtract_sectors(s, k.s_c,
-				     bkey_start_offset(k.k), k.k->size);
-	k.k->size = 0;
-	k.k->type = KEY_TYPE_DELETED;
-}
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 
-static bool bch2_extent_merge_inline(struct bch_fs *,
-				     struct btree_iter *,
-				     struct bkey_packed *,
-				     struct bkey_packed *,
-				     bool);
+	bkey_for_each_ptr(ptrs, ptr)
+		if (ptr->dev == dev)
+			return ptr;
 
-#define MAX_LOCK_HOLD_TIME	(5 * NSEC_PER_MSEC)
+	return NULL;
+}
 
-static enum btree_insert_ret
-extent_insert_should_stop(struct extent_insert_state *s)
+bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
 {
-	struct btree *b = s->insert->iter->l[0].b;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct bch_dev *ca;
+	bool ret = false;
 
-	/*
-	 * Check if we have sufficient space in both the btree node and the
-	 * journal reservation:
-	 *
-	 * Each insert checks for room in the journal entry, but we check for
-	 * room in the btree node up-front. In the worst case, bkey_cmpxchg()
-	 * will insert two keys, and one iteration of this room will insert one
-	 * key, so we need room for three keys.
-	 */
-	if (!bch2_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s))
-		return BTREE_INSERT_BTREE_NODE_FULL;
-	else if (!journal_res_insert_fits(s->trans, s->insert))
-		return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */
-	else
-		return BTREE_INSERT_OK;
+	rcu_read_lock();
+	bkey_for_each_ptr(ptrs, ptr)
+		if (bch2_dev_in_target(c, ptr->dev, target) &&
+		    (ca = bch2_dev_rcu(c, ptr->dev)) &&
+		    (!ptr->cached ||
+		     !dev_ptr_stale_rcu(ca, ptr))) {
+			ret = true;
+			break;
+		}
+	rcu_read_unlock();
+
+	return ret;
 }
 
-static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
-			       struct bkey_i *insert)
+bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
+			   struct bch_extent_ptr m, u64 offset)
 {
-	struct btree_iter_level *l = &iter->l[0];
-	struct bset_tree *t = bset_tree_last(l->b);
-	struct bkey_packed *where =
-		bch2_btree_node_iter_bset_pos(&l->iter, l->b, t);
-	struct bkey_packed *prev = bch2_bkey_prev_filter(l->b, t, where,
-							 KEY_TYPE_DISCARD);
-	struct bkey_packed *next_live_key = where;
-	unsigned clobber_u64s;
-
-	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
-
-	if (prev)
-		where = bkey_next(prev);
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 
-	while (next_live_key != btree_bkey_last(l->b, t) &&
-	       bkey_deleted(next_live_key))
-		next_live_key = bkey_next(next_live_key);
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		if (p.ptr.dev	== m.dev &&
+		    p.ptr.gen	== m.gen &&
+		    (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
+		    (s64) m.offset  - offset)
+			return true;
 
-	/*
-	 * Everything between where and next_live_key is now deleted keys, and
-	 * is overwritten:
-	 */
-	clobber_u64s = (u64 *) next_live_key - (u64 *) where;
-
-	if (prev &&
-	    bch2_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true))
-		goto drop_deleted_keys;
-
-	if (next_live_key != btree_bkey_last(l->b, t) &&
-	    bch2_extent_merge_inline(c, iter, bkey_to_packed(insert),
-				    next_live_key, false))
-		goto drop_deleted_keys;
-
-	bch2_bset_insert(l->b, &l->iter, where, insert, clobber_u64s);
-	bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, where,
-				clobber_u64s, where->u64s);
-	return;
-drop_deleted_keys:
-	bch2_bset_delete(l->b, where, clobber_u64s);
-	bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
-				 where, clobber_u64s, 0);
+	return false;
 }
 
-static void extent_insert_committed(struct extent_insert_state *s)
+/*
+ * Returns true if two extents refer to the same data:
+ */
+bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
 {
-	struct bch_fs *c = s->trans->c;
-	struct btree_iter *iter = s->insert->iter;
-	struct bkey_i *insert = !s->deleting
-		? s->insert->k
-		: &s->whiteout;
-	BKEY_PADDED(k) split;
-
-	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
-	EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0);
-	EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0);
-
-	if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k)))
-		return;
+	if (k1.k->type != k2.k->type)
+		return false;
 
-	if (s->deleting && !s->do_journal) {
-		bch2_cut_front(s->committed, insert);
-		goto done;
-	}
+	if (bkey_extent_is_direct_data(k1.k)) {
+		struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
+		struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
+		const union bch_extent_entry *entry1, *entry2;
+		struct extent_ptr_decoded p1, p2;
 
-	EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+		if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
+			return false;
 
-	bkey_copy(&split.k, insert);
+		bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
+			bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+				if (p1.ptr.dev		== p2.ptr.dev &&
+				    p1.ptr.gen		== p2.ptr.gen &&
+
+				    /*
+				     * This checks that the two pointers point
+				     * to the same region on disk - adjusting
+				     * for the difference in where the extents
+				     * start, since one may have been trimmed:
+				     */
+				    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
+
+				    /*
+				     * This additionally checks that the
+				     * extents overlap on disk, since the
+				     * previous check may trigger spuriously
+				     * when one extent is immediately partially
+				     * overwritten with another extent (so that
+				     * on disk they are adjacent) and
+				     * compression is in use:
+				     */
+				    ((p1.ptr.offset >= p2.ptr.offset &&
+				      p1.ptr.offset  < p2.ptr.offset + p2.crc.compressed_size) ||
+				     (p2.ptr.offset >= p1.ptr.offset &&
+				      p2.ptr.offset  < p1.ptr.offset + p1.crc.compressed_size)))
+					return true;
 
-	if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
-	    bkey_cmp(s->committed, insert->k.p) &&
-	    bch2_extent_is_compressed(bkey_i_to_s_c(insert))) {
-		/* XXX: possibly need to increase our reservation? */
-		bch2_cut_subtract_back(s, s->committed,
-				      bkey_i_to_s(&split.k));
-		bch2_cut_front(s->committed, insert);
-		bch2_add_sectors(s, bkey_i_to_s_c(insert),
-				bkey_start_offset(&insert->k),
-				insert->k.size);
+		return false;
 	} else {
-		bch2_cut_back(s->committed, &split.k.k);
-		bch2_cut_front(s->committed, insert);
+		/* KEY_TYPE_deleted, etc. */
+		return true;
 	}
-
-	if (debug_check_bkeys(c))
-		bch2_bkey_debugcheck(c, iter->l[0].b, bkey_i_to_s_c(&split.k));
-
-	bch2_btree_journal_key(s->trans, iter, &split.k);
-
-	if (!s->deleting)
-		extent_bset_insert(c, iter, &split.k);
-done:
-	bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
-
-	insert->k.needs_whiteout	= false;
-	s->do_journal			= false;
-	s->trans->did_work		= true;
 }
 
-static enum btree_insert_ret
-__extent_insert_advance_pos(struct extent_insert_state *s,
-			    struct bpos next_pos,
-			    struct bkey_s_c k)
+struct bch_extent_ptr *
+bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
 {
-	struct extent_insert_hook *hook = s->trans->hook;
-	enum btree_insert_ret ret;
+	struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
+	union bch_extent_entry *entry2;
+	struct extent_ptr_decoded p2;
 
-	if (hook)
-		ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
-	else
-		ret = BTREE_INSERT_OK;
-
-	if (ret == BTREE_INSERT_OK)
-		s->committed = next_pos;
+	bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+		if (p1.ptr.dev		== p2.ptr.dev &&
+		    p1.ptr.gen		== p2.ptr.gen &&
+		    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+		    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+			return &entry2->ptr;
 
-	return ret;
+	return NULL;
 }
 
-/*
- * Update iter->pos, marking how much of @insert we've processed, and call hook
- * fn:
- */
-static enum btree_insert_ret
-extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
+static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
+			    struct bch_extent_ptr *ptr)
 {
-	struct btree *b = s->insert->iter->l[0].b;
-	struct bpos next_pos = bpos_min(s->insert->k->k.p,
-					k.k ? k.k->p : b->key.k.p);
-	enum btree_insert_ret ret;
-
-	if (race_fault())
-		return BTREE_INSERT_NEED_TRAVERSE;
-
-	/* hole? */
-	if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
-		ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k),
-						    bkey_s_c_null);
-		if (ret != BTREE_INSERT_OK)
-			return ret;
-	}
+	if (!opts->promote_target ||
+	    !bch2_dev_in_target(c, ptr->dev, opts->promote_target))
+		return false;
 
-	/* avoid redundant calls to hook fn: */
-	if (!bkey_cmp(s->committed, next_pos))
-		return BTREE_INSERT_OK;
+	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
 
-	return __extent_insert_advance_pos(s, next_pos, k);
+	return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr);
 }
 
-static enum btree_insert_ret
-extent_insert_check_split_compressed(struct extent_insert_state *s,
-				     struct bkey_s_c k,
-				     enum bch_extent_overlap overlap)
+void bch2_extent_ptr_set_cached(struct bch_fs *c,
+				struct bch_io_opts *opts,
+				struct bkey_s k,
+				struct bch_extent_ptr *ptr)
 {
-	struct bch_fs *c = s->trans->c;
-	unsigned sectors;
-
-	if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
-	    (sectors = bch2_extent_is_compressed(k))) {
-		int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
-
-		if (s->trans->flags & BTREE_INSERT_NOFAIL)
-			flags |= BCH_DISK_RESERVATION_NOFAIL;
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+	union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 
-		switch (bch2_disk_reservation_add(c,
-				s->trans->disk_res,
-				sectors * bch2_extent_nr_dirty_ptrs(k),
-				flags)) {
-		case 0:
-			break;
-		case -ENOSPC:
-			return BTREE_INSERT_ENOSPC;
-		case -EINTR:
-			return BTREE_INSERT_NEED_GC_LOCK;
-		default:
-			BUG();
-		}
+	rcu_read_lock();
+	if (!want_cached_ptr(c, opts, ptr)) {
+		bch2_bkey_drop_ptr_noerror(k, ptr);
+		goto out;
 	}
 
-	return BTREE_INSERT_OK;
-}
-
-static enum btree_insert_ret
-extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
-	      struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k,
-	      enum bch_extent_overlap overlap)
-{
-	struct bch_fs *c = s->trans->c;
-	struct btree_iter *iter = s->insert->iter;
-	struct btree_iter_level *l = &iter->l[0];
-	struct btree *b = l->b;
-	struct btree_node_iter *node_iter = &l->iter;
-	enum btree_insert_ret ret;
-
-	switch (overlap) {
-	case BCH_EXTENT_OVERLAP_FRONT:
-		/* insert overlaps with start of k: */
-		bch2_cut_subtract_front(s, insert->k.p, k);
-		BUG_ON(bkey_deleted(k.k));
-		extent_save(b, node_iter, _k, k.k);
-		break;
-
-	case BCH_EXTENT_OVERLAP_BACK:
-		/* insert overlaps with end of k: */
-		bch2_cut_subtract_back(s, bkey_start_pos(&insert->k), k);
-		BUG_ON(bkey_deleted(k.k));
-		extent_save(b, node_iter, _k, k.k);
-
-		/*
-		 * As the auxiliary tree is indexed by the end of the
-		 * key and we've just changed the end, update the
-		 * auxiliary tree.
-		 */
-		bch2_bset_fix_invalidated_key(b, t, _k);
-		bch2_btree_node_iter_fix(iter, b, node_iter, t,
-					_k, _k->u64s, _k->u64s);
-		break;
-
-	case BCH_EXTENT_OVERLAP_ALL: {
-		struct bpos orig_pos = k.k->p;
-
-		/* The insert key completely covers k, invalidate k */
-		if (!bkey_whiteout(k.k))
-			btree_keys_account_key_drop(&b->nr,
-						t - b->set, _k);
-
-		bch2_drop_subtract(s, k);
-		k.k->p = bkey_start_pos(&insert->k);
-		if (!__extent_save(b, node_iter, _k, k.k)) {
-			/*
-			 * Couldn't repack: we aren't necessarily able
-			 * to repack if the new key is outside the range
-			 * of the old extent, so we have to split
-			 * @insert:
-			 */
-			k.k->p = orig_pos;
-			extent_save(b, node_iter, _k, k.k);
-
-			ret = extent_insert_advance_pos(s, k.s_c);
-			if (ret != BTREE_INSERT_OK)
-				return ret;
-
-			extent_insert_committed(s);
-			/*
-			 * We split and inserted upto at k.k->p - that
-			 * has to coincide with iter->pos, so that we
-			 * don't have anything more we have to insert
-			 * until we recheck our journal reservation:
-			 */
-			EBUG_ON(bkey_cmp(s->committed, k.k->p));
-		} else {
-			bch2_bset_fix_invalidated_key(b, t, _k);
-			bch2_btree_node_iter_fix(iter, b, node_iter, t,
-						_k, _k->u64s, _k->u64s);
+	/*
+	 * Stripes can't contain cached data, for - reasons.
+	 *
+	 * Possibly something we can fix in the future?
+	 */
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		if (&entry->ptr == ptr) {
+			if (p.has_ec)
+				bch2_bkey_drop_ptr_noerror(k, ptr);
+			else
+				ptr->cached = true;
+			goto out;
 		}
 
-		break;
-	}
-	case BCH_EXTENT_OVERLAP_MIDDLE: {
-		BKEY_PADDED(k) split;
-		/*
-		 * The insert key falls 'in the middle' of k
-		 * The insert key splits k in 3:
-		 * - start only in k, preserve
-		 * - middle common section, invalidate in k
-		 * - end only in k, preserve
-		 *
-		 * We update the old key to preserve the start,
-		 * insert will be the new common section,
-		 * we manually insert the end that we are preserving.
-		 *
-		 * modify k _before_ doing the insert (which will move
-		 * what k points to)
-		 */
-		bkey_reassemble(&split.k, k.s_c);
-		split.k.k.needs_whiteout |= bset_written(b, bset(b, t));
-
-		bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k);
-		BUG_ON(bkey_deleted(&split.k.k));
-
-		bch2_cut_subtract_front(s, insert->k.p, k);
-		BUG_ON(bkey_deleted(k.k));
-		extent_save(b, node_iter, _k, k.k);
-
-		bch2_add_sectors(s, bkey_i_to_s_c(&split.k),
-				bkey_start_offset(&split.k.k),
-				split.k.k.size);
-		extent_bset_insert(c, iter, &split.k);
-		break;
-	}
-	}
-
-	return BTREE_INSERT_OK;
+	BUG();
+out:
+	rcu_read_unlock();
 }
 
-static enum btree_insert_ret
-__bch2_delete_fixup_extent(struct extent_insert_state *s)
+/*
+ * bch2_extent_normalize - clean up an extent, dropping stale pointers etc.
+ *
+ * Returns true if @k should be dropped entirely
+ *
+ * For existing keys, only called when btree nodes are being rewritten, not when
+ * they're merely being compacted/resorted in memory.
+ */
+bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
 {
-	struct bch_fs *c = s->trans->c;
-	struct btree_iter *iter = s->insert->iter;
-	struct btree_iter_level *l = &iter->l[0];
-	struct btree *b = l->b;
-	struct btree_node_iter *node_iter = &l->iter;
-	struct bkey_packed *_k;
-	struct bkey unpacked;
-	struct bkey_i *insert = s->insert->k;
-	enum btree_insert_ret ret = BTREE_INSERT_OK;
-
-	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
-
-	s->whiteout = *insert;
-	s->whiteout.k.type = KEY_TYPE_DISCARD;
-
-	while (bkey_cmp(s->committed, insert->k.p) < 0 &&
-	       (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
-	       (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
-		struct bset_tree *t = bch2_bkey_to_bset(b, _k);
-		struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
-		enum bch_extent_overlap overlap;
-
-		EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
-		EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
-
-		if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
-			break;
-
-		if (bkey_whiteout(k.k)) {
-			s->committed = bpos_min(insert->k.p, k.k->p);
-			goto next;
-		}
-
-		overlap = bch2_extent_overlap(&insert->k, k.k);
-
-		ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
-		if (ret)
-			break;
-
-		ret = extent_insert_advance_pos(s, k.s_c);
-		if (ret)
-			break;
-
-		s->do_journal = true;
+	struct bch_dev *ca;
 
-		if (overlap == BCH_EXTENT_OVERLAP_ALL) {
-			btree_keys_account_key_drop(&b->nr,
-						t - b->set, _k);
-			bch2_subtract_sectors(s, k.s_c,
-					     bkey_start_offset(k.k), k.k->size);
-			_k->type = KEY_TYPE_DISCARD;
-			reserve_whiteout(b, t, _k);
-		} else if (k.k->needs_whiteout ||
-			   bset_written(b, bset(b, t))) {
-			struct bkey_i discard = *insert;
+	rcu_read_lock();
+	bch2_bkey_drop_ptrs(k, ptr,
+		ptr->cached &&
+		(!(ca = bch2_dev_rcu(c, ptr->dev)) ||
+		 dev_ptr_stale_rcu(ca, ptr) > 0));
+	rcu_read_unlock();
 
-			discard.k.type = KEY_TYPE_DISCARD;
+	return bkey_deleted(k.k);
+}
 
-			switch (overlap) {
-			case BCH_EXTENT_OVERLAP_FRONT:
-				bch2_cut_front(bkey_start_pos(k.k), &discard);
-				break;
-			case BCH_EXTENT_OVERLAP_BACK:
-				bch2_cut_back(k.k->p, &discard.k);
-				break;
-			default:
-				break;
+/*
+ * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc.
+ *
+ * Like bch2_extent_normalize(), but also only keeps a single cached pointer on
+ * the promote target.
+ */
+bool bch2_extent_normalize_by_opts(struct bch_fs *c,
+				   struct bch_io_opts *opts,
+				   struct bkey_s k)
+{
+	struct bkey_ptrs ptrs;
+	bool have_cached_ptr;
+
+	rcu_read_lock();
+restart_drop_ptrs:
+	ptrs = bch2_bkey_ptrs(k);
+	have_cached_ptr = false;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		if (ptr->cached) {
+			if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) {
+				bch2_bkey_drop_ptr(k, ptr);
+				goto restart_drop_ptrs;
 			}
-
-			discard.k.needs_whiteout = true;
-
-			ret = extent_squash(s, insert, t, _k, k, overlap);
-			BUG_ON(ret != BTREE_INSERT_OK);
-
-			extent_bset_insert(c, iter, &discard);
-		} else {
-			ret = extent_squash(s, insert, t, _k, k, overlap);
-			BUG_ON(ret != BTREE_INSERT_OK);
+			have_cached_ptr = true;
 		}
-next:
-		bch2_cut_front(s->committed, insert);
-		bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
-	}
+	rcu_read_unlock();
 
-	return ret;
+	return bkey_deleted(k.k);
 }
 
-static enum btree_insert_ret
-__bch2_insert_fixup_extent(struct extent_insert_state *s)
+void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
 {
-	struct btree_iter *iter = s->insert->iter;
-	struct btree_iter_level *l = &iter->l[0];
-	struct btree *b = l->b;
-	struct btree_node_iter *node_iter = &l->iter;
-	struct bkey_packed *_k;
-	struct bkey unpacked;
-	struct bkey_i *insert = s->insert->k;
-	enum btree_insert_ret ret = BTREE_INSERT_OK;
-
-	while (bkey_cmp(s->committed, insert->k.p) < 0 &&
-	       (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
-	       (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
-		struct bset_tree *t = bch2_bkey_to_bset(b, _k);
-		struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
-		enum bch_extent_overlap overlap;
-
-		EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
-		EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
-
-		if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
-			break;
-
-		overlap = bch2_extent_overlap(&insert->k, k.k);
-
-		ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
-		if (ret)
-			break;
-
-		if (!k.k->size)
-			goto squash;
-
-		/*
-		 * Only call advance pos & call hook for nonzero size extents:
-		 */
-		ret = extent_insert_advance_pos(s, k.s_c);
-		if (ret)
-			break;
-
-		if (k.k->size &&
-		    (k.k->needs_whiteout || bset_written(b, bset(b, t))))
-			insert->k.needs_whiteout = true;
-
-		if (overlap == BCH_EXTENT_OVERLAP_ALL &&
-		    bkey_whiteout(k.k) &&
-		    k.k->needs_whiteout) {
-			unreserve_whiteout(b, t, _k);
-			_k->needs_whiteout = false;
-		}
-squash:
-		ret = extent_squash(s, insert, t, _k, k, overlap);
-		if (ret != BTREE_INSERT_OK)
-			break;
+	out->atomic++;
+	rcu_read_lock();
+	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
+	if (!ca) {
+		prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+			   (u64) ptr->offset, ptr->gen,
+			   ptr->cached ? " cached" : "");
+	} else {
+		u32 offset;
+		u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+		prt_printf(out, "ptr: %u:%llu:%u gen %u",
+			   ptr->dev, b, offset, ptr->gen);
+		if (ca->mi.durability != 1)
+			prt_printf(out, " d=%u", ca->mi.durability);
+		if (ptr->cached)
+			prt_str(out, " cached");
+		if (ptr->unwritten)
+			prt_str(out, " unwritten");
+		int stale = dev_ptr_stale_rcu(ca, ptr);
+		if (stale > 0)
+			prt_printf(out, " stale");
+		else if (stale)
+			prt_printf(out, " invalid");
 	}
-
-	return ret;
+	rcu_read_unlock();
+	--out->atomic;
 }
 
-/**
- * bch_extent_insert_fixup - insert a new extent and deal with overlaps
- *
- * this may result in not actually doing the insert, or inserting some subset
- * of the insert key. For cmpxchg operations this is where that logic lives.
- *
- * All subsets of @insert that need to be inserted are inserted using
- * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
- * returns false, setting @iter->pos for the prefix of @insert that actually got
- * inserted.
- *
- * BSET INVARIANTS: this function is responsible for maintaining all the
- * invariants for bsets of extents in memory. things get really hairy with 0
- * size extents
- *
- * within one bset:
- *
- * bkey_start_pos(bkey_next(k)) >= k
- * or bkey_start_offset(bkey_next(k)) >= k->offset
- *
- * i.e. strict ordering, no overlapping extents.
- *
- * multiple bsets (i.e. full btree node):
- *
- * ∀ k, j
- *   k.size != 0 ∧ j.size != 0 →
- *     ¬ (k > bkey_start_pos(j) ∧ k < j)
- *
- * i.e. no two overlapping keys _of nonzero size_
- *
- * We can't realistically maintain this invariant for zero size keys because of
- * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
- * there may be another 0 size key between them in another bset, and it will
- * thus overlap with the merged key.
- *
- * In addition, the end of iter->pos indicates how much has been processed.
- * If the end of iter->pos is not the same as the end of insert, then
- * key insertion needs to continue/be retried.
- */
-enum btree_insert_ret
-bch2_insert_fixup_extent(struct btree_insert *trans,
-			 struct btree_insert_entry *insert)
+void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc)
 {
-	struct bch_fs *c = trans->c;
-	struct btree_iter *iter = insert->iter;
-	struct btree_iter_level *l = &iter->l[0];
-	struct btree *b = l->b;
-	enum btree_insert_ret ret = BTREE_INSERT_OK;
-
-	struct extent_insert_state s = {
-		.trans		= trans,
-		.insert		= insert,
-		.committed	= insert->iter->pos,
-		.deleting	= bkey_whiteout(&insert->k->k),
-	};
-
-	EBUG_ON(iter->level);
-	EBUG_ON(!insert->k->k.size);
-
-	/*
-	 * As we process overlapping extents, we advance @iter->pos both to
-	 * signal to our caller (btree_insert_key()) how much of @insert->k has
-	 * been inserted, and also to keep @iter->pos consistent with
-	 * @insert->k and the node iterator that we're advancing:
-	 */
-	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
-
-	if (!s.deleting &&
-	    !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
-		bch2_add_sectors(&s, bkey_i_to_s_c(insert->k),
-				bkey_start_offset(&insert->k->k),
-				insert->k->k.size);
-
-	ret = !s.deleting
-		? __bch2_insert_fixup_extent(&s)
-		: __bch2_delete_fixup_extent(&s);
-
-	if (ret == BTREE_INSERT_OK &&
-	    bkey_cmp(s.committed, insert->k->k.p) < 0)
-		ret = extent_insert_advance_pos(&s, bkey_s_c_null);
-
-	extent_insert_committed(&s);
-
-	if (s.deleting)
-		bch2_cut_front(iter->pos, insert->k);
-
-	/*
-	 * Subtract any remaining sectors from @insert, if we bailed out early
-	 * and didn't fully insert @insert:
-	 */
-	if (!s.deleting &&
-	    !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
-	    insert->k->k.size)
-		bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k),
-				     bkey_start_offset(&insert->k->k),
-				     insert->k->k.size);
-
-	bch2_fs_usage_apply(c, &s.stats, trans->disk_res,
-			   gc_pos_btree_node(b));
-
-	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
-	EBUG_ON(bkey_cmp(iter->pos, s.committed));
-	EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
-		!!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
-
-	if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
-		ret = BTREE_INSERT_NEED_TRAVERSE;
-
-	WARN_ONCE((ret == BTREE_INSERT_OK) != (insert->k->k.size == 0),
-		  "ret %u insert->k.size %u", ret, insert->k->k.size);
-
-	return ret;
+	prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
+		   crc->compressed_size,
+		   crc->uncompressed_size,
+		   crc->offset, crc->nonce);
+	bch2_prt_csum_type(out, crc->csum_type);
+	prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo);
+	prt_str(out, " compress ");
+	bch2_prt_compression_type(out, crc->compression_type);
 }
 
-const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
+static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
+					  const struct bch_extent_rebalance *r)
 {
-	if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
-		return "value too big";
+	prt_str(out, "rebalance:");
 
-	if (!k.k->size)
-		return "zero key size";
+	prt_printf(out, " replicas=%u", r->data_replicas);
+	if (r->data_replicas_from_inode)
+		prt_str(out, " (inode)");
 
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const union bch_extent_entry *entry;
-		struct bch_extent_crc_unpacked crc;
-		const struct bch_extent_ptr *ptr;
-		unsigned size_ondisk = e.k->size;
-		const char *reason;
-		unsigned nonce = UINT_MAX;
-
-		extent_for_each_entry(e, entry) {
-			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
-				return "invalid extent entry type";
-
-			if (extent_entry_is_crc(entry)) {
-				crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
-
-				if (crc.offset + e.k->size >
-				    crc.uncompressed_size)
-					return "checksum offset + key size > uncompressed size";
-
-				size_ondisk = crc.compressed_size;
-
-				if (!bch2_checksum_type_valid(c, crc.csum_type))
-					return "invalid checksum type";
+	prt_str(out, " checksum=");
+	bch2_prt_csum_opt(out, r->data_checksum);
+	if (r->data_checksum_from_inode)
+		prt_str(out, " (inode)");
 
-				if (crc.compression_type >= BCH_COMPRESSION_NR)
-					return "invalid compression type";
+	if (r->background_compression || r->background_compression_from_inode) {
+		prt_str(out, " background_compression=");
+		bch2_compression_opt_to_text(out, r->background_compression);
 
-				if (bch2_csum_type_is_encryption(crc.csum_type)) {
-					if (nonce == UINT_MAX)
-						nonce = crc.offset + crc.nonce;
-					else if (nonce != crc.offset + crc.nonce)
-						return "incorrect nonce";
-				}
-			} else {
-				ptr = entry_to_ptr(entry);
-
-				reason = extent_ptr_invalid(c, e, &entry->ptr,
-							    size_ondisk, false);
-				if (reason)
-					return reason;
-			}
-		}
-
-		return NULL;
+		if (r->background_compression_from_inode)
+			prt_str(out, " (inode)");
 	}
 
-	case BCH_RESERVATION: {
-		struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+	if (r->background_target || r->background_target_from_inode) {
+		prt_str(out, " background_target=");
+		if (c)
+			bch2_target_to_text(out, c, r->background_target);
+		else
+			prt_printf(out, "%u", r->background_target);
 
-		if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
-			return "incorrect value size";
+		if (r->background_target_from_inode)
+			prt_str(out, " (inode)");
+	}
 
-		if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
-			return "invalid nr_replicas";
+	if (r->promote_target || r->promote_target_from_inode) {
+		prt_str(out, " promote_target=");
+		if (c)
+			bch2_target_to_text(out, c, r->promote_target);
+		else
+			prt_printf(out, "%u", r->promote_target);
 
-		return NULL;
+		if (r->promote_target_from_inode)
+			prt_str(out, " (inode)");
 	}
 
-	default:
-		return "invalid value type";
+	if (r->erasure_code || r->erasure_code_from_inode) {
+		prt_printf(out, " ec=%u", r->erasure_code);
+		if (r->erasure_code_from_inode)
+			prt_str(out, " (inode)");
 	}
 }
 
-static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
-					  struct bkey_s_c_extent e)
+void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
 {
-	const struct bch_extent_ptr *ptr;
-	struct bch_dev *ca;
-	struct bucket_mark mark;
-	unsigned seq, stale;
-	char buf[160];
-	bool bad;
-	unsigned replicas = 0;
-
-	/*
-	 * XXX: we should be doing most/all of these checks at startup time,
-	 * where we check bch2_bkey_invalid() in btree_node_read_done()
-	 *
-	 * But note that we can't check for stale pointers or incorrect gc marks
-	 * until after journal replay is done (it might be an extent that's
-	 * going to get overwritten during replay)
-	 */
-
-	extent_for_each_ptr(e, ptr) {
-		ca = bch_dev_bkey_exists(c, ptr->dev);
-		replicas++;
-
-		/*
-		 * If journal replay hasn't finished, we might be seeing keys
-		 * that will be overwritten by the time journal replay is done:
-		 */
-		if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
-			continue;
-
-		stale = 0;
-
-		do {
-			seq = read_seqcount_begin(&c->gc_pos_lock);
-			mark = ptr_bucket_mark(ca, ptr);
-
-			/* between mark and bucket gen */
-			smp_rmb();
-
-			stale = ptr_stale(ca, ptr);
-
-			bch2_fs_bug_on(stale && !ptr->cached, c,
-					 "stale dirty pointer");
-
-			bch2_fs_bug_on(stale > 96, c,
-					 "key too stale: %i",
-					 stale);
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	bool first = true;
 
-			if (stale)
-				break;
+	if (c)
+		prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
 
-			bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
-				(mark.data_type != BCH_DATA_USER ||
-				 !(ptr->cached
-				   ? mark.cached_sectors
-				   : mark.dirty_sectors));
-		} while (read_seqcount_retry(&c->gc_pos_lock, seq));
+	bkey_extent_entry_for_each(ptrs, entry) {
+		if (!first)
+			prt_printf(out, " ");
 
-		if (bad)
-			goto bad_ptr;
-	}
+		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
+			break;
 
-	if (replicas > BCH_REPLICAS_MAX) {
-		bch2_bkey_val_to_text(c, btree_node_type(b), buf,
-				     sizeof(buf), e.s_c);
-		bch2_fs_bug(c,
-			"extent key bad (too many replicas: %u): %s",
-			replicas, buf);
-		return;
-	}
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+		case BCH_EXTENT_ENTRY_crc128: {
+			struct bch_extent_crc_unpacked crc =
+				bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-	if (!bkey_extent_is_cached(e.k) &&
-	    !bch2_bkey_replicas_marked(c, BCH_DATA_USER, e.s_c)) {
-		bch2_bkey_val_to_text(c, btree_node_type(b),
-				     buf, sizeof(buf), e.s_c);
-		bch2_fs_bug(c,
-			"extent key bad (replicas not marked in superblock):\n%s",
-			buf);
-		return;
-	}
+			bch2_extent_crc_unpacked_to_text(out, &crc);
+			break;
+		}
+		case BCH_EXTENT_ENTRY_stripe_ptr: {
+			const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
 
-	return;
+			prt_printf(out, "ec: idx %llu block %u",
+			       (u64) ec->idx, ec->block);
+			break;
+		}
+		case BCH_EXTENT_ENTRY_rebalance:
+			bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
+			break;
 
-bad_ptr:
-	bch2_bkey_val_to_text(c, btree_node_type(b), buf,
-			     sizeof(buf), e.s_c);
-	bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu "
-		   "gen %i type %u", buf,
-		   PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type);
-	return;
-}
+		default:
+			prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
+			return;
+		}
 
-void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		bch2_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k));
-		break;
-	case BCH_RESERVATION:
-		break;
-	default:
-		BUG();
+		first = false;
 	}
 }
 
-void bch2_extent_to_text(struct bch_fs *c, char *buf,
-			 size_t size, struct bkey_s_c k)
+static int extent_ptr_validate(struct bch_fs *c,
+			       struct bkey_s_c k,
+			       struct bkey_validate_context from,
+			       const struct bch_extent_ptr *ptr,
+			       unsigned size_ondisk,
+			       bool metadata)
 {
-	char *out = buf, *end = buf + size;
-	const char *invalid;
-
-#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
-
-	if (bkey_extent_is_data(k.k))
-		out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
-
-	invalid = bch2_extent_invalid(c, k);
-	if (invalid)
-		p(" invalid: %s", invalid);
-#undef p
-}
-
-static void bch2_extent_crc_init(union bch_extent_crc *crc,
-				 struct bch_extent_crc_unpacked new)
-{
-#define common_fields(_crc)						\
-		.csum_type		= _crc.csum_type,		\
-		.compression_type	= _crc.compression_type,	\
-		._compressed_size	= _crc.compressed_size - 1,	\
-		._uncompressed_size	= _crc.uncompressed_size - 1,	\
-		.offset			= _crc.offset
-
-	if (bch_crc_bytes[new.csum_type]	<= 4 &&
-	    new.uncompressed_size		<= CRC32_SIZE_MAX &&
-	    new.nonce				<= CRC32_NONCE_MAX) {
-		crc->crc32 = (struct bch_extent_crc32) {
-			.type = 1 << BCH_EXTENT_ENTRY_crc32,
-			common_fields(new),
-			.csum			= *((__le32 *) &new.csum.lo),
-		};
-		return;
-	}
-
-	if (bch_crc_bytes[new.csum_type]	<= 10 &&
-	    new.uncompressed_size		<= CRC64_SIZE_MAX &&
-	    new.nonce				<= CRC64_NONCE_MAX) {
-		crc->crc64 = (struct bch_extent_crc64) {
-			.type = 1 << BCH_EXTENT_ENTRY_crc64,
-			common_fields(new),
-			.nonce			= new.nonce,
-			.csum_lo		= new.csum.lo,
-			.csum_hi		= *((__le16 *) &new.csum.hi),
-		};
-		return;
-	}
+	int ret = 0;
 
-	if (bch_crc_bytes[new.csum_type]	<= 16 &&
-	    new.uncompressed_size		<= CRC128_SIZE_MAX &&
-	    new.nonce				<= CRC128_NONCE_MAX) {
-		crc->crc128 = (struct bch_extent_crc128) {
-			.type = 1 << BCH_EXTENT_ENTRY_crc128,
-			common_fields(new),
-			.nonce			= new.nonce,
-			.csum			= new.csum,
-		};
-		return;
+	/* bad pointers are repaired by check_fix_ptrs(): */
+	rcu_read_lock();
+	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
+	if (!ca) {
+		rcu_read_unlock();
+		return 0;
 	}
-#undef common_fields
-	BUG();
+	u32 bucket_offset;
+	u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
+	unsigned first_bucket	= ca->mi.first_bucket;
+	u64 nbuckets		= ca->mi.nbuckets;
+	unsigned bucket_size	= ca->mi.bucket_size;
+	rcu_read_unlock();
+
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	bkey_for_each_ptr(ptrs, ptr2)
+		bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
+				 c, ptr_to_duplicate_device,
+				 "multiple pointers to same device (%u)", ptr->dev);
+
+
+	bkey_fsck_err_on(bucket >= nbuckets,
+			 c, ptr_after_last_bucket,
+			 "pointer past last bucket (%llu > %llu)", bucket, nbuckets);
+	bkey_fsck_err_on(bucket < first_bucket,
+			 c, ptr_before_first_bucket,
+			 "pointer before first bucket (%llu < %u)", bucket, first_bucket);
+	bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size,
+			 c, ptr_spans_multiple_buckets,
+			 "pointer spans multiple buckets (%u + %u > %u)",
+		       bucket_offset, size_ondisk, bucket_size);
+fsck_err:
+	return ret;
 }
 
-void bch2_extent_crc_append(struct bkey_i_extent *e,
-			    struct bch_extent_crc_unpacked new)
+int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
+			    struct bkey_validate_context from)
 {
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
 	struct bch_extent_crc_unpacked crc;
-	const union bch_extent_entry *i;
-
-	BUG_ON(new.compressed_size > new.uncompressed_size);
-	BUG_ON(new.live_size != e->k.size);
-	BUG_ON(!new.compressed_size || !new.uncompressed_size);
-
-	/*
-	 * Look up the last crc entry, so we can check if we need to add
-	 * another:
-	 */
-	extent_for_each_crc(extent_i_to_s(e), crc, i)
-		;
-
-	if (!bch2_crc_unpacked_cmp(crc, new))
-		return;
+	unsigned size_ondisk = k.k->size;
+	unsigned nonce = UINT_MAX;
+	unsigned nr_ptrs = 0;
+	bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
+	int ret = 0;
 
-	bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
-	__extent_entry_push(e);
-}
+	if (bkey_is_btree_ptr(k.k))
+		size_ondisk = btree_sectors(c);
 
-/*
- * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
- *
- * Returns true if @k should be dropped entirely
- *
- * For existing keys, only called when btree nodes are being rewritten, not when
- * they're merely being compacted/resorted in memory.
- */
-bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
-{
-	struct bkey_s_extent e;
+	bkey_extent_entry_for_each(ptrs, entry) {
+		bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX,
+				 c, extent_ptrs_invalid_entry,
+				 "invalid extent entry type (got %u, max %u)",
+				 __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
 
-	switch (k.k->type) {
-	case KEY_TYPE_ERROR:
-		return false;
+		bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
+				 !extent_entry_is_ptr(entry),
+				 c, btree_ptr_has_non_ptr,
+				 "has non ptr field");
 
-	case KEY_TYPE_DELETED:
-		return true;
-	case KEY_TYPE_DISCARD:
-		return bversion_zero(k.k->version);
-	case KEY_TYPE_COOKIE:
-		return false;
+		switch (extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false);
+			if (ret)
+				return ret;
 
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		e = bkey_s_to_extent(k);
+			bkey_fsck_err_on(entry->ptr.cached && have_ec,
+					 c, ptr_cached_and_erasure_coded,
+					 "cached, erasure coded ptr");
 
-		bch2_extent_drop_stale(c, e);
+			if (!entry->ptr.unwritten)
+				have_written = true;
+			else
+				have_unwritten = true;
 
-		if (!bkey_val_u64s(e.k)) {
-			if (bkey_extent_is_cached(e.k)) {
-				k.k->type = KEY_TYPE_DISCARD;
-				if (bversion_zero(k.k->version))
-					return true;
-			} else {
-				k.k->type = KEY_TYPE_ERROR;
+			have_ec = false;
+			crc_since_last_ptr = false;
+			nr_ptrs++;
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+		case BCH_EXTENT_ENTRY_crc128:
+			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
+
+			bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type),
+					 c, ptr_crc_csum_type_unknown,
+					 "invalid checksum type");
+			bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR,
+					 c, ptr_crc_compression_type_unknown,
+					 "invalid compression type");
+
+			bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
+					 c, ptr_crc_uncompressed_size_too_small,
+					 "checksum offset + key size > uncompressed size");
+			bkey_fsck_err_on(crc_is_encoded(crc) &&
+					 (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
+					 (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
+					 c, ptr_crc_uncompressed_size_too_big,
+					 "too large encoded extent");
+			bkey_fsck_err_on(!crc_is_compressed(crc) &&
+					 crc.compressed_size != crc.uncompressed_size,
+					 c, ptr_crc_uncompressed_size_mismatch,
+					 "not compressed but compressed != uncompressed size");
+
+			if (bch2_csum_type_is_encryption(crc.csum_type)) {
+				if (nonce == UINT_MAX)
+					nonce = crc.offset + crc.nonce;
+				else if (nonce != crc.offset + crc.nonce)
+					bkey_fsck_err(c, ptr_crc_nonce_mismatch,
+						      "incorrect nonce");
 			}
-		}
 
-		return false;
-	case BCH_RESERVATION:
-		return false;
-	default:
-		BUG();
-	}
-}
-
-void bch2_extent_mark_replicas_cached(struct bch_fs *c,
-				      struct bkey_s_extent e,
-				      unsigned target,
-				      unsigned nr_desired_replicas)
-{
-	struct bch_extent_ptr *ptr;
-	int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas;
+			bkey_fsck_err_on(crc_since_last_ptr,
+					 c, ptr_crc_redundant,
+					 "redundant crc entry");
+			crc_since_last_ptr = true;
 
-	if (target && extra > 0)
-		extent_for_each_ptr(e, ptr) {
-			int n = bch2_extent_ptr_durability(c, ptr);
-
-			if (n && n <= extra &&
-			    !bch2_dev_in_target(c, ptr->dev, target)) {
-				ptr->cached = true;
-				extra -= n;
+			size_ondisk = crc.compressed_size;
+			break;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			bkey_fsck_err_on(have_ec,
+					 c, ptr_stripe_redundant,
+					 "redundant stripe entry");
+			have_ec = true;
+			break;
+		case BCH_EXTENT_ENTRY_rebalance: {
+			/*
+			 * this shouldn't be a fsck error, for forward
+			 * compatibility; the rebalance code should just refetch
+			 * the compression opt if it's unknown
+			 */
+#if 0
+			const struct bch_extent_rebalance *r = &entry->rebalance;
+
+			if (!bch2_compression_opt_valid(r->compression)) {
+				struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
+				prt_printf(err, "invalid compression opt %u:%u",
+					   opt.type, opt.level);
+				return -BCH_ERR_invalid_bkey;
 			}
+#endif
+			break;
 		}
-
-	if (extra > 0)
-		extent_for_each_ptr(e, ptr) {
-			int n = bch2_extent_ptr_durability(c, ptr);
-
-			if (n && n <= extra) {
-				ptr->cached = true;
-				extra -= n;
-			}
 		}
+	}
+
+	bkey_fsck_err_on(!nr_ptrs,
+			 c, extent_ptrs_no_ptrs,
+			 "no ptrs");
+	bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX,
+			 c, extent_ptrs_too_many_ptrs,
+			 "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
+	bkey_fsck_err_on(have_written && have_unwritten,
+			 c, extent_ptrs_written_and_unwritten,
+			 "extent with unwritten and written ptrs");
+	bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten,
+			 c, extent_ptrs_unwritten,
+			 "has unwritten ptrs");
+	bkey_fsck_err_on(crc_since_last_ptr,
+			 c, extent_ptrs_redundant_crc,
+			 "redundant crc entry");
+	bkey_fsck_err_on(have_ec,
+			 c, extent_ptrs_redundant_stripe,
+			 "redundant stripe entry");
+fsck_err:
+	return ret;
 }
 
-/*
- * This picks a non-stale pointer, preferably from a device other than @avoid.
- * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
- * other devices, it will still pick a pointer from avoid.
- */
-int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
-			 struct bch_devs_mask *avoid,
-			 struct extent_pick_ptr *pick)
+void bch2_ptr_swab(struct bkey_s k)
 {
-	int ret;
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+	union bch_extent_entry *entry;
+	u64 *d;
 
-	switch (k.k->type) {
-	case KEY_TYPE_ERROR:
-		return -EIO;
+	for (d =  (u64 *) ptrs.start;
+	     d != (u64 *) ptrs.end;
+	     d++)
+		*d = swab64(*d);
 
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		ret = extent_pick_read_device(c, bkey_s_c_to_extent(k),
-					      avoid, pick);
-
-		if (!ret && !bkey_extent_is_cached(k.k))
-			ret = -EIO;
-
-		return ret;
-
-	default:
-		return 0;
+	for (entry = ptrs.start;
+	     entry < ptrs.end;
+	     entry = extent_entry_next(entry)) {
+		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+			entry->crc32.csum = swab32(entry->crc32.csum);
+			break;
+		case BCH_EXTENT_ENTRY_crc64:
+			entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
+			entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
+			break;
+		case BCH_EXTENT_ENTRY_crc128:
+			entry->crc128.csum.hi = (__force __le64)
+				swab64((__force u64) entry->crc128.csum.hi);
+			entry->crc128.csum.lo = (__force __le64)
+				swab64((__force u64) entry->crc128.csum.lo);
+			break;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			break;
+		case BCH_EXTENT_ENTRY_rebalance:
+			break;
+		default:
+			/* Bad entry type: will be caught by validate() */
+			return;
+		}
 	}
 }
 
-enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b,
-				    struct bkey_i *l, struct bkey_i *r)
-{
-	struct bkey_s_extent el, er;
-	union bch_extent_entry *en_l, *en_r;
-
-	if (key_merging_disabled(c))
-		return BCH_MERGE_NOMERGE;
+/* Generic extent code: */
 
-	/*
-	 * Generic header checks
-	 * Assumes left and right are in order
-	 * Left and right must be exactly aligned
-	 */
-
-	if (l->k.u64s		!= r->k.u64s ||
-	    l->k.type		!= r->k.type ||
-	    bversion_cmp(l->k.version, r->k.version) ||
-	    bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
-		return BCH_MERGE_NOMERGE;
-
-	switch (l->k.type) {
-	case KEY_TYPE_DISCARD:
-	case KEY_TYPE_ERROR:
-		/* These types are mergeable, and no val to check */
-		break;
+int bch2_cut_front_s(struct bpos where, struct bkey_s k)
+{
+	unsigned new_val_u64s = bkey_val_u64s(k.k);
+	int val_u64s_delta;
+	u64 sub;
 
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		el = bkey_i_to_s_extent(l);
-		er = bkey_i_to_s_extent(r);
+	if (bkey_le(where, bkey_start_pos(k.k)))
+		return 0;
 
-		extent_for_each_entry(el, en_l) {
-			struct bch_extent_ptr *lp, *rp;
-			struct bch_dev *ca;
+	EBUG_ON(bkey_gt(where, k.k->p));
 
-			en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
+	sub = where.offset - bkey_start_offset(k.k);
 
-			if ((extent_entry_type(en_l) !=
-			     extent_entry_type(en_r)) ||
-			    extent_entry_is_crc(en_l))
-				return BCH_MERGE_NOMERGE;
+	k.k->size -= sub;
 
-			lp = &en_l->ptr;
-			rp = &en_r->ptr;
+	if (!k.k->size) {
+		k.k->type = KEY_TYPE_deleted;
+		new_val_u64s = 0;
+	}
 
-			if (lp->offset + el.k->size	!= rp->offset ||
-			    lp->dev			!= rp->dev ||
-			    lp->gen			!= rp->gen)
-				return BCH_MERGE_NOMERGE;
+	switch (k.k->type) {
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v: {
+		struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+		union bch_extent_entry *entry;
+		bool seen_crc = false;
 
-			/* We don't allow extents to straddle buckets: */
-			ca = bch_dev_bkey_exists(c, lp->dev);
+		bkey_extent_entry_for_each(ptrs, entry) {
+			switch (extent_entry_type(entry)) {
+			case BCH_EXTENT_ENTRY_ptr:
+				if (!seen_crc)
+					entry->ptr.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_crc32:
+				entry->crc32.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_crc64:
+				entry->crc64.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_crc128:
+				entry->crc128.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_stripe_ptr:
+				break;
+			case BCH_EXTENT_ENTRY_rebalance:
+				break;
+			}
 
-			if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
-				return BCH_MERGE_NOMERGE;
+			if (extent_entry_is_crc(entry))
+				seen_crc = true;
 		}
 
 		break;
-	case BCH_RESERVATION: {
-		struct bkey_i_reservation *li = bkey_i_to_reservation(l);
-		struct bkey_i_reservation *ri = bkey_i_to_reservation(r);
+	}
+	case KEY_TYPE_reflink_p: {
+		struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
 
-		if (li->v.generation != ri->v.generation ||
-		    li->v.nr_replicas != ri->v.nr_replicas)
-			return BCH_MERGE_NOMERGE;
+		SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub);
 		break;
 	}
-	default:
-		return BCH_MERGE_NOMERGE;
-	}
-
-	l->k.needs_whiteout |= r->k.needs_whiteout;
+	case KEY_TYPE_inline_data:
+	case KEY_TYPE_indirect_inline_data: {
+		void *p = bkey_inline_data_p(k);
+		unsigned bytes = bkey_inline_data_bytes(k.k);
 
-	/* Keys with no pointers aren't restricted to one bucket and could
-	 * overflow KEY_SIZE
-	 */
-	if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) {
-		bch2_key_resize(&l->k, KEY_SIZE_MAX);
-		bch2_cut_front(l->k.p, r);
-		return BCH_MERGE_PARTIAL;
-	}
+		sub = min_t(u64, sub << 9, bytes);
 
-	bch2_key_resize(&l->k, l->k.size + r->k.size);
+		memmove(p, p + sub, bytes - sub);
 
-	return BCH_MERGE_MERGE;
-}
-
-static void extent_i_save(struct btree *b, struct bkey_packed *dst,
-			  struct bkey_i *src)
-{
-	struct bkey_format *f = &b->format;
-	struct bkey_i *dst_unpacked;
-
-	BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k));
-
-	/*
-	 * We don't want the bch2_verify_key_order() call in extent_save(),
-	 * because we may be out of order with deleted keys that are about to be
-	 * removed by extent_bset_insert()
-	 */
-
-	if ((dst_unpacked = packed_to_bkey(dst)))
-		bkey_copy(dst_unpacked, src);
-	else
-		BUG_ON(!bch2_bkey_pack(dst, src, f));
-}
-
-static bool extent_merge_one_overlapping(struct btree_iter *iter,
-					 struct bpos new_pos,
-					 struct bset_tree *t,
-					 struct bkey_packed *k, struct bkey uk,
-					 bool check, bool could_pack)
-{
-	struct btree_iter_level *l = &iter->l[0];
-
-	BUG_ON(!bkey_deleted(k));
-
-	if (check) {
-		return !bkey_packed(k) || could_pack;
-	} else {
-		uk.p = new_pos;
-		extent_save(l->b, &l->iter, k, &uk);
-		bch2_bset_fix_invalidated_key(l->b, t, k);
-		bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
-					 k, k->u64s, k->u64s);
-		return true;
+		new_val_u64s -= sub >> 3;
+		break;
 	}
-}
-
-static bool extent_merge_do_overlapping(struct btree_iter *iter,
-					struct bkey *m, bool back_merge)
-{
-	struct btree_iter_level *l = &iter->l[0];
-	struct btree *b = l->b;
-	struct btree_node_iter *node_iter = &l->iter;
-	struct bset_tree *t;
-	struct bkey_packed *k;
-	struct bkey uk;
-	struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m);
-	bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b);
-	bool check = true;
-
-	/*
-	 * @m is the new merged extent:
-	 *
-	 * The merge took place in the last bset; we know there can't be any 0
-	 * size extents overlapping with m there because if so they would have
-	 * been between the two extents we merged.
-	 *
-	 * But in the other bsets, we have to check for and fix such extents:
-	 */
-do_fixup:
-	for_each_bset(b, t) {
-		if (t == bset_tree_last(b))
-			break;
-
-		/*
-		 * if we don't find this bset in the iterator we already got to
-		 * the end of that bset, so start searching from the end.
-		 */
-		k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
-
-		if (k == btree_bkey_last(b, t))
-			k = bch2_bkey_prev_all(b, t, k);
-		if (!k)
-			continue;
-
-		if (back_merge) {
-			/*
-			 * Back merge: 0 size extents will be before the key
-			 * that was just inserted (and thus the iterator
-			 * position) - walk backwards to find them
-			 */
-			for (;
-			     k &&
-			     (uk = bkey_unpack_key(b, k),
-			      bkey_cmp(uk.p, bkey_start_pos(m)) > 0);
-			     k = bch2_bkey_prev_all(b, t, k)) {
-				if (bkey_cmp(uk.p, m->p) >= 0)
-					continue;
-
-				if (!extent_merge_one_overlapping(iter, new_pos,
-						t, k, uk, check, could_pack))
-					return false;
-			}
-		} else {
-			/* Front merge - walk forwards */
-			for (;
-			     k != btree_bkey_last(b, t) &&
-			     (uk = bkey_unpack_key(b, k),
-			      bkey_cmp(uk.p, m->p) < 0);
-			     k = bkey_next(k)) {
-				if (bkey_cmp(uk.p,
-					     bkey_start_pos(m)) <= 0)
-					continue;
-
-				if (!extent_merge_one_overlapping(iter, new_pos,
-						t, k, uk, check, could_pack))
-					return false;
-			}
-		}
 	}
 
-	if (check) {
-		check = false;
-		goto do_fixup;
-	}
+	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
+	BUG_ON(val_u64s_delta < 0);
 
-	return true;
+	set_bkey_val_u64s(k.k, new_val_u64s);
+	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
+	return -val_u64s_delta;
 }
 
-/*
- * When merging an extent that we're inserting into a btree node, the new merged
- * extent could overlap with an existing 0 size extent - if we don't fix that,
- * it'll break the btree node iterator so this code finds those 0 size extents
- * and shifts them out of the way.
- *
- * Also unpacks and repacks.
- */
-static bool bch2_extent_merge_inline(struct bch_fs *c,
-				     struct btree_iter *iter,
-				     struct bkey_packed *l,
-				     struct bkey_packed *r,
-				     bool back_merge)
+int bch2_cut_back_s(struct bpos where, struct bkey_s k)
 {
-	struct btree *b = iter->l[0].b;
-	struct btree_node_iter *node_iter = &iter->l[0].iter;
-	const struct bkey_format *f = &b->format;
-	struct bset_tree *t = bset_tree_last(b);
-	struct bkey_packed *m;
-	BKEY_PADDED(k) li;
-	BKEY_PADDED(k) ri;
-	struct bkey_i *mi;
-	struct bkey tmp;
-
-	/*
-	 * We need to save copies of both l and r, because we might get a
-	 * partial merge (which modifies both) and then fails to repack
-	 */
-	bch2_bkey_unpack(b, &li.k, l);
-	bch2_bkey_unpack(b, &ri.k, r);
-
-	m = back_merge ? l : r;
-	mi = back_merge ? &li.k : &ri.k;
-
-	/* l & r should be in last bset: */
-	EBUG_ON(bch2_bkey_to_bset(b, m) != t);
-
-	switch (bch2_extent_merge(c, b, &li.k, &ri.k)) {
-	case BCH_MERGE_NOMERGE:
-		return false;
-	case BCH_MERGE_PARTIAL:
-		if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &mi->k, f))
-			return false;
-
-		if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
-			return false;
-
-		extent_i_save(b, m, mi);
-		bch2_bset_fix_invalidated_key(b, t, m);
+	unsigned new_val_u64s = bkey_val_u64s(k.k);
+	int val_u64s_delta;
+	u64 len = 0;
 
-		/*
-		 * Update iterator to reflect what we just inserted - otherwise,
-		 * the iter_fix() call is going to put us _before_ the key we
-		 * just partially merged with:
-		 */
-		if (back_merge)
-			bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p);
+	if (bkey_ge(where, k.k->p))
+		return 0;
 
-		bch2_btree_node_iter_fix(iter, b, node_iter,
-					 t, m, m->u64s, m->u64s);
+	EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
 
-		if (!back_merge)
-			bkey_copy(packed_to_bkey(l), &li.k);
-		else
-			bkey_copy(packed_to_bkey(r), &ri.k);
-		return false;
-	case BCH_MERGE_MERGE:
-		if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &li.k.k, f))
-			return false;
+	len = where.offset - bkey_start_offset(k.k);
 
-		if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
-			return false;
-
-		extent_i_save(b, m, &li.k);
-		bch2_bset_fix_invalidated_key(b, t, m);
+	k.k->p.offset = where.offset;
+	k.k->size = len;
 
-		bch2_btree_node_iter_fix(iter, b, node_iter,
-					 t, m, m->u64s, m->u64s);
-		return true;
-	default:
-		BUG();
+	if (!len) {
+		k.k->type = KEY_TYPE_deleted;
+		new_val_u64s = 0;
 	}
-}
-
-int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
-{
-	struct btree_iter iter;
-	struct bpos end = pos;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	end.offset += size;
-
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
-			     BTREE_ITER_SLOTS, k) {
-		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
-			break;
 
-		if (!bch2_extent_is_fully_allocated(k)) {
-			ret = -ENOSPC;
-			break;
-		}
+	switch (k.k->type) {
+	case KEY_TYPE_inline_data:
+	case KEY_TYPE_indirect_inline_data:
+		new_val_u64s = (bkey_inline_data_offset(k.k) +
+				min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
+		break;
 	}
-	bch2_btree_iter_unlock(&iter);
 
-	return ret;
+	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
+	BUG_ON(val_u64s_delta < 0);
+
+	set_bkey_val_u64s(k.k, new_val_u64s);
+	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
+	return -val_u64s_delta;
 }
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 08ad9647..620b284a 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_EXTENTS_H
 #define _BCACHEFS_EXTENTS_H
 
@@ -6,129 +7,45 @@
 #include "extents_types.h"
 
 struct bch_fs;
-struct journal_res;
-struct btree_node_iter;
-struct btree_node_iter_large;
-struct btree_insert;
-struct btree_insert_entry;
-struct extent_insert_hook;
-struct bch_devs_mask;
-union bch_extent_crc;
-
-const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
-			       struct bkey_s_c);
-void bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
-void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
-
-#define bch2_bkey_btree_ops (struct bkey_ops) {			\
-	.key_invalid	= bch2_btree_ptr_invalid,		\
-	.key_debugcheck	= bch2_btree_ptr_debugcheck,		\
-	.val_to_text	= bch2_btree_ptr_to_text,		\
-	.swab		= bch2_ptr_swab,			\
-}
-
-const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-void bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
-bool bch2_ptr_normalize(struct bch_fs *, struct btree *, struct bkey_s);
-enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
-				    struct bkey_i *, struct bkey_i *);
-
-#define bch2_bkey_extent_ops (struct bkey_ops) {		\
-	.key_invalid	= bch2_extent_invalid,			\
-	.key_debugcheck	= bch2_extent_debugcheck,		\
-	.val_to_text	= bch2_extent_to_text,			\
-	.swab		= bch2_ptr_swab,			\
-	.key_normalize	= bch2_ptr_normalize,			\
-	.key_merge	= bch2_extent_merge,			\
-	.is_extents	= true,					\
-}
-
-struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *,
-						  struct btree *,
-						  struct btree_node_iter_large *);
-struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
-						     struct bset *,
-						     struct btree *,
-						     struct btree_node_iter_large *);
-
-int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
-			struct bch_devs_mask *avoid,
-			struct extent_pick_ptr *);
-
-int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
-			 struct bch_devs_mask *,
-			 struct extent_pick_ptr *);
+struct btree_trans;
 
-enum btree_insert_ret
-bch2_insert_fixup_extent(struct btree_insert *,
-			struct btree_insert_entry *);
+/* extent entries: */
 
-bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
-void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
-				      unsigned, unsigned);
-
-const struct bch_extent_ptr *
-bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
-bool bch2_extent_drop_device(struct bkey_s_extent, unsigned);
-const struct bch_extent_ptr *
-bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned);
-const struct bch_extent_ptr *
-bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned);
-
-unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
-unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
-unsigned bch2_extent_is_compressed(struct bkey_s_c);
-
-unsigned bch2_extent_ptr_durability(struct bch_fs *,
-				    const struct bch_extent_ptr *);
-unsigned bch2_extent_durability(struct bch_fs *, struct bkey_s_c_extent);
-
-bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
-			     struct bch_extent_ptr, u64);
-
-static inline bool bkey_extent_is_data(const struct bkey *k)
-{
-	switch (k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool bkey_extent_is_allocation(const struct bkey *k)
-{
-	switch (k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-	case BCH_RESERVATION:
-		return true;
-	default:
-		return false;
-	}
-}
+#define extent_entry_last(_e)						\
+	((typeof(&(_e).v->start[0])) bkey_val_end(_e))
 
-static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
-{
-	return bkey_extent_is_allocation(k.k) &&
-		!bch2_extent_is_compressed(k);
-}
+#define entry_to_ptr(_entry)						\
+({									\
+	EBUG_ON((_entry) && !extent_entry_is_ptr(_entry));		\
+									\
+	__builtin_choose_expr(						\
+		type_is_exact(_entry, const union bch_extent_entry *),	\
+		(const struct bch_extent_ptr *) (_entry),		\
+		(struct bch_extent_ptr *) (_entry));			\
+})
 
-static inline bool bkey_extent_is_cached(const struct bkey *k)
-{
-	return k->type == BCH_EXTENT_CACHED;
-}
+/* downcast, preserves const */
+#define to_entry(_entry)						\
+({									\
+	BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&	\
+		     !type_is(_entry, struct bch_extent_ptr *) &&	\
+		     !type_is(_entry, struct bch_extent_stripe_ptr *));	\
+									\
+	__builtin_choose_expr(						\
+		(type_is_exact(_entry, const union bch_extent_crc *) ||	\
+		 type_is_exact(_entry, const struct bch_extent_ptr *) ||\
+		 type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\
+		(const union bch_extent_entry *) (_entry),		\
+		(union bch_extent_entry *) (_entry));			\
+})
 
-static inline void bkey_extent_set_cached(struct bkey *k, bool cached)
-{
-	EBUG_ON(k->type != BCH_EXTENT &&
-		k->type != BCH_EXTENT_CACHED);
+#define extent_entry_next(_entry)					\
+	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
 
-	k->type = cached ? BCH_EXTENT_CACHED : BCH_EXTENT;
-}
+#define extent_entry_next_safe(_entry, _end)				\
+	(likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX)	\
+	 ? extent_entry_next(_entry)					\
+	 : _end)
 
 static inline unsigned
 __extent_entry_type(const union bch_extent_entry *e)
@@ -149,14 +66,11 @@ extent_entry_type(const union bch_extent_entry *e)
 static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
 {
 	switch (extent_entry_type(entry)) {
-	case BCH_EXTENT_ENTRY_crc32:
-		return sizeof(struct bch_extent_crc32);
-	case BCH_EXTENT_ENTRY_crc64:
-		return sizeof(struct bch_extent_crc64);
-	case BCH_EXTENT_ENTRY_crc128:
-		return sizeof(struct bch_extent_crc128);
-	case BCH_EXTENT_ENTRY_ptr:
-		return sizeof(struct bch_extent_ptr);
+#define x(f, n)						\
+	case BCH_EXTENT_ENTRY_##f:			\
+		return sizeof(struct bch_extent_##f);
+	BCH_EXTENT_ENTRY_TYPES()
+#undef x
 	default:
 		BUG();
 	}
@@ -167,14 +81,50 @@ static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
 	return extent_entry_bytes(entry) / sizeof(u64);
 }
 
+static inline void __extent_entry_insert(struct bkey_i *k,
+					 union bch_extent_entry *dst,
+					 union bch_extent_entry *new)
+{
+	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
+
+	memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
+			      dst, (u64 *) end - (u64 *) dst);
+	k->k.u64s += extent_entry_u64s(new);
+	memcpy_u64s_small(dst, new, extent_entry_u64s(new));
+}
+
+static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
+{
+	union bch_extent_entry *next = extent_entry_next(entry);
+
+	/* stripes have ptrs, but their layout doesn't work with this code */
+	BUG_ON(k.k->type == KEY_TYPE_stripe);
+
+	memmove_u64s_down(entry, next,
+			  (u64 *) bkey_val_end(k) - (u64 *) next);
+	k.k->u64s -= (u64 *) next - (u64 *) entry;
+}
+
 static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
 {
-	return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+	return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+}
+
+static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
+{
+	return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
 }
 
 static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
 {
-	return !extent_entry_is_ptr(e);
+	switch (__extent_entry_type(e)) {
+	case BCH_EXTENT_ENTRY_crc32:
+	case BCH_EXTENT_ENTRY_crc64:
+	case BCH_EXTENT_ENTRY_crc128:
+		return true;
+	default:
+		return false;
+	}
 }
 
 union bch_extent_crc {
@@ -184,19 +134,6 @@ union bch_extent_crc {
 	struct bch_extent_crc128	crc128;
 };
 
-/* downcast, preserves const */
-#define to_entry(_entry)						\
-({									\
-	BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&	\
-		     !type_is(_entry, struct bch_extent_ptr *));	\
-									\
-	__builtin_choose_expr(						\
-		(type_is_exact(_entry, const union bch_extent_crc *) ||	\
-		 type_is_exact(_entry, const struct bch_extent_ptr *)),	\
-		(const union bch_extent_entry *) (_entry),		\
-		(union bch_extent_entry *) (_entry));			\
-})
-
 #define __entry_to_crc(_entry)						\
 	__builtin_choose_expr(						\
 		type_is_exact(_entry, const union bch_extent_entry *),	\
@@ -210,56 +147,6 @@ union bch_extent_crc {
 	__entry_to_crc(_entry);						\
 })
 
-#define entry_to_ptr(_entry)						\
-({									\
-	EBUG_ON((_entry) && !extent_entry_is_ptr(_entry));		\
-									\
-	__builtin_choose_expr(						\
-		type_is_exact(_entry, const union bch_extent_entry *),	\
-		(const struct bch_extent_ptr *) (_entry),		\
-		(struct bch_extent_ptr *) (_entry));			\
-})
-
-/* checksum entries: */
-
-enum bch_extent_crc_type {
-	BCH_EXTENT_CRC_NONE,
-	BCH_EXTENT_CRC32,
-	BCH_EXTENT_CRC64,
-	BCH_EXTENT_CRC128,
-};
-
-static inline enum bch_extent_crc_type
-__extent_crc_type(const union bch_extent_crc *crc)
-{
-	if (!crc)
-		return BCH_EXTENT_CRC_NONE;
-
-	switch (extent_entry_type(to_entry(crc))) {
-	case BCH_EXTENT_ENTRY_crc32:
-		return BCH_EXTENT_CRC32;
-	case BCH_EXTENT_ENTRY_crc64:
-		return BCH_EXTENT_CRC64;
-	case BCH_EXTENT_ENTRY_crc128:
-		return BCH_EXTENT_CRC128;
-	default:
-		BUG();
-	}
-}
-
-#define extent_crc_type(_crc)						\
-({									\
-	BUILD_BUG_ON(!type_is(_crc, struct bch_extent_crc32 *) &&	\
-		     !type_is(_crc, struct bch_extent_crc64 *) &&	\
-		     !type_is(_crc, struct bch_extent_crc128 *) &&	\
-		     !type_is(_crc, union bch_extent_crc *));		\
-									\
-	  type_is(_crc, struct bch_extent_crc32 *)  ? BCH_EXTENT_CRC32	\
-	: type_is(_crc, struct bch_extent_crc64 *)  ? BCH_EXTENT_CRC64	\
-	: type_is(_crc, struct bch_extent_crc128 *) ? BCH_EXTENT_CRC128	\
-	: __extent_crc_type((union bch_extent_crc *) _crc);		\
-})
-
 static inline struct bch_extent_crc_unpacked
 bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 {
@@ -271,37 +158,34 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 		.offset			= _crc.offset,			\
 		.live_size		= k->size
 
-	switch (extent_crc_type(crc)) {
-	case BCH_EXTENT_CRC_NONE:
+	if (!crc)
 		return (struct bch_extent_crc_unpacked) {
 			.compressed_size	= k->size,
 			.uncompressed_size	= k->size,
 			.live_size		= k->size,
 		};
-	case BCH_EXTENT_CRC32: {
+
+	switch (extent_entry_type(to_entry(crc))) {
+	case BCH_EXTENT_ENTRY_crc32: {
 		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
 			common_fields(crc->crc32),
 		};
 
-		*((__le32 *) &ret.csum.lo) = crc->crc32.csum;
-
-		memcpy(&ret.csum.lo, &crc->crc32.csum,
-		       sizeof(crc->crc32.csum));
-
+		*((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum;
 		return ret;
 	}
-	case BCH_EXTENT_CRC64: {
+	case BCH_EXTENT_ENTRY_crc64: {
 		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
 			common_fields(crc->crc64),
 			.nonce			= crc->crc64.nonce,
 			.csum.lo		= (__force __le64) crc->crc64.csum_lo,
 		};
 
-		*((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi;
+		*((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi;
 
 		return ret;
 	}
-	case BCH_EXTENT_CRC128: {
+	case BCH_EXTENT_ENTRY_crc128: {
 		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
 			common_fields(crc->crc128),
 			.nonce			= crc->crc128.nonce,
@@ -316,223 +200,557 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 #undef common_fields
 }
 
-/* Extent entry iteration: */
+static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
+{
+	return (crc.compression_type != BCH_COMPRESSION_TYPE_none &&
+		crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
+}
 
-#define extent_entry_next(_entry)					\
-	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
+{
+	return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
+}
 
-#define extent_entry_last(_e)						\
-	vstruct_idx((_e).v, bkey_val_u64s((_e).k))
+void bch2_extent_crc_unpacked_to_text(struct printbuf *, struct bch_extent_crc_unpacked *);
 
-/* Iterate over all entries: */
+/* bkey_ptrs: generically over any key type that has ptrs */
 
-#define extent_for_each_entry_from(_e, _entry, _start)			\
-	for ((_entry) = _start;						\
-	     (_entry) < extent_entry_last(_e);				\
-	     (_entry) = extent_entry_next(_entry))
+struct bkey_ptrs_c {
+	const union bch_extent_entry	*start;
+	const union bch_extent_entry	*end;
+};
 
-#define extent_for_each_entry(_e, _entry)				\
-	extent_for_each_entry_from(_e, _entry, (_e).v->start)
+struct bkey_ptrs {
+	union bch_extent_entry	*start;
+	union bch_extent_entry	*end;
+};
 
-/* Iterate over crcs only: */
+static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr: {
+		struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
 
-#define __extent_crc_next(_e, _p)					\
-({									\
-	typeof(&(_e).v->start[0]) _entry = _p;				\
-									\
-	while ((_entry) < extent_entry_last(_e) &&			\
-	       !extent_entry_is_crc(_entry))				\
-		(_entry) = extent_entry_next(_entry);			\
-									\
-	entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL);	\
-})
+		return (struct bkey_ptrs_c) {
+			to_entry(&e.v->start[0]),
+			to_entry(extent_entry_last(e))
+		};
+	}
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+
+		return (struct bkey_ptrs_c) {
+			e.v->start,
+			extent_entry_last(e)
+		};
+	}
+	case KEY_TYPE_stripe: {
+		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
 
-#define __extent_for_each_crc(_e, _crc)					\
-	for ((_crc) = __extent_crc_next(_e, (_e).v->start);		\
-	     (_crc);							\
-	     (_crc) = __extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
+		return (struct bkey_ptrs_c) {
+			to_entry(&s.v->ptrs[0]),
+			to_entry(&s.v->ptrs[s.v->nr_blocks]),
+		};
+	}
+	case KEY_TYPE_reflink_v: {
+		struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+		return (struct bkey_ptrs_c) {
+			r.v->start,
+			bkey_val_end(r),
+		};
+	}
+	case KEY_TYPE_btree_ptr_v2: {
+		struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k);
 
-#define extent_crc_next(_e, _crc, _iter)				\
+		return (struct bkey_ptrs_c) {
+			to_entry(&e.v->start[0]),
+			to_entry(extent_entry_last(e))
+		};
+	}
+	default:
+		return (struct bkey_ptrs_c) { NULL, NULL };
+	}
+}
+
+static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
+{
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
+
+	return (struct bkey_ptrs) {
+		(void *) p.start,
+		(void *) p.end
+	};
+}
+
+#define __bkey_extent_entry_for_each_from(_start, _end, _entry)		\
+	for ((_entry) = (_start);					\
+	     (_entry) < (_end);						\
+	     (_entry) = extent_entry_next_safe(_entry, _end))
+
+#define __bkey_ptr_next(_ptr, _end)					\
 ({									\
-	extent_for_each_entry_from(_e, _iter, _iter)			\
-		if (extent_entry_is_crc(_iter)) {			\
-			(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\
+	typeof(_end) _entry;						\
+									\
+	__bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry)	\
+		if (extent_entry_is_ptr(_entry))			\
 			break;						\
-		}							\
 									\
-	(_iter) < extent_entry_last(_e);				\
+	_entry < (_end) ? entry_to_ptr(_entry) : NULL;			\
 })
 
-#define extent_for_each_crc(_e, _crc, _iter)				\
-	for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL),		\
-	     (_iter) = (_e).v->start;					\
-	     extent_crc_next(_e, _crc, _iter);				\
-	     (_iter) = extent_entry_next(_iter))
+#define bkey_extent_entry_for_each_from(_p, _entry, _start)		\
+	__bkey_extent_entry_for_each_from(_start, (_p).end, _entry)
+
+#define bkey_extent_entry_for_each(_p, _entry)				\
+	bkey_extent_entry_for_each_from(_p, _entry, _p.start)
+
+#define __bkey_for_each_ptr(_start, _end, _ptr)				\
+	for (typeof(_start) (_ptr) = (_start);				\
+	     ((_ptr) = __bkey_ptr_next(_ptr, _end));			\
+	     (_ptr)++)
+
+#define bkey_ptr_next(_p, _ptr)						\
+	__bkey_ptr_next(_ptr, (_p).end)
 
-/* Iterate over pointers, with crcs: */
+#define bkey_for_each_ptr(_p, _ptr)					\
+	__bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr)
 
-#define extent_ptr_crc_next(_e, _ptr, _crc)				\
+#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry)			\
 ({									\
 	__label__ out;							\
-	typeof(&(_e).v->start[0]) _entry;				\
 									\
-	extent_for_each_entry_from(_e, _entry, to_entry(_ptr))		\
-		if (extent_entry_is_crc(_entry)) {			\
-			(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_entry));\
-		} else {						\
-			_ptr = entry_to_ptr(_entry);			\
+	(_ptr).idx	= 0;						\
+	(_ptr).has_ec	= false;					\
+									\
+	__bkey_extent_entry_for_each_from(_entry, _end, _entry)		\
+		switch (__extent_entry_type(_entry)) {			\
+		case BCH_EXTENT_ENTRY_ptr:				\
+			(_ptr).ptr		= _entry->ptr;		\
 			goto out;					\
+		case BCH_EXTENT_ENTRY_crc32:				\
+		case BCH_EXTENT_ENTRY_crc64:				\
+		case BCH_EXTENT_ENTRY_crc128:				\
+			(_ptr).crc = bch2_extent_crc_unpack(_k,		\
+					entry_to_crc(_entry));		\
+			break;						\
+		case BCH_EXTENT_ENTRY_stripe_ptr:			\
+			(_ptr).ec = _entry->stripe_ptr;			\
+			(_ptr).has_ec	= true;				\
+			break;						\
+		default:						\
+			/* nothing */					\
+			break;						\
 		}							\
-									\
-	_ptr = NULL;							\
 out:									\
-	_ptr;								\
+	_entry < (_end);						\
 })
 
-#define extent_for_each_ptr_crc(_e, _ptr, _crc)				\
-	for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL),		\
-	     (_ptr) = &(_e).v->start->ptr;				\
-	     ((_ptr) = extent_ptr_crc_next(_e, _ptr, _crc));		\
-	     (_ptr)++)
+#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry)	\
+	for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL),		\
+	     (_entry) = _start;						\
+	     __bkey_ptr_next_decode(_k, _end, _ptr, _entry);		\
+	     (_entry) = extent_entry_next_safe(_entry, _end))
 
-/* Iterate over pointers only, and from a given position: */
+#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry)			\
+	__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end,		\
+				   _ptr, _entry)
 
-#define extent_ptr_next(_e, _ptr)					\
+#define bkey_crc_next(_k, _end, _crc, _iter)			\
 ({									\
-	struct bch_extent_crc_unpacked _crc;				\
+	__bkey_extent_entry_for_each_from(_iter, _end, _iter)		\
+		if (extent_entry_is_crc(_iter)) {			\
+			(_crc) = bch2_extent_crc_unpack(_k,		\
+						entry_to_crc(_iter));	\
+			break;						\
+		}							\
 									\
-	extent_ptr_crc_next(_e, _ptr, _crc);				\
+	(_iter) < (_end);						\
 })
 
+#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter)		\
+	for ((_crc) = bch2_extent_crc_unpack(_k, NULL),			\
+	     (_iter) = (_start);					\
+	     bkey_crc_next(_k, _end, _crc, _iter);		\
+	     (_iter) = extent_entry_next(_iter))
+
+#define bkey_for_each_crc(_k, _p, _crc, _iter)				\
+	__bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
+
+/* Iterate over pointers in KEY_TYPE_extent: */
+
+#define extent_for_each_entry_from(_e, _entry, _start)			\
+	__bkey_extent_entry_for_each_from(_start,			\
+				extent_entry_last(_e), _entry)
+
+#define extent_for_each_entry(_e, _entry)				\
+	extent_for_each_entry_from(_e, _entry, (_e).v->start)
+
+#define extent_ptr_next(_e, _ptr)					\
+	__bkey_ptr_next(_ptr, extent_entry_last(_e))
+
 #define extent_for_each_ptr(_e, _ptr)					\
-	for ((_ptr) = &(_e).v->start->ptr;				\
-	     ((_ptr) = extent_ptr_next(_e, _ptr));			\
-	     (_ptr)++)
+	__bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
 
-#define extent_ptr_prev(_e, _ptr)					\
-({									\
-	typeof(&(_e).v->start->ptr) _p;					\
-	typeof(&(_e).v->start->ptr) _prev = NULL;			\
-									\
-	extent_for_each_ptr(_e, _p) {					\
-		if (_p == (_ptr))					\
-			break;						\
-		_prev = _p;						\
-	}								\
-									\
-	_prev;								\
+#define extent_for_each_ptr_decode(_e, _ptr, _entry)			\
+	__bkey_for_each_ptr_decode((_e).k, (_e).v->start,		\
+				   extent_entry_last(_e), _ptr, _entry)
+
+/* utility code common to all keys with pointers: */
+
+struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
+						 unsigned);
+void bch2_mark_io_failure(struct bch_io_failures *,
+			  struct extent_ptr_decoded *);
+int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
+			       struct bch_io_failures *,
+			       struct extent_ptr_decoded *);
+
+/* KEY_TYPE_btree_ptr: */
+
+int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c,
+			    struct bkey_validate_context);
+void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+
+int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c,
+			       struct bkey_validate_context);
+void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
+			      int, struct bkey_s);
+
+#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) {		\
+	.key_validate	= bch2_btree_ptr_validate,		\
+	.val_to_text	= bch2_btree_ptr_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+	.trigger	= bch2_trigger_extent,			\
 })
 
-/*
- * Use this when you'll be dropping pointers as you iterate. Quadratic,
- * unfortunately:
- */
-#define extent_for_each_ptr_backwards(_e, _ptr)				\
-	for ((_ptr) = extent_ptr_prev(_e, NULL);			\
-	     (_ptr);							\
-	     (_ptr) = extent_ptr_prev(_e, _ptr))
+#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) {		\
+	.key_validate	= bch2_btree_ptr_v2_validate,		\
+	.val_to_text	= bch2_btree_ptr_v2_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+	.compat		= bch2_btree_ptr_v2_compat,		\
+	.trigger	= bch2_trigger_extent,			\
+	.min_val_size	= 40,					\
+})
+
+/* KEY_TYPE_extent: */
+
+bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+
+#define bch2_bkey_ops_extent ((struct bkey_ops) {		\
+	.key_validate	= bch2_bkey_ptrs_validate,		\
+	.val_to_text	= bch2_bkey_ptrs_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+	.key_normalize	= bch2_extent_normalize,		\
+	.key_merge	= bch2_extent_merge,			\
+	.trigger	= bch2_trigger_extent,			\
+})
+
+/* KEY_TYPE_reservation: */
 
-void bch2_extent_crc_append(struct bkey_i_extent *,
+int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c,
+			      struct bkey_validate_context);
+void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+
+#define bch2_bkey_ops_reservation ((struct bkey_ops) {		\
+	.key_validate	= bch2_reservation_validate,		\
+	.val_to_text	= bch2_reservation_to_text,		\
+	.key_merge	= bch2_reservation_merge,		\
+	.trigger	= bch2_trigger_reservation,		\
+	.min_val_size	= 8,					\
+})
+
+/* Extent checksum entries: */
+
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
+				 struct bch_extent_crc_unpacked);
+bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
+void bch2_extent_crc_append(struct bkey_i *,
 			    struct bch_extent_crc_unpacked);
 
-static inline void __extent_entry_push(struct bkey_i_extent *e)
+/* Generic code for keys with pointers: */
+
+static inline bool bkey_is_btree_ptr(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bkey_extent_is_direct_data(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bkey_extent_is_inline_data(const struct bkey *k)
+{
+	return  k->type == KEY_TYPE_inline_data ||
+		k->type == KEY_TYPE_indirect_inline_data;
+}
+
+static inline unsigned bkey_inline_data_offset(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_inline_data:
+		return sizeof(struct bch_inline_data);
+	case KEY_TYPE_indirect_inline_data:
+		return sizeof(struct bch_indirect_inline_data);
+	default:
+		BUG();
+	}
+}
+
+static inline unsigned bkey_inline_data_bytes(const struct bkey *k)
+{
+	return bkey_val_bytes(k) - bkey_inline_data_offset(k);
+}
+
+#define bkey_inline_data_p(_k)	(((void *) (_k).v) + bkey_inline_data_offset((_k).k))
+
+static inline bool bkey_extent_is_data(const struct bkey *k)
+{
+	return  bkey_extent_is_direct_data(k) ||
+		bkey_extent_is_inline_data(k) ||
+		k->type == KEY_TYPE_reflink_p;
+}
+
+/*
+ * Should extent be counted under inode->i_sectors?
+ */
+static inline bool bkey_extent_is_allocation(const struct bkey *k)
 {
-	union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e));
+	switch (k->type) {
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reservation:
+	case KEY_TYPE_reflink_p:
+	case KEY_TYPE_reflink_v:
+	case KEY_TYPE_inline_data:
+	case KEY_TYPE_indirect_inline_data:
+	case KEY_TYPE_error:
+		return true;
+	default:
+		return false;
+	}
+}
 
-	EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
-		BKEY_EXTENT_VAL_U64s_MAX);
+static inline bool bkey_extent_is_unwritten(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 
-	e->k.u64s += extent_entry_u64s(entry);
+	bkey_for_each_ptr(ptrs, ptr)
+		if (ptr->unwritten)
+			return true;
+	return false;
 }
 
-static inline void extent_ptr_append(struct bkey_i_extent *e,
-				     struct bch_extent_ptr ptr)
+static inline bool bkey_extent_is_reservation(struct bkey_s_c k)
 {
-	ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-	extent_entry_last(extent_i_to_s(e))->ptr = ptr;
-	__extent_entry_push(e);
+	return k.k->type == KEY_TYPE_reservation ||
+		bkey_extent_is_unwritten(k);
 }
 
-static inline struct bch_devs_list bch2_extent_devs(struct bkey_s_c_extent e)
+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
 {
 	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-	const struct bch_extent_ptr *ptr;
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
 
-	extent_for_each_ptr(e, ptr)
-		ret.devs[ret.nr++] = ptr->dev;
+	bkey_for_each_ptr(p, ptr)
+		ret.data[ret.nr++] = ptr->dev;
 
 	return ret;
 }
 
-static inline struct bch_devs_list bch2_extent_dirty_devs(struct bkey_s_c_extent e)
+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
 {
 	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-	const struct bch_extent_ptr *ptr;
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
 
-	extent_for_each_ptr(e, ptr)
+	bkey_for_each_ptr(p, ptr)
 		if (!ptr->cached)
-			ret.devs[ret.nr++] = ptr->dev;
+			ret.data[ret.nr++] = ptr->dev;
 
 	return ret;
 }
 
-static inline struct bch_devs_list bch2_extent_cached_devs(struct bkey_s_c_extent e)
+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
 {
 	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-	const struct bch_extent_ptr *ptr;
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
 
-	extent_for_each_ptr(e, ptr)
+	bkey_for_each_ptr(p, ptr)
 		if (ptr->cached)
-			ret.devs[ret.nr++] = ptr->dev;
+			ret.data[ret.nr++] = ptr->dev;
 
 	return ret;
 }
 
-static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
+bool bch2_bkey_is_incompressible(struct bkey_s_c);
+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
+
+unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
+unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_decoded *);
+unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *);
+unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
+
+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
+
+static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
 {
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		return bch2_extent_devs(bkey_s_c_to_extent(k));
-	default:
-		return (struct bch_devs_list) { .nr = 0 };
-	}
+	return (void *) bch2_bkey_has_device_c(k.s_c, dev);
 }
 
-static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
+
+void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
+
+static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr)
 {
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		return bch2_extent_dirty_devs(bkey_s_c_to_extent(k));
+	struct bch_extent_ptr *dest;
+
+	EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev));
+
+	switch (k->k.type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+	case KEY_TYPE_extent:
+		EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
+
+		ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+		dest = (struct bch_extent_ptr *)((void *) &k->v + bkey_val_bytes(&k->k));
+		*dest = ptr;
+		k->k.u64s++;
+		break;
 	default:
-		return (struct bch_devs_list) { .nr = 0 };
+		BUG();
 	}
 }
 
-static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
+void bch2_extent_ptr_decoded_append(struct bkey_i *,
+				    struct extent_ptr_decoded *);
+void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *);
+void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
+
+void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+
+#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond)			\
+do {									\
+	__label__ _again;						\
+	struct bkey_ptrs _ptrs;						\
+_again:									\
+	_ptrs = bch2_bkey_ptrs(_k);					\
+									\
+	bkey_for_each_ptr(_ptrs, _ptr)					\
+		if (_cond) {						\
+			bch2_bkey_drop_ptr_noerror(_k, _ptr);		\
+			goto _again;					\
+		}							\
+} while (0)
+
+#define bch2_bkey_drop_ptrs(_k, _ptr, _cond)				\
+do {									\
+	__label__ _again;						\
+	struct bkey_ptrs _ptrs;						\
+_again:									\
+	_ptrs = bch2_bkey_ptrs(_k);					\
+									\
+	bkey_for_each_ptr(_ptrs, _ptr)					\
+		if (_cond) {						\
+			bch2_bkey_drop_ptr(_k, _ptr);			\
+			goto _again;					\
+		}							\
+} while (0)
+
+bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
+			   struct bch_extent_ptr, u64);
+bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
+struct bch_extent_ptr *
+bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
+
+void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *,
+				struct bkey_s, struct bch_extent_ptr *);
+
+bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s);
+bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+
+void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
+void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c,
+			    struct bkey_validate_context);
+
+static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
+				      struct bch_extent_ptr ptr2)
 {
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		return bch2_extent_cached_devs(bkey_s_c_to_extent(k));
-	default:
-		return (struct bch_devs_list) { .nr = 0 };
-	}
+	return (ptr1.cached	== ptr2.cached &&
+		ptr1.unwritten	== ptr2.unwritten &&
+		ptr1.offset	== ptr2.offset &&
+		ptr1.dev	== ptr2.dev &&
+		ptr1.dev	== ptr2.dev);
 }
 
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
-				 struct bch_extent_crc_unpacked);
-bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
-void bch2_extent_drop_redundant_crcs(struct bkey_s_extent);
+void bch2_ptr_swab(struct bkey_s);
+
+/* Generic extent code: */
+
+enum bch_extent_overlap {
+	BCH_EXTENT_OVERLAP_ALL		= 0,
+	BCH_EXTENT_OVERLAP_BACK		= 1,
+	BCH_EXTENT_OVERLAP_FRONT	= 2,
+	BCH_EXTENT_OVERLAP_MIDDLE	= 3,
+};
+
+/* Returns how k overlaps with m */
+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
+							  const struct bkey *m)
+{
+	int cmp1 = bkey_lt(k->p, m->p);
+	int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m));
 
-void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
-void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
+	return (cmp1 << 1) + cmp2;
+}
+
+int bch2_cut_front_s(struct bpos, struct bkey_s);
+int bch2_cut_back_s(struct bpos, struct bkey_s);
+
+static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
+{
+	bch2_cut_front_s(where, bkey_i_to_s(k));
+}
 
-bool bch2_cut_front(struct bpos, struct bkey_i *);
-bool bch2_cut_back(struct bpos, struct bkey *);
-void bch2_key_resize(struct bkey *, unsigned);
+static inline void bch2_cut_back(struct bpos where, struct bkey_i *k)
+{
+	bch2_cut_back_s(where, bkey_i_to_s(k));
+}
 
-int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
+/**
+ * bch_key_resize - adjust size of @k
+ *
+ * bkey_start_offset(k) will be preserved, modifies where the extent ends
+ */
+static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
+{
+	k->p.offset -= k->size;
+	k->p.offset += new_size;
+	k->size = new_size;
+}
 
 #endif /* _BCACHEFS_EXTENTS_H */
diff --git a/libbcachefs/extents_format.h b/libbcachefs/extents_format.h
new file mode 100644
index 00000000..c198dfc3
--- /dev/null
+++ b/libbcachefs/extents_format.h
@@ -0,0 +1,284 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_FORMAT_H
+#define _BCACHEFS_EXTENTS_FORMAT_H
+
+/*
+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
+ * preceded by checksum/compression information (bch_extent_crc32 or
+ * bch_extent_crc64).
+ *
+ * One major determining factor in the format of extents is how we handle and
+ * represent extents that have been partially overwritten and thus trimmed:
+ *
+ * If an extent is not checksummed or compressed, when the extent is trimmed we
+ * don't have to remember the extent we originally allocated and wrote: we can
+ * merely adjust ptr->offset to point to the start of the data that is currently
+ * live. The size field in struct bkey records the current (live) size of the
+ * extent, and is also used to mean "size of region on disk that we point to" in
+ * this case.
+ *
+ * Thus an extent that is not checksummed or compressed will consist only of a
+ * list of bch_extent_ptrs, with none of the fields in
+ * bch_extent_crc32/bch_extent_crc64.
+ *
+ * When an extent is checksummed or compressed, it's not possible to read only
+ * the data that is currently live: we have to read the entire extent that was
+ * originally written, and then return only the part of the extent that is
+ * currently live.
+ *
+ * Thus, in addition to the current size of the extent in struct bkey, we need
+ * to store the size of the originally allocated space - this is the
+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
+ * when the extent is trimmed, instead of modifying the offset field of the
+ * pointer, we keep a second smaller offset field - "offset into the original
+ * extent of the currently live region".
+ *
+ * The other major determining factor is replication and data migration:
+ *
+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
+ * write, we will initially write all the replicas in the same format, with the
+ * same checksum type and compression format - however, when copygc runs later (or
+ * tiering/cache promotion, anything that moves data), it is not in general
+ * going to rewrite all the pointers at once - one of the replicas may be in a
+ * bucket on one device that has very little fragmentation while another lives
+ * in a bucket that has become heavily fragmented, and thus is being rewritten
+ * sooner than the rest.
+ *
+ * Thus it will only move a subset of the pointers (or in the case of
+ * tiering/cache promotion perhaps add a single pointer without dropping any
+ * current pointers), and if the extent has been partially overwritten it must
+ * write only the currently live portion (or copygc would not be able to reduce
+ * fragmentation!) - which necessitates a different bch_extent_crc format for
+ * the new pointer.
+ *
+ * But in the interests of space efficiency, we don't want to store one
+ * bch_extent_crc for each pointer if we don't have to.
+ *
+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
+ * type of a given entry with a scheme similar to utf8 (except we're encoding a
+ * type, not a size), encoding the type in the position of the first set bit:
+ *
+ * bch_extent_crc32	- 0b1
+ * bch_extent_ptr	- 0b10
+ * bch_extent_crc64	- 0b100
+ *
+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
+ * bch_extent_crc64 is the least constrained).
+ *
+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
+ * until the next bch_extent_crc32/64.
+ *
+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
+ * is neither checksummed nor compressed.
+ */
+
+#define BCH_EXTENT_ENTRY_TYPES()		\
+	x(ptr,			0)		\
+	x(crc32,		1)		\
+	x(crc64,		2)		\
+	x(crc128,		3)		\
+	x(stripe_ptr,		4)		\
+	x(rebalance,		5)
+#define BCH_EXTENT_ENTRY_MAX	6
+
+enum bch_extent_entry_type {
+#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
+	BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+/* Compressed/uncompressed size are stored biased by 1: */
+struct bch_extent_crc32 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u32			type:2,
+				_compressed_size:7,
+				_uncompressed_size:7,
+				offset:7,
+				_unused:1,
+				csum_type:4,
+				compression_type:4;
+	__u32			csum;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u32			csum;
+	__u32			compression_type:4,
+				csum_type:4,
+				_unused:1,
+				offset:7,
+				_uncompressed_size:7,
+				_compressed_size:7,
+				type:2;
+#endif
+} __packed __aligned(8);
+
+#define CRC32_SIZE_MAX		(1U << 7)
+#define CRC32_NONCE_MAX		0
+
+struct bch_extent_crc64 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:3,
+				_compressed_size:9,
+				_uncompressed_size:9,
+				offset:9,
+				nonce:10,
+				csum_type:4,
+				compression_type:4,
+				csum_hi:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			csum_hi:16,
+				compression_type:4,
+				csum_type:4,
+				nonce:10,
+				offset:9,
+				_uncompressed_size:9,
+				_compressed_size:9,
+				type:3;
+#endif
+	__u64			csum_lo;
+} __packed __aligned(8);
+
+#define CRC64_SIZE_MAX		(1U << 9)
+#define CRC64_NONCE_MAX		((1U << 10) - 1)
+
+struct bch_extent_crc128 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:4,
+				_compressed_size:13,
+				_uncompressed_size:13,
+				offset:13,
+				nonce:13,
+				csum_type:4,
+				compression_type:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			compression_type:4,
+				csum_type:4,
+				nonce:13,
+				offset:13,
+				_uncompressed_size:13,
+				_compressed_size:13,
+				type:4;
+#endif
+	struct bch_csum		csum;
+} __packed __aligned(8);
+
+#define CRC128_SIZE_MAX		(1U << 13)
+#define CRC128_NONCE_MAX	((1U << 13) - 1)
+
+/*
+ * @reservation - pointer hasn't been written to, just reserved
+ */
+struct bch_extent_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:1,
+				cached:1,
+				unused:1,
+				unwritten:1,
+				offset:44, /* 8 petabytes */
+				dev:8,
+				gen:8;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			gen:8,
+				dev:8,
+				offset:44,
+				unwritten:1,
+				unused:1,
+				cached:1,
+				type:1;
+#endif
+} __packed __aligned(8);
+
+struct bch_extent_stripe_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:5,
+				block:8,
+				redundancy:4,
+				idx:47;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			idx:47,
+				redundancy:4,
+				block:8,
+				type:5;
+#endif
+};
+
+/* bch_extent_rebalance: */
+#include "rebalance_format.h"
+
+union bch_extent_entry {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
+	unsigned long			type;
+#elif __BITS_PER_LONG == 32
+	struct {
+		unsigned long		pad;
+		unsigned long		type;
+	};
+#else
+#error edit for your odd byteorder.
+#endif
+
+#define x(f, n) struct bch_extent_##f	f;
+	BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+struct bch_btree_ptr {
+	struct bch_val		v;
+
+	__u64			_data[0];
+	struct bch_extent_ptr	start[];
+} __packed __aligned(8);
+
+struct bch_btree_ptr_v2 {
+	struct bch_val		v;
+
+	__u64			mem_ptr;
+	__le64			seq;
+	__le16			sectors_written;
+	__le16			flags;
+	struct bpos		min_key;
+	__u64			_data[0];
+	struct bch_extent_ptr	start[];
+} __packed __aligned(8);
+
+LE16_BITMASK(BTREE_PTR_RANGE_UPDATED,	struct bch_btree_ptr_v2, flags, 0, 1);
+
+struct bch_extent {
+	struct bch_val		v;
+
+	__u64			_data[0];
+	union bch_extent_entry	start[];
+} __packed __aligned(8);
+
+/* Maximum size (in u64s) a single pointer could be: */
+#define BKEY_EXTENT_PTR_U64s_MAX\
+	((sizeof(struct bch_extent_crc128) +			\
+	  sizeof(struct bch_extent_ptr)) / sizeof(__u64))
+
+/* Maximum possible size of an entire extent value: */
+#define BKEY_EXTENT_VAL_U64s_MAX				\
+	(1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
+
+/* * Maximum possible size of an entire extent, key + value: */
+#define BKEY_EXTENT_U64s_MAX		(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
+
+/* Btree pointers don't carry around checksums: */
+#define BKEY_BTREE_PTR_VAL_U64s_MAX				\
+	((sizeof(struct bch_btree_ptr_v2) +			\
+	  sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
+#define BKEY_BTREE_PTR_U64s_MAX					\
+	(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
+
+struct bch_reservation {
+	struct bch_val		v;
+
+	__le32			generation;
+	__u8			nr_replicas;
+	__u8			pad[3];
+} __packed __aligned(8);
+
+struct bch_inline_data {
+	struct bch_val		v;
+	u8			data[];
+};
+
+#endif /* _BCACHEFS_EXTENTS_FORMAT_H */
diff --git a/libbcachefs/extents_types.h b/libbcachefs/extents_types.h
index 76139f93..43d6c341 100644
--- a/libbcachefs/extents_types.h
+++ b/libbcachefs/extents_types.h
@@ -1,26 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_EXTENTS_TYPES_H
 #define _BCACHEFS_EXTENTS_TYPES_H
 
 #include "bcachefs_format.h"
 
 struct bch_extent_crc_unpacked {
+	u32			compressed_size;
+	u32			uncompressed_size;
+	u32			live_size;
+
 	u8			csum_type;
 	u8			compression_type;
 
-	u16			compressed_size;
-	u16			uncompressed_size;
-
 	u16			offset;
-	u16			live_size;
 
 	u16			nonce;
 
 	struct bch_csum		csum;
 };
 
-struct extent_pick_ptr {
-	struct bch_extent_ptr		ptr;
+struct extent_ptr_decoded {
+	unsigned			idx;
+	bool				has_ec;
 	struct bch_extent_crc_unpacked	crc;
+	struct bch_extent_ptr		ptr;
+	struct bch_extent_stripe_ptr	ec;
+};
+
+struct bch_io_failures {
+	u8			nr;
+	struct bch_dev_io_failures {
+		u8		dev;
+		u8		idx;
+		u8		nr_failed;
+		u8		nr_retries;
+	}			devs[BCH_REPLICAS_MAX];
 };
 
 #endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/libbcachefs/eytzinger.c b/libbcachefs/eytzinger.c
new file mode 100644
index 00000000..2eaffe37
--- /dev/null
+++ b/libbcachefs/eytzinger.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "eytzinger.h"
+
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required alignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
+{
+	unsigned char lsbits = (unsigned char)size;
+
+	(void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+	return (lsbits & (align - 1)) == 0;
+}
+
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory.  This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is still valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static void swap_words_32(void *a, void *b, size_t n)
+{
+	do {
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+	} while (n);
+}
+
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory.  This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible.  If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI).  Are there any cases the kernel needs to worry about?
+ */
+static void swap_words_64(void *a, void *b, size_t n)
+{
+	do {
+#ifdef CONFIG_64BIT
+		u64 t = *(u64 *)(a + (n -= 8));
+		*(u64 *)(a + n) = *(u64 *)(b + n);
+		*(u64 *)(b + n) = t;
+#else
+		/* Use two 32-bit transfers to avoid base+index+4 addressing */
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+
+		t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+#endif
+	} while (n);
+}
+
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static void swap_bytes(void *a, void *b, size_t n)
+{
+	do {
+		char t = ((char *)a)[--n];
+		((char *)a)[n] = ((char *)b)[n];
+		((char *)b)[n] = t;
+	} while (n);
+}
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 (swap_r_func_t)0
+#define SWAP_WORDS_32 (swap_r_func_t)1
+#define SWAP_BYTES    (swap_r_func_t)2
+#define SWAP_WRAPPER  (swap_r_func_t)3
+
+struct wrapper {
+	cmp_func_t cmp;
+	swap_func_t swap_func;
+};
+
+/*
+ * The function pointer is last to make tail calls most efficient if the
+ * compiler decides not to inline this function.
+ */
+static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
+{
+	if (swap_func == SWAP_WRAPPER) {
+		((const struct wrapper *)priv)->swap_func(a, b, (int)size);
+		return;
+	}
+
+	if (swap_func == SWAP_WORDS_64)
+		swap_words_64(a, b, size);
+	else if (swap_func == SWAP_WORDS_32)
+		swap_words_32(a, b, size);
+	else if (swap_func == SWAP_BYTES)
+		swap_bytes(a, b, size);
+	else
+		swap_func(a, b, (int)size, priv);
+}
+
+#define _CMP_WRAPPER ((cmp_r_func_t)0L)
+
+static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
+{
+	if (cmp == _CMP_WRAPPER)
+		return ((const struct wrapper *)priv)->cmp(a, b);
+	return cmp(a, b, priv);
+}
+
+static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
+			 cmp_r_func_t cmp_func, const void *priv,
+			 size_t l, size_t r)
+{
+	return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
+		      base + inorder_to_eytzinger0(r, n) * size,
+		      cmp_func, priv);
+}
+
+static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
+			   swap_r_func_t swap_func, const void *priv,
+			   size_t l, size_t r)
+{
+	do_swap(base + inorder_to_eytzinger0(l, n) * size,
+		base + inorder_to_eytzinger0(r, n) * size,
+		size, swap_func, priv);
+}
+
+void eytzinger0_sort_r(void *base, size_t n, size_t size,
+		       cmp_r_func_t cmp_func,
+		       swap_r_func_t swap_func,
+		       const void *priv)
+{
+	int i, j, k;
+
+	/* called from 'sort' without swap function, let's pick the default */
+	if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
+		swap_func = NULL;
+
+	if (!swap_func) {
+		if (is_aligned(base, size, 8))
+			swap_func = SWAP_WORDS_64;
+		else if (is_aligned(base, size, 4))
+			swap_func = SWAP_WORDS_32;
+		else
+			swap_func = SWAP_BYTES;
+	}
+
+	/* heapify */
+	for (i = n / 2 - 1; i >= 0; --i) {
+		/* Find the sift-down path all the way to the leaves. */
+		for (j = i; k = j * 2 + 1, k + 1 < n;)
+			j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
+
+		/* Special case for the last leaf with no sibling. */
+		if (j * 2 + 2 == n)
+			j = j * 2 + 1;
+
+		/* Backtrack to the correct location. */
+		while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0)
+			j = (j - 1) / 2;
+
+		/* Shift the element into its correct place. */
+		for (k = j; j != i;) {
+			j = (j - 1) / 2;
+			eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
+		}
+	}
+
+	/* sort */
+	for (i = n - 1; i > 0; --i) {
+		eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
+
+		/* Find the sift-down path all the way to the leaves. */
+		for (j = 0; k = j * 2 + 1, k + 1 < i;)
+			j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
+
+		/* Special case for the last leaf with no sibling. */
+		if (j * 2 + 2 == i)
+			j = j * 2 + 1;
+
+		/* Backtrack to the correct location. */
+		while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0)
+			j = (j - 1) / 2;
+
+		/* Shift the element into its correct place. */
+		for (k = j; j;) {
+			j = (j - 1) / 2;
+			eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
+		}
+	}
+}
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+		     cmp_func_t cmp_func,
+		     swap_func_t swap_func)
+{
+	struct wrapper w = {
+		.cmp  = cmp_func,
+		.swap_func = swap_func,
+	};
+
+	return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
+}
+
+#if 0
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/ktime.h>
+
+static u64 cmp_count;
+
+static int mycmp(const void *a, const void *b)
+{
+	u32 _a = *(u32 *)a;
+	u32 _b = *(u32 *)b;
+
+	cmp_count++;
+	if (_a < _b)
+		return -1;
+	else if (_a > _b)
+		return 1;
+	else
+		return 0;
+}
+
+static int test(void)
+{
+	size_t N, i;
+	ktime_t start, end;
+	s64 delta;
+	u32 *arr;
+
+	for (N = 10000; N <= 100000; N += 10000) {
+		arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL);
+		cmp_count = 0;
+
+		for (i = 0; i < N; i++)
+			arr[i] = get_random_u32();
+
+		start = ktime_get();
+		eytzinger0_sort(arr, N, sizeof(u32), mycmp, NULL);
+		end = ktime_get();
+
+		delta = ktime_us_delta(end, start);
+		printk(KERN_INFO "time: %lld\n", delta);
+		printk(KERN_INFO "comparisons: %lld\n", cmp_count);
+
+		u32 prev = 0;
+
+		eytzinger0_for_each(i, N) {
+			if (prev > arr[i])
+				goto err;
+			prev = arr[i];
+		}
+
+		kfree(arr);
+	}
+	return 0;
+
+err:
+	kfree(arr);
+	return -1;
+}
+#endif
diff --git a/libbcachefs/eytzinger.h b/libbcachefs/eytzinger.h
index 66fa227c..0541192d 100644
--- a/libbcachefs/eytzinger.h
+++ b/libbcachefs/eytzinger.h
@@ -1,30 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _EYTZINGER_H
 #define _EYTZINGER_H
 
 #include <linux/bitops.h>
 #include <linux/log2.h>
 
-#include "util.h"
+#ifdef EYTZINGER_DEBUG
+#define EYTZINGER_BUG_ON(cond)		BUG_ON(cond)
+#else
+#define EYTZINGER_BUG_ON(cond)
+#endif
 
 /*
  * Traversal for trees in eytzinger layout - a full binary tree layed out in an
- * array
- */
-
-/*
- * One based indexing version:
+ * array.
+ *
+ * Consider using an eytzinger tree any time you would otherwise be doing binary
+ * search over an array. Binary search is a worst case scenario for branch
+ * prediction and prefetching, but in an eytzinger tree every node's children
+ * are adjacent in memory, thus we can prefetch children before knowing the
+ * result of the comparison, assuming multiple nodes fit on a cacheline.
  *
- * With one based indexing each level of the tree starts at a power of two -
- * good for cacheline alignment:
+ * Two variants are provided, for one based indexing and zero based indexing.
  *
- * Size parameter is treated as if we were using 0 based indexing, however:
- * valid nodes, and inorder indices, are in the range [1..size) - that is, there
- * are actually size - 1 elements
+ * Zero based indexing is more convenient, but one based indexing has better
+ * alignment and thus better performance because each new level of the tree
+ * starts at a power of two, and thus if element 0 was cacheline aligned, each
+ * new level will be as well.
  */
 
 static inline unsigned eytzinger1_child(unsigned i, unsigned child)
 {
-	EBUG_ON(child > 1);
+	EYTZINGER_BUG_ON(child > 1);
 
 	return (i << 1) + child;
 }
@@ -41,12 +48,12 @@ static inline unsigned eytzinger1_right_child(unsigned i)
 
 static inline unsigned eytzinger1_first(unsigned size)
 {
-	return rounddown_pow_of_two(size - 1);
+	return size ? rounddown_pow_of_two(size) : 0;
 }
 
 static inline unsigned eytzinger1_last(unsigned size)
 {
-	return rounddown_pow_of_two(size) - 1;
+	return rounddown_pow_of_two(size + 1) - 1;
 }
 
 /*
@@ -61,13 +68,13 @@ static inline unsigned eytzinger1_last(unsigned size)
 
 static inline unsigned eytzinger1_next(unsigned i, unsigned size)
 {
-	EBUG_ON(i >= size);
+	EYTZINGER_BUG_ON(i > size);
 
-	if (eytzinger1_right_child(i) < size) {
+	if (eytzinger1_right_child(i) <= size) {
 		i = eytzinger1_right_child(i);
 
-		i <<= __fls(size) - __fls(i);
-		i >>= i >= size;
+		i <<= __fls(size + 1) - __fls(i);
+		i >>= i > size;
 	} else {
 		i >>= ffz(i) + 1;
 	}
@@ -77,14 +84,14 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
 
 static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
 {
-	EBUG_ON(i >= size);
+	EYTZINGER_BUG_ON(i > size);
 
-	if (eytzinger1_left_child(i) < size) {
+	if (eytzinger1_left_child(i) <= size) {
 		i = eytzinger1_left_child(i) + 1;
 
-		i <<= __fls(size) - __fls(i);
+		i <<= __fls(size + 1) - __fls(i);
 		i -= 1;
-		i >>= i >= size;
+		i >>= i > size;
 	} else {
 		i >>= __ffs(i) + 1;
 	}
@@ -94,17 +101,19 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
 
 static inline unsigned eytzinger1_extra(unsigned size)
 {
-	return (size - rounddown_pow_of_two(size - 1)) << 1;
+	return size
+		? (size + 1 - rounddown_pow_of_two(size)) << 1
+		: 0;
 }
 
 static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
 					      unsigned extra)
 {
 	unsigned b = __fls(i);
-	unsigned shift = __fls(size - 1) - b;
+	unsigned shift = __fls(size) - b;
 	int s;
 
-	EBUG_ON(!i || i >= size);
+	EYTZINGER_BUG_ON(!i || i > size);
 
 	i  ^= 1U << b;
 	i <<= 1;
@@ -129,7 +138,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
 	unsigned shift;
 	int s;
 
-	EBUG_ON(!i || i >= size);
+	EYTZINGER_BUG_ON(!i || i > size);
 
 	/*
 	 * sign bit trick:
@@ -143,7 +152,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
 	shift = __ffs(i);
 
 	i >>= shift + 1;
-	i  |= 1U << (__fls(size - 1) - shift);
+	i  |= 1U << (__fls(size) - shift);
 
 	return i;
 }
@@ -159,7 +168,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
 }
 
 #define eytzinger1_for_each(_i, _size)			\
-	for ((_i) = eytzinger1_first((_size));		\
+	for (unsigned (_i) = eytzinger1_first((_size));	\
 	     (_i) != 0;					\
 	     (_i) = eytzinger1_next((_i), (_size)))
 
@@ -167,7 +176,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
 
 static inline unsigned eytzinger0_child(unsigned i, unsigned child)
 {
-	EBUG_ON(child > 1);
+	EYTZINGER_BUG_ON(child > 1);
 
 	return (i << 1) + 1 + child;
 }
@@ -184,39 +193,39 @@ static inline unsigned eytzinger0_right_child(unsigned i)
 
 static inline unsigned eytzinger0_first(unsigned size)
 {
-	return eytzinger1_first(size + 1) - 1;
+	return eytzinger1_first(size) - 1;
 }
 
 static inline unsigned eytzinger0_last(unsigned size)
 {
-	return eytzinger1_last(size + 1) - 1;
+	return eytzinger1_last(size) - 1;
 }
 
 static inline unsigned eytzinger0_next(unsigned i, unsigned size)
 {
-	return eytzinger1_next(i + 1, size + 1) - 1;
+	return eytzinger1_next(i + 1, size) - 1;
 }
 
 static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
 {
-	return eytzinger1_prev(i + 1, size + 1) - 1;
+	return eytzinger1_prev(i + 1, size) - 1;
 }
 
 static inline unsigned eytzinger0_extra(unsigned size)
 {
-	return eytzinger1_extra(size + 1);
+	return eytzinger1_extra(size);
 }
 
 static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
 					       unsigned extra)
 {
-	return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1;
+	return __eytzinger1_to_inorder(i + 1, size, extra) - 1;
 }
 
 static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
 					       unsigned extra)
 {
-	return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1;
+	return __inorder_to_eytzinger1(i + 1, size, extra) - 1;
 }
 
 static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
@@ -230,15 +239,13 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
 }
 
 #define eytzinger0_for_each(_i, _size)			\
-	for ((_i) = eytzinger0_first((_size));		\
+	for (unsigned (_i) = eytzinger0_first((_size));	\
 	     (_i) != -1;				\
 	     (_i) = eytzinger0_next((_i), (_size)))
 
-typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
-
 /* return greatest node <= @search, or -1 if not found */
-static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
-					 eytzinger_cmp_fn cmp, const void *search)
+static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
+				     cmp_func_t cmp, const void *search)
 {
 	unsigned i, n = 0;
 
@@ -247,36 +254,66 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
 
 	do {
 		i = n;
-		n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+		n = eytzinger0_child(i, cmp(base + i * size, search) <= 0);
 	} while (n < nr);
 
 	if (n & 1) {
-		/* @i was greater than @search, return previous node: */
-
-		if (i == eytzinger0_first(nr))
-			return -1;
-
+		/*
+		 * @i was greater than @search, return previous node:
+		 *
+		 * if @i was leftmost/smallest element,
+		 * eytzinger0_prev(eytzinger0_first())) returns -1, as expected
+		 */
 		return eytzinger0_prev(i, nr);
 	} else {
 		return i;
 	}
 }
 
-static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
-				     eytzinger_cmp_fn cmp, const void *search)
+static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
+				     cmp_func_t cmp, const void *search)
 {
-	size_t i = 0;
-	int res;
+	ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
 
-	while (i < nr &&
-	       (res = cmp(search, base + i * size, size)))
-		i = eytzinger0_child(i, res > 0);
+	/*
+	 * if eytitzinger0_find_le() returned -1 - no element was <= search - we
+	 * want to return the first element; next/prev identities mean this work
+	 * as expected
+	 *
+	 * similarly if find_le() returns last element, we should return -1;
+	 * identities mean this all works out:
+	 */
+	return eytzinger0_next(idx, nr);
+}
 
-	return i;
+static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size,
+				     cmp_func_t cmp, const void *search)
+{
+	ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
+
+	if (idx < nr && !cmp(base + idx * size, search))
+		return idx;
+
+	return eytzinger0_next(idx, nr);
 }
 
-void eytzinger0_sort(void *, size_t, size_t,
-		    int (*cmp_func)(const void *, const void *, size_t),
-		    void (*swap_func)(void *, void *, size_t));
+#define eytzinger0_find(base, nr, size, _cmp, search)			\
+({									\
+	void *_base		= (base);				\
+	const void *_search	= (search);				\
+	size_t _nr		= (nr);					\
+	size_t _size		= (size);				\
+	size_t _i		= 0;					\
+	int _res;							\
+									\
+	while (_i < _nr &&						\
+	       (_res = _cmp(_search, _base + _i * _size)))		\
+		_i = eytzinger0_child(_i, _res > 0);			\
+	_i;								\
+})
+
+void eytzinger0_sort_r(void *, size_t, size_t,
+		       cmp_r_func_t, swap_r_func_t, const void *);
+void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
 
 #endif /* _EYTZINGER_H */
diff --git a/libbcachefs/fifo.h b/libbcachefs/fifo.h
index 789ae663..d8153fe2 100644
--- a/libbcachefs/fifo.h
+++ b/libbcachefs/fifo.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_FIFO_H
 #define _BCACHEFS_FIFO_H
 
@@ -12,7 +13,9 @@ struct {								\
 #define DECLARE_FIFO(type, name)	FIFO(type) name
 
 #define fifo_buf_size(fifo)						\
-	(roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]))
+	((fifo)->size							\
+	 ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0])	\
+	 : 0)
 
 #define init_fifo(fifo, _size, _gfp)					\
 ({									\
@@ -21,12 +24,12 @@ struct {								\
 	(fifo)->mask	= (fifo)->size					\
 		? roundup_pow_of_two((fifo)->size) - 1			\
 		: 0;							\
-	(fifo)->data	= kvpmalloc(fifo_buf_size(fifo), (_gfp));	\
+	(fifo)->data	= kvmalloc(fifo_buf_size(fifo), (_gfp));	\
 })
 
 #define free_fifo(fifo)							\
 do {									\
-	kvpfree((fifo)->data, fifo_buf_size(fifo));			\
+	kvfree((fifo)->data);						\
 	(fifo)->data = NULL;						\
 } while (0)
 
@@ -62,7 +65,7 @@ do {									\
 	   (((p) - (fifo)->data)))
 
 #define fifo_entry_idx(fifo, p)	(((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
-#define fifo_idx_entry(fifo, i)	(fifo)->data[((fifo)->front + (i)) & (fifo)->mask]
+#define fifo_idx_entry(fifo, i)	((fifo)->data[((fifo)->front + (i)) & (fifo)->mask])
 
 #define fifo_push_back_ref(f)						\
 	(fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
@@ -98,7 +101,7 @@ do {									\
 ({									\
 	bool _r = !fifo_empty((fifo));					\
 	if (_r)								\
-		(i) = (fifo)->data[--(fifo)->back & (fifo)->mask]	\
+		(i) = (fifo)->data[--(fifo)->back & (fifo)->mask];	\
 	_r;								\
 })
 
@@ -108,17 +111,17 @@ do {									\
 #define fifo_peek(fifo)		fifo_peek_front(fifo)
 
 #define fifo_for_each_entry(_entry, _fifo, _iter)			\
-	for (((void) (&(_iter) == &(_fifo)->front)),			\
-	     _iter = (_fifo)->front;					\
+	for (typecheck(typeof((_fifo)->front), _iter),			\
+	     (_iter) = (_fifo)->front;					\
 	     ((_iter != (_fifo)->back) &&				\
 	      (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true));	\
-	     _iter++)
+	     (_iter)++)
 
 #define fifo_for_each_entry_ptr(_ptr, _fifo, _iter)			\
-	for (((void) (&(_iter) == &(_fifo)->front)),			\
-	     _iter = (_fifo)->front;					\
+	for (typecheck(typeof((_fifo)->front), _iter),			\
+	     (_iter) = (_fifo)->front;					\
 	     ((_iter != (_fifo)->back) &&				\
 	      (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true));	\
-	     _iter++)
+	     (_iter)++)
 
 #endif /* _BCACHEFS_FIFO_H */
diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c
new file mode 100644
index 00000000..dcaa47f6
--- /dev/null
+++ b/libbcachefs/fs-common.c
@@ -0,0 +1,631 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "acl.h"
+#include "btree_update.h"
+#include "dirent.h"
+#include "fs-common.h"
+#include "inode.h"
+#include "subvolume.h"
+#include "xattr.h"
+
+#include <linux/posix_acl.h>
+
+static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
+{
+	return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
+}
+
+int bch2_create_trans(struct btree_trans *trans,
+		      subvol_inum dir,
+		      struct bch_inode_unpacked *dir_u,
+		      struct bch_inode_unpacked *new_inode,
+		      const struct qstr *name,
+		      uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+		      struct posix_acl *default_acl,
+		      struct posix_acl *acl,
+		      subvol_inum snapshot_src,
+		      unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter dir_iter = { NULL };
+	struct btree_iter inode_iter = { NULL };
+	subvol_inum new_inum = dir;
+	u64 now = bch2_current_time(c);
+	u64 cpu = raw_smp_processor_id();
+	u64 dir_target;
+	u32 snapshot;
+	unsigned dir_type = mode_to_type(mode);
+	int ret;
+
+	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir,
+			      BTREE_ITER_intent|BTREE_ITER_with_updates);
+	if (ret)
+		goto err;
+
+	if (!(flags & BCH_CREATE_SNAPSHOT)) {
+		/* Normal create path - allocate a new inode: */
+		bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
+
+		if (flags & BCH_CREATE_TMPFILE)
+			new_inode->bi_flags |= BCH_INODE_unlinked;
+
+		ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
+		if (ret)
+			goto err;
+
+		snapshot_src = (subvol_inum) { 0 };
+	} else {
+		/*
+		 * Creating a snapshot - we're not allocating a new inode, but
+		 * we do have to lookup the root inode of the subvolume we're
+		 * snapshotting and update it (in the new snapshot):
+		 */
+
+		if (!snapshot_src.inum) {
+			/* Inode wasn't specified, just snapshot: */
+			struct bch_subvolume s;
+
+			ret = bch2_subvolume_get(trans, snapshot_src.subvol, true,
+						 BTREE_ITER_cached, &s);
+			if (ret)
+				goto err;
+
+			snapshot_src.inum = le64_to_cpu(s.inode);
+		}
+
+		ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
+				      BTREE_ITER_intent);
+		if (ret)
+			goto err;
+
+		if (new_inode->bi_subvol != snapshot_src.subvol) {
+			/* Not a subvolume root: */
+			ret = -EINVAL;
+			goto err;
+		}
+
+		/*
+		 * If we're not root, we have to own the subvolume being
+		 * snapshotted:
+		 */
+		if (uid && new_inode->bi_uid != uid) {
+			ret = -EPERM;
+			goto err;
+		}
+
+		flags |= BCH_CREATE_SUBVOL;
+	}
+
+	new_inum.inum	= new_inode->bi_inum;
+	dir_target	= new_inode->bi_inum;
+
+	if (flags & BCH_CREATE_SUBVOL) {
+		u32 new_subvol, dir_snapshot;
+
+		ret = bch2_subvolume_create(trans, new_inode->bi_inum,
+					    dir.subvol,
+					    snapshot_src.subvol,
+					    &new_subvol, &snapshot,
+					    (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
+		if (ret)
+			goto err;
+
+		new_inode->bi_parent_subvol	= dir.subvol;
+		new_inode->bi_subvol		= new_subvol;
+		new_inum.subvol			= new_subvol;
+		dir_target			= new_subvol;
+		dir_type			= DT_SUBVOL;
+
+		ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot);
+		if (ret)
+			goto err;
+
+		bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
+		ret = bch2_btree_iter_traverse(&dir_iter);
+		if (ret)
+			goto err;
+	}
+
+	if (!(flags & BCH_CREATE_SNAPSHOT)) {
+		if (default_acl) {
+			ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+						 default_acl, ACL_TYPE_DEFAULT);
+			if (ret)
+				goto err;
+		}
+
+		if (acl) {
+			ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+						 acl, ACL_TYPE_ACCESS);
+			if (ret)
+				goto err;
+		}
+	}
+
+	if (!(flags & BCH_CREATE_TMPFILE)) {
+		struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
+		u64 dir_offset;
+
+		if (is_subdir_for_nlink(new_inode))
+			dir_u->bi_nlink++;
+		dir_u->bi_mtime = dir_u->bi_ctime = now;
+
+		ret = bch2_inode_write(trans, &dir_iter, dir_u);
+		if (ret)
+			goto err;
+
+		ret = bch2_dirent_create(trans, dir, &dir_hash,
+					 dir_type,
+					 name,
+					 dir_target,
+					 &dir_offset,
+					 STR_HASH_must_create|BTREE_ITER_with_updates);
+		if (ret)
+			goto err;
+
+		new_inode->bi_dir		= dir_u->bi_inum;
+		new_inode->bi_dir_offset	= dir_offset;
+	}
+
+	inode_iter.flags &= ~BTREE_ITER_all_snapshots;
+	bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
+
+	ret   = bch2_btree_iter_traverse(&inode_iter) ?:
+		bch2_inode_write(trans, &inode_iter, new_inode);
+err:
+	bch2_trans_iter_exit(trans, &inode_iter);
+	bch2_trans_iter_exit(trans, &dir_iter);
+	return ret;
+}
+
+int bch2_link_trans(struct btree_trans *trans,
+		    subvol_inum dir,  struct bch_inode_unpacked *dir_u,
+		    subvol_inum inum, struct bch_inode_unpacked *inode_u,
+		    const struct qstr *name)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter dir_iter = { NULL };
+	struct btree_iter inode_iter = { NULL };
+	struct bch_hash_info dir_hash;
+	u64 now = bch2_current_time(c);
+	u64 dir_offset = 0;
+	int ret;
+
+	if (dir.subvol != inum.subvol)
+		return -EXDEV;
+
+	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent);
+	if (ret)
+		return ret;
+
+	inode_u->bi_ctime = now;
+	ret = bch2_inode_nlink_inc(inode_u);
+	if (ret)
+		goto err;
+
+	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
+	if (ret)
+		goto err;
+
+	if (bch2_reinherit_attrs(inode_u, dir_u)) {
+		ret = -EXDEV;
+		goto err;
+	}
+
+	dir_u->bi_mtime = dir_u->bi_ctime = now;
+
+	dir_hash = bch2_hash_info_init(c, dir_u);
+
+	ret = bch2_dirent_create(trans, dir, &dir_hash,
+				 mode_to_type(inode_u->bi_mode),
+				 name, inum.inum, &dir_offset,
+				 STR_HASH_must_create);
+	if (ret)
+		goto err;
+
+	inode_u->bi_dir		= dir.inum;
+	inode_u->bi_dir_offset	= dir_offset;
+
+	ret =   bch2_inode_write(trans, &dir_iter, dir_u) ?:
+		bch2_inode_write(trans, &inode_iter, inode_u);
+err:
+	bch2_trans_iter_exit(trans, &dir_iter);
+	bch2_trans_iter_exit(trans, &inode_iter);
+	return ret;
+}
+
+int bch2_unlink_trans(struct btree_trans *trans,
+		      subvol_inum dir,
+		      struct bch_inode_unpacked *dir_u,
+		      struct bch_inode_unpacked *inode_u,
+		      const struct qstr *name,
+		      bool deleting_subvol)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter dir_iter = { NULL };
+	struct btree_iter dirent_iter = { NULL };
+	struct btree_iter inode_iter = { NULL };
+	struct bch_hash_info dir_hash;
+	subvol_inum inum;
+	u64 now = bch2_current_time(c);
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
+	if (ret)
+		goto err;
+
+	dir_hash = bch2_hash_info_init(c, dir_u);
+
+	ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+				       name, &inum, BTREE_ITER_intent);
+	if (ret)
+		goto err;
+
+	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
+			      BTREE_ITER_intent);
+	if (ret)
+		goto err;
+
+	if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) {
+		ret = bch2_empty_dir_trans(trans, inum);
+		if (ret)
+			goto err;
+	}
+
+	if (deleting_subvol && !inode_u->bi_subvol) {
+		ret = -BCH_ERR_ENOENT_not_subvol;
+		goto err;
+	}
+
+	if (inode_u->bi_subvol) {
+		/* Recursive subvolume destroy not allowed (yet?) */
+		ret = bch2_subvol_has_children(trans, inode_u->bi_subvol);
+		if (ret)
+			goto err;
+	}
+
+	if (deleting_subvol || inode_u->bi_subvol) {
+		ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
+		if (ret)
+			goto err;
+
+		k = bch2_btree_iter_peek_slot(&dirent_iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		/*
+		 * If we're deleting a subvolume, we need to really delete the
+		 * dirent, not just emit a whiteout in the current snapshot:
+		 */
+		bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
+		ret = bch2_btree_iter_traverse(&dirent_iter);
+		if (ret)
+			goto err;
+	} else {
+		bch2_inode_nlink_dec(trans, inode_u);
+	}
+
+	if (inode_u->bi_dir		== dirent_iter.pos.inode &&
+	    inode_u->bi_dir_offset	== dirent_iter.pos.offset) {
+		inode_u->bi_dir		= 0;
+		inode_u->bi_dir_offset	= 0;
+	}
+
+	dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
+	dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
+
+	ret =   bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+				    &dir_hash, &dirent_iter,
+				    BTREE_UPDATE_internal_snapshot_node) ?:
+		bch2_inode_write(trans, &dir_iter, dir_u) ?:
+		bch2_inode_write(trans, &inode_iter, inode_u);
+err:
+	bch2_trans_iter_exit(trans, &inode_iter);
+	bch2_trans_iter_exit(trans, &dirent_iter);
+	bch2_trans_iter_exit(trans, &dir_iter);
+	return ret;
+}
+
+bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
+			  struct bch_inode_unpacked *src_u)
+{
+	u64 src, dst;
+	unsigned id;
+	bool ret = false;
+
+	for (id = 0; id < Inode_opt_nr; id++) {
+		/* Skip attributes that were explicitly set on this inode */
+		if (dst_u->bi_fields_set & (1 << id))
+			continue;
+
+		src = bch2_inode_opt_get(src_u, id);
+		dst = bch2_inode_opt_get(dst_u, id);
+
+		if (src == dst)
+			continue;
+
+		bch2_inode_opt_set(dst_u, id, src);
+		ret = true;
+	}
+
+	return ret;
+}
+
+static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent)
+{
+	struct btree_iter iter;
+	struct bkey_i_subvolume *s =
+		bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_subvolumes, POS(0, subvol),
+			BTREE_ITER_cached, subvolume);
+	int ret = PTR_ERR_OR_ZERO(s);
+	if (ret)
+		return ret;
+
+	s->v.fs_path_parent = cpu_to_le32(new_parent);
+	bch2_trans_iter_exit(trans, &iter);
+	return 0;
+}
+
+int bch2_rename_trans(struct btree_trans *trans,
+		      subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
+		      subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
+		      struct bch_inode_unpacked *src_inode_u,
+		      struct bch_inode_unpacked *dst_inode_u,
+		      const struct qstr *src_name,
+		      const struct qstr *dst_name,
+		      enum bch_rename_mode mode)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter src_dir_iter = { NULL };
+	struct btree_iter dst_dir_iter = { NULL };
+	struct btree_iter src_inode_iter = { NULL };
+	struct btree_iter dst_inode_iter = { NULL };
+	struct bch_hash_info src_hash, dst_hash;
+	subvol_inum src_inum, dst_inum;
+	u64 src_offset, dst_offset;
+	u64 now = bch2_current_time(c);
+	int ret;
+
+	ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir,
+			      BTREE_ITER_intent);
+	if (ret)
+		goto err;
+
+	src_hash = bch2_hash_info_init(c, src_dir_u);
+
+	if (dst_dir.inum	!= src_dir.inum ||
+	    dst_dir.subvol	!= src_dir.subvol) {
+		ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
+				      BTREE_ITER_intent);
+		if (ret)
+			goto err;
+
+		dst_hash = bch2_hash_info_init(c, dst_dir_u);
+	} else {
+		dst_dir_u = src_dir_u;
+		dst_hash = src_hash;
+	}
+
+	ret = bch2_dirent_rename(trans,
+				 src_dir, &src_hash,
+				 dst_dir, &dst_hash,
+				 src_name, &src_inum, &src_offset,
+				 dst_name, &dst_inum, &dst_offset,
+				 mode);
+	if (ret)
+		goto err;
+
+	ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
+			      BTREE_ITER_intent);
+	if (ret)
+		goto err;
+
+	if (dst_inum.inum) {
+		ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
+				      BTREE_ITER_intent);
+		if (ret)
+			goto err;
+	}
+
+	if (src_inode_u->bi_subvol &&
+	    dst_dir.subvol != src_inode_u->bi_parent_subvol) {
+		ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol);
+		if (ret)
+			goto err;
+	}
+
+	if (mode == BCH_RENAME_EXCHANGE &&
+	    dst_inode_u->bi_subvol &&
+	    src_dir.subvol != dst_inode_u->bi_parent_subvol) {
+		ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol);
+		if (ret)
+			goto err;
+	}
+
+	/* Can't move across subvolumes, unless it's a subvolume root: */
+	if (src_dir.subvol != dst_dir.subvol &&
+	    (!src_inode_u->bi_subvol ||
+	     (dst_inum.inum && !dst_inode_u->bi_subvol))) {
+		ret = -EXDEV;
+		goto err;
+	}
+
+	if (src_inode_u->bi_parent_subvol)
+		src_inode_u->bi_parent_subvol = dst_dir.subvol;
+
+	if ((mode == BCH_RENAME_EXCHANGE) &&
+	    dst_inode_u->bi_parent_subvol)
+		dst_inode_u->bi_parent_subvol = src_dir.subvol;
+
+	src_inode_u->bi_dir		= dst_dir_u->bi_inum;
+	src_inode_u->bi_dir_offset	= dst_offset;
+
+	if (mode == BCH_RENAME_EXCHANGE) {
+		dst_inode_u->bi_dir		= src_dir_u->bi_inum;
+		dst_inode_u->bi_dir_offset	= src_offset;
+	}
+
+	if (mode == BCH_RENAME_OVERWRITE &&
+	    dst_inode_u->bi_dir		== dst_dir_u->bi_inum &&
+	    dst_inode_u->bi_dir_offset	== src_offset) {
+		dst_inode_u->bi_dir		= 0;
+		dst_inode_u->bi_dir_offset	= 0;
+	}
+
+	if (mode == BCH_RENAME_OVERWRITE) {
+		if (S_ISDIR(src_inode_u->bi_mode) !=
+		    S_ISDIR(dst_inode_u->bi_mode)) {
+			ret = -ENOTDIR;
+			goto err;
+		}
+
+		if (S_ISDIR(dst_inode_u->bi_mode)) {
+			ret = bch2_empty_dir_trans(trans, dst_inum);
+			if (ret)
+				goto err;
+		}
+	}
+
+	if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
+	    S_ISDIR(src_inode_u->bi_mode)) {
+		ret = -EXDEV;
+		goto err;
+	}
+
+	if (mode == BCH_RENAME_EXCHANGE &&
+	    bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
+	    S_ISDIR(dst_inode_u->bi_mode)) {
+		ret = -EXDEV;
+		goto err;
+	}
+
+	if (is_subdir_for_nlink(src_inode_u)) {
+		src_dir_u->bi_nlink--;
+		dst_dir_u->bi_nlink++;
+	}
+
+	if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
+		dst_dir_u->bi_nlink--;
+		src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
+	}
+
+	if (mode == BCH_RENAME_OVERWRITE)
+		bch2_inode_nlink_dec(trans, dst_inode_u);
+
+	src_dir_u->bi_mtime		= now;
+	src_dir_u->bi_ctime		= now;
+
+	if (src_dir.inum != dst_dir.inum) {
+		dst_dir_u->bi_mtime	= now;
+		dst_dir_u->bi_ctime	= now;
+	}
+
+	src_inode_u->bi_ctime		= now;
+
+	if (dst_inum.inum)
+		dst_inode_u->bi_ctime	= now;
+
+	ret =   bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
+		(src_dir.inum != dst_dir.inum
+		 ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
+		 : 0) ?:
+		bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
+		(dst_inum.inum
+		 ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
+		 : 0);
+err:
+	bch2_trans_iter_exit(trans, &dst_inode_iter);
+	bch2_trans_iter_exit(trans, &src_inode_iter);
+	bch2_trans_iter_exit(trans, &dst_dir_iter);
+	bch2_trans_iter_exit(trans, &src_dir_iter);
+	return ret;
+}
+
+static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n)
+{
+	bch2_printbuf_make_room(out, n);
+
+	unsigned can_print = min(n, printbuf_remaining(out));
+
+	b += n;
+
+	for (unsigned i = 0; i < can_print; i++)
+		out->buf[out->pos++] = *((char *) --b);
+
+	printbuf_nul_terminate(out);
+}
+
+static inline void reverse_bytes(void *b, size_t n)
+{
+	char *e = b + n, *s = b;
+
+	while (s < e) {
+		--e;
+		swap(*s, *e);
+		s++;
+	}
+}
+
+/* XXX: we don't yet attempt to print paths when we don't know the subvol */
+int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path)
+{
+	unsigned orig_pos = path->pos;
+	int ret = 0;
+
+	while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL &&
+		 inum.inum   == BCACHEFS_ROOT_INO)) {
+		struct bch_inode_unpacked inode;
+		ret = bch2_inode_find_by_inum_trans(trans, inum, &inode);
+		if (ret)
+			goto err;
+
+		if (!inode.bi_dir && !inode.bi_dir_offset) {
+			ret = -BCH_ERR_ENOENT_inode_no_backpointer;
+			goto err;
+		}
+
+		u32 snapshot;
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+		if (ret)
+			goto err;
+
+		struct btree_iter d_iter;
+		struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter,
+				BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot),
+				0, dirent);
+		ret = bkey_err(d.s_c);
+		if (ret)
+			goto err;
+
+		struct qstr dirent_name = bch2_dirent_get_name(d);
+		prt_bytes_reversed(path, dirent_name.name, dirent_name.len);
+
+		prt_char(path, '/');
+
+		if (d.v->d_type == DT_SUBVOL)
+			inum.subvol = le32_to_cpu(d.v->d_parent_subvol);
+		inum.inum = d.k->p.inode;
+
+		bch2_trans_iter_exit(trans, &d_iter);
+	}
+
+	if (orig_pos == path->pos)
+		prt_char(path, '/');
+
+	ret = path->allocation_failure ? -ENOMEM : 0;
+	if (ret)
+		goto err;
+
+	reverse_bytes(path->buf + orig_pos, path->pos - orig_pos);
+	return 0;
+err:
+	return ret;
+}
diff --git a/libbcachefs/fs-common.h b/libbcachefs/fs-common.h
new file mode 100644
index 00000000..2b59210b
--- /dev/null
+++ b/libbcachefs/fs-common.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_COMMON_H
+#define _BCACHEFS_FS_COMMON_H
+
+#include "dirent.h"
+
+struct posix_acl;
+
+#define BCH_CREATE_TMPFILE		(1U << 0)
+#define BCH_CREATE_SUBVOL		(1U << 1)
+#define BCH_CREATE_SNAPSHOT		(1U << 2)
+#define BCH_CREATE_SNAPSHOT_RO		(1U << 3)
+
+int bch2_create_trans(struct btree_trans *, subvol_inum,
+		      struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      const struct qstr *,
+		      uid_t, gid_t, umode_t, dev_t,
+		      struct posix_acl *,
+		      struct posix_acl *,
+		      subvol_inum, unsigned);
+
+int bch2_link_trans(struct btree_trans *,
+		    subvol_inum, struct bch_inode_unpacked *,
+		    subvol_inum, struct bch_inode_unpacked *,
+		    const struct qstr *);
+
+int bch2_unlink_trans(struct btree_trans *, subvol_inum,
+		      struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      const struct qstr *, bool);
+
+int bch2_rename_trans(struct btree_trans *,
+		      subvol_inum, struct bch_inode_unpacked *,
+		      subvol_inum, struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      const struct qstr *,
+		      const struct qstr *,
+		      enum bch_rename_mode);
+
+bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
+			  struct bch_inode_unpacked *);
+
+int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *);
+
+#endif /* _BCACHEFS_FS_COMMON_H */
diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c
new file mode 100644
index 00000000..ff8b8df5
--- /dev/null
+++ b/libbcachefs/fs-io-buffered.c
@@ -0,0 +1,1101 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
+#include "io_read.h"
+#include "io_write.h"
+
+#include <linux/backing-dev.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+
+static inline bool bio_full(struct bio *bio, unsigned len)
+{
+	if (bio->bi_vcnt >= bio->bi_max_vecs)
+		return true;
+	if (bio->bi_iter.bi_size > UINT_MAX - len)
+		return true;
+	return false;
+}
+
+/* readpage(s): */
+
+static void bch2_readpages_end_io(struct bio *bio)
+{
+	struct folio_iter fi;
+
+	bio_for_each_folio_all(fi, bio)
+		folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK);
+
+	bio_put(bio);
+}
+
+struct readpages_iter {
+	struct address_space	*mapping;
+	unsigned		idx;
+	folios			folios;
+};
+
+static int readpages_iter_init(struct readpages_iter *iter,
+			       struct readahead_control *ractl)
+{
+	struct folio *folio;
+
+	*iter = (struct readpages_iter) { ractl->mapping };
+
+	while ((folio = __readahead_folio(ractl))) {
+		if (!bch2_folio_create(folio, GFP_KERNEL) ||
+		    darray_push(&iter->folios, folio)) {
+			bch2_folio_release(folio);
+			ractl->_nr_pages += folio_nr_pages(folio);
+			ractl->_index -= folio_nr_pages(folio);
+			return iter->folios.nr ? 0 : -ENOMEM;
+		}
+
+		folio_put(folio);
+	}
+
+	return 0;
+}
+
+static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
+{
+	if (iter->idx >= iter->folios.nr)
+		return NULL;
+	return iter->folios.data[iter->idx];
+}
+
+static inline void readpage_iter_advance(struct readpages_iter *iter)
+{
+	iter->idx++;
+}
+
+static bool extent_partial_reads_expensive(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *i;
+
+	bkey_for_each_crc(k.k, ptrs, crc, i)
+		if (crc.csum_type || crc.compression_type)
+			return true;
+	return false;
+}
+
+static int readpage_bio_extend(struct btree_trans *trans,
+			       struct readpages_iter *iter,
+			       struct bio *bio,
+			       unsigned sectors_this_extent,
+			       bool get_more)
+{
+	/* Don't hold btree locks while allocating memory: */
+	bch2_trans_unlock(trans);
+
+	while (bio_sectors(bio) < sectors_this_extent &&
+	       bio->bi_vcnt < bio->bi_max_vecs) {
+		struct folio *folio = readpage_iter_peek(iter);
+		int ret;
+
+		if (folio) {
+			readpage_iter_advance(iter);
+		} else {
+			pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
+
+			if (!get_more)
+				break;
+
+			folio = xa_load(&iter->mapping->i_pages, folio_offset);
+			if (folio && !xa_is_value(folio))
+				break;
+
+			folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
+			if (!folio)
+				break;
+
+			if (!__bch2_folio_create(folio, GFP_KERNEL)) {
+				folio_put(folio);
+				break;
+			}
+
+			ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
+			if (ret) {
+				__bch2_folio_release(folio);
+				folio_put(folio);
+				break;
+			}
+
+			folio_put(folio);
+		}
+
+		BUG_ON(folio_sector(folio) != bio_end_sector(bio));
+
+		BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
+	}
+
+	return bch2_trans_relock(trans);
+}
+
+static void bchfs_read(struct btree_trans *trans,
+		       struct bch_read_bio *rbio,
+		       subvol_inum inum,
+		       struct readpages_iter *readpages_iter)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_buf sk;
+	int flags = BCH_READ_RETRY_IF_STALE|
+		BCH_READ_MAY_PROMOTE;
+	int ret = 0;
+
+	rbio->c = c;
+	rbio->start_time = local_clock();
+	rbio->subvol = inum.subvol;
+
+	bch2_bkey_buf_init(&sk);
+	bch2_trans_begin(trans);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     POS(inum.inum, rbio->bio.bi_iter.bi_sector),
+			     BTREE_ITER_slots);
+	while (1) {
+		struct bkey_s_c k;
+		unsigned bytes, sectors;
+		s64 offset_into_extent;
+		enum btree_id data_btree = BTREE_ID_extents;
+
+		bch2_trans_begin(trans);
+
+		u32 snapshot;
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+		if (ret)
+			goto err;
+
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+		bch2_btree_iter_set_pos(&iter,
+				POS(inum.inum, rbio->bio.bi_iter.bi_sector));
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		offset_into_extent = iter.pos.offset -
+			bkey_start_offset(k.k);
+		sectors = k.k->size - offset_into_extent;
+
+		bch2_bkey_buf_reassemble(&sk, c, k);
+
+		ret = bch2_read_indirect_extent(trans, &data_btree,
+					&offset_into_extent, &sk);
+		if (ret)
+			goto err;
+
+		k = bkey_i_to_s_c(sk.k);
+
+		sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
+
+		if (readpages_iter) {
+			ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
+						  extent_partial_reads_expensive(k));
+			if (ret)
+				goto err;
+		}
+
+		bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
+		swap(rbio->bio.bi_iter.bi_size, bytes);
+
+		if (rbio->bio.bi_iter.bi_size == bytes)
+			flags |= BCH_READ_LAST_FRAGMENT;
+
+		bch2_bio_page_state_set(&rbio->bio, k);
+
+		bch2_read_extent(trans, rbio, iter.pos,
+				 data_btree, k, offset_into_extent, flags);
+
+		if (flags & BCH_READ_LAST_FRAGMENT)
+			break;
+
+		swap(rbio->bio.bi_iter.bi_size, bytes);
+		bio_advance(&rbio->bio, bytes);
+err:
+		if (ret &&
+		    !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret) {
+		struct printbuf buf = PRINTBUF;
+		bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9);
+		prt_printf(&buf, "read error %i from btree lookup", ret);
+		bch_err_ratelimited(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+
+		rbio->bio.bi_status = BLK_STS_IOERR;
+		bio_endio(&rbio->bio);
+	}
+
+	bch2_bkey_buf_exit(&sk, c);
+}
+
+void bch2_readahead(struct readahead_control *ractl)
+{
+	struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_io_opts opts;
+	struct folio *folio;
+	struct readpages_iter readpages_iter;
+	struct blk_plug plug;
+
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+	int ret = readpages_iter_init(&readpages_iter, ractl);
+	if (ret)
+		return;
+
+	/*
+	 * Besides being a general performance optimization, plugging helps with
+	 * avoiding btree transaction srcu warnings - submitting a bio can
+	 * block, and we don't want todo that with the transaction locked.
+	 *
+	 * However, plugged bios are submitted when we schedule; we ideally
+	 * would have our own scheduler hook to call unlock_long() before
+	 * scheduling.
+	 */
+	blk_start_plug(&plug);
+	bch2_pagecache_add_get(inode);
+
+	struct btree_trans *trans = bch2_trans_get(c);
+	while ((folio = readpage_iter_peek(&readpages_iter))) {
+		unsigned n = min_t(unsigned,
+				   readpages_iter.folios.nr -
+				   readpages_iter.idx,
+				   BIO_MAX_VECS);
+		struct bch_read_bio *rbio =
+			rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
+						   GFP_KERNEL, &c->bio_read),
+				  opts);
+
+		readpage_iter_advance(&readpages_iter);
+
+		rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+		rbio->bio.bi_end_io = bch2_readpages_end_io;
+		BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+
+		bchfs_read(trans, rbio, inode_inum(inode),
+			   &readpages_iter);
+		bch2_trans_unlock(trans);
+	}
+	bch2_trans_put(trans);
+
+	bch2_pagecache_add_put(inode);
+	blk_finish_plug(&plug);
+	darray_exit(&readpages_iter.folios);
+}
+
+static void bch2_read_single_folio_end_io(struct bio *bio)
+{
+	complete(bio->bi_private);
+}
+
+int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_read_bio *rbio;
+	struct bch_io_opts opts;
+	struct blk_plug plug;
+	int ret;
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	BUG_ON(folio_test_uptodate(folio));
+	BUG_ON(folio_test_dirty(folio));
+
+	if (!bch2_folio_create(folio, GFP_KERNEL))
+		return -ENOMEM;
+
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+	rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
+			 opts);
+	rbio->bio.bi_private = &done;
+	rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
+
+	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
+	rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+	BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+
+	blk_start_plug(&plug);
+	bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0));
+	blk_finish_plug(&plug);
+	wait_for_completion(&done);
+
+	ret = blk_status_to_errno(rbio->bio.bi_status);
+	bio_put(&rbio->bio);
+
+	if (ret < 0)
+		return ret;
+
+	folio_mark_uptodate(folio);
+	return 0;
+}
+
+int bch2_read_folio(struct file *file, struct folio *folio)
+{
+	int ret;
+
+	ret = bch2_read_single_folio(folio, folio->mapping);
+	folio_unlock(folio);
+	return bch2_err_class(ret);
+}
+
+/* writepages: */
+
+struct bch_writepage_io {
+	struct bch_inode_info		*inode;
+
+	/* must be last: */
+	struct bch_write_op		op;
+};
+
+struct bch_writepage_state {
+	struct bch_writepage_io	*io;
+	struct bch_io_opts	opts;
+	struct bch_folio_sector	*tmp;
+	unsigned		tmp_sectors;
+};
+
+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
+								  struct bch_inode_info *inode)
+{
+	struct bch_writepage_state ret = { 0 };
+
+	bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
+	return ret;
+}
+
+/*
+ * Determine when a writepage io is full. We have to limit writepage bios to a
+ * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to
+ * what the bounce path in bch2_write_extent() can handle. In theory we could
+ * loosen this restriction for non-bounce I/O, but we don't have that context
+ * here. Ideally, we can up this limit and make it configurable in the future
+ * when the bounce path can be enhanced to accommodate larger source bios.
+ */
+static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len)
+{
+	struct bio *bio = &io->op.wbio.bio;
+	return bio_full(bio, len) ||
+		(bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE);
+}
+
+static void bch2_writepage_io_done(struct bch_write_op *op)
+{
+	struct bch_writepage_io *io =
+		container_of(op, struct bch_writepage_io, op);
+	struct bch_fs *c = io->op.c;
+	struct bio *bio = &io->op.wbio.bio;
+	struct folio_iter fi;
+	unsigned i;
+
+	if (io->op.error) {
+		set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
+
+		bio_for_each_folio_all(fi, bio) {
+			struct bch_folio *s;
+
+			mapping_set_error(fi.folio->mapping, -EIO);
+
+			s = __bch2_folio(fi.folio);
+			spin_lock(&s->lock);
+			for (i = 0; i < folio_sectors(fi.folio); i++)
+				s->s[i].nr_replicas = 0;
+			spin_unlock(&s->lock);
+		}
+	}
+
+	if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
+		bio_for_each_folio_all(fi, bio) {
+			struct bch_folio *s;
+
+			s = __bch2_folio(fi.folio);
+			spin_lock(&s->lock);
+			for (i = 0; i < folio_sectors(fi.folio); i++)
+				s->s[i].nr_replicas = 0;
+			spin_unlock(&s->lock);
+		}
+	}
+
+	/*
+	 * racing with fallocate can cause us to add fewer sectors than
+	 * expected - but we shouldn't add more sectors than expected:
+	 */
+	WARN_ON_ONCE(io->op.i_sectors_delta > 0);
+
+	/*
+	 * (error (due to going RO) halfway through a page can screw that up
+	 * slightly)
+	 * XXX wtf?
+	   BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
+	 */
+
+	/*
+	 * The writeback flag is effectively our ref on the inode -
+	 * fixup i_blocks before calling folio_end_writeback:
+	 */
+	bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
+
+	bio_for_each_folio_all(fi, bio) {
+		struct bch_folio *s = __bch2_folio(fi.folio);
+
+		if (atomic_dec_and_test(&s->write_count))
+			folio_end_writeback(fi.folio);
+	}
+
+	bio_put(&io->op.wbio.bio);
+}
+
+static void bch2_writepage_do_io(struct bch_writepage_state *w)
+{
+	struct bch_writepage_io *io = w->io;
+
+	w->io = NULL;
+	closure_call(&io->op.cl, bch2_write, NULL, NULL);
+}
+
+/*
+ * Get a bch_writepage_io and add @page to it - appending to an existing one if
+ * possible, else allocating a new one:
+ */
+static void bch2_writepage_io_alloc(struct bch_fs *c,
+				    struct writeback_control *wbc,
+				    struct bch_writepage_state *w,
+				    struct bch_inode_info *inode,
+				    u64 sector,
+				    unsigned nr_replicas)
+{
+	struct bch_write_op *op;
+
+	w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
+					      REQ_OP_WRITE,
+					      GFP_KERNEL,
+					      &c->writepage_bioset),
+			     struct bch_writepage_io, op.wbio.bio);
+
+	w->io->inode		= inode;
+	op			= &w->io->op;
+	bch2_write_op_init(op, c, w->opts);
+	op->target		= w->opts.foreground_target;
+	op->nr_replicas		= nr_replicas;
+	op->res.nr_replicas	= nr_replicas;
+	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
+	op->subvol		= inode->ei_inum.subvol;
+	op->pos			= POS(inode->v.i_ino, sector);
+	op->end_io		= bch2_writepage_io_done;
+	op->devs_need_flush	= &inode->ei_devs_need_flush;
+	op->wbio.bio.bi_iter.bi_sector = sector;
+	op->wbio.bio.bi_opf	= wbc_to_write_flags(wbc);
+}
+
+static int __bch2_writepage(struct folio *folio,
+			    struct writeback_control *wbc,
+			    void *data)
+{
+	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_writepage_state *w = data;
+	struct bch_folio *s;
+	unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
+	loff_t i_size = i_size_read(&inode->v);
+	int ret;
+
+	EBUG_ON(!folio_test_uptodate(folio));
+
+	/* Is the folio fully inside i_size? */
+	if (folio_end_pos(folio) <= i_size)
+		goto do_io;
+
+	/* Is the folio fully outside i_size? (truncate in progress) */
+	if (folio_pos(folio) >= i_size) {
+		folio_unlock(folio);
+		return 0;
+	}
+
+	/*
+	 * The folio straddles i_size.  It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped.  "A file is mapped
+	 * in multiples of the folio size.  For a file that is not a multiple of
+	 * the  folio size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	folio_zero_segment(folio,
+			   i_size - folio_pos(folio),
+			   folio_size(folio));
+do_io:
+	f_sectors = folio_sectors(folio);
+	s = bch2_folio(folio);
+
+	if (f_sectors > w->tmp_sectors) {
+		kfree(w->tmp);
+		w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL);
+		w->tmp_sectors = f_sectors;
+	}
+
+	/*
+	 * Things get really hairy with errors during writeback:
+	 */
+	ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
+	BUG_ON(ret);
+
+	/* Before unlocking the page, get copy of reservations: */
+	spin_lock(&s->lock);
+	memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
+
+	for (i = 0; i < f_sectors; i++) {
+		if (s->s[i].state < SECTOR_dirty)
+			continue;
+
+		nr_replicas_this_write =
+			min_t(unsigned, nr_replicas_this_write,
+			      s->s[i].nr_replicas +
+			      s->s[i].replicas_reserved);
+	}
+
+	for (i = 0; i < f_sectors; i++) {
+		if (s->s[i].state < SECTOR_dirty)
+			continue;
+
+		s->s[i].nr_replicas = w->opts.compression
+			? 0 : nr_replicas_this_write;
+
+		s->s[i].replicas_reserved = 0;
+		bch2_folio_sector_set(folio, s, i, SECTOR_allocated);
+	}
+	spin_unlock(&s->lock);
+
+	BUG_ON(atomic_read(&s->write_count));
+	atomic_set(&s->write_count, 1);
+
+	BUG_ON(folio_test_writeback(folio));
+	folio_start_writeback(folio);
+
+	folio_unlock(folio);
+
+	offset = 0;
+	while (1) {
+		unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
+		u64 sector;
+
+		while (offset < f_sectors &&
+		       w->tmp[offset].state < SECTOR_dirty)
+			offset++;
+
+		if (offset == f_sectors)
+			break;
+
+		while (offset + sectors < f_sectors &&
+		       w->tmp[offset + sectors].state >= SECTOR_dirty) {
+			reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
+			dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
+			sectors++;
+		}
+		BUG_ON(!sectors);
+
+		sector = folio_sector(folio) + offset;
+
+		if (w->io &&
+		    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
+		     bch_io_full(w->io, sectors << 9) ||
+		     bio_end_sector(&w->io->op.wbio.bio) != sector))
+			bch2_writepage_do_io(w);
+
+		if (!w->io)
+			bch2_writepage_io_alloc(c, wbc, w, inode, sector,
+						nr_replicas_this_write);
+
+		atomic_inc(&s->write_count);
+
+		BUG_ON(inode != w->io->inode);
+		BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
+				     sectors << 9, offset << 9));
+
+		/* Check for writing past i_size: */
+		WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
+			  round_up(i_size, block_bytes(c)) &&
+			  !test_bit(BCH_FS_emergency_ro, &c->flags),
+			  "writing past i_size: %llu > %llu (unrounded %llu)\n",
+			  bio_end_sector(&w->io->op.wbio.bio) << 9,
+			  round_up(i_size, block_bytes(c)),
+			  i_size);
+
+		w->io->op.res.sectors += reserved_sectors;
+		w->io->op.i_sectors_delta -= dirty_sectors;
+		w->io->op.new_i_size = i_size;
+
+		offset += sectors;
+	}
+
+	if (atomic_dec_and_test(&s->write_count))
+		folio_end_writeback(folio);
+
+	return 0;
+}
+
+int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	struct bch_fs *c = mapping->host->i_sb->s_fs_info;
+	struct bch_writepage_state w =
+		bch_writepage_state_init(c, to_bch_ei(mapping->host));
+	struct blk_plug plug;
+	int ret;
+
+	blk_start_plug(&plug);
+	ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
+	if (w.io)
+		bch2_writepage_do_io(&w);
+	blk_finish_plug(&plug);
+	kfree(w.tmp);
+	return bch2_err_class(ret);
+}
+
+/* buffered writes: */
+
+int bch2_write_begin(struct file *file, struct address_space *mapping,
+		     loff_t pos, unsigned len,
+		     struct folio **foliop, void **fsdata)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_folio_reservation *res;
+	struct folio *folio;
+	unsigned offset;
+	int ret = -ENOMEM;
+
+	res = kmalloc(sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	bch2_folio_reservation_init(c, inode, res);
+	*fsdata = res;
+
+	bch2_pagecache_add_get(inode);
+
+	folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
+				    FGP_WRITEBEGIN | fgf_set_order(len),
+				    mapping_gfp_mask(mapping));
+	if (IS_ERR(folio))
+		goto err_unlock;
+
+	offset = pos - folio_pos(folio);
+	len = min_t(size_t, len, folio_end_pos(folio) - pos);
+
+	if (folio_test_uptodate(folio))
+		goto out;
+
+	/* If we're writing entire folio, don't need to read it in first: */
+	if (!offset && len == folio_size(folio))
+		goto out;
+
+	if (!offset && pos + len >= inode->v.i_size) {
+		folio_zero_segment(folio, len, folio_size(folio));
+		flush_dcache_folio(folio);
+		goto out;
+	}
+
+	if (folio_pos(folio) >= inode->v.i_size) {
+		folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
+		flush_dcache_folio(folio);
+		goto out;
+	}
+readpage:
+	ret = bch2_read_single_folio(folio, mapping);
+	if (ret)
+		goto err;
+out:
+	ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
+	if (ret)
+		goto err;
+
+	ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
+	if (ret) {
+		if (!folio_test_uptodate(folio)) {
+			/*
+			 * If the folio hasn't been read in, we won't know if we
+			 * actually need a reservation - we don't actually need
+			 * to read here, we just need to check if the folio is
+			 * fully backed by uncompressed data:
+			 */
+			goto readpage;
+		}
+
+		goto err;
+	}
+
+	*foliop = folio;
+	return 0;
+err:
+	folio_unlock(folio);
+	folio_put(folio);
+err_unlock:
+	bch2_pagecache_add_put(inode);
+	kfree(res);
+	*fsdata = NULL;
+	return bch2_err_class(ret);
+}
+
+int bch2_write_end(struct file *file, struct address_space *mapping,
+		   loff_t pos, unsigned len, unsigned copied,
+		   struct folio *folio, void *fsdata)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_folio_reservation *res = fsdata;
+	unsigned offset = pos - folio_pos(folio);
+
+	lockdep_assert_held(&inode->v.i_rwsem);
+	BUG_ON(offset + copied > folio_size(folio));
+
+	if (unlikely(copied < len && !folio_test_uptodate(folio))) {
+		/*
+		 * The folio needs to be read in, but that would destroy
+		 * our partial write - simplest thing is to just force
+		 * userspace to redo the write:
+		 */
+		folio_zero_range(folio, 0, folio_size(folio));
+		flush_dcache_folio(folio);
+		copied = 0;
+	}
+
+	spin_lock(&inode->v.i_lock);
+	if (pos + copied > inode->v.i_size)
+		i_size_write(&inode->v, pos + copied);
+	spin_unlock(&inode->v.i_lock);
+
+	if (copied) {
+		if (!folio_test_uptodate(folio))
+			folio_mark_uptodate(folio);
+
+		bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
+
+		inode->ei_last_dirtied = (unsigned long) current;
+	}
+
+	folio_unlock(folio);
+	folio_put(folio);
+	bch2_pagecache_add_put(inode);
+
+	bch2_folio_reservation_put(c, inode, res);
+	kfree(res);
+
+	return copied;
+}
+
+static noinline void folios_trunc(folios *fs, struct folio **fi)
+{
+	while (fs->data + fs->nr > fi) {
+		struct folio *f = darray_pop(fs);
+
+		folio_unlock(f);
+		folio_put(f);
+	}
+}
+
+static int __bch2_buffered_write(struct bch_inode_info *inode,
+				 struct address_space *mapping,
+				 struct iov_iter *iter,
+				 loff_t pos, unsigned len)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_folio_reservation res;
+	folios fs;
+	struct folio *f;
+	unsigned copied = 0, f_offset, f_copied;
+	u64 end = pos + len, f_pos, f_len;
+	loff_t last_folio_pos = inode->v.i_size;
+	int ret = 0;
+
+	BUG_ON(!len);
+
+	bch2_folio_reservation_init(c, inode, &res);
+	darray_init(&fs);
+
+	ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
+					       FGP_WRITEBEGIN | fgf_set_order(len),
+					       mapping_gfp_mask(mapping), &fs);
+	if (ret)
+		goto out;
+
+	BUG_ON(!fs.nr);
+
+	f = darray_first(fs);
+	if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
+		ret = bch2_read_single_folio(f, mapping);
+		if (ret)
+			goto out;
+	}
+
+	f = darray_last(fs);
+	end = min(end, folio_end_pos(f));
+	last_folio_pos = folio_pos(f);
+	if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
+		if (end >= inode->v.i_size) {
+			folio_zero_range(f, 0, folio_size(f));
+		} else {
+			ret = bch2_read_single_folio(f, mapping);
+			if (ret)
+				goto out;
+		}
+	}
+
+	ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr);
+	if (ret)
+		goto out;
+
+	f_pos = pos;
+	f_offset = pos - folio_pos(darray_first(fs));
+	darray_for_each(fs, fi) {
+		ssize_t f_reserved;
+
+		f = *fi;
+		f_len = min(end, folio_end_pos(f)) - f_pos;
+		f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len);
+
+		if (unlikely(f_reserved != f_len)) {
+			if (f_reserved < 0) {
+				if (f == darray_first(fs)) {
+					ret = f_reserved;
+					goto out;
+				}
+
+				folios_trunc(&fs, fi);
+				end = min(end, folio_end_pos(darray_last(fs)));
+			} else {
+				if (!folio_test_uptodate(f)) {
+					ret = bch2_read_single_folio(f, mapping);
+					if (ret)
+						goto out;
+				}
+
+				folios_trunc(&fs, fi + 1);
+				end = f_pos + f_reserved;
+			}
+
+			break;
+		}
+
+		f_pos = folio_end_pos(f);
+		f_offset = 0;
+	}
+
+	if (mapping_writably_mapped(mapping))
+		darray_for_each(fs, fi)
+			flush_dcache_folio(*fi);
+
+	f_pos = pos;
+	f_offset = pos - folio_pos(darray_first(fs));
+	darray_for_each(fs, fi) {
+		f = *fi;
+		f_len = min(end, folio_end_pos(f)) - f_pos;
+		f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter);
+		if (!f_copied) {
+			folios_trunc(&fs, fi);
+			break;
+		}
+
+		if (!folio_test_uptodate(f) &&
+		    f_copied != folio_size(f) &&
+		    pos + copied + f_copied < inode->v.i_size) {
+			iov_iter_revert(iter, f_copied);
+			folio_zero_range(f, 0, folio_size(f));
+			folios_trunc(&fs, fi);
+			break;
+		}
+
+		flush_dcache_folio(f);
+		copied += f_copied;
+
+		if (f_copied != f_len) {
+			folios_trunc(&fs, fi + 1);
+			break;
+		}
+
+		f_pos = folio_end_pos(f);
+		f_offset = 0;
+	}
+
+	if (!copied)
+		goto out;
+
+	end = pos + copied;
+
+	spin_lock(&inode->v.i_lock);
+	if (end > inode->v.i_size)
+		i_size_write(&inode->v, end);
+	spin_unlock(&inode->v.i_lock);
+
+	f_pos = pos;
+	f_offset = pos - folio_pos(darray_first(fs));
+	darray_for_each(fs, fi) {
+		f = *fi;
+		f_len = min(end, folio_end_pos(f)) - f_pos;
+
+		if (!folio_test_uptodate(f))
+			folio_mark_uptodate(f);
+
+		bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
+
+		f_pos = folio_end_pos(f);
+		f_offset = 0;
+	}
+
+	inode->ei_last_dirtied = (unsigned long) current;
+out:
+	darray_for_each(fs, fi) {
+		folio_unlock(*fi);
+		folio_put(*fi);
+	}
+
+	/*
+	 * If the last folio added to the mapping starts beyond current EOF, we
+	 * performed a short write but left around at least one post-EOF folio.
+	 * Clean up the mapping before we return.
+	 */
+	if (last_folio_pos >= inode->v.i_size)
+		truncate_pagecache(&inode->v, inode->v.i_size);
+
+	darray_exit(&fs);
+	bch2_folio_reservation_put(c, inode, &res);
+
+	return copied ?: ret;
+}
+
+static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	loff_t pos = iocb->ki_pos;
+	ssize_t written = 0;
+	int ret = 0;
+
+	bch2_pagecache_add_get(inode);
+
+	do {
+		unsigned offset = pos & (PAGE_SIZE - 1);
+		unsigned bytes = iov_iter_count(iter);
+again:
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 *
+		 * Not only is this an optimisation, but it is also required
+		 * to check that the address is actually valid, when atomic
+		 * usercopies are used, below.
+		 */
+		if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
+			bytes = min_t(unsigned long, iov_iter_count(iter),
+				      PAGE_SIZE - offset);
+
+			if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
+				ret = -EFAULT;
+				break;
+			}
+		}
+
+		if (unlikely(fatal_signal_pending(current))) {
+			ret = -EINTR;
+			break;
+		}
+
+		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+		if (unlikely(ret < 0))
+			break;
+
+		cond_resched();
+
+		if (unlikely(ret == 0)) {
+			/*
+			 * If we were unable to copy any data at all, we must
+			 * fall back to a single segment length write.
+			 *
+			 * If we didn't fallback here, we could livelock
+			 * because not all segments in the iov can be copied at
+			 * once without a pagefault.
+			 */
+			bytes = min_t(unsigned long, PAGE_SIZE - offset,
+				      iov_iter_single_seg_count(iter));
+			goto again;
+		}
+		pos += ret;
+		written += ret;
+		ret = 0;
+
+		balance_dirty_pages_ratelimited(mapping);
+	} while (iov_iter_count(iter));
+
+	bch2_pagecache_add_put(inode);
+
+	return written ? written : ret;
+}
+
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	ssize_t ret;
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		ret = bch2_direct_write(iocb, from);
+		goto out;
+	}
+
+	inode_lock(&inode->v);
+
+	ret = generic_write_checks(iocb, from);
+	if (ret <= 0)
+		goto unlock;
+
+	ret = file_remove_privs(file);
+	if (ret)
+		goto unlock;
+
+	ret = file_update_time(file);
+	if (ret)
+		goto unlock;
+
+	ret = bch2_buffered_write(iocb, from);
+	if (likely(ret > 0))
+		iocb->ki_pos += ret;
+unlock:
+	inode_unlock(&inode->v);
+
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
+out:
+	return bch2_err_class(ret);
+}
+
+void bch2_fs_fs_io_buffered_exit(struct bch_fs *c)
+{
+	bioset_exit(&c->writepage_bioset);
+}
+
+int bch2_fs_fs_io_buffered_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->writepage_bioset,
+			4, offsetof(struct bch_writepage_io, op.wbio.bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_writepage_bioset_init;
+
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/libbcachefs/fs-io-buffered.h b/libbcachefs/fs-io-buffered.h
new file mode 100644
index 00000000..3207ebbb
--- /dev/null
+++ b/libbcachefs/fs-io-buffered.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_BUFFERED_H
+#define _BCACHEFS_FS_IO_BUFFERED_H
+
+#ifndef NO_BCACHEFS_FS
+
+int bch2_read_single_folio(struct folio *, struct address_space *);
+int bch2_read_folio(struct file *, struct folio *);
+
+int bch2_writepages(struct address_space *, struct writeback_control *);
+void bch2_readahead(struct readahead_control *);
+
+int bch2_write_begin(struct file *, struct address_space *, loff_t pos,
+		     unsigned len, struct folio **, void **);
+int bch2_write_end(struct file *, struct address_space *, loff_t,
+		   unsigned len, unsigned copied, struct folio *, void *);
+
+ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
+
+void bch2_fs_fs_io_buffered_exit(struct bch_fs *);
+int bch2_fs_fs_io_buffered_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_BUFFERED_H */
diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c
new file mode 100644
index 00000000..b0367b9d
--- /dev/null
+++ b/libbcachefs/fs-io-direct.c
@@ -0,0 +1,725 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
+#include "io_read.h"
+#include "io_write.h"
+
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/prefetch.h>
+#include <linux/task_io_accounting_ops.h>
+
+/* O_DIRECT reads */
+
+struct dio_read {
+	struct closure			cl;
+	struct kiocb			*req;
+	long				ret;
+	bool				should_dirty;
+	struct bch_read_bio		rbio;
+};
+
+static void bio_check_or_release(struct bio *bio, bool check_dirty)
+{
+	if (check_dirty) {
+		bio_check_pages_dirty(bio);
+	} else {
+		bio_release_pages(bio, false);
+		bio_put(bio);
+	}
+}
+
+static CLOSURE_CALLBACK(bch2_dio_read_complete)
+{
+	closure_type(dio, struct dio_read, cl);
+
+	dio->req->ki_complete(dio->req, dio->ret);
+	bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
+}
+
+static void bch2_direct_IO_read_endio(struct bio *bio)
+{
+	struct dio_read *dio = bio->bi_private;
+
+	if (bio->bi_status)
+		dio->ret = blk_status_to_errno(bio->bi_status);
+
+	closure_put(&dio->cl);
+}
+
+static void bch2_direct_IO_read_split_endio(struct bio *bio)
+{
+	struct dio_read *dio = bio->bi_private;
+	bool should_dirty = dio->should_dirty;
+
+	bch2_direct_IO_read_endio(bio);
+	bio_check_or_release(bio, should_dirty);
+}
+
+static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
+{
+	struct file *file = req->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_io_opts opts;
+	struct dio_read *dio;
+	struct bio *bio;
+	struct blk_plug plug;
+	loff_t offset = req->ki_pos;
+	bool sync = is_sync_kiocb(req);
+	size_t shorten;
+	ssize_t ret;
+
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+	/* bios must be 512 byte aligned: */
+	if ((offset|iter->count) & (SECTOR_SIZE - 1))
+		return -EINVAL;
+
+	ret = min_t(loff_t, iter->count,
+		    max_t(loff_t, 0, i_size_read(&inode->v) - offset));
+
+	if (!ret)
+		return ret;
+
+	shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
+	if (shorten >= iter->count)
+		shorten = 0;
+	iter->count -= shorten;
+
+	bio = bio_alloc_bioset(NULL,
+			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+			       REQ_OP_READ,
+			       GFP_KERNEL,
+			       &c->dio_read_bioset);
+
+	bio->bi_end_io = bch2_direct_IO_read_endio;
+
+	dio = container_of(bio, struct dio_read, rbio.bio);
+	closure_init(&dio->cl, NULL);
+
+	/*
+	 * this is a _really_ horrible hack just to avoid an atomic sub at the
+	 * end:
+	 */
+	if (!sync) {
+		set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
+		atomic_set(&dio->cl.remaining,
+			   CLOSURE_REMAINING_INITIALIZER -
+			   CLOSURE_RUNNING +
+			   CLOSURE_DESTRUCTOR);
+	} else {
+		atomic_set(&dio->cl.remaining,
+			   CLOSURE_REMAINING_INITIALIZER + 1);
+		dio->cl.closure_get_happened = true;
+	}
+
+	dio->req	= req;
+	dio->ret	= ret;
+	/*
+	 * This is one of the sketchier things I've encountered: we have to skip
+	 * the dirtying of requests that are internal from the kernel (i.e. from
+	 * loopback), because we'll deadlock on page_lock.
+	 */
+	dio->should_dirty = iter_is_iovec(iter);
+
+	blk_start_plug(&plug);
+
+	goto start;
+	while (iter->count) {
+		bio = bio_alloc_bioset(NULL,
+				       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+				       REQ_OP_READ,
+				       GFP_KERNEL,
+				       &c->bio_read);
+		bio->bi_end_io		= bch2_direct_IO_read_split_endio;
+start:
+		bio->bi_opf		= REQ_OP_READ|REQ_SYNC;
+		bio->bi_iter.bi_sector	= offset >> 9;
+		bio->bi_private		= dio;
+
+		ret = bio_iov_iter_get_pages(bio, iter);
+		if (ret < 0) {
+			/* XXX: fault inject this path */
+			bio->bi_status = BLK_STS_RESOURCE;
+			bio_endio(bio);
+			break;
+		}
+
+		offset += bio->bi_iter.bi_size;
+
+		if (dio->should_dirty)
+			bio_set_pages_dirty(bio);
+
+		if (iter->count)
+			closure_get(&dio->cl);
+
+		bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
+	}
+
+	blk_finish_plug(&plug);
+
+	iter->count += shorten;
+
+	if (sync) {
+		closure_sync(&dio->cl);
+		closure_debug_destroy(&dio->cl);
+		ret = dio->ret;
+		bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
+		return ret;
+	} else {
+		return -EIOCBQUEUED;
+	}
+}
+
+ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct address_space *mapping = file->f_mapping;
+	size_t count = iov_iter_count(iter);
+	ssize_t ret = 0;
+
+	if (!count)
+		return 0; /* skip atime */
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		struct blk_plug plug;
+
+		if (unlikely(mapping->nrpages)) {
+			ret = filemap_write_and_wait_range(mapping,
+						iocb->ki_pos,
+						iocb->ki_pos + count - 1);
+			if (ret < 0)
+				goto out;
+		}
+
+		file_accessed(file);
+
+		blk_start_plug(&plug);
+		ret = bch2_direct_IO_read(iocb, iter);
+		blk_finish_plug(&plug);
+
+		if (ret >= 0)
+			iocb->ki_pos += ret;
+	} else {
+		bch2_pagecache_add_get(inode);
+		ret = filemap_read(iocb, iter, ret);
+		bch2_pagecache_add_put(inode);
+	}
+out:
+	return bch2_err_class(ret);
+}
+
+/* O_DIRECT writes */
+
+struct dio_write {
+	struct kiocb			*req;
+	struct address_space		*mapping;
+	struct bch_inode_info		*inode;
+	struct mm_struct		*mm;
+	const struct iovec		*iov;
+	unsigned			loop:1,
+					have_mm_ref:1,
+					extending:1,
+					sync:1,
+					flush:1;
+	struct quota_res		quota_res;
+	u64				written;
+
+	struct iov_iter			iter;
+	struct iovec			inline_vecs[2];
+
+	/* must be last: */
+	struct bch_write_op		op;
+};
+
+static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
+				       u64 offset, u64 size,
+				       unsigned nr_replicas, bool compressed)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 end = offset + size;
+	u32 snapshot;
+	bool ret = true;
+	int err;
+retry:
+	bch2_trans_begin(trans);
+
+	err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (err)
+		goto err;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
+			   SPOS(inum.inum, offset, snapshot),
+			   BTREE_ITER_slots, k, err) {
+		if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
+			break;
+
+		if (k.k->p.snapshot != snapshot ||
+		    nr_replicas > bch2_bkey_replicas(c, k) ||
+		    (!compressed && bch2_bkey_sectors_compressed(k))) {
+			ret = false;
+			break;
+		}
+	}
+
+	offset = iter.pos.offset;
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(err, BCH_ERR_transaction_restart))
+		goto retry;
+	bch2_trans_put(trans);
+
+	return err ? false : ret;
+}
+
+static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct bch_inode_info *inode = dio->inode;
+	struct bio *bio = &dio->op.wbio.bio;
+
+	return bch2_check_range_allocated(c, inode_inum(inode),
+				dio->op.pos.offset, bio_sectors(bio),
+				dio->op.opts.data_replicas,
+				dio->op.opts.compression != 0);
+}
+
+static void bch2_dio_write_loop_async(struct bch_write_op *);
+static __always_inline long bch2_dio_write_done(struct dio_write *dio);
+
+/*
+ * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
+ * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
+ * caller's stack, we're not guaranteed that it will live for the duration of
+ * the IO:
+ */
+static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
+{
+	struct iovec *iov = dio->inline_vecs;
+
+	/*
+	 * iov_iter has a single embedded iovec - nothing to do:
+	 */
+	if (iter_is_ubuf(&dio->iter))
+		return 0;
+
+	/*
+	 * We don't currently handle non-iovec iov_iters here - return an error,
+	 * and we'll fall back to doing the IO synchronously:
+	 */
+	if (!iter_is_iovec(&dio->iter))
+		return -1;
+
+	if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
+		dio->iov = iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
+				    GFP_KERNEL);
+		if (unlikely(!iov))
+			return -ENOMEM;
+	}
+
+	memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
+	dio->iter.__iov = iov;
+	return 0;
+}
+
+static CLOSURE_CALLBACK(bch2_dio_write_flush_done)
+{
+	closure_type(dio, struct dio_write, op.cl);
+	struct bch_fs *c = dio->op.c;
+
+	closure_debug_destroy(cl);
+
+	dio->op.error = bch2_journal_error(&c->journal);
+
+	bch2_dio_write_done(dio);
+}
+
+static noinline void bch2_dio_write_flush(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct bch_inode_unpacked inode;
+	int ret;
+
+	dio->flush = 0;
+
+	closure_init(&dio->op.cl, NULL);
+
+	if (!dio->op.error) {
+		ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
+		if (ret) {
+			dio->op.error = ret;
+		} else {
+			bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq,
+						     &dio->op.cl);
+			bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
+		}
+	}
+
+	if (dio->sync) {
+		closure_sync(&dio->op.cl);
+		closure_debug_destroy(&dio->op.cl);
+	} else {
+		continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
+	}
+}
+
+static __always_inline long bch2_dio_write_done(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct kiocb *req = dio->req;
+	struct bch_inode_info *inode = dio->inode;
+	bool sync = dio->sync;
+	long ret;
+
+	if (unlikely(dio->flush)) {
+		bch2_dio_write_flush(dio);
+		if (!sync)
+			return -EIOCBQUEUED;
+	}
+
+	bch2_pagecache_block_put(inode);
+
+	kfree(dio->iov);
+
+	if (dio->have_mm_ref)
+		mmdrop(dio->mm);
+
+	ret = dio->op.error ?: ((long) dio->written << 9);
+	bio_put(&dio->op.wbio.bio);
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
+
+	/* inode->i_dio_count is our ref on inode and thus bch_fs */
+	inode_dio_end(&inode->v);
+
+	if (ret < 0)
+		ret = bch2_err_class(ret);
+
+	if (!sync) {
+		req->ki_complete(req, ret);
+		ret = -EIOCBQUEUED;
+	}
+	return ret;
+}
+
+static __always_inline void bch2_dio_write_end(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct kiocb *req = dio->req;
+	struct bch_inode_info *inode = dio->inode;
+	struct bio *bio = &dio->op.wbio.bio;
+
+	req->ki_pos	+= (u64) dio->op.written << 9;
+	dio->written	+= dio->op.written;
+
+	if (dio->extending) {
+		spin_lock(&inode->v.i_lock);
+		if (req->ki_pos > inode->v.i_size)
+			i_size_write(&inode->v, req->ki_pos);
+		spin_unlock(&inode->v.i_lock);
+	}
+
+	if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
+		mutex_lock(&inode->ei_quota_lock);
+		__bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
+		__bch2_quota_reservation_put(c, inode, &dio->quota_res);
+		mutex_unlock(&inode->ei_quota_lock);
+	}
+
+	bio_release_pages(bio, false);
+
+	if (unlikely(dio->op.error))
+		set_bit(EI_INODE_ERROR, &inode->ei_flags);
+}
+
+static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct kiocb *req = dio->req;
+	struct address_space *mapping = dio->mapping;
+	struct bch_inode_info *inode = dio->inode;
+	struct bch_io_opts opts;
+	struct bio *bio = &dio->op.wbio.bio;
+	unsigned unaligned, iter_count;
+	bool sync = dio->sync, dropped_locks;
+	long ret;
+
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+	while (1) {
+		iter_count = dio->iter.count;
+
+		EBUG_ON(current->faults_disabled_mapping);
+		current->faults_disabled_mapping = mapping;
+
+		ret = bio_iov_iter_get_pages(bio, &dio->iter);
+
+		dropped_locks = fdm_dropped_locks();
+
+		current->faults_disabled_mapping = NULL;
+
+		/*
+		 * If the fault handler returned an error but also signalled
+		 * that it dropped & retook ei_pagecache_lock, we just need to
+		 * re-shoot down the page cache and retry:
+		 */
+		if (dropped_locks && ret)
+			ret = 0;
+
+		if (unlikely(ret < 0))
+			goto err;
+
+		if (unlikely(dropped_locks)) {
+			ret = bch2_write_invalidate_inode_pages_range(mapping,
+					req->ki_pos,
+					req->ki_pos + iter_count - 1);
+			if (unlikely(ret))
+				goto err;
+
+			if (!bio->bi_iter.bi_size)
+				continue;
+		}
+
+		unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
+		bio->bi_iter.bi_size -= unaligned;
+		iov_iter_revert(&dio->iter, unaligned);
+
+		if (!bio->bi_iter.bi_size) {
+			/*
+			 * bio_iov_iter_get_pages was only able to get <
+			 * blocksize worth of pages:
+			 */
+			ret = -EFAULT;
+			goto err;
+		}
+
+		bch2_write_op_init(&dio->op, c, opts);
+		dio->op.end_io		= sync
+			? NULL
+			: bch2_dio_write_loop_async;
+		dio->op.target		= dio->op.opts.foreground_target;
+		dio->op.write_point	= writepoint_hashed((unsigned long) current);
+		dio->op.nr_replicas	= dio->op.opts.data_replicas;
+		dio->op.subvol		= inode->ei_inum.subvol;
+		dio->op.pos		= POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
+		dio->op.devs_need_flush	= &inode->ei_devs_need_flush;
+
+		if (sync)
+			dio->op.flags |= BCH_WRITE_SYNC;
+		dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
+
+		ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
+						 bio_sectors(bio), true);
+		if (unlikely(ret))
+			goto err;
+
+		ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
+						dio->op.opts.data_replicas, 0);
+		if (unlikely(ret) &&
+		    !bch2_dio_write_check_allocated(dio))
+			goto err;
+
+		task_io_account_write(bio->bi_iter.bi_size);
+
+		if (unlikely(dio->iter.count) &&
+		    !dio->sync &&
+		    !dio->loop) {
+			/*
+			 * Rest of write will be submitted asynchronously -
+			 * unless copying the iov fails:
+			 */
+			if (likely(!bch2_dio_write_copy_iov(dio))) {
+				/*
+				 * aio guarantees that mm_struct outlives the
+				 * request, but io_uring does not
+				 */
+				if (dio->mm) {
+					mmgrab(dio->mm);
+					dio->have_mm_ref = true;
+				}
+			} else {
+				dio->sync = sync = true;
+			}
+		}
+
+		dio->loop = true;
+		closure_call(&dio->op.cl, bch2_write, NULL, NULL);
+
+		if (!sync)
+			return -EIOCBQUEUED;
+
+		bch2_dio_write_end(dio);
+
+		if (likely(!dio->iter.count) || dio->op.error)
+			break;
+
+		bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
+	}
+out:
+	return bch2_dio_write_done(dio);
+err:
+	dio->op.error = ret;
+
+	bio_release_pages(bio, false);
+
+	bch2_quota_reservation_put(c, inode, &dio->quota_res);
+	goto out;
+}
+
+static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
+{
+	struct mm_struct *mm = dio->have_mm_ref ? dio->mm: NULL;
+
+	bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
+
+	if (mm) {
+		if (unlikely(!mmget_not_zero(mm))) {
+			/* process exited */
+			dio->op.error = -ESRCH;
+			bch2_dio_write_done(dio);
+			return;
+		}
+
+		kthread_use_mm(mm);
+	}
+	bch2_dio_write_loop(dio);
+	if (mm) {
+		kthread_unuse_mm(mm);
+		mmput(mm);
+	}
+}
+
+static void bch2_dio_write_loop_async(struct bch_write_op *op)
+{
+	struct dio_write *dio = container_of(op, struct dio_write, op);
+
+	bch2_dio_write_end(dio);
+
+	if (likely(!dio->iter.count) || dio->op.error)
+		bch2_dio_write_done(dio);
+	else
+		bch2_dio_write_continue(dio);
+}
+
+ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
+{
+	struct file *file = req->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct dio_write *dio;
+	struct bio *bio;
+	bool locked = true, extending;
+	ssize_t ret;
+
+	prefetch(&c->opts);
+	prefetch((void *) &c->opts + 64);
+	prefetch(&inode->ei_inode);
+	prefetch((void *) &inode->ei_inode + 64);
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write))
+		return -EROFS;
+
+	inode_lock(&inode->v);
+
+	ret = generic_write_checks(req, iter);
+	if (unlikely(ret <= 0))
+		goto err_put_write_ref;
+
+	ret = file_remove_privs(file);
+	if (unlikely(ret))
+		goto err_put_write_ref;
+
+	ret = file_update_time(file);
+	if (unlikely(ret))
+		goto err_put_write_ref;
+
+	if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) {
+		ret = -EINVAL;
+		goto err_put_write_ref;
+	}
+
+	inode_dio_begin(&inode->v);
+	bch2_pagecache_block_get(inode);
+
+	extending = req->ki_pos + iter->count > inode->v.i_size;
+	if (!extending) {
+		inode_unlock(&inode->v);
+		locked = false;
+	}
+
+	bio = bio_alloc_bioset(NULL,
+			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+			       REQ_OP_WRITE | REQ_SYNC | REQ_IDLE,
+			       GFP_KERNEL,
+			       &c->dio_write_bioset);
+	dio = container_of(bio, struct dio_write, op.wbio.bio);
+	dio->req		= req;
+	dio->mapping		= mapping;
+	dio->inode		= inode;
+	dio->mm			= current->mm;
+	dio->iov		= NULL;
+	dio->loop		= false;
+	dio->have_mm_ref	= false;
+	dio->extending		= extending;
+	dio->sync		= is_sync_kiocb(req) || extending;
+	dio->flush		= iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
+	dio->quota_res.sectors	= 0;
+	dio->written		= 0;
+	dio->iter		= *iter;
+	dio->op.c		= c;
+
+	if (unlikely(mapping->nrpages)) {
+		ret = bch2_write_invalidate_inode_pages_range(mapping,
+						req->ki_pos,
+						req->ki_pos + iter->count - 1);
+		if (unlikely(ret))
+			goto err_put_bio;
+	}
+
+	ret = bch2_dio_write_loop(dio);
+out:
+	if (locked)
+		inode_unlock(&inode->v);
+	return ret;
+err_put_bio:
+	bch2_pagecache_block_put(inode);
+	bio_put(bio);
+	inode_dio_end(&inode->v);
+err_put_write_ref:
+	bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
+	goto out;
+}
+
+void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
+{
+	bioset_exit(&c->dio_write_bioset);
+	bioset_exit(&c->dio_read_bioset);
+}
+
+int bch2_fs_fs_io_direct_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->dio_read_bioset,
+			4, offsetof(struct dio_read, rbio.bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_dio_read_bioset_init;
+
+	if (bioset_init(&c->dio_write_bioset,
+			4, offsetof(struct dio_write, op.wbio.bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_dio_write_bioset_init;
+
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/libbcachefs/fs-io-direct.h b/libbcachefs/fs-io-direct.h
new file mode 100644
index 00000000..814621ec
--- /dev/null
+++ b/libbcachefs/fs-io-direct.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_DIRECT_H
+#define _BCACHEFS_FS_IO_DIRECT_H
+
+#ifndef NO_BCACHEFS_FS
+ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *);
+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
+
+void bch2_fs_fs_io_direct_exit(struct bch_fs *);
+int bch2_fs_fs_io_direct_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_DIRECT_H */
diff --git a/libbcachefs/fs-io-pagecache.c b/libbcachefs/fs-io-pagecache.c
new file mode 100644
index 00000000..e072900e
--- /dev/null
+++ b/libbcachefs/fs-io-pagecache.c
@@ -0,0 +1,823 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "btree_iter.h"
+#include "extents.h"
+#include "fs-io.h"
+#include "fs-io-pagecache.h"
+#include "subvolume.h"
+
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+
+int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
+				     loff_t start, u64 end,
+				     fgf_t fgp_flags, gfp_t gfp,
+				     folios *fs)
+{
+	struct folio *f;
+	u64 pos = start;
+	int ret = 0;
+
+	while (pos < end) {
+		if ((u64) pos >= (u64) start + (1ULL << 20))
+			fgp_flags &= ~FGP_CREAT;
+
+		ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL);
+		if (ret)
+			break;
+
+		f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
+		if (IS_ERR(f))
+			break;
+
+		BUG_ON(fs->nr && folio_pos(f) != pos);
+
+		pos = folio_end_pos(f);
+		darray_push(fs, f);
+	}
+
+	if (!fs->nr && !ret && (fgp_flags & FGP_CREAT))
+		ret = -ENOMEM;
+
+	return fs->nr ? 0 : ret;
+}
+
+/* pagecache_block must be held */
+int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
+					    loff_t start, loff_t end)
+{
+	int ret;
+
+	/*
+	 * XXX: the way this is currently implemented, we can spin if a process
+	 * is continually redirtying a specific page
+	 */
+	do {
+		if (!mapping->nrpages)
+			return 0;
+
+		ret = filemap_write_and_wait_range(mapping, start, end);
+		if (ret)
+			break;
+
+		if (!mapping->nrpages)
+			return 0;
+
+		ret = invalidate_inode_pages2_range(mapping,
+				start >> PAGE_SHIFT,
+				end >> PAGE_SHIFT);
+	} while (ret == -EBUSY);
+
+	return ret;
+}
+
+#if 0
+/* Useful for debug tracing: */
+static const char * const bch2_folio_sector_states[] = {
+#define x(n)	#n,
+	BCH_FOLIO_SECTOR_STATE()
+#undef x
+	NULL
+};
+#endif
+
+static inline enum bch_folio_sector_state
+folio_sector_dirty(enum bch_folio_sector_state state)
+{
+	switch (state) {
+	case SECTOR_unallocated:
+		return SECTOR_dirty;
+	case SECTOR_reserved:
+		return SECTOR_dirty_reserved;
+	default:
+		return state;
+	}
+}
+
+static inline enum bch_folio_sector_state
+folio_sector_undirty(enum bch_folio_sector_state state)
+{
+	switch (state) {
+	case SECTOR_dirty:
+		return SECTOR_unallocated;
+	case SECTOR_dirty_reserved:
+		return SECTOR_reserved;
+	default:
+		return state;
+	}
+}
+
+static inline enum bch_folio_sector_state
+folio_sector_reserve(enum bch_folio_sector_state state)
+{
+	switch (state) {
+	case SECTOR_unallocated:
+		return SECTOR_reserved;
+	case SECTOR_dirty:
+		return SECTOR_dirty_reserved;
+	default:
+		return state;
+	}
+}
+
+/* for newly allocated folios: */
+struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
+{
+	struct bch_folio *s;
+
+	s = kzalloc(sizeof(*s) +
+		    sizeof(struct bch_folio_sector) *
+		    folio_sectors(folio), gfp);
+	if (!s)
+		return NULL;
+
+	spin_lock_init(&s->lock);
+	folio_attach_private(folio, s);
+	return s;
+}
+
+struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
+{
+	return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
+}
+
+static unsigned bkey_to_sector_state(struct bkey_s_c k)
+{
+	if (bkey_extent_is_reservation(k))
+		return SECTOR_reserved;
+	if (bkey_extent_is_allocation(k.k))
+		return SECTOR_allocated;
+	return SECTOR_unallocated;
+}
+
+static void __bch2_folio_set(struct folio *folio,
+			     unsigned pg_offset, unsigned pg_len,
+			     unsigned nr_ptrs, unsigned state)
+{
+	struct bch_folio *s = bch2_folio(folio);
+	unsigned i, sectors = folio_sectors(folio);
+
+	BUG_ON(pg_offset >= sectors);
+	BUG_ON(pg_offset + pg_len > sectors);
+
+	spin_lock(&s->lock);
+
+	for (i = pg_offset; i < pg_offset + pg_len; i++) {
+		s->s[i].nr_replicas	= nr_ptrs;
+		bch2_folio_sector_set(folio, s, i, state);
+	}
+
+	if (i == sectors)
+		s->uptodate = true;
+
+	spin_unlock(&s->lock);
+}
+
+/*
+ * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
+ * extents btree:
+ */
+int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
+		   struct folio **fs, unsigned nr_folios)
+{
+	u64 offset = folio_sector(fs[0]);
+	bool need_set = false;
+
+	for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
+		struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
+		if (!s)
+			return -ENOMEM;
+
+		need_set |= !s->uptodate;
+	}
+
+	if (!need_set)
+		return 0;
+
+	unsigned folio_idx = 0;
+
+	return bch2_trans_run(c,
+		for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
+				   POS(inum.inum, offset),
+				   POS(inum.inum, U64_MAX),
+				   inum.subvol, BTREE_ITER_slots, k, ({
+			unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
+			unsigned state = bkey_to_sector_state(k);
+
+			while (folio_idx < nr_folios) {
+				struct folio *folio = fs[folio_idx];
+				u64 folio_start	= folio_sector(folio);
+				u64 folio_end	= folio_end_sector(folio);
+				unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
+					folio_start;
+				unsigned folio_len = min(k.k->p.offset, folio_end) -
+					folio_offset - folio_start;
+
+				BUG_ON(k.k->p.offset < folio_start);
+				BUG_ON(bkey_start_offset(k.k) > folio_end);
+
+				if (!bch2_folio(folio)->uptodate)
+					__bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
+
+				if (k.k->p.offset < folio_end)
+					break;
+				folio_idx++;
+			}
+
+			if (folio_idx == nr_folios)
+				break;
+			0;
+		})));
+}
+
+void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
+{
+	struct bvec_iter iter;
+	struct folio_vec fv;
+	unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
+		? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
+	unsigned state = bkey_to_sector_state(k);
+
+	bio_for_each_folio(fv, bio, iter)
+		__bch2_folio_set(fv.fv_folio,
+				 fv.fv_offset >> 9,
+				 fv.fv_len >> 9,
+				 nr_ptrs, state);
+}
+
+void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
+				     u64 start, u64 end)
+{
+	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+	struct folio_batch fbatch;
+	unsigned i, j;
+
+	if (end <= start)
+		return;
+
+	folio_batch_init(&fbatch);
+
+	while (filemap_get_folios(inode->v.i_mapping,
+				  &index, end_index, &fbatch)) {
+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
+			struct folio *folio = fbatch.folios[i];
+			u64 folio_start = folio_sector(folio);
+			u64 folio_end = folio_end_sector(folio);
+			unsigned folio_offset = max(start, folio_start) - folio_start;
+			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+			struct bch_folio *s;
+
+			BUG_ON(end <= folio_start);
+
+			folio_lock(folio);
+			s = bch2_folio(folio);
+
+			if (s) {
+				spin_lock(&s->lock);
+				for (j = folio_offset; j < folio_offset + folio_len; j++)
+					s->s[j].nr_replicas = 0;
+				spin_unlock(&s->lock);
+			}
+
+			folio_unlock(folio);
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+}
+
+int bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
+				 u64 *start, u64 end,
+				 bool nonblocking)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	pgoff_t index = *start >> PAGE_SECTORS_SHIFT;
+	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+	struct folio_batch fbatch;
+	s64 i_sectors_delta = 0;
+	int ret = 0;
+
+	if (end <= *start)
+		return 0;
+
+	folio_batch_init(&fbatch);
+
+	while (filemap_get_folios(inode->v.i_mapping,
+				  &index, end_index, &fbatch)) {
+		for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) {
+			struct folio *folio = fbatch.folios[i];
+
+			if (!nonblocking)
+				folio_lock(folio);
+			else if (!folio_trylock(folio)) {
+				folio_batch_release(&fbatch);
+				ret = -EAGAIN;
+				break;
+			}
+
+			u64 folio_start = folio_sector(folio);
+			u64 folio_end = folio_end_sector(folio);
+
+			BUG_ON(end <= folio_start);
+
+			*start = min(end, folio_end);
+
+			struct bch_folio *s = bch2_folio(folio);
+			if (s) {
+				unsigned folio_offset = max(*start, folio_start) - folio_start;
+				unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+
+				spin_lock(&s->lock);
+				for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) {
+					i_sectors_delta -= s->s[j].state == SECTOR_dirty;
+					bch2_folio_sector_set(folio, s, j,
+						folio_sector_reserve(s->s[j].state));
+				}
+				spin_unlock(&s->lock);
+			}
+
+			folio_unlock(folio);
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+
+	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+	return ret;
+}
+
+static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
+					  unsigned nr_replicas)
+{
+	return max(0, (int) nr_replicas -
+		   s->nr_replicas -
+		   s->replicas_reserved);
+}
+
+int bch2_get_folio_disk_reservation(struct bch_fs *c,
+				struct bch_inode_info *inode,
+				struct folio *folio, bool check_enospc)
+{
+	struct bch_folio *s = bch2_folio_create(folio, 0);
+	unsigned nr_replicas = inode_nr_replicas(c, inode);
+	struct disk_reservation disk_res = { 0 };
+	unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
+	int ret;
+
+	if (!s)
+		return -ENOMEM;
+
+	for (i = 0; i < sectors; i++)
+		disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
+
+	if (!disk_res_sectors)
+		return 0;
+
+	ret = bch2_disk_reservation_get(c, &disk_res,
+					disk_res_sectors, 1,
+					!check_enospc
+					? BCH_DISK_RESERVATION_NOFAIL
+					: 0);
+	if (unlikely(ret))
+		return ret;
+
+	for (i = 0; i < sectors; i++)
+		s->s[i].replicas_reserved +=
+			sectors_to_reserve(&s->s[i], nr_replicas);
+
+	return 0;
+}
+
+void bch2_folio_reservation_put(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct bch2_folio_reservation *res)
+{
+	bch2_disk_reservation_put(c, &res->disk);
+	bch2_quota_reservation_put(c, inode, &res->quota);
+}
+
+static int __bch2_folio_reservation_get(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct folio *folio,
+			struct bch2_folio_reservation *res,
+			size_t offset, size_t len,
+			bool partial)
+{
+	struct bch_folio *s = bch2_folio_create(folio, 0);
+	unsigned i, disk_sectors = 0, quota_sectors = 0;
+	struct disk_reservation disk_res = {};
+	size_t reserved = len;
+	int ret;
+
+	if (!s)
+		return -ENOMEM;
+
+	BUG_ON(!s->uptodate);
+
+	for (i = round_down(offset, block_bytes(c)) >> 9;
+	     i < round_up(offset + len, block_bytes(c)) >> 9;
+	     i++) {
+		disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
+		quota_sectors += s->s[i].state == SECTOR_unallocated;
+	}
+
+	if (disk_sectors) {
+		ret = bch2_disk_reservation_add(c, &disk_res, disk_sectors,
+				partial ? BCH_DISK_RESERVATION_PARTIAL : 0);
+		if (unlikely(ret))
+			return ret;
+
+		if (unlikely(disk_res.sectors != disk_sectors)) {
+			disk_sectors = quota_sectors = 0;
+
+			for (i = round_down(offset, block_bytes(c)) >> 9;
+			     i < round_up(offset + len, block_bytes(c)) >> 9;
+			     i++) {
+				disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
+				if (disk_sectors > disk_res.sectors) {
+					/*
+					 * Make sure to get a reservation that's
+					 * aligned to the filesystem blocksize:
+					 */
+					unsigned reserved_offset = round_down(i << 9, block_bytes(c));
+					reserved = clamp(reserved_offset, offset, offset + len) - offset;
+
+					if (!reserved) {
+						bch2_disk_reservation_put(c, &disk_res);
+						return -BCH_ERR_ENOSPC_disk_reservation;
+					}
+					break;
+				}
+				quota_sectors += s->s[i].state == SECTOR_unallocated;
+			}
+		}
+	}
+
+	if (quota_sectors) {
+		ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true);
+		if (unlikely(ret)) {
+			bch2_disk_reservation_put(c, &disk_res);
+			return ret;
+		}
+	}
+
+	res->disk.sectors += disk_res.sectors;
+	return partial ? reserved : 0;
+}
+
+int bch2_folio_reservation_get(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct folio *folio,
+			struct bch2_folio_reservation *res,
+			size_t offset, size_t len)
+{
+	return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, false);
+}
+
+ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct folio *folio,
+			struct bch2_folio_reservation *res,
+			size_t offset, size_t len)
+{
+	return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, true);
+}
+
+static void bch2_clear_folio_bits(struct folio *folio)
+{
+	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_folio *s = bch2_folio(folio);
+	struct disk_reservation disk_res = { 0 };
+	int i, sectors = folio_sectors(folio), dirty_sectors = 0;
+
+	if (!s)
+		return;
+
+	EBUG_ON(!folio_test_locked(folio));
+	EBUG_ON(folio_test_writeback(folio));
+
+	for (i = 0; i < sectors; i++) {
+		disk_res.sectors += s->s[i].replicas_reserved;
+		s->s[i].replicas_reserved = 0;
+
+		dirty_sectors -= s->s[i].state == SECTOR_dirty;
+		bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
+	}
+
+	bch2_disk_reservation_put(c, &disk_res);
+
+	bch2_i_sectors_acct(c, inode, NULL, dirty_sectors);
+
+	bch2_folio_release(folio);
+}
+
+void bch2_set_folio_dirty(struct bch_fs *c,
+			  struct bch_inode_info *inode,
+			  struct folio *folio,
+			  struct bch2_folio_reservation *res,
+			  unsigned offset, unsigned len)
+{
+	struct bch_folio *s = bch2_folio(folio);
+	unsigned i, dirty_sectors = 0;
+
+	WARN_ON((u64) folio_pos(folio) + offset + len >
+		round_up((u64) i_size_read(&inode->v), block_bytes(c)));
+
+	BUG_ON(!s->uptodate);
+
+	spin_lock(&s->lock);
+
+	for (i = round_down(offset, block_bytes(c)) >> 9;
+	     i < round_up(offset + len, block_bytes(c)) >> 9;
+	     i++) {
+		unsigned sectors = sectors_to_reserve(&s->s[i],
+						res->disk.nr_replicas);
+
+		/*
+		 * This can happen if we race with the error path in
+		 * bch2_writepage_io_done():
+		 */
+		sectors = min_t(unsigned, sectors, res->disk.sectors);
+
+		s->s[i].replicas_reserved += sectors;
+		res->disk.sectors -= sectors;
+
+		dirty_sectors += s->s[i].state == SECTOR_unallocated;
+
+		bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
+	}
+
+	spin_unlock(&s->lock);
+
+	bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors);
+
+	if (!folio_test_dirty(folio))
+		filemap_dirty_folio(inode->v.i_mapping, folio);
+}
+
+vm_fault_t bch2_page_fault(struct vm_fault *vmf)
+{
+	struct file *file = vmf->vma->vm_file;
+	struct address_space *mapping = file->f_mapping;
+	struct address_space *fdm = faults_disabled_mapping();
+	struct bch_inode_info *inode = file_bch_inode(file);
+	vm_fault_t ret;
+
+	if (fdm == mapping)
+		return VM_FAULT_SIGBUS;
+
+	/* Lock ordering: */
+	if (fdm > mapping) {
+		struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
+
+		if (bch2_pagecache_add_tryget(inode))
+			goto got_lock;
+
+		bch2_pagecache_block_put(fdm_host);
+
+		bch2_pagecache_add_get(inode);
+		bch2_pagecache_add_put(inode);
+
+		bch2_pagecache_block_get(fdm_host);
+
+		/* Signal that lock has been dropped: */
+		set_fdm_dropped_locks();
+		return VM_FAULT_SIGBUS;
+	}
+
+	bch2_pagecache_add_get(inode);
+got_lock:
+	ret = filemap_fault(vmf);
+	bch2_pagecache_add_put(inode);
+
+	return ret;
+}
+
+vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
+{
+	struct folio *folio = page_folio(vmf->page);
+	struct file *file = vmf->vma->vm_file;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct address_space *mapping = file->f_mapping;
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_folio_reservation res;
+	unsigned len;
+	loff_t isize;
+	vm_fault_t ret;
+
+	bch2_folio_reservation_init(c, inode, &res);
+
+	sb_start_pagefault(inode->v.i_sb);
+	file_update_time(file);
+
+	/*
+	 * Not strictly necessary, but helps avoid dio writes livelocking in
+	 * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get
+	 * a bch2_write_invalidate_inode_pages_range() that works without dropping
+	 * page lock before invalidating page
+	 */
+	bch2_pagecache_add_get(inode);
+
+	folio_lock(folio);
+	isize = i_size_read(&inode->v);
+
+	if (folio->mapping != mapping || folio_pos(folio) >= isize) {
+		folio_unlock(folio);
+		ret = VM_FAULT_NOPAGE;
+		goto out;
+	}
+
+	len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
+
+	if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
+	    bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
+		folio_unlock(folio);
+		ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
+
+	bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
+	bch2_folio_reservation_put(c, inode, &res);
+
+	folio_wait_stable(folio);
+	ret = VM_FAULT_LOCKED;
+out:
+	bch2_pagecache_add_put(inode);
+	sb_end_pagefault(inode->v.i_sb);
+
+	return ret;
+}
+
+void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
+{
+	if (offset || length < folio_size(folio))
+		return;
+
+	bch2_clear_folio_bits(folio);
+}
+
+bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
+{
+	if (folio_test_dirty(folio) || folio_test_writeback(folio))
+		return false;
+
+	bch2_clear_folio_bits(folio);
+	return true;
+}
+
+/* fseek: */
+
+static int folio_data_offset(struct folio *folio, loff_t pos,
+			     unsigned min_replicas)
+{
+	struct bch_folio *s = bch2_folio(folio);
+	unsigned i, sectors = folio_sectors(folio);
+
+	if (s)
+		for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
+			if (s->s[i].state >= SECTOR_dirty &&
+			    s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
+				return i << SECTOR_SHIFT;
+
+	return -1;
+}
+
+loff_t bch2_seek_pagecache_data(struct inode *vinode,
+				loff_t start_offset,
+				loff_t end_offset,
+				unsigned min_replicas,
+				bool nonblock)
+{
+	struct folio_batch fbatch;
+	pgoff_t start_index	= start_offset >> PAGE_SHIFT;
+	pgoff_t end_index	= end_offset >> PAGE_SHIFT;
+	pgoff_t index		= start_index;
+	unsigned i;
+	loff_t ret;
+	int offset;
+
+	folio_batch_init(&fbatch);
+
+	while (filemap_get_folios(vinode->i_mapping,
+				  &index, end_index, &fbatch)) {
+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
+			struct folio *folio = fbatch.folios[i];
+
+			if (!nonblock) {
+				folio_lock(folio);
+			} else if (!folio_trylock(folio)) {
+				folio_batch_release(&fbatch);
+				return -EAGAIN;
+			}
+
+			offset = folio_data_offset(folio,
+					max(folio_pos(folio), start_offset),
+					min_replicas);
+			if (offset >= 0) {
+				ret = clamp(folio_pos(folio) + offset,
+					    start_offset, end_offset);
+				folio_unlock(folio);
+				folio_batch_release(&fbatch);
+				return ret;
+			}
+			folio_unlock(folio);
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+
+	return end_offset;
+}
+
+/*
+ * Search for a hole in a folio.
+ *
+ * The filemap layer returns -ENOENT if no folio exists, so reuse the same error
+ * code to indicate a pagecache hole exists at the returned offset. Otherwise
+ * return 0 if the folio is filled with data, or an error code. This function
+ * can return -EAGAIN if nonblock is specified.
+ */
+static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
+			      unsigned min_replicas, bool nonblock)
+{
+	struct folio *folio;
+	struct bch_folio *s;
+	unsigned i, sectors;
+	int ret = -ENOENT;
+
+	folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
+				    FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
+
+	s = bch2_folio(folio);
+	if (!s)
+		goto unlock;
+
+	sectors = folio_sectors(folio);
+	for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
+		if (s->s[i].state < SECTOR_dirty ||
+		    s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
+			*offset = max(*offset,
+				      folio_pos(folio) + (i << SECTOR_SHIFT));
+			goto unlock;
+		}
+
+	*offset = folio_end_pos(folio);
+	ret = 0;
+unlock:
+	folio_unlock(folio);
+	folio_put(folio);
+	return ret;
+}
+
+loff_t bch2_seek_pagecache_hole(struct inode *vinode,
+				loff_t start_offset,
+				loff_t end_offset,
+				unsigned min_replicas,
+				bool nonblock)
+{
+	struct address_space *mapping = vinode->i_mapping;
+	loff_t offset = start_offset;
+	loff_t ret = 0;
+
+	while (!ret && offset < end_offset)
+		ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock);
+
+	if (ret && ret != -ENOENT)
+		return ret;
+	return min(offset, end_offset);
+}
+
+int bch2_clamp_data_hole(struct inode *inode,
+			 u64 *hole_start,
+			 u64 *hole_end,
+			 unsigned min_replicas,
+			 bool nonblock)
+{
+	loff_t ret;
+
+	ret = bch2_seek_pagecache_hole(inode,
+		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+	if (ret < 0)
+		return ret;
+
+	*hole_start = ret;
+
+	if (*hole_start == *hole_end)
+		return 0;
+
+	ret = bch2_seek_pagecache_data(inode,
+		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+	if (ret < 0)
+		return ret;
+
+	*hole_end = ret;
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/libbcachefs/fs-io-pagecache.h b/libbcachefs/fs-io-pagecache.h
new file mode 100644
index 00000000..fad911cf
--- /dev/null
+++ b/libbcachefs/fs-io-pagecache.h
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_PAGECACHE_H
+#define _BCACHEFS_FS_IO_PAGECACHE_H
+
+#include <linux/pagemap.h>
+
+typedef DARRAY(struct folio *) folios;
+
+int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
+				     u64, fgf_t, gfp_t, folios *);
+int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
+
+/*
+ * Use u64 for the end pos and sector helpers because if the folio covers the
+ * max supported range of the mapping, the start offset of the next folio
+ * overflows loff_t. This breaks much of the range based processing in the
+ * buffered write path.
+ */
+static inline u64 folio_end_pos(struct folio *folio)
+{
+	return folio_pos(folio) + folio_size(folio);
+}
+
+static inline size_t folio_sectors(struct folio *folio)
+{
+	return PAGE_SECTORS << folio_order(folio);
+}
+
+static inline loff_t folio_sector(struct folio *folio)
+{
+	return folio_pos(folio) >> 9;
+}
+
+static inline u64 folio_end_sector(struct folio *folio)
+{
+	return folio_end_pos(folio) >> 9;
+}
+
+#define BCH_FOLIO_SECTOR_STATE()	\
+	x(unallocated)			\
+	x(reserved)			\
+	x(dirty)			\
+	x(dirty_reserved)		\
+	x(allocated)
+
+enum bch_folio_sector_state {
+#define x(n)	SECTOR_##n,
+	BCH_FOLIO_SECTOR_STATE()
+#undef x
+};
+
+struct bch_folio_sector {
+	/* Uncompressed, fully allocated replicas (or on disk reservation): */
+	u8			nr_replicas:4,
+	/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
+				replicas_reserved:4;
+	u8			state;
+};
+
+struct bch_folio {
+	spinlock_t		lock;
+	atomic_t		write_count;
+	/*
+	 * Is the sector state up to date with the btree?
+	 * (Not the data itself)
+	 */
+	bool			uptodate;
+	struct bch_folio_sector	s[];
+};
+
+/* Helper for when we need to add debug instrumentation: */
+static inline void bch2_folio_sector_set(struct folio *folio,
+			     struct bch_folio *s,
+			     unsigned i, unsigned n)
+{
+	s->s[i].state = n;
+}
+
+/* file offset (to folio offset) to bch_folio_sector index */
+static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
+{
+	u64 f_offset = pos - folio_pos(folio);
+
+	BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
+	return f_offset >> SECTOR_SHIFT;
+}
+
+/* for newly allocated folios: */
+static inline void __bch2_folio_release(struct folio *folio)
+{
+	kfree(folio_detach_private(folio));
+}
+
+static inline void bch2_folio_release(struct folio *folio)
+{
+	EBUG_ON(!folio_test_locked(folio));
+	__bch2_folio_release(folio);
+}
+
+static inline struct bch_folio *__bch2_folio(struct folio *folio)
+{
+	return folio_get_private(folio);
+}
+
+static inline struct bch_folio *bch2_folio(struct folio *folio)
+{
+	EBUG_ON(!folio_test_locked(folio));
+
+	return __bch2_folio(folio);
+}
+
+struct bch_folio *__bch2_folio_create(struct folio *, gfp_t);
+struct bch_folio *bch2_folio_create(struct folio *, gfp_t);
+
+struct bch2_folio_reservation {
+	struct disk_reservation	disk;
+	struct quota_res	quota;
+};
+
+static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
+{
+	/* XXX: this should not be open coded */
+	return inode->ei_inode.bi_data_replicas
+		? inode->ei_inode.bi_data_replicas - 1
+		: c->opts.data_replicas;
+}
+
+static inline void bch2_folio_reservation_init(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct bch2_folio_reservation *res)
+{
+	memset(res, 0, sizeof(*res));
+
+	res->disk.nr_replicas = inode_nr_replicas(c, inode);
+}
+
+int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
+void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
+
+void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
+int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool);
+
+int bch2_get_folio_disk_reservation(struct bch_fs *,
+				struct bch_inode_info *,
+				struct folio *, bool);
+
+void bch2_folio_reservation_put(struct bch_fs *,
+			struct bch_inode_info *,
+			struct bch2_folio_reservation *);
+int bch2_folio_reservation_get(struct bch_fs *,
+			struct bch_inode_info *,
+			struct folio *,
+			struct bch2_folio_reservation *,
+			size_t, size_t);
+ssize_t bch2_folio_reservation_get_partial(struct bch_fs *,
+			struct bch_inode_info *,
+			struct folio *,
+			struct bch2_folio_reservation *,
+			size_t, size_t);
+
+void bch2_set_folio_dirty(struct bch_fs *,
+			  struct bch_inode_info *,
+			  struct folio *,
+			  struct bch2_folio_reservation *,
+			  unsigned, unsigned);
+
+vm_fault_t bch2_page_fault(struct vm_fault *);
+vm_fault_t bch2_page_mkwrite(struct vm_fault *);
+void bch2_invalidate_folio(struct folio *, size_t, size_t);
+bool bch2_release_folio(struct folio *, gfp_t);
+
+loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool);
+loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool);
+int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
+
+#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index e4d2b39e..33d0e708 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -1,18 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
 #ifndef NO_BCACHEFS_FS
 
 #include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "clock.h"
 #include "error.h"
+#include "extents.h"
+#include "extent_update.h"
 #include "fs.h"
 #include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-pagecache.h"
 #include "fsck.h"
 #include "inode.h"
 #include "journal.h"
-#include "io.h"
+#include "io_misc.h"
 #include "keylist.h"
 #include "quota.h"
+#include "reflink.h"
+#include "trace.h"
 
 #include <linux/aio.h>
 #include <linux/backing-dev.h>
@@ -20,2773 +29,984 @@
 #include <linux/migrate.h>
 #include <linux/mmu_context.h>
 #include <linux/pagevec.h>
+#include <linux/rmap.h>
 #include <linux/sched/signal.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/uio.h>
-#include <linux/writeback.h>
 
-#include <trace/events/bcachefs.h>
 #include <trace/events/writeback.h>
 
-struct quota_res {
-	u64				sectors;
+struct nocow_flush {
+	struct closure	*cl;
+	struct bch_dev	*ca;
+	struct bio	bio;
 };
 
-struct i_sectors_hook {
-	struct extent_insert_hook	hook;
-	struct bch_inode_info		*inode;
-	struct quota_res		quota_res;
-	s64				sectors;
-	u64				new_i_size;
-	unsigned			flags;
-	unsigned			appending:1;
-};
-
-struct bchfs_write_op {
-	struct bch_inode_info		*inode;
-	s64				sectors_added;
-	bool				is_dio;
-	bool				unalloc;
-	u64				new_i_size;
-
-	/* must be last: */
-	struct bch_write_op		op;
-};
-
-struct bch_writepage_io {
-	struct closure			cl;
-	u64				new_sectors;
-
-	/* must be last: */
-	struct bchfs_write_op		op;
-};
-
-struct dio_write {
-	struct closure			cl;
-	struct kiocb			*req;
-	struct task_struct		*task;
-	unsigned			loop:1,
-					sync:1,
-					free_iov:1;
-	struct quota_res		quota_res;
-
-	struct iov_iter			iter;
-	struct iovec			inline_vecs[2];
-
-	/* must be last: */
-	struct bchfs_write_op		iop;
-};
-
-struct dio_read {
-	struct closure			cl;
-	struct kiocb			*req;
-	long				ret;
-	struct bch_read_bio		rbio;
-};
-
-/* pagecache_block must be held */
-static int write_invalidate_inode_pages_range(struct address_space *mapping,
-					      loff_t start, loff_t end)
+static void nocow_flush_endio(struct bio *_bio)
 {
-	int ret;
-
-	/*
-	 * XXX: the way this is currently implemented, we can spin if a process
-	 * is continually redirtying a specific page
-	 */
-	do {
-		if (!mapping->nrpages &&
-		    !mapping->nrexceptional)
-			return 0;
-
-		ret = filemap_write_and_wait_range(mapping, start, end);
-		if (ret)
-			break;
-
-		if (!mapping->nrpages)
-			return 0;
 
-		ret = invalidate_inode_pages2_range(mapping,
-				start >> PAGE_SHIFT,
-				end >> PAGE_SHIFT);
-	} while (ret == -EBUSY);
+	struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
 
-	return ret;
+	closure_put(bio->cl);
+	percpu_ref_put(&bio->ca->io_ref);
+	bio_put(&bio->bio);
 }
 
-/* quotas */
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-
-static void bch2_quota_reservation_put(struct bch_fs *c,
-				       struct bch_inode_info *inode,
-				       struct quota_res *res)
+void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
+					 struct bch_inode_info *inode,
+					 struct closure *cl)
 {
-	if (!res->sectors)
-		return;
+	struct nocow_flush *bio;
+	struct bch_dev *ca;
+	struct bch_devs_mask devs;
+	unsigned dev;
 
-	mutex_lock(&inode->ei_quota_lock);
-	BUG_ON(res->sectors > inode->ei_quota_reserved);
+	dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
+	if (dev == BCH_SB_MEMBERS_MAX)
+		return;
 
-	bch2_quota_acct(c, inode->ei_qid, Q_SPC,
-			-((s64) res->sectors), BCH_QUOTA_PREALLOC);
-	inode->ei_quota_reserved -= res->sectors;
-	mutex_unlock(&inode->ei_quota_lock);
+	devs = inode->ei_devs_need_flush;
+	memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
 
-	res->sectors = 0;
-}
+	for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
+		rcu_read_lock();
+		ca = rcu_dereference(c->devs[dev]);
+		if (ca && !percpu_ref_tryget(&ca->io_ref))
+			ca = NULL;
+		rcu_read_unlock();
 
-static int bch2_quota_reservation_add(struct bch_fs *c,
-				      struct bch_inode_info *inode,
-				      struct quota_res *res,
-				      unsigned sectors,
-				      bool check_enospc)
-{
-	int ret;
+		if (!ca)
+			continue;
 
-	mutex_lock(&inode->ei_quota_lock);
-	ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
-			      check_enospc ? BCH_QUOTA_PREALLOC : BCH_QUOTA_NOCHECK);
-	if (likely(!ret)) {
-		inode->ei_quota_reserved += sectors;
-		res->sectors += sectors;
+		bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
+						    REQ_OP_WRITE|REQ_PREFLUSH,
+						    GFP_KERNEL,
+						    &c->nocow_flush_bioset),
+				   struct nocow_flush, bio);
+		bio->cl			= cl;
+		bio->ca			= ca;
+		bio->bio.bi_end_io	= nocow_flush_endio;
+		closure_bio_submit(&bio->bio, cl);
 	}
-	mutex_unlock(&inode->ei_quota_lock);
-
-	return ret;
 }
 
-#else
-
-static void bch2_quota_reservation_put(struct bch_fs *c,
-				       struct bch_inode_info *inode,
-				       struct quota_res *res)
+static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
+					 struct bch_inode_info *inode)
 {
-}
+	struct closure cl;
+
+	closure_init_stack(&cl);
+	bch2_inode_flush_nocow_writes_async(c, inode, &cl);
+	closure_sync(&cl);
 
-static int bch2_quota_reservation_add(struct bch_fs *c,
-				      struct bch_inode_info *inode,
-				      struct quota_res *res,
-				      unsigned sectors,
-				      bool check_enospc)
-{
 	return 0;
 }
 
-#endif
-
 /* i_size updates: */
 
-static int inode_set_size(struct bch_inode_info *inode,
+struct inode_new_size {
+	loff_t		new_size;
+	u64		now;
+	unsigned	fields;
+};
+
+static int inode_set_size(struct btree_trans *trans,
+			  struct bch_inode_info *inode,
 			  struct bch_inode_unpacked *bi,
 			  void *p)
 {
-	loff_t *new_i_size = p;
+	struct inode_new_size *s = p;
 
-	lockdep_assert_held(&inode->ei_update_lock);
+	bi->bi_size = s->new_size;
+	if (s->fields & ATTR_ATIME)
+		bi->bi_atime = s->now;
+	if (s->fields & ATTR_MTIME)
+		bi->bi_mtime = s->now;
+	if (s->fields & ATTR_CTIME)
+		bi->bi_ctime = s->now;
 
-	bi->bi_size = *new_i_size;
 	return 0;
 }
 
-static int __must_check bch2_write_inode_size(struct bch_fs *c,
-					      struct bch_inode_info *inode,
-					      loff_t new_size)
+int __must_check bch2_write_inode_size(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       loff_t new_size, unsigned fields)
 {
-	return __bch2_write_inode(c, inode, inode_set_size, &new_size, 0);
+	struct inode_new_size s = {
+		.new_size	= new_size,
+		.now		= bch2_current_time(c),
+		.fields		= fields,
+	};
+
+	return bch2_write_inode(c, inode, inode_set_size, &s, fields);
 }
 
-static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-			   struct quota_res *quota_res, int sectors)
+void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+			   struct quota_res *quota_res, s64 sectors)
 {
-	mutex_lock(&inode->ei_quota_lock);
+	bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
+				"inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
+				inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
+				inode->ei_inode.bi_sectors);
+	inode->v.i_blocks += sectors;
+
 #ifdef CONFIG_BCACHEFS_QUOTA
-	if (quota_res && sectors > 0) {
+	if (quota_res &&
+	    !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
+	    sectors > 0) {
 		BUG_ON(sectors > quota_res->sectors);
 		BUG_ON(sectors > inode->ei_quota_reserved);
 
 		quota_res->sectors -= sectors;
 		inode->ei_quota_reserved -= sectors;
 	} else {
-		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, BCH_QUOTA_WARN);
+		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
 	}
 #endif
-	inode->v.i_blocks += sectors;
-	mutex_unlock(&inode->ei_quota_lock);
 }
 
-/* i_sectors accounting: */
-
-static enum btree_insert_ret
-i_sectors_hook_fn(struct extent_insert_hook *hook,
-		  struct bpos committed_pos,
-		  struct bpos next_pos,
-		  struct bkey_s_c k,
-		  const struct bkey_i *insert)
-{
-	struct i_sectors_hook *h = container_of(hook,
-				struct i_sectors_hook, hook);
-	s64 sectors = next_pos.offset - committed_pos.offset;
-	int sign = bkey_extent_is_allocation(&insert->k) -
-		(k.k && bkey_extent_is_allocation(k.k));
-
-	EBUG_ON(!(h->inode->ei_inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY));
-
-	h->sectors += sectors * sign;
-
-	return BTREE_INSERT_OK;
-}
-
-static int i_sectors_dirty_finish_fn(struct bch_inode_info *inode,
-				     struct bch_inode_unpacked *bi,
-				     void *p)
-{
-	struct i_sectors_hook *h = p;
-
-	if (h->new_i_size != U64_MAX &&
-	    (!h->appending ||
-	     h->new_i_size > bi->bi_size))
-		bi->bi_size = h->new_i_size;
-	bi->bi_sectors	+= h->sectors;
-	bi->bi_flags	&= ~h->flags;
-	return 0;
-}
-
-static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
-{
-	int ret;
-
-	mutex_lock(&h->inode->ei_update_lock);
-	i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
-
-	ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h, 0);
-
-	if (!ret && h->new_i_size != U64_MAX)
-		i_size_write(&h->inode->v, h->new_i_size);
-	mutex_unlock(&h->inode->ei_update_lock);
-
-	bch2_quota_reservation_put(c, h->inode, &h->quota_res);
-
-	h->sectors = 0;
-
-	return ret;
-}
-
-static int i_sectors_dirty_start_fn(struct bch_inode_info *inode,
-				    struct bch_inode_unpacked *bi, void *p)
-{
-	struct i_sectors_hook *h = p;
-
-	if (h->flags & BCH_INODE_I_SIZE_DIRTY)
-		bi->bi_size = h->new_i_size;
-
-	bi->bi_flags |= h->flags;
-	return 0;
-}
-
-static int i_sectors_dirty_start(struct bch_fs *c, struct i_sectors_hook *h)
-{
-	int ret;
-
-	mutex_lock(&h->inode->ei_update_lock);
-	ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h, 0);
-	mutex_unlock(&h->inode->ei_update_lock);
-
-	return ret;
-}
-
-static inline struct i_sectors_hook
-i_sectors_hook_init(struct bch_inode_info *inode, unsigned flags)
-{
-	return (struct i_sectors_hook) {
-		.hook.fn	= i_sectors_hook_fn,
-		.inode		= inode,
-		.sectors	= 0,
-		.new_i_size	= U64_MAX,
-		.flags		= flags|BCH_INODE_I_SECTORS_DIRTY,
-	};
-}
-
-/* normal i_size/i_sectors update machinery: */
-
-struct bchfs_extent_trans_hook {
-	struct bchfs_write_op		*op;
-	struct extent_insert_hook	hook;
-
-	struct bch_inode_unpacked	inode_u;
-	struct bkey_inode_buf		inode_p;
-
-	bool				need_inode_update;
-};
-
-static enum btree_insert_ret
-bchfs_extent_update_hook(struct extent_insert_hook *hook,
-			 struct bpos committed_pos,
-			 struct bpos next_pos,
-			 struct bkey_s_c k,
-			 const struct bkey_i *insert)
-{
-	struct bchfs_extent_trans_hook *h = container_of(hook,
-				struct bchfs_extent_trans_hook, hook);
-	struct bch_inode_info *inode = h->op->inode;
-	int sign = bkey_extent_is_allocation(&insert->k) -
-		(k.k && bkey_extent_is_allocation(k.k));
-	s64 sectors = (s64) (next_pos.offset - committed_pos.offset) * sign;
-	u64 offset = min(next_pos.offset << 9, h->op->new_i_size);
-	bool do_pack = false;
-
-	if (h->op->unalloc &&
-	    !bch2_extent_is_fully_allocated(k))
-		return BTREE_INSERT_ENOSPC;
-
-	BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));
-
-	/* XXX: inode->i_size locking */
-	if (offset > inode->ei_inode.bi_size) {
-		if (!h->need_inode_update) {
-			h->need_inode_update = true;
-			return BTREE_INSERT_NEED_TRAVERSE;
-		}
-
-		/* truncate in progress? */
-		if (h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)
-			goto no_i_size_update;
-
-		h->inode_u.bi_size = offset;
-		do_pack = true;
-
-		inode->ei_inode.bi_size = offset;
-
-		spin_lock(&inode->v.i_lock);
-		if (offset > inode->v.i_size) {
-			if (h->op->is_dio)
-				i_size_write(&inode->v, offset);
-			else
-				BUG();
-		}
-		spin_unlock(&inode->v.i_lock);
-	}
-no_i_size_update:
-	if (sectors) {
-		if (!h->need_inode_update) {
-			h->need_inode_update = true;
-			return BTREE_INSERT_NEED_TRAVERSE;
-		}
-
-		h->inode_u.bi_sectors += sectors;
-		do_pack = true;
-
-		h->op->sectors_added += sectors;
-	}
-
-	if (do_pack)
-		bch2_inode_pack(&h->inode_p, &h->inode_u);
-
-	return BTREE_INSERT_OK;
-}
+/* fsync: */
 
-static int bchfs_write_index_update(struct bch_write_op *wop)
+static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum,
+					    u64 *seq)
 {
-	struct bchfs_write_op *op = container_of(wop,
-				struct bchfs_write_op, op);
-	struct keylist *keys = &op->op.insert_keys;
-	struct btree_trans trans;
-	struct btree_iter *extent_iter, *inode_iter = NULL;
-	struct bchfs_extent_trans_hook hook;
-	struct bkey_i *k = bch2_keylist_front(keys);
-	s64 orig_sectors_added = op->sectors_added;
-	int ret;
-
-	BUG_ON(k->k.p.inode != op->inode->v.i_ino);
-
-	bch2_trans_init(&trans, wop->c);
-
-	extent_iter = bch2_trans_get_iter(&trans,
-				BTREE_ID_EXTENTS,
-				bkey_start_pos(&bch2_keylist_front(keys)->k),
-				BTREE_ITER_INTENT);
-	BUG_ON(IS_ERR(extent_iter));
-
-	hook.op			= op;
-	hook.hook.fn		= bchfs_extent_update_hook;
-	hook.need_inode_update	= false;
-
-	do {
-		/* XXX: inode->i_size locking */
-		k = bch2_keylist_front(keys);
-		if (min(k->k.p.offset << 9, op->new_i_size) >
-		    op->inode->ei_inode.bi_size)
-			hook.need_inode_update = true;
-
-		/* optimization for fewer transaction restarts: */
-		ret = bch2_btree_iter_traverse(extent_iter);
-		if (ret)
-			goto err;
-
-		if (hook.need_inode_update) {
-			struct bkey_s_c inode;
-
-			if (!inode_iter) {
-				inode_iter = bch2_trans_get_iter(&trans,
-					BTREE_ID_INODES,
-					POS(extent_iter->pos.inode, 0),
-					BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-				BUG_ON(IS_ERR(inode_iter));
-			}
-
-			inode = bch2_btree_iter_peek_slot(inode_iter);
-			if ((ret = btree_iter_err(inode)))
-				goto err;
-
-			if (WARN_ONCE(inode.k->type != BCH_INODE_FS,
-				      "inode %llu not found when updating",
-				      extent_iter->pos.inode)) {
-				ret = -ENOENT;
-				break;
-			}
-
-			if (WARN_ONCE(bkey_bytes(inode.k) >
-				      sizeof(hook.inode_p),
-				      "inode %llu too big (%zu bytes, buf %zu)",
-				      extent_iter->pos.inode,
-				      bkey_bytes(inode.k),
-				      sizeof(hook.inode_p))) {
-				ret = -ENOENT;
-				break;
-			}
-
-			bkey_reassemble(&hook.inode_p.inode.k_i, inode);
-			ret = bch2_inode_unpack(bkey_s_c_to_inode(inode),
-					       &hook.inode_u);
-			if (WARN_ONCE(ret,
-				      "error %i unpacking inode %llu",
-				      ret, extent_iter->pos.inode)) {
-				ret = -ENOENT;
-				break;
-			}
-
-			ret = bch2_btree_insert_at(wop->c, &wop->res,
-					&hook.hook, op_journal_seq(wop),
-					BTREE_INSERT_NOFAIL|
-					BTREE_INSERT_ATOMIC|
-					BTREE_INSERT_USE_RESERVE,
-					BTREE_INSERT_ENTRY(extent_iter, k),
-					BTREE_INSERT_ENTRY_EXTRA_RES(inode_iter,
-							&hook.inode_p.inode.k_i, 2));
-		} else {
-			ret = bch2_btree_insert_at(wop->c, &wop->res,
-					&hook.hook, op_journal_seq(wop),
-					BTREE_INSERT_NOFAIL|
-					BTREE_INSERT_ATOMIC|
-					BTREE_INSERT_USE_RESERVE,
-					BTREE_INSERT_ENTRY(extent_iter, k));
-		}
-
-		BUG_ON(bkey_cmp(extent_iter->pos, bkey_start_pos(&k->k)));
-
-		if (WARN_ONCE(!ret != !k->k.size,
-			      "ret %i k->size %u", ret, k->k.size))
-			ret = k->k.size ? -EINTR : 0;
-err:
-		if (ret == -EINTR)
-			continue;
-		if (ret)
-			break;
-
-		BUG_ON(bkey_cmp(extent_iter->pos, k->k.p) < 0);
-		bch2_keylist_pop_front(keys);
-	} while (!bch2_keylist_empty(keys));
-
-	bch2_trans_exit(&trans);
-
-	if (op->is_dio) {
-		struct dio_write *dio = container_of(op, struct dio_write, iop);
+	struct printbuf buf = PRINTBUF;
+	struct bch_inode_unpacked u;
+	struct btree_iter iter;
+	int ret = bch2_inode_peek(trans, &iter, &u, inum, 0);
+	if (ret)
+		return ret;
 
-		i_sectors_acct(wop->c, op->inode, &dio->quota_res,
-			       op->sectors_added - orig_sectors_added);
+	u64 cur_seq = journal_cur_seq(&trans->c->journal);
+	*seq = min(cur_seq, u.bi_journal_seq);
+
+	if (fsck_err_on(u.bi_journal_seq > cur_seq,
+			trans, inode_journal_seq_in_future,
+			"inode journal seq in future (currently at %llu)\n%s",
+			cur_seq,
+			(bch2_inode_unpacked_to_text(&buf, &u),
+			buf.buf))) {
+		u.bi_journal_seq = cur_seq;
+		ret = bch2_inode_write(trans, &iter, &u);
 	}
-
+fsck_err:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
 	return ret;
 }
 
-static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
-					struct bch_fs *c,
-					struct bch_inode_info *inode,
-					struct bch_io_opts opts,
-					bool is_dio)
-{
-	op->inode		= inode;
-	op->sectors_added	= 0;
-	op->is_dio		= is_dio;
-	op->unalloc		= false;
-	op->new_i_size		= U64_MAX;
-
-	bch2_write_op_init(&op->op, c, opts);
-	op->op.target		= opts.foreground_target;
-	op->op.index_update_fn	= bchfs_write_index_update;
-	op_journal_seq_set(&op->op, &inode->ei_journal_seq);
-}
-
-static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info *inode)
-{
-	struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
-
-	bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode->ei_inode));
-	return opts;
-}
-
-/* page state: */
-
-/* stored in page->private: */
-
 /*
- * bch_page_state has to (unfortunately) be manipulated with cmpxchg - we could
- * almost protected it with the page lock, except that bch2_writepage_io_done has
- * to update the sector counts (and from interrupt/bottom half context).
+ * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
+ * insert trigger: look up the btree inode instead
  */
-struct bch_page_state {
-union { struct {
-	/* existing data: */
-	unsigned		sectors:PAGE_SECTOR_SHIFT + 1;
-	unsigned		nr_replicas:4;
-	unsigned		compressed:1;
-
-	/* Owns PAGE_SECTORS sized reservation: */
-	unsigned		reserved:1;
-	unsigned		reservation_replicas:4;
-
-	/* Owns PAGE_SECTORS sized quota reservation: */
-	unsigned		quota_reserved:1;
-
-	/*
-	 * Number of sectors on disk - for i_blocks
-	 * Uncompressed size, not compressed size:
-	 */
-	unsigned		dirty_sectors:PAGE_SECTOR_SHIFT + 1;
-};
-	/* for cmpxchg: */
-	unsigned long		v;
-};
-};
-
-#define page_state_cmpxchg(_ptr, _new, _expr)				\
-({									\
-	unsigned long _v = READ_ONCE((_ptr)->v);			\
-	struct bch_page_state _old;					\
-									\
-	do {								\
-		_old.v = _new.v = _v;					\
-		_expr;							\
-									\
-		EBUG_ON(_new.sectors + _new.dirty_sectors > PAGE_SECTORS);\
-	} while (_old.v != _new.v &&					\
-		 (_v = cmpxchg(&(_ptr)->v, _old.v, _new.v)) != _old.v);	\
-									\
-	_old;								\
-})
-
-static inline struct bch_page_state *page_state(struct page *page)
-{
-	struct bch_page_state *s = (void *) &page->private;
-
-	BUILD_BUG_ON(sizeof(*s) > sizeof(page->private));
-
-	if (!PagePrivate(page))
-		SetPagePrivate(page);
-
-	return s;
-}
-
-static inline unsigned page_res_sectors(struct bch_page_state s)
+static int bch2_flush_inode(struct bch_fs *c,
+			    struct bch_inode_info *inode)
 {
+	if (c->opts.journal_flush_disabled)
+		return 0;
 
-	return s.reserved ? s.reservation_replicas * PAGE_SECTORS : 0;
-}
-
-static void __bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
-					struct bch_page_state s)
-{
-	struct disk_reservation res = { .sectors = page_res_sectors(s) };
-	struct quota_res quota_res = { .sectors = s.quota_reserved ? PAGE_SECTORS : 0 };
-
-	bch2_quota_reservation_put(c, inode, &quota_res);
-	bch2_disk_reservation_put(c, &res);
-}
-
-static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
-				      struct page *page)
-{
-	struct bch_page_state s;
-
-	s = page_state_cmpxchg(page_state(page), s, {
-		s.reserved		= 0;
-		s.quota_reserved	= 0;
-	});
-
-	__bch2_put_page_reservation(c, inode, s);
-}
-
-static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
-				     struct page *page, bool check_enospc)
-{
-	struct bch_page_state *s = page_state(page), new, old;
-
-	/* XXX: this should not be open coded */
-	unsigned nr_replicas = inode->ei_inode.bi_data_replicas
-		? inode->ei_inode.bi_data_replicas - 1
-		: c->opts.data_replicas;
-
-	struct disk_reservation disk_res = bch2_disk_reservation_init(c,
-						nr_replicas);
-	struct quota_res quota_res = { 0 };
-	int ret = 0;
-
-	/*
-	 * XXX: this could likely be quite a bit simpler, page reservations
-	 * _should_ only be manipulated with page locked:
-	 */
-
-	old = page_state_cmpxchg(s, new, {
-		if (new.reserved
-		    ? (new.reservation_replicas < disk_res.nr_replicas)
-		    : (new.sectors < PAGE_SECTORS ||
-		       new.nr_replicas < disk_res.nr_replicas ||
-		       new.compressed)) {
-			int sectors = (disk_res.nr_replicas * PAGE_SECTORS -
-				       page_res_sectors(new) -
-				       disk_res.sectors);
-
-			if (sectors > 0) {
-				ret = bch2_disk_reservation_add(c, &disk_res, sectors,
-						!check_enospc
-						? BCH_DISK_RESERVATION_NOFAIL : 0);
-				if (unlikely(ret))
-					goto err;
-			}
-
-			new.reserved = 1;
-			new.reservation_replicas = disk_res.nr_replicas;
-		}
-
-		if (!new.quota_reserved &&
-		    new.sectors + new.dirty_sectors < PAGE_SECTORS) {
-			ret = bch2_quota_reservation_add(c, inode, &quota_res,
-						PAGE_SECTORS - quota_res.sectors,
-						check_enospc);
-			if (unlikely(ret))
-				goto err;
-
-			new.quota_reserved = 1;
-		}
-	});
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync))
+		return -EROFS;
 
-	quota_res.sectors -= (new.quota_reserved - old.quota_reserved) * PAGE_SECTORS;
-	disk_res.sectors -= page_res_sectors(new) - page_res_sectors(old);
-err:
-	bch2_quota_reservation_put(c, inode, &quota_res);
-	bch2_disk_reservation_put(c, &disk_res);
+	u64 seq;
+	int ret = bch2_trans_commit_do(c, NULL, NULL, 0,
+			bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?:
+		  bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?:
+		  bch2_inode_flush_nocow_writes(c, inode);
+	bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
 	return ret;
 }
 
-static void bch2_clear_page_bits(struct page *page)
-{
-	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_page_state s;
-
-	if (!PagePrivate(page))
-		return;
-
-	s.v = xchg(&page_state(page)->v, 0);
-	ClearPagePrivate(page);
-
-	if (s.dirty_sectors)
-		i_sectors_acct(c, inode, NULL, -s.dirty_sectors);
-
-	__bch2_put_page_reservation(c, inode, s);
-}
-
-int bch2_set_page_dirty(struct page *page)
-{
-	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct quota_res quota_res = { 0 };
-	struct bch_page_state old, new;
-
-	old = page_state_cmpxchg(page_state(page), new,
-		new.dirty_sectors = PAGE_SECTORS - new.sectors;
-		new.quota_reserved = 0;
-	);
-
-	quota_res.sectors += old.quota_reserved * PAGE_SECTORS;
-
-	if (old.dirty_sectors != new.dirty_sectors)
-		i_sectors_acct(c, inode, &quota_res,
-			       new.dirty_sectors - old.dirty_sectors);
-	bch2_quota_reservation_put(c, inode, &quota_res);
-
-	return __set_page_dirty_nobuffers(page);
-}
-
-int bch2_page_mkwrite(struct vm_fault *vmf)
+int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
-	struct page *page = vmf->page;
-	struct file *file = vmf->vma->vm_file;
 	struct bch_inode_info *inode = file_bch_inode(file);
-	struct address_space *mapping = inode->v.i_mapping;
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	int ret = VM_FAULT_LOCKED;
+	int ret, err;
 
-	sb_start_pagefault(inode->v.i_sb);
-	file_update_time(file);
+	trace_bch2_fsync(file, datasync);
 
-	/*
-	 * Not strictly necessary, but helps avoid dio writes livelocking in
-	 * write_invalidate_inode_pages_range() - can drop this if/when we get
-	 * a write_invalidate_inode_pages_range() that works without dropping
-	 * page lock before invalidating page
-	 */
-	if (current->pagecache_lock != &mapping->add_lock)
-		pagecache_add_get(&mapping->add_lock);
-
-	lock_page(page);
-	if (page->mapping != mapping ||
-	    page_offset(page) > i_size_read(&inode->v)) {
-		unlock_page(page);
-		ret = VM_FAULT_NOPAGE;
+	ret = file_write_and_wait_range(file, start, end);
+	if (ret)
 		goto out;
-	}
-
-	if (bch2_get_page_reservation(c, inode, page, true)) {
-		unlock_page(page);
-		ret = VM_FAULT_SIGBUS;
+	ret = sync_inode_metadata(&inode->v, 1);
+	if (ret)
 		goto out;
-	}
-
-	if (!PageDirty(page))
-		set_page_dirty(page);
-	wait_for_stable_page(page);
+	ret = bch2_flush_inode(c, inode);
 out:
-	if (current->pagecache_lock != &mapping->add_lock)
-		pagecache_add_put(&mapping->add_lock);
-	sb_end_pagefault(inode->v.i_sb);
-	return ret;
-}
-
-void bch2_invalidatepage(struct page *page, unsigned int offset,
-			 unsigned int length)
-{
-	EBUG_ON(!PageLocked(page));
-	EBUG_ON(PageWriteback(page));
-
-	if (offset || length < PAGE_SIZE)
-		return;
-
-	bch2_clear_page_bits(page);
-}
-
-int bch2_releasepage(struct page *page, gfp_t gfp_mask)
-{
-	/* XXX: this can't take locks that are held while we allocate memory */
-	EBUG_ON(!PageLocked(page));
-	EBUG_ON(PageWriteback(page));
-
-	if (PageDirty(page))
-		return 0;
-
-	bch2_clear_page_bits(page);
-	return 1;
-}
-
-#ifdef CONFIG_MIGRATION
-int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
-		      struct page *page, enum migrate_mode mode)
-{
-	int ret;
-
-	ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
-	if (ret != MIGRATEPAGE_SUCCESS)
-		return ret;
-
-	if (PagePrivate(page)) {
-		*page_state(newpage) = *page_state(page);
-		ClearPagePrivate(page);
-	}
-
-	migrate_page_copy(newpage, page);
-	return MIGRATEPAGE_SUCCESS;
-}
-#endif
+	ret = bch2_err_class(ret);
+	if (ret == -EROFS)
+		ret = -EIO;
 
-/* readpages/writepages: */
-
-static bool bio_can_add_page_contig(struct bio *bio, struct page *page)
-{
-	sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT;
+	err = file_check_and_advance_wb_err(file);
+	if (!ret)
+		ret = err;
 
-	return bio->bi_vcnt < bio->bi_max_vecs &&
-		bio_end_sector(bio) == offset;
+	return ret;
 }
 
-static void __bio_add_page(struct bio *bio, struct page *page)
-{
-	bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
-		.bv_page = page,
-		.bv_len = PAGE_SIZE,
-		.bv_offset = 0,
-	};
-
-	bio->bi_iter.bi_size += PAGE_SIZE;
-}
+/* truncate: */
 
-static int bio_add_page_contig(struct bio *bio, struct page *page)
+static inline int range_has_data(struct bch_fs *c, u32 subvol,
+				 struct bpos start,
+				 struct bpos end)
 {
-	sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT;
-
-	EBUG_ON(!bio->bi_max_vecs);
-
-	if (!bio->bi_vcnt)
-		bio->bi_iter.bi_sector = offset;
-	else if (!bio_can_add_page_contig(bio, page))
-		return -1;
-
-	__bio_add_page(bio, page);
-	return 0;
+	return bch2_trans_run(c,
+		for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end,
+						    subvol, 0, k, ({
+			bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k);
+		})));
 }
 
-/* readpage(s): */
-
-static void bch2_readpages_end_io(struct bio *bio)
+static int __bch2_truncate_folio(struct bch_inode_info *inode,
+				 pgoff_t index, loff_t start, loff_t end)
 {
-	struct bio_vec *bv;
-	int i;
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = inode->v.i_mapping;
+	struct bch_folio *s;
+	unsigned start_offset;
+	unsigned end_offset;
+	unsigned i;
+	struct folio *folio;
+	s64 i_sectors_delta = 0;
+	int ret = 0;
+	u64 end_pos;
 
-	bio_for_each_segment_all(bv, bio, i) {
-		struct page *page = bv->bv_page;
+	folio = filemap_lock_folio(mapping, index);
+	if (IS_ERR_OR_NULL(folio)) {
+		/*
+		 * XXX: we're doing two index lookups when we end up reading the
+		 * folio
+		 */
+		ret = range_has_data(c, inode->ei_inum.subvol,
+				POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
+				POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
+		if (ret <= 0)
+			return ret;
 
-		if (!bio->bi_status) {
-			SetPageUptodate(page);
-		} else {
-			ClearPageUptodate(page);
-			SetPageError(page);
+		folio = __filemap_get_folio(mapping, index,
+					    FGP_LOCK|FGP_CREAT, GFP_KERNEL);
+		if (IS_ERR(folio)) {
+			ret = -ENOMEM;
+			goto out;
 		}
-		unlock_page(page);
 	}
 
-	bio_put(bio);
-}
+	BUG_ON(start	>= folio_end_pos(folio));
+	BUG_ON(end	<= folio_pos(folio));
 
-struct readpages_iter {
-	struct address_space	*mapping;
-	struct list_head	pages;
-	unsigned		nr_pages;
-};
-
-static inline void page_state_init_for_read(struct page *page)
-{
-	struct bch_page_state *s = page_state(page);
-
-	BUG_ON(s->reserved);
-	s->sectors	= 0;
-	s->compressed	= 0;
-}
-
-static int readpage_add_page(struct readpages_iter *iter, struct page *page)
-{
-	int ret;
-
-	prefetchw(&page->flags);
-
-	ret = add_to_page_cache_lru(page, iter->mapping,
-				    page->index, GFP_NOFS);
-	if (!ret)
-		page_state_init_for_read(page);
-
-	put_page(page);
-	return ret;
-}
-
-static inline struct page *readpage_iter_next(struct readpages_iter *iter)
-{
-	while (iter->nr_pages) {
-		struct page *page =
-			list_last_entry(&iter->pages, struct page, lru);
+	start_offset	= max(start, folio_pos(folio)) - folio_pos(folio);
+	end_offset	= min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio);
 
-		prefetchw(&page->flags);
-		list_del(&page->lru);
-		iter->nr_pages--;
-
-		if (!readpage_add_page(iter, page))
-			return page;
+	/* Folio boundary? Nothing to do */
+	if (start_offset == 0 &&
+	    end_offset == folio_size(folio)) {
+		ret = 0;
+		goto unlock;
 	}
 
-	return NULL;
-}
-
-#define for_each_readpage_page(_iter, _page)				\
-	for (;								\
-	     ((_page) = __readpage_next_page(&(_iter)));)		\
-
-static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
-{
-	struct bvec_iter iter;
-	struct bio_vec bv;
-	bool compressed = bch2_extent_is_compressed(k);
-	unsigned nr_ptrs = bch2_extent_nr_dirty_ptrs(k);
-
-	bio_for_each_segment(bv, bio, iter) {
-		struct bch_page_state *s = page_state(bv.bv_page);
-
-		/* sectors in @k from the start of this page: */
-		unsigned k_sectors = k.k->size - (iter.bi_sector - k.k->p.offset);
-
-		unsigned page_sectors = min(bv.bv_len >> 9, k_sectors);
-
-		s->nr_replicas = !s->sectors
-			? nr_ptrs
-			: min_t(unsigned, s->nr_replicas, nr_ptrs);
-
-		BUG_ON(s->sectors + page_sectors > PAGE_SECTORS);
-		s->sectors += page_sectors;
-
-		s->compressed |= compressed;
+	s = bch2_folio_create(folio, 0);
+	if (!s) {
+		ret = -ENOMEM;
+		goto unlock;
 	}
-}
 
-static void readpage_bio_extend(struct readpages_iter *iter,
-				struct bio *bio, u64 offset,
-				bool get_more)
-{
-	struct page *page;
-	pgoff_t page_offset;
-	int ret;
-
-	while (bio_end_sector(bio) < offset &&
-	       bio->bi_vcnt < bio->bi_max_vecs) {
-		page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT;
-
-		if (iter->nr_pages) {
-			page = list_last_entry(&iter->pages, struct page, lru);
-			if (page->index != page_offset)
-				break;
-
-			list_del(&page->lru);
-			iter->nr_pages--;
-		} else if (get_more) {
-			rcu_read_lock();
-			page = radix_tree_lookup(&iter->mapping->page_tree, page_offset);
-			rcu_read_unlock();
-
-			if (page && !radix_tree_exceptional_entry(page))
-				break;
-
-			page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
-			if (!page)
-				break;
-
-			page->index = page_offset;
-			ClearPageReadahead(bio->bi_io_vec[bio->bi_vcnt - 1].bv_page);
-		} else {
-			break;
-		}
-
-		ret = readpage_add_page(iter, page);
+	if (!folio_test_uptodate(folio)) {
+		ret = bch2_read_single_folio(folio, mapping);
 		if (ret)
-			break;
-
-		__bio_add_page(bio, page);
+			goto unlock;
 	}
 
-	if (!iter->nr_pages)
-		SetPageReadahead(bio->bi_io_vec[bio->bi_vcnt - 1].bv_page);
-}
-
-static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
-		       struct bch_read_bio *rbio, u64 inum,
-		       struct readpages_iter *readpages_iter)
-{
-	struct bio *bio = &rbio->bio;
-	int flags = BCH_READ_RETRY_IF_STALE|
-		BCH_READ_MAY_PROMOTE;
-
-	rbio->c = c;
-	rbio->start_time = local_clock();
-
-	while (1) {
-		BKEY_PADDED(k) tmp;
-		struct bkey_s_c k;
-		unsigned bytes;
-
-		bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector));
-
-		k = bch2_btree_iter_peek_slot(iter);
-		BUG_ON(!k.k);
-
-		if (IS_ERR(k.k)) {
-			int ret = bch2_btree_iter_unlock(iter);
-			BUG_ON(!ret);
-			bcache_io_error(c, bio, "btree IO error %i", ret);
-			bio_endio(bio);
-			return;
-		}
-
-		bkey_reassemble(&tmp.k, k);
-		bch2_btree_iter_unlock(iter);
-		k = bkey_i_to_s_c(&tmp.k);
-
-		if (readpages_iter) {
-			bool want_full_extent = false;
-
-			if (bkey_extent_is_data(k.k)) {
-				struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-				struct bch_extent_crc_unpacked crc;
-				const union bch_extent_entry *i;
-
-				extent_for_each_crc(e, crc, i)
-					want_full_extent |= ((crc.csum_type != 0) |
-							     (crc.compression_type != 0));
-			}
-
-			readpage_bio_extend(readpages_iter,
-					    bio, k.k->p.offset,
-					    want_full_extent);
-		}
-
-		bytes = (min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
-			 bio->bi_iter.bi_sector) << 9;
-		swap(bio->bi_iter.bi_size, bytes);
-
-		if (bytes == bio->bi_iter.bi_size)
-			flags |= BCH_READ_LAST_FRAGMENT;
-
-		if (bkey_extent_is_allocation(k.k))
-			bch2_add_page_sectors(bio, k);
-
-		bch2_read_extent(c, rbio, k, flags);
+	ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
+	if (ret)
+		goto unlock;
 
-		if (flags & BCH_READ_LAST_FRAGMENT)
-			return;
+	for (i = round_up(start_offset, block_bytes(c)) >> 9;
+	     i < round_down(end_offset, block_bytes(c)) >> 9;
+	     i++) {
+		s->s[i].nr_replicas	= 0;
 
-		swap(bio->bi_iter.bi_size, bytes);
-		bio_advance(bio, bytes);
+		i_sectors_delta -= s->s[i].state == SECTOR_dirty;
+		bch2_folio_sector_set(folio, s, i, SECTOR_unallocated);
 	}
-}
-
-int bch2_readpages(struct file *file, struct address_space *mapping,
-		   struct list_head *pages, unsigned nr_pages)
-{
-	struct bch_inode_info *inode = to_bch_ei(mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_io_opts opts = io_opts(c, inode);
-	struct btree_iter iter;
-	struct page *page;
-	struct readpages_iter readpages_iter = {
-		.mapping = mapping, .nr_pages = nr_pages
-	};
-
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
-			     BTREE_ITER_SLOTS);
-
-	INIT_LIST_HEAD(&readpages_iter.pages);
-	list_add(&readpages_iter.pages, pages);
-	list_del_init(pages);
-
-	if (current->pagecache_lock != &mapping->add_lock)
-		pagecache_add_get(&mapping->add_lock);
 
-	while ((page = readpage_iter_next(&readpages_iter))) {
-		unsigned n = max_t(unsigned,
-				   min_t(unsigned, readpages_iter.nr_pages + 1,
-					 BIO_MAX_PAGES),
-				   c->sb.encoded_extent_max >> PAGE_SECTOR_SHIFT);
+	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
-		struct bch_read_bio *rbio =
-			rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read),
-				  opts);
+	/*
+	 * Caller needs to know whether this folio will be written out by
+	 * writeback - doing an i_size update if necessary - or whether it will
+	 * be responsible for the i_size update.
+	 *
+	 * Note that we shouldn't ever see a folio beyond EOF, but check and
+	 * warn if so. This has been observed by failure to clean up folios
+	 * after a short write and there's still a chance reclaim will fix
+	 * things up.
+	 */
+	WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size);
+	end_pos = folio_end_pos(folio);
+	if (inode->v.i_size > folio_pos(folio))
+		end_pos = min_t(u64, inode->v.i_size, end_pos);
+	ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty;
 
-		rbio->bio.bi_end_io = bch2_readpages_end_io;
-		bio_add_page_contig(&rbio->bio, page);
-		bchfs_read(c, &iter, rbio, inode->v.i_ino, &readpages_iter);
-	}
+	folio_zero_segment(folio, start_offset, end_offset);
 
-	if (current->pagecache_lock != &mapping->add_lock)
-		pagecache_add_put(&mapping->add_lock);
+	/*
+	 * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
+	 *
+	 * XXX: because we aren't currently tracking whether the folio has actual
+	 * data in it (vs. just 0s, or only partially written) this wrong. ick.
+	 */
+	BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false));
 
-	return 0;
+	/*
+	 * This removes any writeable userspace mappings; we need to force
+	 * .page_mkwrite to be called again before any mmapped writes, to
+	 * redirty the full page:
+	 */
+	folio_mkclean(folio);
+	filemap_dirty_folio(mapping, folio);
+unlock:
+	folio_unlock(folio);
+	folio_put(folio);
+out:
+	return ret;
 }
 
-static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
-			     u64 inum, struct page *page)
+static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from)
 {
-	struct btree_iter iter;
-
-	page_state_init_for_read(page);
-
-	bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
-	bio_add_page_contig(&rbio->bio, page);
-
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
-			     BTREE_ITER_SLOTS);
-	bchfs_read(c, &iter, rbio, inum, NULL);
+	return __bch2_truncate_folio(inode, from >> PAGE_SHIFT,
+				     from, ANYSINT_MAX(loff_t));
 }
 
-int bch2_readpage(struct file *file, struct page *page)
+static int bch2_truncate_folios(struct bch_inode_info *inode,
+				loff_t start, loff_t end)
 {
-	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_io_opts opts = io_opts(c, inode);
-	struct bch_read_bio *rbio;
-
-	rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts);
-	rbio->bio.bi_end_io = bch2_readpages_end_io;
-
-	__bchfs_readpage(c, rbio, inode->v.i_ino, page);
-	return 0;
-}
+	int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT,
+					start, end);
 
-static void bch2_read_single_page_end_io(struct bio *bio)
-{
-	complete(bio->bi_private);
+	if (ret >= 0 &&
+	    start >> PAGE_SHIFT != end >> PAGE_SHIFT)
+		ret = __bch2_truncate_folio(inode,
+					(end - 1) >> PAGE_SHIFT,
+					start, end);
+	return ret;
 }
 
-static int bch2_read_single_page(struct page *page,
-				 struct address_space *mapping)
+static int bch2_extend(struct mnt_idmap *idmap,
+		       struct bch_inode_info *inode,
+		       struct bch_inode_unpacked *inode_u,
+		       struct iattr *iattr)
 {
-	struct bch_inode_info *inode = to_bch_ei(mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_read_bio *rbio;
+	struct address_space *mapping = inode->v.i_mapping;
 	int ret;
-	DECLARE_COMPLETION_ONSTACK(done);
-
-	rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read),
-			 io_opts(c, inode));
-	rbio->bio.bi_private = &done;
-	rbio->bio.bi_end_io = bch2_read_single_page_end_io;
-
-	__bchfs_readpage(c, rbio, inode->v.i_ino, page);
-	wait_for_completion(&done);
-
-	ret = blk_status_to_errno(rbio->bio.bi_status);
-	bio_put(&rbio->bio);
 
-	if (ret < 0)
+	/*
+	 * sync appends:
+	 *
+	 * this has to be done _before_ extending i_size:
+	 */
+	ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
+	if (ret)
 		return ret;
 
-	SetPageUptodate(page);
-	return 0;
-}
-
-/* writepages: */
-
-struct bch_writepage_state {
-	struct bch_writepage_io	*io;
-	struct bch_io_opts	opts;
-};
-
-static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
-								  struct bch_inode_info *inode)
-{
-	return (struct bch_writepage_state) { .opts = io_opts(c, inode) };
-}
-
-static void bch2_writepage_io_free(struct closure *cl)
-{
-	struct bch_writepage_io *io = container_of(cl,
-					struct bch_writepage_io, cl);
+	truncate_setsize(&inode->v, iattr->ia_size);
 
-	bio_put(&io->op.op.wbio.bio);
+	return bch2_setattr_nonsize(idmap, inode, iattr);
 }
 
-static void bch2_writepage_io_done(struct closure *cl)
+int bchfs_truncate(struct mnt_idmap *idmap,
+		  struct bch_inode_info *inode, struct iattr *iattr)
 {
-	struct bch_writepage_io *io = container_of(cl,
-					struct bch_writepage_io, cl);
-	struct bch_fs *c = io->op.op.c;
-	struct bio *bio = &io->op.op.wbio.bio;
-	struct bio_vec *bvec;
-	unsigned i;
-
-	if (io->op.op.error) {
-		bio_for_each_segment_all(bvec, bio, i)
-			SetPageError(bvec->bv_page);
-		set_bit(AS_EIO, &io->op.inode->v.i_mapping->flags);
-	}
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = inode->v.i_mapping;
+	struct bch_inode_unpacked inode_u;
+	s64 i_sectors_delta = 0;
+	int ret = 0;
 
 	/*
-	 * racing with fallocate can cause us to add fewer sectors than
-	 * expected - but we shouldn't add more sectors than expected:
+	 * If the truncate call with change the size of the file, the
+	 * cmtimes should be updated. If the size will not change, we
+	 * do not need to update the cmtimes.
 	 */
-	BUG_ON(io->op.sectors_added > (s64) io->new_sectors);
+	if (iattr->ia_size != inode->v.i_size) {
+		if (!(iattr->ia_valid & ATTR_MTIME))
+			ktime_get_coarse_real_ts64(&iattr->ia_mtime);
+		if (!(iattr->ia_valid & ATTR_CTIME))
+			ktime_get_coarse_real_ts64(&iattr->ia_ctime);
+		iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME;
+	}
 
-	/*
-	 * (error (due to going RO) halfway through a page can screw that up
-	 * slightly)
-	 * XXX wtf?
-	   BUG_ON(io->op.sectors_added - io->new_sectors >= (s64) PAGE_SECTORS);
-	 */
+	inode_dio_wait(&inode->v);
+	bch2_pagecache_block_get(inode);
+
+	ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
+	if (ret)
+		goto err;
 
 	/*
-	 * PageWriteback is effectively our ref on the inode - fixup i_blocks
-	 * before calling end_page_writeback:
+	 * check this before next assertion; on filesystem error our normal
+	 * invariants are a bit broken (truncate has to truncate the page cache
+	 * before the inode).
 	 */
-	if (io->op.sectors_added != io->new_sectors)
-		i_sectors_acct(c, io->op.inode, NULL,
-			       io->op.sectors_added - (s64) io->new_sectors);
+	ret = bch2_journal_error(&c->journal);
+	if (ret)
+		goto err;
 
-	bio_for_each_segment_all(bvec, bio, i)
-		end_page_writeback(bvec->bv_page);
+	WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
+		  inode->v.i_size < inode_u.bi_size,
+		  "truncate spotted in mem i_size < btree i_size: %llu < %llu\n",
+		  (u64) inode->v.i_size, inode_u.bi_size);
 
-	closure_return_with_destructor(&io->cl, bch2_writepage_io_free);
-}
-
-static void bch2_writepage_do_io(struct bch_writepage_state *w)
-{
-	struct bch_writepage_io *io = w->io;
+	if (iattr->ia_size > inode->v.i_size) {
+		ret = bch2_extend(idmap, inode, &inode_u, iattr);
+		goto err;
+	}
 
-	w->io = NULL;
-	closure_call(&io->op.op.cl, bch2_write, NULL, &io->cl);
-	continue_at(&io->cl, bch2_writepage_io_done, NULL);
-}
+	iattr->ia_valid &= ~ATTR_SIZE;
 
-/*
- * Get a bch_writepage_io and add @page to it - appending to an existing one if
- * possible, else allocating a new one:
- */
-static void bch2_writepage_io_alloc(struct bch_fs *c,
-				    struct bch_writepage_state *w,
-				    struct bch_inode_info *inode,
-				    struct page *page,
-				    unsigned nr_replicas)
-{
-	struct bch_write_op *op;
-	u64 offset = (u64) page->index << PAGE_SECTOR_SHIFT;
-
-	w->io = container_of(bio_alloc_bioset(GFP_NOFS,
-					      BIO_MAX_PAGES,
-					      &c->writepage_bioset),
-			     struct bch_writepage_io, op.op.wbio.bio);
-
-	closure_init(&w->io->cl, NULL);
-	w->io->new_sectors	= 0;
-	bch2_fswrite_op_init(&w->io->op, c, inode, w->opts, false);
-	op			= &w->io->op.op;
-	op->nr_replicas		= nr_replicas;
-	op->res.nr_replicas	= nr_replicas;
-	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
-	op->pos			= POS(inode->v.i_ino, offset);
-	op->wbio.bio.bi_iter.bi_sector = offset;
-}
+	ret = bch2_truncate_folio(inode, iattr->ia_size);
+	if (unlikely(ret < 0))
+		goto err;
 
-static int __bch2_writepage(struct page *page,
-			    struct writeback_control *wbc,
-			    void *data)
-{
-	struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_writepage_state *w = data;
-	struct bch_page_state new, old;
-	unsigned offset;
-	loff_t i_size = i_size_read(&inode->v);
-	pgoff_t end_index = i_size >> PAGE_SHIFT;
-
-	EBUG_ON(!PageUptodate(page));
-
-	/* Is the page fully inside i_size? */
-	if (page->index < end_index)
-		goto do_io;
-
-	/* Is the page fully outside i_size? (truncate in progress) */
-	offset = i_size & (PAGE_SIZE - 1);
-	if (page->index > end_index || !offset) {
-		unlock_page(page);
-		return 0;
-	}
+	truncate_setsize(&inode->v, iattr->ia_size);
 
 	/*
-	 * The page straddles i_size.  It must be zeroed out on each and every
-	 * writepage invocation because it may be mmapped.  "A file is mapped
-	 * in multiples of the page size.  For a file that is not a multiple of
-	 * the  page size, the remaining memory is zeroed when mapped, and
-	 * writes to that region are not written out to the file."
+	 * When extending, we're going to write the new i_size to disk
+	 * immediately so we need to flush anything above the current on disk
+	 * i_size first:
+	 *
+	 * Also, when extending we need to flush the page that i_size currently
+	 * straddles - if it's mapped to userspace, we need to ensure that
+	 * userspace has to redirty it and call .mkwrite -> set_page_dirty
+	 * again to allocate the part of the page that was extended.
 	 */
-	zero_user_segment(page, offset, PAGE_SIZE);
-do_io:
-	/* Before unlocking the page, transfer reservation to w->io: */
-	old = page_state_cmpxchg(page_state(page), new, {
-		EBUG_ON(!new.reserved &&
-			(new.sectors != PAGE_SECTORS ||
-			new.compressed));
-
-		if (new.reserved)
-			new.nr_replicas = new.reservation_replicas;
-		new.reserved = 0;
-
-		new.compressed |= w->opts.compression != 0;
-
-		new.sectors += new.dirty_sectors;
-		new.dirty_sectors = 0;
-	});
-
-	BUG_ON(PageWriteback(page));
-	set_page_writeback(page);
-	unlock_page(page);
-
-	if (w->io &&
-	    (w->io->op.op.res.nr_replicas != new.nr_replicas ||
-	     !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page)))
-		bch2_writepage_do_io(w);
-
-	if (!w->io)
-		bch2_writepage_io_alloc(c, w, inode, page, new.nr_replicas);
-
-	w->io->new_sectors += new.sectors - old.sectors;
-
-	BUG_ON(inode != w->io->op.inode);
-	BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page));
+	if (iattr->ia_size > inode_u.bi_size)
+		ret = filemap_write_and_wait_range(mapping,
+				inode_u.bi_size,
+				iattr->ia_size - 1);
+	else if (iattr->ia_size & (PAGE_SIZE - 1))
+		ret = filemap_write_and_wait_range(mapping,
+				round_down(iattr->ia_size, PAGE_SIZE),
+				iattr->ia_size - 1);
+	if (ret)
+		goto err;
 
-	if (old.reserved)
-		w->io->op.op.res.sectors += old.reservation_replicas * PAGE_SECTORS;
+	ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta);
+	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
-	w->io->op.new_i_size = i_size;
+	if (unlikely(ret)) {
+		/*
+		 * If we error here, VFS caches are now inconsistent with btree
+		 */
+		set_bit(EI_INODE_ERROR, &inode->ei_flags);
+		goto err;
+	}
 
-	if (wbc->sync_mode == WB_SYNC_ALL)
-		w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
+	bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
+				!bch2_journal_error(&c->journal), c,
+				"inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
+				inode->v.i_ino, (u64) inode->v.i_blocks,
+				inode->ei_inode.bi_sectors);
 
-	return 0;
+	ret = bch2_setattr_nonsize(idmap, inode, iattr);
+err:
+	bch2_pagecache_block_put(inode);
+	return bch2_err_class(ret);
 }
 
-int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
-{
-	struct bch_fs *c = mapping->host->i_sb->s_fs_info;
-	struct bch_writepage_state w =
-		bch_writepage_state_init(c, to_bch_ei(mapping->host));
-	struct blk_plug plug;
-	int ret;
-
-	blk_start_plug(&plug);
-	ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
-	if (w.io)
-		bch2_writepage_do_io(&w);
-	blk_finish_plug(&plug);
-	return ret;
-}
+/* fallocate: */
 
-int bch2_writepage(struct page *page, struct writeback_control *wbc)
+static int inode_update_times_fn(struct btree_trans *trans,
+				 struct bch_inode_info *inode,
+				 struct bch_inode_unpacked *bi, void *p)
 {
-	struct bch_fs *c = page->mapping->host->i_sb->s_fs_info;
-	struct bch_writepage_state w =
-		bch_writepage_state_init(c, to_bch_ei(page->mapping->host));
-	int ret;
-
-	ret = __bch2_writepage(page, wbc, &w);
-	if (w.io)
-		bch2_writepage_do_io(&w);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 
-	return ret;
+	bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
+	return 0;
 }
 
-/* buffered writes: */
-
-int bch2_write_begin(struct file *file, struct address_space *mapping,
-		     loff_t pos, unsigned len, unsigned flags,
-		     struct page **pagep, void **fsdata)
+static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
 {
-	struct bch_inode_info *inode = to_bch_ei(mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	pgoff_t index = pos >> PAGE_SHIFT;
-	unsigned offset = pos & (PAGE_SIZE - 1);
-	struct page *page;
-	int ret = -ENOMEM;
-
-	BUG_ON(inode_unhashed(&inode->v));
+	u64 end		= offset + len;
+	u64 block_start	= round_up(offset, block_bytes(c));
+	u64 block_end	= round_down(end, block_bytes(c));
+	bool truncated_last_page;
+	int ret = 0;
 
-	/* Not strictly necessary - same reason as mkwrite(): */
-	pagecache_add_get(&mapping->add_lock);
+	ret = bch2_truncate_folios(inode, offset, end);
+	if (unlikely(ret < 0))
+		goto err;
 
-	page = grab_cache_page_write_begin(mapping, index, flags);
-	if (!page)
-		goto err_unlock;
+	truncated_last_page = ret;
 
-	if (PageUptodate(page))
-		goto out;
+	truncate_pagecache_range(&inode->v, offset, end - 1);
 
-	/* If we're writing entire page, don't need to read it in first: */
-	if (len == PAGE_SIZE)
-		goto out;
+	if (block_start < block_end) {
+		s64 i_sectors_delta = 0;
 
-	if (!offset && pos + len >= inode->v.i_size) {
-		zero_user_segment(page, len, PAGE_SIZE);
-		flush_dcache_page(page);
-		goto out;
+		ret = bch2_fpunch(c, inode_inum(inode),
+				  block_start >> 9, block_end >> 9,
+				  &i_sectors_delta);
+		bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 	}
 
-	if (index > inode->v.i_size >> PAGE_SHIFT) {
-		zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE);
-		flush_dcache_page(page);
-		goto out;
-	}
-readpage:
-	ret = bch2_read_single_page(page, mapping);
-	if (ret)
-		goto err;
-out:
-	ret = bch2_get_page_reservation(c, inode, page, true);
-	if (ret) {
-		if (!PageUptodate(page)) {
-			/*
-			 * If the page hasn't been read in, we won't know if we
-			 * actually need a reservation - we don't actually need
-			 * to read here, we just need to check if the page is
-			 * fully backed by uncompressed data:
-			 */
-			goto readpage;
-		}
-
-		goto err;
+	mutex_lock(&inode->ei_update_lock);
+	if (end >= inode->v.i_size && !truncated_last_page) {
+		ret = bch2_write_inode_size(c, inode, inode->v.i_size,
+					    ATTR_MTIME|ATTR_CTIME);
+	} else {
+		ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+				       ATTR_MTIME|ATTR_CTIME);
 	}
-
-	*pagep = page;
-	return 0;
+	mutex_unlock(&inode->ei_update_lock);
 err:
-	unlock_page(page);
-	put_page(page);
-	*pagep = NULL;
-err_unlock:
-	pagecache_add_put(&mapping->add_lock);
 	return ret;
 }
 
-int bch2_write_end(struct file *file, struct address_space *mapping,
-		   loff_t pos, unsigned len, unsigned copied,
-		   struct page *page, void *fsdata)
+static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
+				   loff_t offset, loff_t len,
+				   bool insert)
 {
-	struct bch_inode_info *inode = to_bch_ei(mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = inode->v.i_mapping;
+	s64 i_sectors_delta = 0;
+	int ret = 0;
 
-	lockdep_assert_held(&inode->v.i_rwsem);
+	if ((offset | len) & (block_bytes(c) - 1))
+		return -EINVAL;
 
-	if (unlikely(copied < len && !PageUptodate(page))) {
-		/*
-		 * The page needs to be read in, but that would destroy
-		 * our partial write - simplest thing is to just force
-		 * userspace to redo the write:
-		 */
-		zero_user(page, 0, PAGE_SIZE);
-		flush_dcache_page(page);
-		copied = 0;
+	if (insert) {
+		if (offset >= inode->v.i_size)
+			return -EINVAL;
+	} else {
+		if (offset + len >= inode->v.i_size)
+			return -EINVAL;
 	}
 
-	spin_lock(&inode->v.i_lock);
-	if (pos + copied > inode->v.i_size)
-		i_size_write(&inode->v, pos + copied);
-	spin_unlock(&inode->v.i_lock);
-
-	if (copied) {
-		if (!PageUptodate(page))
-			SetPageUptodate(page);
-		if (!PageDirty(page))
-			set_page_dirty(page);
+	ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
+	if (ret)
+		return ret;
 
-		inode->ei_last_dirtied = (unsigned long) current;
-	} else {
-		bch2_put_page_reservation(c, inode, page);
-	}
+	if (insert)
+		i_size_write(&inode->v, inode->v.i_size + len);
 
-	unlock_page(page);
-	put_page(page);
-	pagecache_add_put(&mapping->add_lock);
+	ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9,
+				     insert, &i_sectors_delta);
+	if (!ret && !insert)
+		i_size_write(&inode->v, inode->v.i_size - len);
+	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
-	return copied;
+	return ret;
 }
 
-#define WRITE_BATCH_PAGES	32
-
-static int __bch2_buffered_write(struct bch_inode_info *inode,
-				 struct address_space *mapping,
-				 struct iov_iter *iter,
-				 loff_t pos, unsigned len)
+static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
+			     u64 start_sector, u64 end_sector)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct page *pages[WRITE_BATCH_PAGES];
-	unsigned long index = pos >> PAGE_SHIFT;
-	unsigned offset = pos & (PAGE_SIZE - 1);
-	unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
-	unsigned i, copied = 0, nr_pages_copied = 0;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bpos end_pos = POS(inode->v.i_ino, end_sector);
+	struct bch_io_opts opts;
 	int ret = 0;
 
-	BUG_ON(!len);
-	BUG_ON(nr_pages > ARRAY_SIZE(pages));
-
-	for (i = 0; i < nr_pages; i++) {
-		pages[i] = grab_cache_page_write_begin(mapping, index + i, 0);
-		if (!pages[i]) {
-			nr_pages = i;
-			ret = -ENOMEM;
-			goto out;
-		}
-	}
-
-	if (offset && !PageUptodate(pages[0])) {
-		ret = bch2_read_single_page(pages[0], mapping);
-		if (ret)
-			goto out;
-	}
-
-	if ((pos + len) & (PAGE_SIZE - 1) &&
-	    !PageUptodate(pages[nr_pages - 1])) {
-		if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) {
-			zero_user(pages[nr_pages - 1], 0, PAGE_SIZE);
-		} else {
-			ret = bch2_read_single_page(pages[nr_pages - 1], mapping);
-			if (ret)
-				goto out;
-		}
-	}
-
-	for (i = 0; i < nr_pages; i++) {
-		ret = bch2_get_page_reservation(c, inode, pages[i], true);
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
 
-		if (ret && !PageUptodate(pages[i])) {
-			ret = bch2_read_single_page(pages[i], mapping);
-			if (ret)
-				goto out;
-
-			ret = bch2_get_page_reservation(c, inode, pages[i], true);
-		}
-
-		if (ret)
-			goto out;
-	}
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			POS(inode->v.i_ino, start_sector),
+			BTREE_ITER_slots|BTREE_ITER_intent);
 
-	if (mapping_writably_mapped(mapping))
-		for (i = 0; i < nr_pages; i++)
-			flush_dcache_page(pages[i]);
+	while (!ret) {
+		s64 i_sectors_delta = 0;
+		struct quota_res quota_res = { 0 };
+		struct bkey_s_c k;
+		unsigned sectors;
+		bool is_allocation;
+		u64 hole_start, hole_end;
+		u32 snapshot;
 
-	while (copied < len) {
-		struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
-		unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
-		unsigned pg_bytes = min_t(unsigned, len - copied,
-					  PAGE_SIZE - pg_offset);
-		unsigned pg_copied = iov_iter_copy_from_user_atomic(page,
-						iter, pg_offset, pg_bytes);
+		bch2_trans_begin(trans);
 
-		if (!pg_copied)
+		if (bkey_ge(iter.pos, end_pos))
 			break;
 
-		flush_dcache_page(page);
-		iov_iter_advance(iter, pg_copied);
-		copied += pg_copied;
-	}
-
-	if (!copied)
-		goto out;
-
-	nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
-	inode->ei_last_dirtied = (unsigned long) current;
-
-	spin_lock(&inode->v.i_lock);
-	if (pos + copied > inode->v.i_size)
-		i_size_write(&inode->v, pos + copied);
-	spin_unlock(&inode->v.i_lock);
-
-	if (copied < len &&
-	    ((offset + copied) & (PAGE_SIZE - 1))) {
-		struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
-
-		if (!PageUptodate(page)) {
-			zero_user(page, 0, PAGE_SIZE);
-			copied -= (offset + copied) & (PAGE_SIZE - 1);
-		}
-	}
-out:
-	for (i = 0; i < nr_pages_copied; i++) {
-		if (!PageUptodate(pages[i]))
-			SetPageUptodate(pages[i]);
-		if (!PageDirty(pages[i]))
-			set_page_dirty(pages[i]);
-		unlock_page(pages[i]);
-		put_page(pages[i]);
-	}
-
-	for (i = nr_pages_copied; i < nr_pages; i++) {
-		if (!PageDirty(pages[i]))
-			bch2_put_page_reservation(c, inode, pages[i]);
-		unlock_page(pages[i]);
-		put_page(pages[i]);
-	}
+		ret = bch2_subvolume_get_snapshot(trans,
+					inode->ei_inum.subvol, &snapshot);
+		if (ret)
+			goto bkey_err;
 
-	return copied ?: ret;
-}
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
 
-static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
-{
-	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	loff_t pos = iocb->ki_pos;
-	ssize_t written = 0;
-	int ret = 0;
-
-	pagecache_add_get(&mapping->add_lock);
+		k = bch2_btree_iter_peek_slot(&iter);
+		if ((ret = bkey_err(k)))
+			goto bkey_err;
 
-	do {
-		unsigned offset = pos & (PAGE_SIZE - 1);
-		unsigned bytes = min_t(unsigned long, iov_iter_count(iter),
-			      PAGE_SIZE * WRITE_BATCH_PAGES - offset);
-again:
-		/*
-		 * Bring in the user page that we will copy from _first_.
-		 * Otherwise there's a nasty deadlock on copying from the
-		 * same page as we're writing to, without it being marked
-		 * up-to-date.
-		 *
-		 * Not only is this an optimisation, but it is also required
-		 * to check that the address is actually valid, when atomic
-		 * usercopies are used, below.
-		 */
-		if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
-			bytes = min_t(unsigned long, iov_iter_count(iter),
-				      PAGE_SIZE - offset);
+		hole_start	= iter.pos.offset;
+		hole_end	= bpos_min(k.k->p, end_pos).offset;
+		is_allocation	= bkey_extent_is_allocation(k.k);
 
-			if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
-				ret = -EFAULT;
-				break;
-			}
+		/* already reserved */
+		if (bkey_extent_is_reservation(k) &&
+		    bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) {
+			bch2_btree_iter_advance(&iter);
+			continue;
 		}
 
-		if (unlikely(fatal_signal_pending(current))) {
-			ret = -EINTR;
-			break;
+		if (bkey_extent_is_data(k.k) &&
+		    !(mode & FALLOC_FL_ZERO_RANGE)) {
+			bch2_btree_iter_advance(&iter);
+			continue;
 		}
 
-		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
-		if (unlikely(ret < 0))
-			break;
-
-		cond_resched();
-
-		if (unlikely(ret == 0)) {
+		if (!(mode & FALLOC_FL_ZERO_RANGE)) {
 			/*
-			 * If we were unable to copy any data at all, we must
-			 * fall back to a single segment length write.
-			 *
-			 * If we didn't fallback here, we could livelock
-			 * because not all segments in the iov can be copied at
-			 * once without a pagefault.
+			 * Lock ordering - can't be holding btree locks while
+			 * blocking on a folio lock:
 			 */
-			bytes = min_t(unsigned long, PAGE_SIZE - offset,
-				      iov_iter_single_seg_count(iter));
-			goto again;
-		}
-		pos += ret;
-		written += ret;
-
-		balance_dirty_pages_ratelimited(mapping);
-	} while (iov_iter_count(iter));
-
-	pagecache_add_put(&mapping->add_lock);
-
-	return written ? written : ret;
-}
-
-/* O_DIRECT reads */
-
-static void bch2_dio_read_complete(struct closure *cl)
-{
-	struct dio_read *dio = container_of(cl, struct dio_read, cl);
-
-	dio->req->ki_complete(dio->req, dio->ret, 0);
-	bio_check_pages_dirty(&dio->rbio.bio);	/* transfers ownership */
-}
-
-static void bch2_direct_IO_read_endio(struct bio *bio)
-{
-	struct dio_read *dio = bio->bi_private;
-
-	if (bio->bi_status)
-		dio->ret = blk_status_to_errno(bio->bi_status);
-
-	closure_put(&dio->cl);
-}
-
-static void bch2_direct_IO_read_split_endio(struct bio *bio)
-{
-	bch2_direct_IO_read_endio(bio);
-	bio_check_pages_dirty(bio);	/* transfers ownership */
-}
-
-static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
-{
-	struct file *file = req->ki_filp;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_io_opts opts = io_opts(c, inode);
-	struct dio_read *dio;
-	struct bio *bio;
-	loff_t offset = req->ki_pos;
-	bool sync = is_sync_kiocb(req);
-	size_t shorten;
-	ssize_t ret;
-
-	if ((offset|iter->count) & (block_bytes(c) - 1))
-		return -EINVAL;
-
-	ret = min_t(loff_t, iter->count,
-		    max_t(loff_t, 0, i_size_read(&inode->v) - offset));
-
-	if (!ret)
-		return ret;
-
-	shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
-	iter->count -= shorten;
-
-	bio = bio_alloc_bioset(GFP_KERNEL,
-			       iov_iter_npages(iter, BIO_MAX_PAGES),
-			       &c->dio_read_bioset);
-
-	bio->bi_end_io = bch2_direct_IO_read_endio;
-
-	dio = container_of(bio, struct dio_read, rbio.bio);
-	closure_init(&dio->cl, NULL);
-
-	/*
-	 * this is a _really_ horrible hack just to avoid an atomic sub at the
-	 * end:
-	 */
-	if (!sync) {
-		set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
-		atomic_set(&dio->cl.remaining,
-			   CLOSURE_REMAINING_INITIALIZER -
-			   CLOSURE_RUNNING +
-			   CLOSURE_DESTRUCTOR);
-	} else {
-		atomic_set(&dio->cl.remaining,
-			   CLOSURE_REMAINING_INITIALIZER + 1);
-	}
-
-	dio->req	= req;
-	dio->ret	= ret;
-
-	goto start;
-	while (iter->count) {
-		bio = bio_alloc_bioset(GFP_KERNEL,
-				       iov_iter_npages(iter, BIO_MAX_PAGES),
-				       &c->bio_read);
-		bio->bi_end_io		= bch2_direct_IO_read_split_endio;
-start:
-		bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC);
-		bio->bi_iter.bi_sector	= offset >> 9;
-		bio->bi_private		= dio;
-
-		ret = bio_iov_iter_get_pages(bio, iter);
-		if (ret < 0) {
-			/* XXX: fault inject this path */
-			bio->bi_status = BLK_STS_RESOURCE;
-			bio_endio(bio);
-			break;
-		}
-
-		offset += bio->bi_iter.bi_size;
-		bio_set_pages_dirty(bio);
-
-		if (iter->count)
-			closure_get(&dio->cl);
-
-		bch2_read(c, rbio_init(bio, opts), inode->v.i_ino);
-	}
-
-	iter->count += shorten;
-
-	if (sync) {
-		closure_sync(&dio->cl);
-		closure_debug_destroy(&dio->cl);
-		ret = dio->ret;
-		bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */
-		return ret;
-	} else {
-		return -EIOCBQUEUED;
-	}
-}
-
-/* O_DIRECT writes */
-
-static void bch2_dio_write_loop_async(struct closure *);
-
-static long bch2_dio_write_loop(struct dio_write *dio)
-{
-	struct kiocb *req = dio->req;
-	struct address_space *mapping = req->ki_filp->f_mapping;
-	struct bch_inode_info *inode = dio->iop.inode;
-	struct bio *bio = &dio->iop.op.wbio.bio;
-	struct bio_vec *bv;
-	bool sync;
-	long ret;
-	int i;
-
-	if (dio->loop)
-		goto loop;
-
-	inode_dio_begin(&inode->v);
-	__pagecache_block_get(&mapping->add_lock);
-
-	/* Write and invalidate pagecache range that we're writing to: */
-	ret = write_invalidate_inode_pages_range(mapping, req->ki_pos,
-				req->ki_pos + iov_iter_count(&dio->iter) - 1);
-	if (unlikely(ret))
-		goto err;
-
-	while (1) {
-		BUG_ON(current->pagecache_lock);
-		current->pagecache_lock = &mapping->add_lock;
-		if (current != dio->task)
-			use_mm(dio->task->mm);
-
-		ret = bio_iov_iter_get_pages(bio, &dio->iter);
-
-		if (current != dio->task)
-			unuse_mm(dio->task->mm);
-		current->pagecache_lock = NULL;
-
-		if (unlikely(ret < 0))
-			goto err;
-
-		/* gup might have faulted pages back in: */
-		ret = write_invalidate_inode_pages_range(mapping,
-				req->ki_pos + (dio->iop.op.written << 9),
-				req->ki_pos + iov_iter_count(&dio->iter) - 1);
-		if (unlikely(ret))
-			goto err;
-
-		dio->iop.op.pos = POS(inode->v.i_ino,
-				(req->ki_pos >> 9) + dio->iop.op.written);
-
-		task_io_account_write(bio->bi_iter.bi_size);
-
-		closure_call(&dio->iop.op.cl, bch2_write, NULL, &dio->cl);
-
-		if (!dio->sync && !dio->loop && dio->iter.count) {
-			struct iovec *iov = dio->inline_vecs;
-
-			if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
-				iov = kmalloc(dio->iter.nr_segs * sizeof(*iov),
-					      GFP_KERNEL);
-				if (unlikely(!iov)) {
-					dio->iop.op.error = -ENOMEM;
-					goto err_wait_io;
-				}
-
-				dio->free_iov = true;
+			if (bch2_clamp_data_hole(&inode->v,
+						 &hole_start,
+						 &hole_end,
+						 opts.data_replicas, true)) {
+				ret = drop_locks_do(trans,
+					(bch2_clamp_data_hole(&inode->v,
+							      &hole_start,
+							      &hole_end,
+							      opts.data_replicas, false), 0));
+				if (ret)
+					goto bkey_err;
 			}
+			bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
 
-			memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
-			dio->iter.iov = iov;
-		}
-err_wait_io:
-		dio->loop = true;
+			if (ret)
+				goto bkey_err;
 
-		if (!dio->sync) {
-			continue_at(&dio->cl, bch2_dio_write_loop_async, NULL);
-			return -EIOCBQUEUED;
+			if (hole_start == hole_end)
+				continue;
 		}
 
-		closure_sync(&dio->cl);
-loop:
-		bio_for_each_segment_all(bv, bio, i)
-			put_page(bv->bv_page);
-		if (!dio->iter.count || dio->iop.op.error)
-			break;
-		bio_reset(bio);
-	}
-
-	ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9);
-err:
-	__pagecache_block_put(&mapping->add_lock);
-	bch2_disk_reservation_put(dio->iop.op.c, &dio->iop.op.res);
-	bch2_quota_reservation_put(dio->iop.op.c, inode, &dio->quota_res);
-
-	if (dio->free_iov)
-		kfree(dio->iter.iov);
+		sectors	= hole_end - hole_start;
 
-	closure_debug_destroy(&dio->cl);
+		if (!is_allocation) {
+			ret = bch2_quota_reservation_add(c, inode,
+					&quota_res, sectors, true);
+			if (unlikely(ret))
+				goto bkey_err;
+		}
 
-	sync = dio->sync;
-	bio_put(bio);
+		ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter,
+					    sectors, opts, &i_sectors_delta,
+					    writepoint_hashed((unsigned long) current));
+		if (ret)
+			goto bkey_err;
 
-	/* inode->i_dio_count is our ref on inode and thus bch_fs */
-	inode_dio_end(&inode->v);
+		bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 
-	if (!sync) {
-		req->ki_complete(req, ret, 0);
-		ret = -EIOCBQUEUED;
+		if (bch2_mark_pagecache_reserved(inode, &hole_start,
+						 iter.pos.offset, true)) {
+			ret = drop_locks_do(trans,
+				bch2_mark_pagecache_reserved(inode, &hole_start,
+							     iter.pos.offset, false));
+			if (ret)
+				goto bkey_err;
+		}
+bkey_err:
+		bch2_quota_reservation_put(c, inode, &quota_res);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			ret = 0;
 	}
-	return ret;
-}
-
-static void bch2_dio_write_loop_async(struct closure *cl)
-{
-	struct dio_write *dio = container_of(cl, struct dio_write, cl);
 
-	bch2_dio_write_loop(dio);
-}
+	if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) {
+		struct quota_res quota_res = { 0 };
+		s64 i_sectors_delta = 0;
 
-static int bch2_direct_IO_write(struct kiocb *req,
-				struct iov_iter *iter,
-				bool swap)
-{
-	struct file *file = req->ki_filp;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct dio_write *dio;
-	struct bio *bio;
-	loff_t offset = req->ki_pos;
-	ssize_t ret;
-
-	lockdep_assert_held(&inode->v.i_rwsem);
-
-	if (unlikely(!iter->count))
-		return 0;
-
-	if (unlikely((offset|iter->count) & (block_bytes(c) - 1)))
-		return -EINVAL;
-
-	bio = bio_alloc_bioset(GFP_KERNEL,
-			       iov_iter_npages(iter, BIO_MAX_PAGES),
-			       &c->dio_write_bioset);
-	dio = container_of(bio, struct dio_write, iop.op.wbio.bio);
-	closure_init(&dio->cl, NULL);
-	dio->req		= req;
-	dio->task		= current;
-	dio->loop		= false;
-	dio->sync		= is_sync_kiocb(req) ||
-		offset + iter->count > inode->v.i_size;
-	dio->free_iov		= false;
-	dio->quota_res.sectors	= 0;
-	dio->iter		= *iter;
-	bch2_fswrite_op_init(&dio->iop, c, inode, io_opts(c, inode), true);
-	dio->iop.op.write_point	= writepoint_hashed((unsigned long) dio->task);
-	dio->iop.op.flags |= BCH_WRITE_NOPUT_RESERVATION;
-
-	if ((req->ki_flags & IOCB_DSYNC) &&
-	    !c->opts.journal_flush_disabled)
-		dio->iop.op.flags |= BCH_WRITE_FLUSH;
-
-	ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
-					 iter->count >> 9, true);
-	if (unlikely(ret))
-		goto err;
-
-	ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9,
-					dio->iop.op.opts.data_replicas, 0);
-	if (unlikely(ret)) {
-		if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
-						      offset >> 9),
-					       iter->count >> 9))
-			goto err;
-
-		dio->iop.unalloc = true;
+		bch2_fpunch_at(trans, &iter, inode_inum(inode),
+			       end_sector, &i_sectors_delta);
+		bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+		bch2_quota_reservation_put(c, inode, &quota_res);
 	}
 
-	dio->iop.op.nr_replicas	= dio->iop.op.res.nr_replicas;
-
-	return bch2_dio_write_loop(dio);
-err:
-	bch2_disk_reservation_put(c, &dio->iop.op.res);
-	bch2_quota_reservation_put(c, inode, &dio->quota_res);
-	closure_debug_destroy(&dio->cl);
-	bio_put(bio);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
 	return ret;
 }
 
-ssize_t bch2_direct_IO(struct kiocb *req, struct iov_iter *iter)
+static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode,
+			    loff_t offset, loff_t len)
 {
-	struct blk_plug plug;
-	ssize_t ret;
-
-	blk_start_plug(&plug);
-	ret = iov_iter_rw(iter) == WRITE
-		? bch2_direct_IO_write(req, iter, false)
-		: bch2_direct_IO_read(req, iter);
-	blk_finish_plug(&plug);
-
-	return ret;
-}
-
-static ssize_t
-bch2_direct_write(struct kiocb *iocb, struct iov_iter *iter)
-{
-	return bch2_direct_IO_write(iocb, iter, true);
-}
-
-static ssize_t __bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
-	struct file *file = iocb->ki_filp;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	ssize_t	ret;
-
-	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = inode_to_bdi(&inode->v);
-	ret = file_remove_privs(file);
-	if (ret)
-		goto out;
-
-	ret = file_update_time(file);
-	if (ret)
-		goto out;
-
-	ret = iocb->ki_flags & IOCB_DIRECT
-		? bch2_direct_write(iocb, from)
-		: bch2_buffered_write(iocb, from);
-
-	if (likely(ret > 0))
-		iocb->ki_pos += ret;
-out:
-	current->backing_dev_info = NULL;
-	return ret;
-}
-
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
-	struct bch_inode_info *inode = file_bch_inode(iocb->ki_filp);
-	bool direct = iocb->ki_flags & IOCB_DIRECT;
-	ssize_t ret;
-
-	inode_lock(&inode->v);
-	ret = generic_write_checks(iocb, from);
-	if (ret > 0)
-		ret = __bch2_write_iter(iocb, from);
-	inode_unlock(&inode->v);
-
-	if (ret > 0 && !direct)
-		ret = generic_write_sync(iocb, ret);
-
-	return ret;
-}
-
-/* fsync: */
-
-int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
-{
-	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	int ret;
-
-	ret = file_write_and_wait_range(file, start, end);
-	if (ret)
-		return ret;
-
-	if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC))
-		goto out;
-
-	ret = sync_inode_metadata(&inode->v, 1);
-	if (ret)
-		return ret;
-out:
-	if (c->opts.journal_flush_disabled)
-		return 0;
+	u64 end		= offset + len;
+	u64 block_start	= round_down(offset,	block_bytes(c));
+	u64 block_end	= round_up(end,		block_bytes(c));
+	bool truncated_last_page = false;
+	int ret, ret2 = 0;
 
-	return bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
-}
-
-/* truncate: */
-
-static inline int range_has_data(struct bch_fs *c,
-				  struct bpos start,
-				  struct bpos end)
-{
-
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
-			   start, 0, k) {
-		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
-			break;
-
-		if (bkey_extent_is_data(k.k)) {
-			ret = 1;
-			break;
-		}
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
+		ret = inode_newsize_ok(&inode->v, end);
+		if (ret)
+			return ret;
 	}
 
-	return bch2_btree_iter_unlock(&iter) ?: ret;
-}
-
-static int __bch2_truncate_page(struct bch_inode_info *inode,
-				pgoff_t index, loff_t start, loff_t end)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct address_space *mapping = inode->v.i_mapping;
-	unsigned start_offset = start & (PAGE_SIZE - 1);
-	unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
-	struct page *page;
-	int ret = 0;
-
-	/* Page boundary? Nothing to do */
-	if (!((index == start >> PAGE_SHIFT && start_offset) ||
-	      (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE)))
-		return 0;
+	if (mode & FALLOC_FL_ZERO_RANGE) {
+		ret = bch2_truncate_folios(inode, offset, end);
+		if (unlikely(ret < 0))
+			return ret;
 
-	/* Above i_size? */
-	if (index << PAGE_SHIFT >= inode->v.i_size)
-		return 0;
+		truncated_last_page = ret;
 
-	page = find_lock_page(mapping, index);
-	if (!page) {
-		/*
-		 * XXX: we're doing two index lookups when we end up reading the
-		 * page
-		 */
-		ret = range_has_data(c,
-				POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT),
-				POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT));
-		if (ret <= 0)
-			return ret;
+		truncate_pagecache_range(&inode->v, offset, end - 1);
 
-		page = find_or_create_page(mapping, index, GFP_KERNEL);
-		if (unlikely(!page)) {
-			ret = -ENOMEM;
-			goto out;
-		}
+		block_start	= round_up(offset,	block_bytes(c));
+		block_end	= round_down(end,	block_bytes(c));
 	}
 
-	if (!PageUptodate(page)) {
-		ret = bch2_read_single_page(page, mapping);
-		if (ret)
-			goto unlock;
-	}
+	ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
 
 	/*
-	 * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
-	 *
-	 * XXX: because we aren't currently tracking whether the page has actual
-	 * data in it (vs. just 0s, or only partially written) this wrong. ick.
+	 * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
+	 * so that the VFS cache i_size is consistent with the btree i_size:
 	 */
-	ret = bch2_get_page_reservation(c, inode, page, false);
-	BUG_ON(ret);
-
-	if (index == start >> PAGE_SHIFT &&
-	    index == end >> PAGE_SHIFT)
-		zero_user_segment(page, start_offset, end_offset);
-	else if (index == start >> PAGE_SHIFT)
-		zero_user_segment(page, start_offset, PAGE_SIZE);
-	else if (index == end >> PAGE_SHIFT)
-		zero_user_segment(page, 0, end_offset);
-
-	if (!PageDirty(page))
-		set_page_dirty(page);
-unlock:
-	unlock_page(page);
-	put_page(page);
-out:
-	return ret;
-}
-
-static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
-{
-	return __bch2_truncate_page(inode, from >> PAGE_SHIFT,
-				    from, from + PAGE_SIZE);
-}
-
-static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct address_space *mapping = inode->v.i_mapping;
-	int ret;
-
-	ret = filemap_write_and_wait_range(mapping,
-			inode->ei_inode.bi_size, S64_MAX);
-	if (ret)
+	if (ret &&
+	    !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)))
 		return ret;
 
-	truncate_setsize(&inode->v, iattr->ia_size);
-	setattr_copy(&inode->v, iattr);
-
-	mutex_lock(&inode->ei_update_lock);
-	inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v);
-	ret = bch2_write_inode_size(c, inode, inode->v.i_size);
-	mutex_unlock(&inode->ei_update_lock);
+	if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
+		end = inode->v.i_size;
 
-	return ret;
-}
-
-int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct address_space *mapping = inode->v.i_mapping;
-	struct i_sectors_hook i_sectors_hook =
-		i_sectors_hook_init(inode, BCH_INODE_I_SIZE_DIRTY);
-	bool shrink;
-	int ret = 0;
-
-	inode_dio_wait(&inode->v);
-	pagecache_block_get(&mapping->add_lock);
-
-	BUG_ON(inode->v.i_size < inode->ei_inode.bi_size);
-
-	shrink = iattr->ia_size <= inode->v.i_size;
+	if (end >= inode->v.i_size &&
+	    (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) ||
+	     !(mode & FALLOC_FL_KEEP_SIZE))) {
+		spin_lock(&inode->v.i_lock);
+		i_size_write(&inode->v, end);
+		spin_unlock(&inode->v.i_lock);
 
-	if (!shrink) {
-		ret = bch2_extend(inode, iattr);
-		goto err_put_pagecache;
+		mutex_lock(&inode->ei_update_lock);
+		ret2 = bch2_write_inode_size(c, inode, end, 0);
+		mutex_unlock(&inode->ei_update_lock);
 	}
 
-	ret = bch2_truncate_page(inode, iattr->ia_size);
-	if (unlikely(ret))
-		goto err_put_pagecache;
-
-	if (iattr->ia_size > inode->ei_inode.bi_size)
-		ret = filemap_write_and_wait_range(mapping,
-				inode->ei_inode.bi_size,
-				iattr->ia_size - 1);
-	else if (iattr->ia_size & (PAGE_SIZE - 1))
-		ret = filemap_write_and_wait_range(mapping,
-				round_down(iattr->ia_size, PAGE_SIZE),
-				iattr->ia_size - 1);
-	if (ret)
-		goto err_put_pagecache;
-
-	i_sectors_hook.new_i_size = iattr->ia_size;
-
-	ret = i_sectors_dirty_start(c, &i_sectors_hook);
-	if (unlikely(ret))
-		goto err_put_pagecache;
-
-	truncate_setsize(&inode->v, iattr->ia_size);
-
-	ret = bch2_inode_truncate(c, inode->v.i_ino,
-				  round_up(iattr->ia_size, PAGE_SIZE) >> 9,
-				  &i_sectors_hook.hook,
-				  &inode->ei_journal_seq);
-	if (unlikely(ret))
-		goto err_put_sectors_dirty;
-
-	setattr_copy(&inode->v, iattr);
-	inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v);
-out:
-	ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
-err_put_pagecache:
-	pagecache_block_put(&mapping->add_lock);
-	return ret;
-err_put_sectors_dirty:
-	/*
-	 * On error - in particular, bch2_truncate_page() error - don't clear
-	 * I_SIZE_DIRTY, as we've left data above i_size!:
-	 */
-	i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY;
-	goto out;
+	return ret ?: ret2;
 }
 
-/* fallocate: */
-
-static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
+long bch2_fallocate_dispatch(struct file *file, int mode,
+			     loff_t offset, loff_t len)
 {
+	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct address_space *mapping = inode->v.i_mapping;
-	u64 ino = inode->v.i_ino;
-	u64 discard_start = round_up(offset, PAGE_SIZE) >> 9;
-	u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9;
-	int ret = 0;
+	long ret;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate))
+		return -EROFS;
 
 	inode_lock(&inode->v);
 	inode_dio_wait(&inode->v);
-	pagecache_block_get(&mapping->add_lock);
+	bch2_pagecache_block_get(inode);
 
-	ret = __bch2_truncate_page(inode,
-				   offset >> PAGE_SHIFT,
-				   offset, offset + len);
-	if (unlikely(ret))
+	ret = file_modified(file);
+	if (ret)
 		goto err;
 
-	if (offset >> PAGE_SHIFT !=
-	    (offset + len) >> PAGE_SHIFT) {
-		ret = __bch2_truncate_page(inode,
-					   (offset + len) >> PAGE_SHIFT,
-					   offset, offset + len);
-		if (unlikely(ret))
-			goto err;
-	}
-
-	truncate_pagecache_range(&inode->v, offset, offset + len - 1);
-
-	if (discard_start < discard_end) {
-		/*
-		 * We need to pass in a disk reservation here because we might
-		 * be splitting a compressed extent into two. This isn't a
-		 * problem with truncate because truncate will never split an
-		 * extent, only truncate it...
-		 */
-		struct disk_reservation disk_res =
-			bch2_disk_reservation_init(c, 0);
-		struct i_sectors_hook i_sectors_hook =
-			i_sectors_hook_init(inode, 0);
-		int ret;
-
-		ret = i_sectors_dirty_start(c, &i_sectors_hook);
-		if (unlikely(ret))
-			goto err;
-
-		ret = bch2_btree_delete_range(c,
-				BTREE_ID_EXTENTS,
-				POS(ino, discard_start),
-				POS(ino, discard_end),
-				ZERO_VERSION,
-				&disk_res,
-				&i_sectors_hook.hook,
-				&inode->ei_journal_seq);
-
-		ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
-	}
+	if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
+		ret = bchfs_fallocate(inode, mode, offset, len);
+	else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
+		ret = bchfs_fpunch(inode, offset, len);
+	else if (mode == FALLOC_FL_INSERT_RANGE)
+		ret = bchfs_fcollapse_finsert(inode, offset, len, true);
+	else if (mode == FALLOC_FL_COLLAPSE_RANGE)
+		ret = bchfs_fcollapse_finsert(inode, offset, len, false);
+	else
+		ret = -EOPNOTSUPP;
 err:
-	pagecache_block_put(&mapping->add_lock);
+	bch2_pagecache_block_put(inode);
 	inode_unlock(&inode->v);
+	bch2_write_ref_put(c, BCH_WRITE_REF_fallocate);
 
-	return ret;
+	return bch2_err_class(ret);
 }
 
-static long bch2_fcollapse(struct bch_inode_info *inode,
-			   loff_t offset, loff_t len)
+/*
+ * Take a quota reservation for unallocated blocks in a given file range
+ * Does not check pagecache
+ */
+static int quota_reserve_range(struct bch_inode_info *inode,
+			       struct quota_res *res,
+			       u64 start, u64 end)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct address_space *mapping = inode->v.i_mapping;
-	struct btree_trans trans;
-	struct btree_iter *src, *dst;
-	BKEY_PADDED(k) copy;
-	struct bkey_s_c k;
-	struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
-	loff_t new_size;
-	int ret;
+	u64 sectors = end - start;
+
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_in_subvolume_max(trans, iter,
+				BTREE_ID_extents,
+				POS(inode->v.i_ino, start),
+				POS(inode->v.i_ino, end - 1),
+				inode->ei_inum.subvol, 0, k, ({
+			if (bkey_extent_is_allocation(k.k)) {
+				u64 s = min(end, k.k->p.offset) -
+					max(start, bkey_start_offset(k.k));
+				BUG_ON(s > sectors);
+				sectors -= s;
+			}
 
-	if ((offset | len) & (block_bytes(c) - 1))
-		return -EINVAL;
+			0;
+		})));
 
-	bch2_trans_init(&trans, c);
+	return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
+}
 
-	dst = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-			     POS(inode->v.i_ino, offset >> 9),
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	BUG_ON(IS_ERR(dst));
+loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
+			     struct file *file_dst, loff_t pos_dst,
+			     loff_t len, unsigned remap_flags)
+{
+	struct bch_inode_info *src = file_bch_inode(file_src);
+	struct bch_inode_info *dst = file_bch_inode(file_dst);
+	struct bch_fs *c = src->v.i_sb->s_fs_info;
+	struct quota_res quota_res = { 0 };
+	s64 i_sectors_delta = 0;
+	u64 aligned_len;
+	loff_t ret = 0;
 
-	/* position will be set from dst iter's position: */
-	src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
-			     BTREE_ITER_SLOTS);
-	BUG_ON(IS_ERR(src));
+	if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
+		return -EINVAL;
 
-	/*
-	 * We need i_mutex to keep the page cache consistent with the extents
-	 * btree, and the btree consistent with i_size - we don't need outside
-	 * locking for the extents btree itself, because we're using linked
-	 * iterators
-	 */
-	inode_lock(&inode->v);
-	inode_dio_wait(&inode->v);
-	pagecache_block_get(&mapping->add_lock);
+	if ((pos_src & (block_bytes(c) - 1)) ||
+	    (pos_dst & (block_bytes(c) - 1)))
+		return -EINVAL;
 
-	ret = -EINVAL;
-	if (offset + len >= inode->v.i_size)
-		goto err;
+	if (src == dst &&
+	    abs(pos_src - pos_dst) < len)
+		return -EINVAL;
 
-	if (inode->v.i_size < len)
-		goto err;
+	lock_two_nondirectories(&src->v, &dst->v);
+	bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
 
-	new_size = inode->v.i_size - len;
+	inode_dio_wait(&src->v);
+	inode_dio_wait(&dst->v);
 
-	ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
-	if (ret)
+	ret = generic_remap_file_range_prep(file_src, pos_src,
+					    file_dst, pos_dst,
+					    &len, remap_flags);
+	if (ret < 0 || len == 0)
 		goto err;
 
-	ret = i_sectors_dirty_start(c, &i_sectors_hook);
+	aligned_len = round_up((u64) len, block_bytes(c));
+
+	ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping,
+				pos_dst, pos_dst + len - 1);
 	if (ret)
 		goto err;
 
-	while (bkey_cmp(dst->pos,
-			POS(inode->v.i_ino,
-			    round_up(new_size, PAGE_SIZE) >> 9)) < 0) {
-		struct disk_reservation disk_res;
-
-		bch2_btree_iter_set_pos(src,
-			POS(dst->pos.inode, dst->pos.offset + (len >> 9)));
-
-		k = bch2_btree_iter_peek_slot(src);
-		if ((ret = btree_iter_err(k)))
-			goto btree_iter_err;
-
-		bkey_reassemble(&copy.k, k);
-
-		bch2_cut_front(src->pos, &copy.k);
-		copy.k.k.p.offset -= len >> 9;
-
-		BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(&copy.k.k)));
-
-		ret = bch2_disk_reservation_get(c, &disk_res, copy.k.k.size,
-				bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&copy.k)),
-				BCH_DISK_RESERVATION_NOFAIL);
-		BUG_ON(ret);
-
-		ret = bch2_btree_insert_at(c, &disk_res, &i_sectors_hook.hook,
-					   &inode->ei_journal_seq,
-					   BTREE_INSERT_ATOMIC|
-					   BTREE_INSERT_NOFAIL,
-					   BTREE_INSERT_ENTRY(dst, &copy.k));
-		bch2_disk_reservation_put(c, &disk_res);
-btree_iter_err:
-		if (ret == -EINTR)
-			ret = 0;
-		if (ret) {
-			bch2_trans_exit(&trans);
-			goto err_put_sectors_dirty;
-		}
-		/*
-		 * XXX: if we error here we've left data with multiple
-		 * pointers... which isn't a _super_ serious problem...
-		 */
-
-		bch2_btree_iter_cond_resched(src);
-	}
-
-	bch2_trans_exit(&trans);
-
-	ret = bch2_inode_truncate(c, inode->v.i_ino,
-				 round_up(new_size, block_bytes(c)) >> 9,
-				 &i_sectors_hook.hook,
-				 &inode->ei_journal_seq);
+	ret = quota_reserve_range(dst, &quota_res, pos_dst >> 9,
+				  (pos_dst + aligned_len) >> 9);
 	if (ret)
-		goto err_put_sectors_dirty;
-
-	i_sectors_hook.new_i_size = new_size;
-err_put_sectors_dirty:
-	ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
-err:
-	pagecache_block_put(&mapping->add_lock);
-	inode_unlock(&inode->v);
-	return ret;
-}
-
-static long bch2_fallocate(struct bch_inode_info *inode, int mode,
-			   loff_t offset, loff_t len)
-{
-	struct address_space *mapping = inode->v.i_mapping;
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
-	struct btree_iter iter;
-	struct bpos end_pos;
-	loff_t block_start, block_end;
-	loff_t end = offset + len;
-	unsigned sectors;
-	unsigned replicas = io_opts(c, inode).data_replicas;
-	int ret;
-
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-	inode_lock(&inode->v);
-	inode_dio_wait(&inode->v);
-	pagecache_block_get(&mapping->add_lock);
-
-	if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
-		ret = inode_newsize_ok(&inode->v, end);
-		if (ret)
-			goto err;
-	}
-
-	if (mode & FALLOC_FL_ZERO_RANGE) {
-		ret = __bch2_truncate_page(inode,
-					   offset >> PAGE_SHIFT,
-					   offset, end);
-
-		if (!ret &&
-		    offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
-			ret = __bch2_truncate_page(inode,
-						   end >> PAGE_SHIFT,
-						   offset, end);
-
-		if (unlikely(ret))
-			goto err;
-
-		truncate_pagecache_range(&inode->v, offset, end - 1);
-
-		block_start	= round_up(offset, PAGE_SIZE);
-		block_end	= round_down(end, PAGE_SIZE);
-	} else {
-		block_start	= round_down(offset, PAGE_SIZE);
-		block_end	= round_up(end, PAGE_SIZE);
-	}
-
-	bch2_btree_iter_set_pos(&iter, POS(inode->v.i_ino, block_start >> 9));
-	end_pos = POS(inode->v.i_ino, block_end >> 9);
-
-	ret = i_sectors_dirty_start(c, &i_sectors_hook);
-	if (unlikely(ret))
 		goto err;
 
-	while (bkey_cmp(iter.pos, end_pos) < 0) {
-		struct disk_reservation disk_res = { 0 };
-		struct bkey_i_reservation reservation;
-		struct bkey_s_c k;
-
-		k = bch2_btree_iter_peek_slot(&iter);
-		if ((ret = btree_iter_err(k)))
-			goto btree_iter_err;
+	if (!(remap_flags & REMAP_FILE_DEDUP))
+		file_update_time(file_dst);
 
-		/* already reserved */
-		if (k.k->type == BCH_RESERVATION &&
-		    bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
-			bch2_btree_iter_next_slot(&iter);
-			continue;
-		}
+	bch2_mark_pagecache_unallocated(src, pos_src >> 9,
+				   (pos_src + aligned_len) >> 9);
 
-		if (bkey_extent_is_data(k.k)) {
-			if (!(mode & FALLOC_FL_ZERO_RANGE)) {
-				bch2_btree_iter_next_slot(&iter);
-				continue;
-			}
-		}
-
-		bkey_reservation_init(&reservation.k_i);
-		reservation.k.type	= BCH_RESERVATION;
-		reservation.k.p		= k.k->p;
-		reservation.k.size	= k.k->size;
-
-		bch2_cut_front(iter.pos, &reservation.k_i);
-		bch2_cut_back(end_pos, &reservation.k);
-
-		sectors = reservation.k.size;
-		reservation.v.nr_replicas = bch2_extent_nr_dirty_ptrs(k);
-
-		if (!bkey_extent_is_allocation(k.k)) {
-			ret = bch2_quota_reservation_add(c, inode,
-					&i_sectors_hook.quota_res,
-					sectors, true);
-			if (unlikely(ret))
-				goto btree_iter_err;
-		}
-
-		if (reservation.v.nr_replicas < replicas ||
-		    bch2_extent_is_compressed(k)) {
-			ret = bch2_disk_reservation_get(c, &disk_res, sectors,
-							replicas, 0);
-			if (unlikely(ret))
-				goto btree_iter_err;
-
-			reservation.v.nr_replicas = disk_res.nr_replicas;
-		}
-
-		ret = bch2_btree_insert_at(c, &disk_res, &i_sectors_hook.hook,
-					  &inode->ei_journal_seq,
-					  BTREE_INSERT_ATOMIC|
-					  BTREE_INSERT_NOFAIL,
-					  BTREE_INSERT_ENTRY(&iter, &reservation.k_i));
-		bch2_disk_reservation_put(c, &disk_res);
-btree_iter_err:
-		if (ret == -EINTR)
-			ret = 0;
-		if (ret) {
-			bch2_btree_iter_unlock(&iter);
-			goto err_put_sectors_dirty;
-		}
+	ret = bch2_remap_range(c,
+			       inode_inum(dst), pos_dst >> 9,
+			       inode_inum(src), pos_src >> 9,
+			       aligned_len >> 9,
+			       pos_dst + len, &i_sectors_delta);
+	if (ret < 0)
+		goto err;
 
-	}
-	bch2_btree_iter_unlock(&iter);
+	/*
+	 * due to alignment, we might have remapped slightly more than requsted
+	 */
+	ret = min((u64) ret << 9, (u64) len);
 
-	ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+	bch2_i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
 
-	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-	    end > inode->v.i_size) {
-		i_size_write(&inode->v, end);
+	spin_lock(&dst->v.i_lock);
+	if (pos_dst + ret > dst->v.i_size)
+		i_size_write(&dst->v, pos_dst + ret);
+	spin_unlock(&dst->v.i_lock);
 
-		mutex_lock(&inode->ei_update_lock);
-		ret = bch2_write_inode_size(c, inode, inode->v.i_size);
-		mutex_unlock(&inode->ei_update_lock);
-	}
-
-	/* blech */
-	if ((mode & FALLOC_FL_KEEP_SIZE) &&
-	    (mode & FALLOC_FL_ZERO_RANGE) &&
-	    inode->ei_inode.bi_size != inode->v.i_size) {
-		/* sync appends.. */
-		ret = filemap_write_and_wait_range(mapping,
-					inode->ei_inode.bi_size, S64_MAX);
-		if (ret)
-			goto err;
-
-		if (inode->ei_inode.bi_size != inode->v.i_size) {
-			mutex_lock(&inode->ei_update_lock);
-			ret = bch2_write_inode_size(c, inode, inode->v.i_size);
-			mutex_unlock(&inode->ei_update_lock);
-		}
-	}
-
-	pagecache_block_put(&mapping->add_lock);
-	inode_unlock(&inode->v);
-
-	return 0;
-err_put_sectors_dirty:
-	ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+	if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
+	    IS_SYNC(file_inode(file_dst)))
+		ret = bch2_flush_inode(c, dst);
 err:
-	pagecache_block_put(&mapping->add_lock);
-	inode_unlock(&inode->v);
-	return ret;
-}
-
-long bch2_fallocate_dispatch(struct file *file, int mode,
-			     loff_t offset, loff_t len)
-{
-	struct bch_inode_info *inode = file_bch_inode(file);
-
-	if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
-		return bch2_fallocate(inode, mode, offset, len);
-
-	if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
-		return bch2_fpunch(inode, offset, len);
-
-	if (mode == FALLOC_FL_COLLAPSE_RANGE)
-		return bch2_fcollapse(inode, offset, len);
+	bch2_quota_reservation_put(c, dst, &quota_res);
+	bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
+	unlock_two_nondirectories(&src->v, &dst->v);
 
-	return -EOPNOTSUPP;
+	return bch2_err_class(ret);
 }
 
 /* fseek: */
 
-static bool page_is_data(struct page *page)
-{
-	/* XXX: should only have to check PageDirty */
-	return PagePrivate(page) &&
-		(page_state(page)->sectors ||
-		 page_state(page)->dirty_sectors);
-}
-
-static loff_t bch2_next_pagecache_data(struct inode *vinode,
-				       loff_t start_offset,
-				       loff_t end_offset)
-{
-	struct address_space *mapping = vinode->i_mapping;
-	struct page *page;
-	pgoff_t index;
-
-	for (index = start_offset >> PAGE_SHIFT;
-	     index < end_offset >> PAGE_SHIFT;
-	     index++) {
-		if (find_get_pages(mapping, &index, 1, &page)) {
-			lock_page(page);
-
-			if (page_is_data(page))
-				end_offset =
-					min(end_offset,
-					max(start_offset,
-					    ((loff_t) index) << PAGE_SHIFT));
-			unlock_page(page);
-			put_page(page);
-		} else {
-			break;
-		}
-	}
-
-	return end_offset;
-}
-
 static loff_t bch2_seek_data(struct file *file, u64 offset)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_iter iter;
-	struct bkey_s_c k;
+	subvol_inum inum = inode_inum(inode);
 	u64 isize, next_data = MAX_LFS_FILESIZE;
-	int ret;
 
 	isize = i_size_read(&inode->v);
 	if (offset >= isize)
 		return -ENXIO;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
-			   POS(inode->v.i_ino, offset >> 9), 0, k) {
-		if (k.k->p.inode != inode->v.i_ino) {
-			break;
-		} else if (bkey_extent_is_data(k.k)) {
-			next_data = max(offset, bkey_start_offset(k.k) << 9);
-			break;
-		} else if (k.k->p.offset >> 9 > isize)
-			break;
-	}
-
-	ret = bch2_btree_iter_unlock(&iter);
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
+				   POS(inode->v.i_ino, offset >> 9),
+				   POS(inode->v.i_ino, U64_MAX),
+				   inum.subvol, 0, k, ({
+			if (bkey_extent_is_data(k.k)) {
+				next_data = max(offset, bkey_start_offset(k.k) << 9);
+				break;
+			} else if (k.k->p.offset >> 9 > isize)
+				break;
+			0;
+		})));
 	if (ret)
 		return ret;
 
 	if (next_data > offset)
-		next_data = bch2_next_pagecache_data(&inode->v,
-						     offset, next_data);
+		next_data = bch2_seek_pagecache_data(&inode->v,
+					offset, next_data, 0, false);
 
-	if (next_data > isize)
+	if (next_data >= isize)
 		return -ENXIO;
 
 	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
 }
 
-static bool page_slot_is_data(struct address_space *mapping, pgoff_t index)
-{
-	struct page *page;
-	bool ret;
-
-	page = find_lock_entry(mapping, index);
-	if (!page || radix_tree_exception(page))
-		return false;
-
-	ret = page_is_data(page);
-	unlock_page(page);
-
-	return ret;
-}
-
-static loff_t bch2_next_pagecache_hole(struct inode *vinode,
-				       loff_t start_offset,
-				       loff_t end_offset)
-{
-	struct address_space *mapping = vinode->i_mapping;
-	pgoff_t index;
-
-	for (index = start_offset >> PAGE_SHIFT;
-	     index < end_offset >> PAGE_SHIFT;
-	     index++)
-		if (!page_slot_is_data(mapping, index))
-			end_offset = max(start_offset,
-					 ((loff_t) index) << PAGE_SHIFT);
-
-	return end_offset;
-}
-
 static loff_t bch2_seek_hole(struct file *file, u64 offset)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_iter iter;
-	struct bkey_s_c k;
+	subvol_inum inum = inode_inum(inode);
 	u64 isize, next_hole = MAX_LFS_FILESIZE;
-	int ret;
 
 	isize = i_size_read(&inode->v);
 	if (offset >= isize)
 		return -ENXIO;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
-			   POS(inode->v.i_ino, offset >> 9),
-			   BTREE_ITER_SLOTS, k) {
-		if (k.k->p.inode != inode->v.i_ino) {
-			next_hole = bch2_next_pagecache_hole(&inode->v,
-					offset, MAX_LFS_FILESIZE);
-			break;
-		} else if (!bkey_extent_is_data(k.k)) {
-			next_hole = bch2_next_pagecache_hole(&inode->v,
-					max(offset, bkey_start_offset(k.k) << 9),
-					k.k->p.offset << 9);
-
-			if (next_hole < k.k->p.offset << 9)
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
+				   POS(inode->v.i_ino, offset >> 9),
+				   POS(inode->v.i_ino, U64_MAX),
+				   inum.subvol, BTREE_ITER_slots, k, ({
+			if (k.k->p.inode != inode->v.i_ino) {
+				next_hole = bch2_seek_pagecache_hole(&inode->v,
+						offset, MAX_LFS_FILESIZE, 0, false);
 				break;
-		} else {
-			offset = max(offset, bkey_start_offset(k.k) << 9);
-		}
-	}
-
-	ret = bch2_btree_iter_unlock(&iter);
+			} else if (!bkey_extent_is_data(k.k)) {
+				next_hole = bch2_seek_pagecache_hole(&inode->v,
+						max(offset, bkey_start_offset(k.k) << 9),
+						k.k->p.offset << 9, 0, false);
+
+				if (next_hole < k.k->p.offset << 9)
+					break;
+			} else {
+				offset = max(offset, bkey_start_offset(k.k) << 9);
+			}
+			0;
+		})));
 	if (ret)
 		return ret;
 
@@ -2798,46 +1018,40 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 
 loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
 {
+	loff_t ret;
+
 	switch (whence) {
 	case SEEK_SET:
 	case SEEK_CUR:
 	case SEEK_END:
-		return generic_file_llseek(file, offset, whence);
+		ret = generic_file_llseek(file, offset, whence);
+		break;
 	case SEEK_DATA:
-		return bch2_seek_data(file, offset);
+		ret = bch2_seek_data(file, offset);
+		break;
 	case SEEK_HOLE:
-		return bch2_seek_hole(file, offset);
+		ret = bch2_seek_hole(file, offset);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
 	}
 
-	return -EINVAL;
+	return bch2_err_class(ret);
 }
 
 void bch2_fs_fsio_exit(struct bch_fs *c)
 {
-	bioset_exit(&c->dio_write_bioset);
-	bioset_exit(&c->dio_read_bioset);
-	bioset_exit(&c->writepage_bioset);
+	bioset_exit(&c->nocow_flush_bioset);
 }
 
 int bch2_fs_fsio_init(struct bch_fs *c)
 {
-	int ret = 0;
+	if (bioset_init(&c->nocow_flush_bioset,
+			1, offsetof(struct nocow_flush, bio), 0))
+		return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
 
-	pr_verbose_init(c->opts, "");
-
-	if (bioset_init(&c->writepage_bioset,
-			4, offsetof(struct bch_writepage_io, op.op.wbio.bio),
-			BIOSET_NEED_BVECS) ||
-	    bioset_init(&c->dio_read_bioset,
-			4, offsetof(struct dio_read, rbio.bio),
-			BIOSET_NEED_BVECS) ||
-	    bioset_init(&c->dio_write_bioset,
-			4, offsetof(struct dio_write, iop.op.wbio.bio),
-			BIOSET_NEED_BVECS))
-		ret = -ENOMEM;
-
-	pr_verbose_init(c->opts, "ret %i", ret);
-	return ret;
+	return 0;
 }
 
 #endif /* NO_BCACHEFS_FS */
diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h
index 30d1ea9d..ca70346e 100644
--- a/libbcachefs/fs-io.h
+++ b/libbcachefs/fs-io.h
@@ -1,43 +1,178 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_FS_IO_H
 #define _BCACHEFS_FS_IO_H
 
 #ifndef NO_BCACHEFS_FS
 
 #include "buckets.h"
-#include "io_types.h"
+#include "fs.h"
+#include "io_write_types.h"
+#include "quota.h"
 
 #include <linux/uio.h>
 
-int bch2_set_page_dirty(struct page *);
+struct folio_vec {
+	struct folio	*fv_folio;
+	size_t		fv_offset;
+	size_t		fv_len;
+};
 
-int bch2_writepage(struct page *, struct writeback_control *);
-int bch2_readpage(struct file *, struct page *);
+static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
+{
 
-int bch2_writepages(struct address_space *, struct writeback_control *);
-int bch2_readpages(struct file *, struct address_space *,
-		   struct list_head *, unsigned);
+	struct folio *folio	= page_folio(bv.bv_page);
+	size_t offset		= (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
+		bv.bv_offset;
+	size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
 
-int bch2_write_begin(struct file *, struct address_space *, loff_t,
-		     unsigned, unsigned, struct page **, void **);
-int bch2_write_end(struct file *, struct address_space *, loff_t,
-		   unsigned, unsigned, struct page *, void *);
+	return (struct folio_vec) {
+		.fv_folio	= folio,
+		.fv_offset	= offset,
+		.fv_len		= len,
+	};
+}
 
-ssize_t bch2_direct_IO(struct kiocb *, struct iov_iter *);
+static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
+						    struct bvec_iter iter)
+{
+	return biovec_to_foliovec(bio_iter_iovec(bio, iter));
+}
 
-ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
+#define __bio_for_each_folio(bvl, bio, iter, start)			\
+	for (iter = (start);						\
+	     (iter).bi_size &&						\
+		((bvl = bio_iter_iovec_folio((bio), (iter))), 1);	\
+	     bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
+
+/**
+ * bio_for_each_folio - iterate over folios within a bio
+ *
+ * Like other non-_all versions, this iterates over what bio->bi_iter currently
+ * points to. This version is for drivers, where the bio may have previously
+ * been split or cloned.
+ */
+#define bio_for_each_folio(bvl, bio, iter)				\
+	__bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
+
+struct quota_res {
+	u64				sectors;
+};
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+static inline void __bch2_quota_reservation_put(struct bch_fs *c,
+					 struct bch_inode_info *inode,
+					 struct quota_res *res)
+{
+	BUG_ON(res->sectors > inode->ei_quota_reserved);
+
+	bch2_quota_acct(c, inode->ei_qid, Q_SPC,
+			-((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
+	inode->ei_quota_reserved -= res->sectors;
+	res->sectors = 0;
+}
+
+static inline void bch2_quota_reservation_put(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       struct quota_res *res)
+{
+	if (res->sectors) {
+		mutex_lock(&inode->ei_quota_lock);
+		__bch2_quota_reservation_put(c, inode, res);
+		mutex_unlock(&inode->ei_quota_lock);
+	}
+}
+
+static inline int bch2_quota_reservation_add(struct bch_fs *c,
+				      struct bch_inode_info *inode,
+				      struct quota_res *res,
+				      u64 sectors,
+				      bool check_enospc)
+{
+	int ret;
+
+	if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
+		return 0;
+
+	mutex_lock(&inode->ei_quota_lock);
+	ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
+			      check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
+	if (likely(!ret)) {
+		inode->ei_quota_reserved += sectors;
+		res->sectors += sectors;
+	}
+	mutex_unlock(&inode->ei_quota_lock);
+
+	return ret;
+}
+
+#else
+
+static inline void __bch2_quota_reservation_put(struct bch_fs *c,
+					 struct bch_inode_info *inode,
+					 struct quota_res *res) {}
+
+static inline void bch2_quota_reservation_put(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       struct quota_res *res) {}
+
+static inline int bch2_quota_reservation_add(struct bch_fs *c,
+				      struct bch_inode_info *inode,
+				      struct quota_res *res,
+				      unsigned sectors,
+				      bool check_enospc)
+{
+	return 0;
+}
+
+#endif
+
+void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *,
+			   struct quota_res *, s64);
+
+static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+				       struct quota_res *quota_res, s64 sectors)
+{
+	if (sectors) {
+		mutex_lock(&inode->ei_quota_lock);
+		__bch2_i_sectors_acct(c, inode, quota_res, sectors);
+		mutex_unlock(&inode->ei_quota_lock);
+	}
+}
+
+static inline struct address_space *faults_disabled_mapping(void)
+{
+	return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
+}
+
+static inline void set_fdm_dropped_locks(void)
+{
+	current->faults_disabled_mapping =
+		(void *) (((unsigned long) current->faults_disabled_mapping)|1);
+}
+
+static inline bool fdm_dropped_locks(void)
+{
+	return ((unsigned long) current->faults_disabled_mapping) & 1;
+}
+
+void bch2_inode_flush_nocow_writes_async(struct bch_fs *,
+			struct bch_inode_info *, struct closure *);
+
+int __must_check bch2_write_inode_size(struct bch_fs *,
+				       struct bch_inode_info *,
+				       loff_t, unsigned);
 
 int bch2_fsync(struct file *, loff_t, loff_t, int);
 
-int bch2_truncate(struct bch_inode_info *, struct iattr *);
+int bchfs_truncate(struct mnt_idmap *,
+		  struct bch_inode_info *, struct iattr *);
 long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
 
-loff_t bch2_llseek(struct file *, loff_t, int);
+loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
+			     loff_t, loff_t, unsigned);
 
-int bch2_page_mkwrite(struct vm_fault *);
-void bch2_invalidatepage(struct page *, unsigned int, unsigned int);
-int bch2_releasepage(struct page *, gfp_t);
-int bch2_migrate_page(struct address_space *, struct page *,
-		      struct page *, enum migrate_mode);
+loff_t bch2_llseek(struct file *, loff_t, int);
 
 void bch2_fs_fsio_exit(struct bch_fs *);
 int bch2_fs_fsio_init(struct bch_fs *);
diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c
index 336dbd4b..15725b4c 100644
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@@ -1,100 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
 #ifndef NO_BCACHEFS_FS
 
 #include "bcachefs.h"
 #include "chardev.h"
+#include "dirent.h"
 #include "fs.h"
+#include "fs-common.h"
 #include "fs-ioctl.h"
 #include "quota.h"
 
 #include <linux/compat.h>
+#include <linux/fsnotify.h>
 #include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/writeback.h>
 
 #define FS_IOC_GOINGDOWN	     _IOR('X', 125, __u32)
-
-/* Inode flags: */
-
-/* bcachefs inode flags -> vfs inode flags: */
-static const unsigned bch_flags_to_vfs[] = {
-	[__BCH_INODE_SYNC]	= S_SYNC,
-	[__BCH_INODE_IMMUTABLE]	= S_IMMUTABLE,
-	[__BCH_INODE_APPEND]	= S_APPEND,
-	[__BCH_INODE_NOATIME]	= S_NOATIME,
-};
-
-/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
-static const unsigned bch_flags_to_uflags[] = {
-	[__BCH_INODE_SYNC]	= FS_SYNC_FL,
-	[__BCH_INODE_IMMUTABLE]	= FS_IMMUTABLE_FL,
-	[__BCH_INODE_APPEND]	= FS_APPEND_FL,
-	[__BCH_INODE_NODUMP]	= FS_NODUMP_FL,
-	[__BCH_INODE_NOATIME]	= FS_NOATIME_FL,
-};
-
-/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
-static const unsigned bch_flags_to_xflags[] = {
-	[__BCH_INODE_SYNC]	= FS_XFLAG_SYNC,
-	[__BCH_INODE_IMMUTABLE]	= FS_XFLAG_IMMUTABLE,
-	[__BCH_INODE_APPEND]	= FS_XFLAG_APPEND,
-	[__BCH_INODE_NODUMP]	= FS_XFLAG_NODUMP,
-	[__BCH_INODE_NOATIME]	= FS_XFLAG_NOATIME,
-	//[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
-};
-
-#define set_flags(_map, _in, _out)					\
-do {									\
-	unsigned _i;							\
-									\
-	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
-		if ((_in) & (1 << _i))					\
-			(_out) |= _map[_i];				\
-		else							\
-			(_out) &= ~_map[_i];				\
-} while (0)
-
-#define map_flags(_map, _in)						\
-({									\
-	unsigned _out = 0;						\
-									\
-	set_flags(_map, _in, _out);					\
-	_out;								\
-})
-
-#define map_flags_rev(_map, _in)					\
-({									\
-	unsigned _i, _out = 0;						\
-									\
-	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
-		if ((_in) & _map[_i]) {					\
-			(_out) |= 1 << _i;				\
-			(_in) &= ~_map[_i];				\
-		}							\
-	(_out);								\
-})
-
-#define map_defined(_map)						\
-({									\
-	unsigned _in = ~0;						\
-									\
-	map_flags_rev(_map, _in);					\
-})
-
-/* Set VFS inode flags from bcachefs inode: */
-void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
-{
-	set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
-}
+#define FSOP_GOING_FLAGS_DEFAULT	0x0	/* going down */
+#define FSOP_GOING_FLAGS_LOGFLUSH	0x1	/* flush log but not data */
+#define FSOP_GOING_FLAGS_NOLOGFLUSH	0x2	/* don't flush log nor data */
 
 struct flags_set {
 	unsigned		mask;
 	unsigned		flags;
 
 	unsigned		projid;
+
+	bool			set_projinherit;
+	bool			projinherit;
 };
 
-static int bch2_inode_flags_set(struct bch_inode_info *inode,
+static int bch2_inode_flags_set(struct btree_trans *trans,
+				struct bch_inode_info *inode,
 				struct bch_inode_unpacked *bi,
 				void *p)
 {
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	/*
 	 * We're relying on btree locking here for exclusion with other ioctl
 	 * calls - use the flags in the btree (@bi), not inode->i_flags:
@@ -103,18 +45,24 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode,
 	unsigned newflags = s->flags;
 	unsigned oldflags = bi->bi_flags & s->mask;
 
-	if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) &&
+	if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) &&
 	    !capable(CAP_LINUX_IMMUTABLE))
 		return -EPERM;
 
-	if (!S_ISREG(inode->v.i_mode) &&
-	    !S_ISDIR(inode->v.i_mode) &&
-	    (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
+	if (!S_ISREG(bi->bi_mode) &&
+	    !S_ISDIR(bi->bi_mode) &&
+	    (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags)
 		return -EINVAL;
 
+	if (s->set_projinherit) {
+		bi->bi_fields_set &= ~(1 << Inode_opt_project);
+		bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);
+	}
+
 	bi->bi_flags &= ~s->mask;
 	bi->bi_flags |= newflags;
-	inode->v.i_ctime = current_time(&inode->v);
+
+	bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
 	return 0;
 }
 
@@ -146,16 +94,15 @@ static int bch2_ioc_setflags(struct bch_fs *c,
 		return ret;
 
 	inode_lock(&inode->v);
-	if (!inode_owner_or_capable(&inode->v)) {
+	if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
 		ret = -EACCES;
 		goto setflags_out;
 	}
 
 	mutex_lock(&inode->ei_update_lock);
-	ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s, 0);
-
-	if (!ret)
-		bch2_inode_flags_to_vfs(inode);
+	ret   = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
+		bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
+			       ATTR_CTIME);
 	mutex_unlock(&inode->ei_update_lock);
 
 setflags_out:
@@ -170,42 +117,31 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
 	struct fsxattr fa = { 0 };
 
 	fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
-	fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
 
-	return copy_to_user(arg, &fa, sizeof(fa));
-}
+	if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
+		fa.fsx_xflags |= FS_XFLAG_PROJINHERIT;
 
-static int bch2_set_projid(struct bch_fs *c,
-			   struct bch_inode_info *inode,
-			   u32 projid)
-{
-	struct bch_qid qid = inode->ei_qid;
-	int ret;
-
-	if (projid == inode->ei_qid.q[QTYP_PRJ])
-		return 0;
-
-	qid.q[QTYP_PRJ] = projid;
+	fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
 
-	return bch2_quota_transfer(c, 1 << QTYP_PRJ, qid, inode->ei_qid,
-				   inode->v.i_blocks +
-				   inode->ei_quota_reserved);
-	if (ret)
-		return ret;
+	if (copy_to_user(arg, &fa, sizeof(fa)))
+		return -EFAULT;
 
-	inode->ei_qid.q[QTYP_PRJ] = projid;
 	return 0;
 }
 
-static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
+static int fssetxattr_inode_update_fn(struct btree_trans *trans,
+				      struct bch_inode_info *inode,
 				      struct bch_inode_unpacked *bi,
 				      void *p)
 {
 	struct flags_set *s = p;
 
-	bi->bi_project = s->projid;
+	if (s->projid != bi->bi_project) {
+		bi->bi_fields_set |= 1U << Inode_opt_project;
+		bi->bi_project = s->projid;
+	}
 
-	return bch2_inode_flags_set(inode, bi, p);
+	return bch2_inode_flags_set(trans, inode, bi, p);
 }
 
 static int bch2_ioc_fssetxattr(struct bch_fs *c,
@@ -220,31 +156,38 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
 	if (copy_from_user(&fa, arg, sizeof(fa)))
 		return -EFAULT;
 
+	s.set_projinherit = true;
+	s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0;
+	fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
+
 	s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
 	if (fa.fsx_xflags)
 		return -EOPNOTSUPP;
 
-	s.projid = fa.fsx_projid;
+	if (fa.fsx_projid >= U32_MAX)
+		return -EINVAL;
+
+	/*
+	 * inode fields accessible via the xattr interface are stored with a +1
+	 * bias, so that 0 means unset:
+	 */
+	s.projid = fa.fsx_projid + 1;
 
 	ret = mnt_want_write_file(file);
 	if (ret)
 		return ret;
 
 	inode_lock(&inode->v);
-	if (!inode_owner_or_capable(&inode->v)) {
+	if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
 		ret = -EACCES;
 		goto err;
 	}
 
 	mutex_lock(&inode->ei_update_lock);
-	ret = bch2_set_projid(c, inode, fa.fsx_projid);
-	if (ret)
-		goto err_unlock;
-
-	ret = __bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, 0);
-	if (!ret)
-		bch2_inode_flags_to_vfs(inode);
-err_unlock:
+	ret   = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
+		bch2_set_projid(c, inode, fa.fsx_projid) ?:
+		bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
+			       ATTR_CTIME);
 	mutex_unlock(&inode->ei_update_lock);
 err:
 	inode_unlock(&inode->v);
@@ -252,42 +195,411 @@ err:
 	return ret;
 }
 
+static int bch2_reinherit_attrs_fn(struct btree_trans *trans,
+				   struct bch_inode_info *inode,
+				   struct bch_inode_unpacked *bi,
+				   void *p)
+{
+	struct bch_inode_info *dir = p;
+
+	return !bch2_reinherit_attrs(bi, &dir->ei_inode);
+}
+
+static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
+				    struct file *file,
+				    struct bch_inode_info *src,
+				    const char __user *name)
+{
+	struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode);
+	struct bch_inode_info *dst;
+	struct inode *vinode = NULL;
+	char *kname = NULL;
+	struct qstr qstr;
+	int ret = 0;
+	subvol_inum inum;
+
+	kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
+	if (!kname)
+		return -ENOMEM;
+
+	ret = strncpy_from_user(kname, name, BCH_NAME_MAX);
+	if (unlikely(ret < 0))
+		goto err1;
+
+	qstr.len	= ret;
+	qstr.name	= kname;
+
+	ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum);
+	if (ret)
+		goto err1;
+
+	vinode = bch2_vfs_inode_get(c, inum);
+	ret = PTR_ERR_OR_ZERO(vinode);
+	if (ret)
+		goto err1;
+
+	dst = to_bch_ei(vinode);
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		goto err2;
+
+	bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst);
+
+	if (inode_attr_changing(src, dst, Inode_opt_project)) {
+		ret = bch2_fs_quota_transfer(c, dst,
+					     src->ei_qid,
+					     1 << QTYP_PRJ,
+					     KEY_TYPE_QUOTA_PREALLOC);
+		if (ret)
+			goto err3;
+	}
+
+	ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0);
+err3:
+	bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst);
+
+	/* return true if we did work */
+	if (ret >= 0)
+		ret = !ret;
+
+	mnt_drop_write_file(file);
+err2:
+	iput(vinode);
+err1:
+	kfree(kname);
+
+	return ret;
+}
+
+static int bch2_ioc_getversion(struct bch_inode_info *inode, u32 __user *arg)
+{
+	return put_user(inode->v.i_generation, arg);
+}
+
+static int bch2_ioc_getlabel(struct bch_fs *c, char __user *user_label)
+{
+	int ret;
+	size_t len;
+	char label[BCH_SB_LABEL_SIZE];
+
+	BUILD_BUG_ON(BCH_SB_LABEL_SIZE >= FSLABEL_MAX);
+
+	mutex_lock(&c->sb_lock);
+	memcpy(label, c->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
+	mutex_unlock(&c->sb_lock);
+
+	len = strnlen(label, BCH_SB_LABEL_SIZE);
+	if (len == BCH_SB_LABEL_SIZE) {
+		bch_warn(c,
+			"label is too long, return the first %zu bytes",
+			--len);
+	}
+
+	ret = copy_to_user(user_label, label, len);
+
+	return ret ? -EFAULT : 0;
+}
+
+static int bch2_ioc_setlabel(struct bch_fs *c,
+			     struct file *file,
+			     struct bch_inode_info *inode,
+			     const char __user *user_label)
+{
+	int ret;
+	char label[BCH_SB_LABEL_SIZE];
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(label, user_label, sizeof(label)))
+		return -EFAULT;
+
+	if (strnlen(label, BCH_SB_LABEL_SIZE) == BCH_SB_LABEL_SIZE) {
+		bch_err(c,
+			"unable to set label with more than %d bytes",
+			BCH_SB_LABEL_SIZE - 1);
+		return -EINVAL;
+	}
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->sb_lock);
+	strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE);
+	ret = bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
+{
+	u32 flags;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (get_user(flags, arg))
+		return -EFAULT;
+
+	bch_notice(c, "shutdown by ioctl type %u", flags);
+
+	switch (flags) {
+	case FSOP_GOING_FLAGS_DEFAULT:
+		ret = bdev_freeze(c->vfs_sb->s_bdev);
+		if (ret)
+			break;
+		bch2_journal_flush(&c->journal);
+		bch2_fs_emergency_read_only(c);
+		bdev_thaw(c->vfs_sb->s_bdev);
+		break;
+	case FSOP_GOING_FLAGS_LOGFLUSH:
+		bch2_journal_flush(&c->journal);
+		fallthrough;
+	case FSOP_GOING_FLAGS_NOLOGFLUSH:
+		bch2_fs_emergency_read_only(c);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+					struct bch_ioctl_subvolume arg)
+{
+	struct inode *dir;
+	struct bch_inode_info *inode;
+	struct user_namespace *s_user_ns;
+	struct dentry *dst_dentry;
+	struct path src_path, dst_path;
+	int how = LOOKUP_FOLLOW;
+	int error;
+	subvol_inum snapshot_src = { 0 };
+	unsigned lookup_flags = 0;
+	unsigned create_flags = BCH_CREATE_SUBVOL;
+
+	if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE|
+			  BCH_SUBVOL_SNAPSHOT_RO))
+		return -EINVAL;
+
+	if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+	    (arg.src_ptr ||
+	     (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
+		return -EINVAL;
+
+	if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+		create_flags |= BCH_CREATE_SNAPSHOT;
+
+	if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
+		create_flags |= BCH_CREATE_SNAPSHOT_RO;
+
+	if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) {
+		/* sync_inodes_sb enforce s_umount is locked */
+		down_read(&c->vfs_sb->s_umount);
+		sync_inodes_sb(c->vfs_sb);
+		up_read(&c->vfs_sb->s_umount);
+	}
+
+	if (arg.src_ptr) {
+		error = user_path_at(arg.dirfd,
+				(const char __user *)(unsigned long)arg.src_ptr,
+				how, &src_path);
+		if (error)
+			goto err1;
+
+		if (src_path.dentry->d_sb->s_fs_info != c) {
+			path_put(&src_path);
+			error = -EXDEV;
+			goto err1;
+		}
+
+		snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode));
+	}
+
+	dst_dentry = user_path_create(arg.dirfd,
+			(const char __user *)(unsigned long)arg.dst_ptr,
+			&dst_path, lookup_flags);
+	error = PTR_ERR_OR_ZERO(dst_dentry);
+	if (error)
+		goto err2;
+
+	if (dst_dentry->d_sb->s_fs_info != c) {
+		error = -EXDEV;
+		goto err3;
+	}
+
+	if (dst_dentry->d_inode) {
+		error = -BCH_ERR_EEXIST_subvolume_create;
+		goto err3;
+	}
+
+	dir = dst_path.dentry->d_inode;
+	if (IS_DEADDIR(dir)) {
+		error = -BCH_ERR_ENOENT_directory_dead;
+		goto err3;
+	}
+
+	s_user_ns = dir->i_sb->s_user_ns;
+	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+	    !kgid_has_mapping(s_user_ns, current_fsgid())) {
+		error = -EOVERFLOW;
+		goto err3;
+	}
+
+	error = inode_permission(file_mnt_idmap(filp),
+				 dir, MAY_WRITE | MAY_EXEC);
+	if (error)
+		goto err3;
+
+	if (!IS_POSIXACL(dir))
+		arg.mode &= ~current_umask();
+
+	error = security_path_mkdir(&dst_path, dst_dentry, arg.mode);
+	if (error)
+		goto err3;
+
+	if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+	    !arg.src_ptr)
+		snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol;
+
+	down_write(&c->snapshot_create_lock);
+	inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir),
+			      dst_dentry, arg.mode|S_IFDIR,
+			      0, snapshot_src, create_flags);
+	up_write(&c->snapshot_create_lock);
+
+	error = PTR_ERR_OR_ZERO(inode);
+	if (error)
+		goto err3;
+
+	d_instantiate(dst_dentry, &inode->v);
+	fsnotify_mkdir(dir, dst_dentry);
+err3:
+	done_path_create(&dst_path, dst_dentry);
+err2:
+	if (arg.src_ptr)
+		path_put(&src_path);
+err1:
+	return error;
+}
+
+static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
+				struct bch_ioctl_subvolume arg)
+{
+	const char __user *name = (void __user *)(unsigned long)arg.dst_ptr;
+	struct path path;
+	struct inode *dir;
+	struct dentry *victim;
+	int ret = 0;
+
+	if (arg.flags)
+		return -EINVAL;
+
+	victim = user_path_locked_at(arg.dirfd, name, &path);
+	if (IS_ERR(victim))
+		return PTR_ERR(victim);
+
+	dir = d_inode(path.dentry);
+	if (victim->d_sb->s_fs_info != c) {
+		ret = -EXDEV;
+		goto err;
+	}
+	if (!d_is_positive(victim)) {
+		ret = -ENOENT;
+		goto err;
+	}
+	ret = __bch2_unlink(dir, victim, true);
+	if (!ret) {
+		fsnotify_rmdir(dir, victim);
+		d_delete(victim);
+	}
+err:
+	inode_unlock(dir);
+	dput(victim);
+	path_put(&path);
+	return ret;
+}
+
 long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
-	struct super_block *sb = inode->v.i_sb;
-	struct bch_fs *c = sb->s_fs_info;
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	long ret;
 
 	switch (cmd) {
 	case FS_IOC_GETFLAGS:
-		return bch2_ioc_getflags(inode, (int __user *) arg);
+		ret = bch2_ioc_getflags(inode, (int __user *) arg);
+		break;
 
 	case FS_IOC_SETFLAGS:
-		return bch2_ioc_setflags(c, file, inode, (int __user *) arg);
+		ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg);
+		break;
 
 	case FS_IOC_FSGETXATTR:
-		return bch2_ioc_fsgetxattr(inode, (void __user *) arg);
+		ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg);
+		break;
+
 	case FS_IOC_FSSETXATTR:
-		return bch2_ioc_fssetxattr(c, file, inode, (void __user *) arg);
+		ret = bch2_ioc_fssetxattr(c, file, inode,
+					  (void __user *) arg);
+		break;
+
+	case BCHFS_IOC_REINHERIT_ATTRS:
+		ret = bch2_ioc_reinherit_attrs(c, file, inode,
+					       (void __user *) arg);
+		break;
 
 	case FS_IOC_GETVERSION:
-		return -ENOTTY;
+		ret = bch2_ioc_getversion(inode, (u32 __user *) arg);
+		break;
+
 	case FS_IOC_SETVERSION:
-		return -ENOTTY;
+		ret = -ENOTTY;
+		break;
+
+	case FS_IOC_GETFSLABEL:
+		ret = bch2_ioc_getlabel(c, (void __user *) arg);
+		break;
+
+	case FS_IOC_SETFSLABEL:
+		ret = bch2_ioc_setlabel(c, file, inode, (const void __user *) arg);
+		break;
 
 	case FS_IOC_GOINGDOWN:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
+		ret = bch2_ioc_goingdown(c, (u32 __user *) arg);
+		break;
 
-		down_write(&sb->s_umount);
-		sb->s_flags |= MS_RDONLY;
-		bch2_fs_emergency_read_only(c);
-		up_write(&sb->s_umount);
-		return 0;
+	case BCH_IOCTL_SUBVOLUME_CREATE: {
+		struct bch_ioctl_subvolume i;
+
+		ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
+			? -EFAULT
+			: bch2_ioctl_subvolume_create(c, file, i);
+		break;
+	}
+
+	case BCH_IOCTL_SUBVOLUME_DESTROY: {
+		struct bch_ioctl_subvolume i;
+
+		ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
+			? -EFAULT
+			: bch2_ioctl_subvolume_destroy(c, file, i);
+		break;
+	}
 
 	default:
-		return bch2_fs_ioctl(c, cmd, (void __user *) arg);
+		ret = bch2_fs_ioctl(c, cmd, (void __user *) arg);
+		break;
 	}
+
+	return bch2_err_class(ret);
 }
 
 #ifdef CONFIG_COMPAT
@@ -295,12 +607,18 @@ long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	/* These are just misnamed, they actually get/put from/to user an int */
 	switch (cmd) {
-	case FS_IOC_GETFLAGS:
+	case FS_IOC32_GETFLAGS:
 		cmd = FS_IOC_GETFLAGS;
 		break;
 	case FS_IOC32_SETFLAGS:
 		cmd = FS_IOC_SETFLAGS;
 		break;
+	case FS_IOC32_GETVERSION:
+		cmd = FS_IOC_GETVERSION;
+		break;
+	case FS_IOC_GETFSLABEL:
+	case FS_IOC_SETFSLABEL:
+		break;
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/libbcachefs/fs-ioctl.h b/libbcachefs/fs-ioctl.h
index c14e583d..d30f9bb0 100644
--- a/libbcachefs/fs-ioctl.h
+++ b/libbcachefs/fs-ioctl.h
@@ -1,7 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_FS_IOCTL_H
 #define _BCACHEFS_FS_IOCTL_H
 
-void bch2_inode_flags_to_vfs(struct bch_inode_info *);
+/* Inode flags: */
+
+/* bcachefs inode flags -> vfs inode flags: */
+static const __maybe_unused unsigned bch_flags_to_vfs[] = {
+	[__BCH_INODE_sync]	= S_SYNC,
+	[__BCH_INODE_immutable]	= S_IMMUTABLE,
+	[__BCH_INODE_append]	= S_APPEND,
+	[__BCH_INODE_noatime]	= S_NOATIME,
+};
+
+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
+static const __maybe_unused unsigned bch_flags_to_uflags[] = {
+	[__BCH_INODE_sync]	= FS_SYNC_FL,
+	[__BCH_INODE_immutable]	= FS_IMMUTABLE_FL,
+	[__BCH_INODE_append]	= FS_APPEND_FL,
+	[__BCH_INODE_nodump]	= FS_NODUMP_FL,
+	[__BCH_INODE_noatime]	= FS_NOATIME_FL,
+};
+
+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
+static const __maybe_unused unsigned bch_flags_to_xflags[] = {
+	[__BCH_INODE_sync]	= FS_XFLAG_SYNC,
+	[__BCH_INODE_immutable]	= FS_XFLAG_IMMUTABLE,
+	[__BCH_INODE_append]	= FS_XFLAG_APPEND,
+	[__BCH_INODE_nodump]	= FS_XFLAG_NODUMP,
+	[__BCH_INODE_noatime]	= FS_XFLAG_NOATIME,
+	//[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
+};
+
+#define set_flags(_map, _in, _out)					\
+do {									\
+	unsigned _i;							\
+									\
+	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
+		if ((_in) & (1 << _i))					\
+			(_out) |= _map[_i];				\
+		else							\
+			(_out) &= ~_map[_i];				\
+} while (0)
+
+#define map_flags(_map, _in)						\
+({									\
+	unsigned _out = 0;						\
+									\
+	set_flags(_map, _in, _out);					\
+	_out;								\
+})
+
+#define map_flags_rev(_map, _in)					\
+({									\
+	unsigned _i, _out = 0;						\
+									\
+	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
+		if ((_in) & _map[_i]) {					\
+			(_out) |= 1 << _i;				\
+			(_in) &= ~_map[_i];				\
+		}							\
+	(_out);								\
+})
+
+#define map_defined(_map)						\
+({									\
+	unsigned _in = ~0;						\
+									\
+	map_flags_rev(_map, _in);					\
+})
+
+/* Set VFS inode flags from bcachefs inode: */
+static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
+{
+	set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
+}
 
 long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
 long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index c51a65da..c6e7df7c 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -1,339 +1,593 @@
+// SPDX-License-Identifier: GPL-2.0
 #ifndef NO_BCACHEFS_FS
 
 #include "bcachefs.h"
 #include "acl.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "chardev.h"
 #include "dirent.h"
+#include "errcode.h"
 #include "extents.h"
 #include "fs.h"
+#include "fs-common.h"
 #include "fs-io.h"
 #include "fs-ioctl.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
 #include "fsck.h"
 #include "inode.h"
-#include "io.h"
+#include "io_read.h"
 #include "journal.h"
 #include "keylist.h"
 #include "quota.h"
+#include "rebalance.h"
+#include "snapshot.h"
 #include "super.h"
 #include "xattr.h"
+#include "trace.h"
 
 #include <linux/aio.h>
 #include <linux/backing-dev.h>
 #include <linux/exportfs.h>
+#include <linux/fiemap.h>
+#include <linux/fs_context.h>
 #include <linux/module.h>
+#include <linux/pagemap.h>
 #include <linux/posix_acl.h>
 #include <linux/random.h>
+#include <linux/seq_file.h>
+#include <linux/siphash.h>
 #include <linux/statfs.h>
+#include <linux/string.h>
 #include <linux/xattr.h>
 
 static struct kmem_cache *bch2_inode_cache;
 
-static void bch2_vfs_inode_init(struct bch_fs *,
+static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
 				struct bch_inode_info *,
-				struct bch_inode_unpacked *);
+				struct bch_inode_unpacked *,
+				struct bch_subvolume *);
 
-static void journal_seq_copy(struct bch_inode_info *dst,
-			     u64 journal_seq)
-{
-	u64 old, v = READ_ONCE(dst->ei_journal_seq);
-
-	do {
-		old = v;
-
-		if (old >= journal_seq)
-			break;
-	} while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
-}
-
-/*
- * I_SIZE_DIRTY requires special handling:
- *
- * To the recovery code, the flag means that there is stale data past i_size
- * that needs to be deleted; it's used for implementing atomic appends and
- * truncates.
- *
- * On append, we set I_SIZE_DIRTY before doing the write, then after the write
- * we clear I_SIZE_DIRTY atomically with updating i_size to the new larger size
- * that exposes the data we just wrote.
- *
- * On truncate, it's the reverse: We set I_SIZE_DIRTY atomically with setting
- * i_size to the new smaller size, then we delete the data that we just made
- * invisible, and then we clear I_SIZE_DIRTY.
- *
- * Because there can be multiple appends in flight at a time, we need a refcount
- * (i_size_dirty_count) instead of manipulating the flag directly. Nonzero
- * refcount means I_SIZE_DIRTY is set, zero means it's cleared.
- *
- * Because write_inode() can be called at any time, i_size_dirty_count means
- * something different to the runtime code - it means to write_inode() "don't
- * update i_size yet".
- *
- * We don't clear I_SIZE_DIRTY directly, we let write_inode() clear it when
- * i_size_dirty_count is zero - but the reverse is not true, I_SIZE_DIRTY must
- * be set explicitly.
- */
-
-void bch2_inode_update_after_write(struct bch_fs *c,
+void bch2_inode_update_after_write(struct btree_trans *trans,
 				   struct bch_inode_info *inode,
 				   struct bch_inode_unpacked *bi,
 				   unsigned fields)
 {
-	set_nlink(&inode->v, bi->bi_flags & BCH_INODE_UNLINKED
-		  ? 0
-		  : bi->bi_nlink + nlink_bias(inode->v.i_mode));
+	struct bch_fs *c = trans->c;
+
+	BUG_ON(bi->bi_inum != inode->v.i_ino);
+
+	bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum));
+
+	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
 	i_uid_write(&inode->v, bi->bi_uid);
 	i_gid_write(&inode->v, bi->bi_gid);
 	inode->v.i_mode	= bi->bi_mode;
 
 	if (fields & ATTR_ATIME)
-		inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
+		inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
 	if (fields & ATTR_MTIME)
-		inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
+		inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
 	if (fields & ATTR_CTIME)
-		inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
+		inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
 
 	inode->ei_inode		= *bi;
-	inode->ei_qid		= bch_qid(bi);
+
+	bch2_inode_flags_to_vfs(inode);
 }
 
-int __must_check bch2_write_inode_trans(struct btree_trans *trans,
-				struct bch_inode_info *inode,
-				struct bch_inode_unpacked *inode_u,
-				inode_set_fn set,
-				void *p)
+int __must_check bch2_write_inode(struct bch_fs *c,
+				  struct bch_inode_info *inode,
+				  inode_set_fn set,
+				  void *p, unsigned fields)
 {
-	struct btree_iter *iter;
-	struct bkey_inode_buf *inode_p;
-	struct bkey_s_c k;
-	u64 inum = inode->v.i_ino;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter = { NULL };
+	struct bch_inode_unpacked inode_u;
 	int ret;
+retry:
+	bch2_trans_begin(trans);
 
-	lockdep_assert_held(&inode->ei_update_lock);
+	ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent);
+	if (ret)
+		goto err;
 
-	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inum, 0),
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
+	struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u);
 
-	k = bch2_btree_iter_peek_slot(iter);
-	if ((ret = btree_iter_err(k)))
-		return ret;
+	ret = (set ? set(trans, inode, &inode_u, p) : 0);
+	if (ret)
+		goto err;
 
-	if (WARN_ONCE(k.k->type != BCH_INODE_FS,
-		      "inode %llu not found when updating", inum))
-		return -ENOENT;
+	struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u);
 
-	ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode_u);
-	if (WARN_ONCE(ret,
-		      "error %i unpacking inode %llu", ret, inum))
-		return -ENOENT;
+	if (memcmp(&old_r, &new_r, sizeof(new_r))) {
+		ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum);
+		if (ret)
+			goto err;
+	}
 
-	BUG_ON(inode_u->bi_size != inode->ei_inode.bi_size);
+	ret   = bch2_inode_write(trans, &iter, &inode_u) ?:
+		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
 
-	BUG_ON(inode_u->bi_size != inode->ei_inode.bi_size &&
-	       !(inode_u->bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-	       inode_u->bi_size > i_size_read(&inode->v));
+	/*
+	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
+	 * this is important for inode updates via bchfs_write_index_update
+	 */
+	if (!ret)
+		bch2_inode_update_after_write(trans, inode, &inode_u, fields);
+err:
+	bch2_trans_iter_exit(trans, &iter);
 
-	if (set) {
-		ret = set(inode, inode_u, p);
-		if (ret)
-			return ret;
-	}
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
 
-	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
-	if (IS_ERR(inode_p))
-		return PTR_ERR(inode_p);
+	bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
+			     "%s: inode %llu:%llu not found when updating",
+			     bch2_err_str(ret),
+			     inode_inum(inode).subvol,
+			     inode_inum(inode).inum);
 
-	bch2_inode_pack(inode_p, inode_u);
-	bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
-	return 0;
+	bch2_trans_put(trans);
+	return ret < 0 ? ret : 0;
 }
 
-int __must_check __bch2_write_inode(struct bch_fs *c,
-				    struct bch_inode_info *inode,
-				    inode_set_fn set,
-				    void *p, unsigned fields)
+int bch2_fs_quota_transfer(struct bch_fs *c,
+			   struct bch_inode_info *inode,
+			   struct bch_qid new_qid,
+			   unsigned qtypes,
+			   enum quota_acct_mode mode)
 {
-	struct btree_trans trans;
-	struct bch_inode_unpacked inode_u;
+	unsigned i;
 	int ret;
 
-	bch2_trans_init(&trans, c);
-retry:
-	bch2_trans_begin(&trans);
-
-	ret = bch2_write_inode_trans(&trans, inode, &inode_u, set, p) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
-				  &inode->ei_journal_seq,
-				  BTREE_INSERT_ATOMIC|
-				  BTREE_INSERT_NOUNLOCK|
-				  BTREE_INSERT_NOFAIL);
-	if (ret == -EINTR)
-		goto retry;
+	qtypes &= enabled_qtypes(c);
+
+	for (i = 0; i < QTYP_NR; i++)
+		if (new_qid.q[i] == inode->ei_qid.q[i])
+			qtypes &= ~(1U << i);
+
+	if (!qtypes)
+		return 0;
+
+	mutex_lock(&inode->ei_quota_lock);
+
+	ret = bch2_quota_transfer(c, qtypes, new_qid,
+				  inode->ei_qid,
+				  inode->v.i_blocks +
+				  inode->ei_quota_reserved,
+				  mode);
+	if (!ret)
+		for (i = 0; i < QTYP_NR; i++)
+			if (qtypes & (1 << i))
+				inode->ei_qid.q[i] = new_qid.q[i];
+
+	mutex_unlock(&inode->ei_quota_lock);
+
+	return ret;
+}
+
+static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
+{
+	return a.subvol == b.subvol && a.inum == b.inum;
+}
+
+static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
+{
+	const subvol_inum *inum = data;
+	siphash_key_t k = { .key[0] = seed };
+
+	return siphash_2u64(inum->subvol, inum->inum, &k);
+}
+
+static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed)
+{
+	const struct bch_inode_info *inode = data;
+
+	return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed);
+}
+
+static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
+				 const void *obj)
+{
+	const struct bch_inode_info *inode = obj;
+	const subvol_inum *v = arg->key;
+
+	return !subvol_inum_eq(inode->ei_inum, *v);
+}
+
+static const struct rhashtable_params bch2_vfs_inodes_params = {
+	.head_offset		= offsetof(struct bch_inode_info, hash),
+	.key_offset		= offsetof(struct bch_inode_info, ei_inum),
+	.key_len		= sizeof(subvol_inum),
+	.hashfn			= bch2_vfs_inode_hash_fn,
+	.obj_hashfn		= bch2_vfs_inode_obj_hash_fn,
+	.obj_cmpfn		= bch2_vfs_inode_cmp_fn,
+	.automatic_shrinking	= true,
+};
+
+static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = {
+	.head_offset		= offsetof(struct bch_inode_info, by_inum_hash),
+	.key_offset		= offsetof(struct bch_inode_info, ei_inum.inum),
+	.key_len		= sizeof(u64),
+	.automatic_shrinking	= true,
+};
+
+int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
+{
+	struct bch_fs *c = trans->c;
+	struct rhltable *ht = &c->vfs_inodes_by_inum_table;
+	u64 inum = p.offset;
+	DARRAY(u32) subvols;
+	int ret = 0;
+
+	if (!test_bit(BCH_FS_started, &c->flags))
+		return false;
+
+	darray_init(&subvols);
+restart_from_top:
 
 	/*
-	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
-	 * this is important for inode updates via bchfs_write_index_update
+	 * Tweaked version of __rhashtable_lookup(); we need to get a list of
+	 * subvolumes in which the given inode number is open.
+	 *
+	 * For this to work, we don't include the subvolume ID in the key that
+	 * we hash - all inodes with the same inode number regardless of
+	 * subvolume will hash to the same slot.
+	 *
+	 * This will be less than ideal if the same file is ever open
+	 * simultaneously in many different snapshots:
 	 */
-	if (!ret)
-		bch2_inode_update_after_write(c, inode, &inode_u, fields);
+	rcu_read_lock();
+	struct rhash_lock_head __rcu *const *bkt;
+	struct rhash_head *he;
+	unsigned int hash;
+	struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht);
+restart:
+	hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params);
+	bkt = rht_bucket(tbl, hash);
+	do {
+		struct bch_inode_info *inode;
+
+		rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
+			if (inode->ei_inum.inum == inum) {
+				ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
+						      GFP_NOWAIT|__GFP_NOWARN);
+				if (ret) {
+					rcu_read_unlock();
+					ret = darray_make_room(&subvols, 1);
+					if (ret)
+						goto err;
+					subvols.nr = 0;
+					goto restart_from_top;
+				}
+			}
+		}
+		/* An object might have been moved to a different hash chain,
+		 * while we walk along it - better check and retry.
+		 */
+	} while (he != RHT_NULLS_MARKER(bkt));
 
-	bch2_trans_exit(&trans);
-	return ret < 0 ? ret : 0;
+	/* Ensure we see any new tables. */
+	smp_rmb();
+
+	tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht);
+	if (unlikely(tbl))
+		goto restart;
+	rcu_read_unlock();
+
+	darray_for_each(subvols, i) {
+		u32 snap;
+		ret = bch2_subvolume_get_snapshot(trans, *i, &snap);
+		if (ret)
+			goto err;
+
+		ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot);
+		if (ret)
+			break;
+	}
+err:
+	darray_exit(&subvols);
+	return ret;
 }
 
-static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
+static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
+{
+	return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
+}
+
+static void __wait_on_freeing_inode(struct bch_fs *c,
+				    struct bch_inode_info *inode,
+				    subvol_inum inum)
+{
+	wait_queue_head_t *wq;
+	struct wait_bit_queue_entry wait;
+
+	wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW);
+	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+	spin_unlock(&inode->v.i_lock);
+
+	if (__bch2_inode_hash_find(c, inum) == inode)
+		schedule_timeout(HZ * 10);
+	finish_wait(wq, &wait.wq_entry);
+}
+
+static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans,
+						   subvol_inum inum)
 {
-	struct bch_inode_unpacked inode_u;
 	struct bch_inode_info *inode;
-	int ret;
+repeat:
+	inode = __bch2_inode_hash_find(c, inum);
+	if (inode) {
+		spin_lock(&inode->v.i_lock);
+		if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
+			spin_unlock(&inode->v.i_lock);
+			return NULL;
+		}
+		if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
+			if (!trans) {
+				__wait_on_freeing_inode(c, inode, inum);
+			} else {
+				bch2_trans_unlock(trans);
+				__wait_on_freeing_inode(c, inode, inum);
+				int ret = bch2_trans_relock(trans);
+				if (ret)
+					return ERR_PTR(ret);
+			}
+			goto repeat;
+		}
+		__iget(&inode->v);
+		spin_unlock(&inode->v.i_lock);
+	}
 
-	inode = to_bch_ei(iget_locked(c->vfs_sb, inum));
-	if (unlikely(!inode))
-		return ERR_PTR(-ENOMEM);
-	if (!(inode->v.i_state & I_NEW))
-		return &inode->v;
+	return inode;
+}
 
-	ret = bch2_inode_find_by_inum(c, inum, &inode_u);
-	if (ret) {
-		iget_failed(&inode->v);
-		return ERR_PTR(ret);
+static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
+{
+	spin_lock(&inode->v.i_lock);
+	bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
+	spin_unlock(&inode->v.i_lock);
+
+	if (remove) {
+		int ret = rhltable_remove(&c->vfs_inodes_by_inum_table,
+					&inode->by_inum_hash, bch2_vfs_inodes_by_inum_params);
+		BUG_ON(ret);
+
+		ret = rhashtable_remove_fast(&c->vfs_inodes_table,
+					&inode->hash, bch2_vfs_inodes_params);
+		BUG_ON(ret);
+		inode->v.i_hash.pprev = NULL;
+		/*
+		 * This pairs with the bch2_inode_hash_find() ->
+		 * __wait_on_freeing_inode() path
+		 */
+		inode_wake_up_bit(&inode->v, __I_NEW);
 	}
+}
 
-	bch2_vfs_inode_init(c, inode, &inode_u);
+static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
+						     struct btree_trans *trans,
+						     struct bch_inode_info *inode)
+{
+	struct bch_inode_info *old = inode;
 
-	inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum);
+	set_bit(EI_INODE_HASHED, &inode->ei_flags);
+retry:
+	if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table,
+					&inode->ei_inum,
+					&inode->hash,
+					bch2_vfs_inodes_params))) {
+		old = bch2_inode_hash_find(c, trans, inode->ei_inum);
+		if (!old)
+			goto retry;
 
-	unlock_new_inode(&inode->v);
+		clear_bit(EI_INODE_HASHED, &inode->ei_flags);
 
-	return &inode->v;
+		/*
+		 * bcachefs doesn't use I_NEW; we have no use for it since we
+		 * only insert fully created inodes in the inode hash table. But
+		 * discard_new_inode() expects it to be set...
+		 */
+		inode->v.i_state |= I_NEW;
+		/*
+		 * We don't want bch2_evict_inode() to delete the inode on disk,
+		 * we just raced and had another inode in cache. Normally new
+		 * inodes don't have nlink == 0 - except tmpfiles do...
+		 */
+		set_nlink(&inode->v, 1);
+		discard_new_inode(&inode->v);
+		return old;
+	} else {
+		int ret = rhltable_insert(&c->vfs_inodes_by_inum_table,
+					  &inode->by_inum_hash,
+					  bch2_vfs_inodes_by_inum_params);
+		BUG_ON(ret);
+
+		inode_fake_hash(&inode->v);
+
+		inode_sb_list_add(&inode->v);
+
+		mutex_lock(&c->vfs_inodes_lock);
+		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+		mutex_unlock(&c->vfs_inodes_lock);
+		return inode;
+	}
 }
 
-static void bch2_inode_init_owner(struct bch_inode_unpacked *inode_u,
-				  const struct inode *dir, umode_t mode)
+#define memalloc_flags_do(_flags, _do)						\
+({										\
+	unsigned _saved_flags = memalloc_flags_save(_flags);			\
+	typeof(_do) _ret = _do;							\
+	memalloc_noreclaim_restore(_saved_flags);				\
+	_ret;									\
+})
+
+static struct inode *bch2_alloc_inode(struct super_block *sb)
 {
-	kuid_t uid = current_fsuid();
-	kgid_t gid;
+	BUG();
+}
 
-	if (dir && dir->i_mode & S_ISGID) {
-		gid = dir->i_gid;
-		if (S_ISDIR(mode))
-			mode |= S_ISGID;
-	} else
-		gid = current_fsgid();
+static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp)
+{
+	struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb,
+						bch2_inode_cache, gfp);
+	if (!inode)
+		return NULL;
 
-	inode_u->bi_uid		= from_kuid(dir->i_sb->s_user_ns, uid);
-	inode_u->bi_gid		= from_kgid(dir->i_sb->s_user_ns, gid);
-	inode_u->bi_mode	= mode;
+	inode_init_once(&inode->v);
+	mutex_init(&inode->ei_update_lock);
+	two_state_lock_init(&inode->ei_pagecache_lock);
+	INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
+	inode->ei_flags = 0;
+	mutex_init(&inode->ei_quota_lock);
+	memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
+
+	if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) {
+		kmem_cache_free(bch2_inode_cache, inode);
+		return NULL;
+	}
+
+	return inode;
 }
 
-static int inode_update_for_create_fn(struct bch_inode_info *inode,
-				      struct bch_inode_unpacked *bi,
-				      void *p)
+/*
+ * Allocate a new inode, dropping/retaking btree locks if necessary:
+ */
+static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
 {
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_inode_unpacked *new_inode = p;
-	struct timespec now = current_time(&inode->v);
+	struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT);
 
-	bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now);
+	if (unlikely(!inode)) {
+		int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM);
+		if (ret && inode) {
+			__destroy_inode(&inode->v);
+			kmem_cache_free(bch2_inode_cache, inode);
+		}
+		if (ret)
+			return ERR_PTR(ret);
+	}
 
-	if (S_ISDIR(new_inode->bi_mode))
-		bi->bi_nlink++;
+	return inode;
+}
+
+static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans,
+							  subvol_inum inum,
+							  struct bch_inode_unpacked *bi,
+							  struct bch_subvolume *subvol)
+{
+	struct bch_inode_info *inode = bch2_new_inode(trans);
+	if (IS_ERR(inode))
+		return inode;
+
+	bch2_vfs_inode_init(trans, inum, inode, bi, subvol);
+
+	return bch2_inode_hash_insert(trans->c, trans, inode);
 
-	return 0;
 }
 
-static struct bch_inode_info *
-__bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
-	      umode_t mode, dev_t rdev, bool tmpfile)
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+{
+	struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum);
+	if (inode)
+		return &inode->v;
+
+	struct btree_trans *trans = bch2_trans_get(c);
+
+	struct bch_inode_unpacked inode_u;
+	struct bch_subvolume subvol;
+	int ret = lockrestart_do(trans,
+		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+		bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
+		PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
+	bch2_trans_put(trans);
+
+	return ret ? ERR_PTR(ret) : &inode->v;
+}
+
+struct bch_inode_info *
+__bch2_create(struct mnt_idmap *idmap,
+	      struct bch_inode_info *dir, struct dentry *dentry,
+	      umode_t mode, dev_t rdev, subvol_inum snapshot_src,
+	      unsigned flags)
 {
 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
-	struct btree_trans trans;
+	struct btree_trans *trans;
 	struct bch_inode_unpacked dir_u;
-	struct bch_inode_info *inode, *old;
+	struct bch_inode_info *inode;
 	struct bch_inode_unpacked inode_u;
-	struct bch_hash_info hash_info;
 	struct posix_acl *default_acl = NULL, *acl = NULL;
+	subvol_inum inum;
+	struct bch_subvolume subvol;
+	u64 journal_seq = 0;
+	kuid_t kuid;
+	kgid_t kgid;
 	int ret;
 
-	bch2_inode_init(c, &inode_u, 0, 0, 0, rdev, &dir->ei_inode);
-	bch2_inode_init_owner(&inode_u, &dir->v, mode);
-
-	inode_u.bi_project = dir->ei_qid.q[QTYP_PRJ];
-
-	hash_info = bch2_hash_info_init(c, &inode_u);
-
-	if (tmpfile)
-		inode_u.bi_flags |= BCH_INODE_UNLINKED;
-
-	ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, BCH_QUOTA_PREALLOC);
-	if (ret)
-		return ERR_PTR(ret);
-
+	/*
+	 * preallocate acls + vfs inode before btree transaction, so that
+	 * nothing can fail after the transaction succeeds:
+	 */
 #ifdef CONFIG_BCACHEFS_POSIX_ACL
-	ret = posix_acl_create(&dir->v, &inode_u.bi_mode, &default_acl, &acl);
+	ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
 	if (ret)
-		goto err;
+		return ERR_PTR(ret);
 #endif
-
-	/*
-	 * preallocate vfs inode before btree transaction, so that nothing can
-	 * fail after the transaction succeeds:
-	 */
-	inode = to_bch_ei(new_inode(c->vfs_sb));
+	inode = __bch2_new_inode(c, GFP_NOFS);
 	if (unlikely(!inode)) {
-		ret = -ENOMEM;
+		inode = ERR_PTR(-ENOMEM);
 		goto err;
 	}
 
-	bch2_trans_init(&trans, c);
+	bch2_inode_init_early(c, &inode_u);
+
+	if (!(flags & BCH_CREATE_TMPFILE))
+		mutex_lock(&dir->ei_update_lock);
+
+	trans = bch2_trans_get(c);
 retry:
-	bch2_trans_begin(&trans);
-
-	ret   = __bch2_inode_create(&trans, &inode_u,
-				    BLOCKDEV_INODE_MAX, 0,
-				    &c->unused_inode_hint) ?:
-		(default_acl
-		 ? bch2_set_acl_trans(&trans, &inode_u, &hash_info,
-				      default_acl, ACL_TYPE_DEFAULT)
-		 : 0) ?:
-		(acl
-		 ? bch2_set_acl_trans(&trans, &inode_u, &hash_info,
-				      acl, ACL_TYPE_ACCESS)
-		 : 0) ?:
-		(!tmpfile
-		 ? __bch2_dirent_create(&trans, dir->v.i_ino,
-					&dir->ei_str_hash,
-					mode_to_type(mode),
-					&dentry->d_name,
-					inode_u.bi_inum,
-					BCH_HASH_SET_MUST_CREATE)
-		: 0) ?:
-		(!tmpfile
-		 ? bch2_write_inode_trans(&trans, dir, &dir_u,
-					  inode_update_for_create_fn,
-					  &inode_u)
-		 : 0) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
-				  &inode->ei_journal_seq,
-				  BTREE_INSERT_ATOMIC|
-				  BTREE_INSERT_NOUNLOCK);
-	if (ret == -EINTR)
-		goto retry;
+	bch2_trans_begin(trans);
+
+	kuid = mapped_fsuid(idmap, i_user_ns(&dir->v));
+	kgid = mapped_fsgid(idmap, i_user_ns(&dir->v));
+	ret   = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
+		bch2_create_trans(trans,
+				  inode_inum(dir), &dir_u, &inode_u,
+				  !(flags & BCH_CREATE_TMPFILE)
+				  ? &dentry->d_name : NULL,
+				  from_kuid(i_user_ns(&dir->v), kuid),
+				  from_kgid(i_user_ns(&dir->v), kgid),
+				  mode, rdev,
+				  default_acl, acl, snapshot_src, flags) ?:
+		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
+				KEY_TYPE_QUOTA_PREALLOC);
 	if (unlikely(ret))
+		goto err_before_quota;
+
+	inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
+	inum.inum = inode_u.bi_inum;
+
+	ret   = bch2_subvolume_get(trans, inum.subvol, true,
+				   BTREE_ITER_with_updates, &subvol) ?:
+		bch2_trans_commit(trans, NULL, &journal_seq, 0);
+	if (unlikely(ret)) {
+		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
+				KEY_TYPE_QUOTA_WARN);
+err_before_quota:
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto retry;
 		goto err_trans;
+	}
 
-	atomic_long_inc(&c->nr_inodes);
-
-	if (!tmpfile) {
-		bch2_inode_update_after_write(c, dir, &dir_u,
+	if (!(flags & BCH_CREATE_TMPFILE)) {
+		bch2_inode_update_after_write(trans, dir, &dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
-		journal_seq_copy(dir, inode->ei_journal_seq);
+		mutex_unlock(&dir->ei_update_lock);
 	}
 
-	bch2_vfs_inode_init(c, inode, &inode_u);
+	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
 
 	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
 	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
@@ -342,90 +596,129 @@ retry:
 	 * we must insert the new inode into the inode cache before calling
 	 * bch2_trans_exit() and dropping locks, else we could race with another
 	 * thread pulling the inode in and modifying it:
+	 *
+	 * also, calling bch2_inode_hash_insert() without passing in the
+	 * transaction object is sketchy - if we could ever end up in
+	 * __wait_on_freeing_inode(), we'd risk deadlock.
+	 *
+	 * But that shouldn't be possible, since we still have the inode locked
+	 * that we just created, and we _really_ can't take a transaction
+	 * restart here.
 	 */
-
-	old = to_bch_ei(insert_inode_locked2(&inode->v));
-	if (unlikely(old)) {
-		/*
-		 * We raced, another process pulled the new inode into cache
-		 * before us:
-		 */
-		old->ei_journal_seq = inode->ei_journal_seq;
-		make_bad_inode(&inode->v);
-		iput(&inode->v);
-
-		inode = old;
-	} else {
-		/*
-		 * we really don't want insert_inode_locked2() to be setting
-		 * I_NEW...
-		 */
-		unlock_new_inode(&inode->v);
-	}
-
-	bch2_trans_exit(&trans);
-out:
+	inode = bch2_inode_hash_insert(c, NULL, inode);
+	bch2_trans_put(trans);
+err:
 	posix_acl_release(default_acl);
 	posix_acl_release(acl);
 	return inode;
 err_trans:
-	bch2_trans_exit(&trans);
+	if (!(flags & BCH_CREATE_TMPFILE))
+		mutex_unlock(&dir->ei_update_lock);
+
+	bch2_trans_put(trans);
 	make_bad_inode(&inode->v);
 	iput(&inode->v);
-err:
-	bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN);
 	inode = ERR_PTR(ret);
-	goto out;
+	goto err;
 }
 
 /* methods */
 
+static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
+			subvol_inum dir, struct bch_hash_info *dir_hash_info,
+			const struct qstr *name)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter dirent_iter = {};
+	subvol_inum inum = {};
+	struct printbuf buf = PRINTBUF;
+
+	struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
+					     dir_hash_info, dir, name, 0);
+	int ret = bkey_err(k);
+	if (ret)
+		return ERR_PTR(ret);
+
+	ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
+	if (ret > 0)
+		ret = -ENOENT;
+	if (ret)
+		goto err;
+
+	struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum);
+	if (inode)
+		goto out;
+
+	struct bch_subvolume subvol;
+	struct bch_inode_unpacked inode_u;
+	ret =   bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+		bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
+		PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
+
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
+				c, "dirent to missing inode:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+	if (ret)
+		goto err;
+
+	/* regular files may have hardlinks: */
+	if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) &&
+				    !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)),
+				    c,
+				    "dirent points to inode that does not point back:\n  %s",
+				    (bch2_bkey_val_to_text(&buf, c, k),
+				     prt_printf(&buf, "\n  "),
+				     bch2_inode_unpacked_to_text(&buf, &inode_u),
+				     buf.buf))) {
+		ret = -ENOENT;
+		goto err;
+	}
+out:
+	bch2_trans_iter_exit(trans, &dirent_iter);
+	printbuf_exit(&buf);
+	return inode;
+err:
+	inode = ERR_PTR(ret);
+	goto out;
+}
+
 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
 				  unsigned int flags)
 {
 	struct bch_fs *c = vdir->i_sb->s_fs_info;
 	struct bch_inode_info *dir = to_bch_ei(vdir);
-	struct inode *vinode = NULL;
-	u64 inum;
-
-	inum = bch2_dirent_lookup(c, dir->v.i_ino,
-				  &dir->ei_str_hash,
-				  &dentry->d_name);
+	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
 
-	if (inum)
-		vinode = bch2_vfs_inode_get(c, inum);
+	struct bch_inode_info *inode;
+	bch2_trans_do(c,
+		PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
+							  &hash, &dentry->d_name)));
+	if (IS_ERR(inode))
+		inode = NULL;
 
-	return d_splice_alias(vinode, dentry);
+	return d_splice_alias(&inode->v, dentry);
 }
 
-static int bch2_create(struct inode *vdir, struct dentry *dentry,
-		       umode_t mode, bool excl)
+static int bch2_mknod(struct mnt_idmap *idmap,
+		      struct inode *vdir, struct dentry *dentry,
+		      umode_t mode, dev_t rdev)
 {
 	struct bch_inode_info *inode =
-		__bch2_create(to_bch_ei(vdir), dentry, mode|S_IFREG, 0, false);
+		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
+			      (subvol_inum) { 0 }, 0);
 
 	if (IS_ERR(inode))
-		return PTR_ERR(inode);
+		return bch2_err_class(PTR_ERR(inode));
 
 	d_instantiate(dentry, &inode->v);
 	return 0;
 }
 
-static int inode_update_for_link_fn(struct bch_inode_info *inode,
-				    struct bch_inode_unpacked *bi,
-				    void *p)
+static int bch2_create(struct mnt_idmap *idmap,
+		       struct inode *vdir, struct dentry *dentry,
+		       umode_t mode, bool excl)
 {
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct timespec now = current_time(&inode->v);
-
-	bi->bi_ctime = timespec_to_bch2_time(c, now);
-
-	if (bi->bi_flags & BCH_INODE_UNLINKED)
-		bi->bi_flags &= ~BCH_INODE_UNLINKED;
-	else
-		bi->bi_nlink++;
-
-	return 0;
+	return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
 }
 
 static int __bch2_link(struct bch_fs *c,
@@ -433,37 +726,26 @@ static int __bch2_link(struct bch_fs *c,
 		       struct bch_inode_info *dir,
 		       struct dentry *dentry)
 {
-	struct btree_trans trans;
-	struct bch_inode_unpacked inode_u;
+	struct bch_inode_unpacked dir_u, inode_u;
 	int ret;
 
-	lockdep_assert_held(&inode->v.i_rwsem);
+	mutex_lock(&inode->ei_update_lock);
+	struct btree_trans *trans = bch2_trans_get(c);
 
-	bch2_trans_init(&trans, c);
-retry:
-	bch2_trans_begin(&trans);
-
-	ret   = __bch2_dirent_create(&trans, dir->v.i_ino,
-				     &dir->ei_str_hash,
-				     mode_to_type(inode->v.i_mode),
-				     &dentry->d_name,
-				     inode->v.i_ino,
-				     BCH_HASH_SET_MUST_CREATE) ?:
-		bch2_write_inode_trans(&trans, inode, &inode_u,
-				       inode_update_for_link_fn,
-				       NULL) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
-				  &inode->ei_journal_seq,
-				  BTREE_INSERT_ATOMIC|
-				  BTREE_INSERT_NOUNLOCK);
-
-	if (ret == -EINTR)
-		goto retry;
+	ret = commit_do(trans, NULL, NULL, 0,
+			bch2_link_trans(trans,
+					inode_inum(dir),   &dir_u,
+					inode_inum(inode), &inode_u,
+					&dentry->d_name));
 
-	if (likely(!ret))
-		bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
+	if (likely(!ret)) {
+		bch2_inode_update_after_write(trans, dir, &dir_u,
+					      ATTR_MTIME|ATTR_CTIME);
+		bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
+	}
 
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
+	mutex_unlock(&inode->ei_update_lock);
 	return ret;
 }
 
@@ -475,101 +757,82 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
 	struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
 	int ret;
 
-	ret = __bch2_link(c, inode, dir, dentry);
+	lockdep_assert_held(&inode->v.i_rwsem);
+
+	ret   = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
+		bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
+		__bch2_link(c, inode, dir, dentry);
 	if (unlikely(ret))
-		return ret;
+		return bch2_err_class(ret);
 
 	ihold(&inode->v);
 	d_instantiate(dentry, &inode->v);
 	return 0;
 }
 
-static int inode_update_dir_for_unlink_fn(struct bch_inode_info *inode,
-					  struct bch_inode_unpacked *bi,
-					  void *p)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_inode_info *unlink_inode = p;
-	struct timespec now = current_time(&inode->v);
-
-	bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now);
-
-	bi->bi_nlink -= S_ISDIR(unlink_inode->v.i_mode);
-
-	return 0;
-}
-
-static int inode_update_for_unlink_fn(struct bch_inode_info *inode,
-				      struct bch_inode_unpacked *bi,
-				      void *p)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct timespec now = current_time(&inode->v);
-
-	bi->bi_ctime = timespec_to_bch2_time(c, now);
-	if (bi->bi_nlink)
-		bi->bi_nlink--;
-	else
-		bi->bi_flags |= BCH_INODE_UNLINKED;
-
-	return 0;
-}
-
-static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
+		  bool deleting_snapshot)
 {
 	struct bch_fs *c = vdir->i_sb->s_fs_info;
 	struct bch_inode_info *dir = to_bch_ei(vdir);
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
 	struct bch_inode_unpacked dir_u, inode_u;
-	struct btree_trans trans;
 	int ret;
 
-	bch2_trans_init(&trans, c);
-retry:
-	bch2_trans_begin(&trans);
-
-	ret   = __bch2_dirent_delete(&trans, dir->v.i_ino,
-				     &dir->ei_str_hash,
-				     &dentry->d_name) ?:
-		bch2_write_inode_trans(&trans, dir, &dir_u,
-				       inode_update_dir_for_unlink_fn,
-				       inode) ?:
-		bch2_write_inode_trans(&trans, inode, &inode_u,
-				       inode_update_for_unlink_fn,
-				       NULL) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
-				  &dir->ei_journal_seq,
-				  BTREE_INSERT_ATOMIC|
-				  BTREE_INSERT_NOUNLOCK|
-				  BTREE_INSERT_NOFAIL);
-	if (ret == -EINTR)
-		goto retry;
-	if (ret)
-		goto err;
+	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
+
+	struct btree_trans *trans = bch2_trans_get(c);
 
-	if (dir->ei_journal_seq > inode->ei_journal_seq)
-		inode->ei_journal_seq = dir->ei_journal_seq;
+	ret = commit_do(trans, NULL, NULL,
+			BCH_TRANS_COMMIT_no_enospc,
+		bch2_unlink_trans(trans,
+				  inode_inum(dir), &dir_u,
+				  &inode_u, &dentry->d_name,
+				  deleting_snapshot));
+	if (unlikely(ret))
+		goto err;
 
-	bch2_inode_update_after_write(c, dir, &dir_u,
+	bch2_inode_update_after_write(trans, dir, &dir_u,
 				      ATTR_MTIME|ATTR_CTIME);
-	bch2_inode_update_after_write(c, inode, &inode_u,
+	bch2_inode_update_after_write(trans, inode, &inode_u,
 				      ATTR_MTIME);
+
+	if (inode_u.bi_subvol) {
+		/*
+		 * Subvolume deletion is asynchronous, but we still want to tell
+		 * the VFS that it's been deleted here:
+		 */
+		set_nlink(&inode->v, 0);
+	}
 err:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
+	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
 
 	return ret;
 }
 
-static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
+static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+{
+	struct bch_inode_info *dir= to_bch_ei(vdir);
+	struct bch_fs *c = dir->v.i_sb->s_fs_info;
+
+	int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
+		__bch2_unlink(vdir, dentry, false);
+	return bch2_err_class(ret);
+}
+
+static int bch2_symlink(struct mnt_idmap *idmap,
+			struct inode *vdir, struct dentry *dentry,
 			const char *symname)
 {
 	struct bch_fs *c = vdir->i_sb->s_fs_info;
 	struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
 	int ret;
 
-	inode = __bch2_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, true);
-	if (unlikely(IS_ERR(inode)))
-		return PTR_ERR(inode);
+	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
+			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
+	if (IS_ERR(inode))
+		return bch2_err_class(PTR_ERR(inode));
 
 	inode_lock(&inode->v);
 	ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
@@ -582,8 +845,6 @@ static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
 	if (unlikely(ret))
 		goto err;
 
-	journal_seq_copy(dir, inode->ei_journal_seq);
-
 	ret = __bch2_link(c, inode, dir, dentry);
 	if (unlikely(ret))
 		goto err;
@@ -592,195 +853,179 @@ static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
 	return 0;
 err:
 	iput(&inode->v);
-	return ret;
-}
-
-static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode)
-{
-	struct bch_inode_info *inode =
-		__bch2_create(to_bch_ei(vdir), dentry, mode|S_IFDIR, 0, false);
-
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
-
-	d_instantiate(dentry, &inode->v);
-	return 0;
-}
-
-static int bch2_rmdir(struct inode *vdir, struct dentry *dentry)
-{
-	struct bch_fs *c = vdir->i_sb->s_fs_info;
-
-	if (bch2_empty_dir(c, dentry->d_inode->i_ino))
-		return -ENOTEMPTY;
-
-	return bch2_unlink(vdir, dentry);
+	return bch2_err_class(ret);
 }
 
-static int bch2_mknod(struct inode *vdir, struct dentry *dentry,
-		      umode_t mode, dev_t rdev)
+static int bch2_mkdir(struct mnt_idmap *idmap,
+		      struct inode *vdir, struct dentry *dentry, umode_t mode)
 {
-	struct bch_inode_info *inode =
-		__bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false);
-
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
-
-	d_instantiate(dentry, &inode->v);
-	return 0;
+	return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
 }
 
-struct rename_info {
-	u64			now;
-	struct bch_inode_info	*src_dir;
-	struct bch_inode_info	*dst_dir;
-	struct bch_inode_info	*src_inode;
-	struct bch_inode_info	*dst_inode;
-	enum bch_rename_mode	mode;
-};
-
-static int inode_update_for_rename_fn(struct bch_inode_info *inode,
-				      struct bch_inode_unpacked *bi,
-				      void *p)
-{
-	struct rename_info *info = p;
-
-	if (inode == info->src_dir) {
-		bi->bi_nlink -= S_ISDIR(info->src_inode->v.i_mode);
-		bi->bi_nlink += info->dst_inode &&
-			S_ISDIR(info->dst_inode->v.i_mode) &&
-			info->mode == BCH_RENAME_EXCHANGE;
-	}
-
-	if (inode == info->dst_dir) {
-		bi->bi_nlink += S_ISDIR(info->src_inode->v.i_mode);
-		bi->bi_nlink -= info->dst_inode &&
-			S_ISDIR(info->dst_inode->v.i_mode);
-	}
-
-	if (inode == info->dst_inode &&
-	    info->mode == BCH_RENAME_OVERWRITE) {
-		BUG_ON(bi->bi_nlink &&
-		       S_ISDIR(info->dst_inode->v.i_mode));
-
-		if (bi->bi_nlink)
-			bi->bi_nlink--;
-		else
-			bi->bi_flags |= BCH_INODE_UNLINKED;
-	}
-
-	if (inode == info->src_dir ||
-	    inode == info->dst_dir)
-		bi->bi_mtime = info->now;
-	bi->bi_ctime = info->now;
-
-	return 0;
-}
-
-static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
+static int bch2_rename2(struct mnt_idmap *idmap,
+			struct inode *src_vdir, struct dentry *src_dentry,
 			struct inode *dst_vdir, struct dentry *dst_dentry,
 			unsigned flags)
 {
 	struct bch_fs *c = src_vdir->i_sb->s_fs_info;
-	struct rename_info i = {
-		.now		= timespec_to_bch2_time(c,
-						current_time(src_vdir)),
-		.src_dir	= to_bch_ei(src_vdir),
-		.dst_dir	= to_bch_ei(dst_vdir),
-		.src_inode	= to_bch_ei(src_dentry->d_inode),
-		.dst_inode	= to_bch_ei(dst_dentry->d_inode),
-		.mode		= flags & RENAME_EXCHANGE
-				? BCH_RENAME_EXCHANGE
-			: dst_dentry->d_inode
-				? BCH_RENAME_OVERWRITE : BCH_RENAME,
-	};
-	struct btree_trans trans;
+	struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
+	struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
+	struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
+	struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
 	struct bch_inode_unpacked dst_dir_u, src_dir_u;
-	struct bch_inode_unpacked src_inode_u, dst_inode_u;
-	u64 journal_seq = 0;
+	struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u;
+	struct btree_trans *trans;
+	enum bch_rename_mode mode = flags & RENAME_EXCHANGE
+		? BCH_RENAME_EXCHANGE
+		: dst_dentry->d_inode
+		? BCH_RENAME_OVERWRITE : BCH_RENAME;
+	bool whiteout = !!(flags & RENAME_WHITEOUT);
 	int ret;
 
-	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
+	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT))
 		return -EINVAL;
 
-	if (i.mode == BCH_RENAME_OVERWRITE) {
-		if (S_ISDIR(i.src_inode->v.i_mode) !=
-		    S_ISDIR(i.dst_inode->v.i_mode))
-			return -ENOTDIR;
-
-		if (S_ISDIR(i.src_inode->v.i_mode) &&
-		    bch2_empty_dir(c, i.dst_inode->v.i_ino))
-			return -ENOTEMPTY;
-
-		ret = filemap_write_and_wait_range(i.src_inode->v.i_mapping,
+	if (mode == BCH_RENAME_OVERWRITE) {
+		ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
 						   0, LLONG_MAX);
 		if (ret)
 			return ret;
 	}
 
-	bch2_trans_init(&trans, c);
+	bch2_lock_inodes(INODE_UPDATE_LOCK,
+			 src_dir,
+			 dst_dir,
+			 src_inode,
+			 dst_inode);
+
+	trans = bch2_trans_get(c);
+
+	ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
+		bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
+	if (ret)
+		goto err_tx_restart;
+
+	if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
+		ret = bch2_fs_quota_transfer(c, src_inode,
+					     dst_dir->ei_qid,
+					     1 << QTYP_PRJ,
+					     KEY_TYPE_QUOTA_PREALLOC);
+		if (ret)
+			goto err;
+	}
+
+	if (mode == BCH_RENAME_EXCHANGE &&
+	    inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
+		ret = bch2_fs_quota_transfer(c, dst_inode,
+					     src_dir->ei_qid,
+					     1 << QTYP_PRJ,
+					     KEY_TYPE_QUOTA_PREALLOC);
+		if (ret)
+			goto err;
+	}
 retry:
-	bch2_trans_begin(&trans);
-	i.now = timespec_to_bch2_time(c, current_time(src_vdir)),
-
-	ret   = bch2_dirent_rename(&trans,
-				   i.src_dir, &src_dentry->d_name,
-				   i.dst_dir, &dst_dentry->d_name,
-				   i.mode) ?:
-		bch2_write_inode_trans(&trans, i.src_dir, &src_dir_u,
-				       inode_update_for_rename_fn, &i) ?:
-		(i.src_dir != i.dst_dir
-		 ? bch2_write_inode_trans(&trans, i.dst_dir, &dst_dir_u,
-				       inode_update_for_rename_fn, &i)
-		 : 0 ) ?:
-		bch2_write_inode_trans(&trans, i.src_inode, &src_inode_u,
-				       inode_update_for_rename_fn, &i) ?:
-		(i.dst_inode
-		 ? bch2_write_inode_trans(&trans, i.dst_inode, &dst_inode_u,
-				       inode_update_for_rename_fn, &i)
-		 : 0 ) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
-				  &journal_seq,
-				  BTREE_INSERT_ATOMIC|
-				  BTREE_INSERT_NOUNLOCK);
-	if (ret == -EINTR)
-		goto retry;
+	bch2_trans_begin(trans);
+
+	ret = bch2_rename_trans(trans,
+				inode_inum(src_dir), &src_dir_u,
+				inode_inum(dst_dir), &dst_dir_u,
+				&src_inode_u,
+				&dst_inode_u,
+				&src_dentry->d_name,
+				&dst_dentry->d_name,
+				mode);
 	if (unlikely(ret))
+		goto err_tx_restart;
+
+	if (whiteout) {
+		whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u));
+		ret = PTR_ERR_OR_ZERO(whiteout_inode_u);
+		if (unlikely(ret))
+			goto err_tx_restart;
+		bch2_inode_init_early(c, whiteout_inode_u);
+
+		ret = bch2_create_trans(trans,
+					inode_inum(src_dir), &src_dir_u,
+					whiteout_inode_u,
+					&src_dentry->d_name,
+					from_kuid(i_user_ns(&src_dir->v), current_fsuid()),
+					from_kgid(i_user_ns(&src_dir->v), current_fsgid()),
+					S_IFCHR|WHITEOUT_MODE, 0,
+					NULL, NULL, (subvol_inum) { 0 }, 0) ?:
+		      bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1,
+				      KEY_TYPE_QUOTA_PREALLOC);
+		if (unlikely(ret))
+			goto err_tx_restart;
+	}
+
+	ret = bch2_trans_commit(trans, NULL, NULL, 0);
+	if (unlikely(ret)) {
+err_tx_restart:
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto retry;
 		goto err;
+	}
 
-	bch2_inode_update_after_write(c, i.src_dir, &src_dir_u,
+	BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
+	BUG_ON(dst_inode &&
+	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
+
+	bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
 				      ATTR_MTIME|ATTR_CTIME);
-	journal_seq_copy(i.src_dir, journal_seq);
 
-	if (i.src_dir != i.dst_dir) {
-		bch2_inode_update_after_write(c, i.dst_dir, &dst_dir_u,
+	if (src_dir != dst_dir)
+		bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
 					      ATTR_MTIME|ATTR_CTIME);
-		journal_seq_copy(i.dst_dir, journal_seq);
-	}
 
-	bch2_inode_update_after_write(c, i.src_inode, &src_inode_u,
+	bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
 				      ATTR_CTIME);
-	if (i.dst_inode)
-		bch2_inode_update_after_write(c, i.dst_inode, &dst_inode_u,
+
+	if (dst_inode)
+		bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
 					      ATTR_CTIME);
 err:
-	bch2_trans_exit(&trans);
-
-	return ret;
+	bch2_trans_put(trans);
+
+	bch2_fs_quota_transfer(c, src_inode,
+			       bch_qid(&src_inode->ei_inode),
+			       1 << QTYP_PRJ,
+			       KEY_TYPE_QUOTA_NOCHECK);
+	if (dst_inode)
+		bch2_fs_quota_transfer(c, dst_inode,
+				       bch_qid(&dst_inode->ei_inode),
+				       1 << QTYP_PRJ,
+				       KEY_TYPE_QUOTA_NOCHECK);
+
+	bch2_unlock_inodes(INODE_UPDATE_LOCK,
+			   src_dir,
+			   dst_dir,
+			   src_inode,
+			   dst_inode);
+
+	return bch2_err_class(ret);
 }
 
-static int inode_update_for_setattr_fn(struct bch_inode_info *inode,
-				       struct bch_inode_unpacked *bi,
-				       void *p)
+static void bch2_setattr_copy(struct mnt_idmap *idmap,
+			      struct bch_inode_info *inode,
+			      struct bch_inode_unpacked *bi,
+			      struct iattr *attr)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct iattr *attr = p;
 	unsigned int ia_valid = attr->ia_valid;
+	kuid_t kuid;
+	kgid_t kgid;
 
-	if (ia_valid & ATTR_UID)
-		bi->bi_uid = from_kuid(inode->v.i_sb->s_user_ns, attr->ia_uid);
-	if (ia_valid & ATTR_GID)
-		bi->bi_gid = from_kgid(inode->v.i_sb->s_user_ns, attr->ia_gid);
+	if (ia_valid & ATTR_UID) {
+		kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
+		bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid);
+	}
+	if (ia_valid & ATTR_GID) {
+		kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
+		bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid);
+	}
+
+	if (ia_valid & ATTR_SIZE)
+		bi->bi_size = attr->ia_size;
 
 	if (ia_valid & ATTR_ATIME)
 		bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
@@ -792,184 +1037,231 @@ static int inode_update_for_setattr_fn(struct bch_inode_info *inode,
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
 		kgid_t gid = ia_valid & ATTR_GID
-			? attr->ia_gid
+			? kgid
 			: inode->v.i_gid;
 
-		if (!in_group_p(gid) &&
-		    !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID))
+		if (!in_group_or_capable(idmap, &inode->v,
+			make_vfsgid(idmap, i_user_ns(&inode->v), gid)))
 			mode &= ~S_ISGID;
 		bi->bi_mode = mode;
 	}
-
-	return 0;
 }
 
-static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iattr)
+int bch2_setattr_nonsize(struct mnt_idmap *idmap,
+			 struct bch_inode_info *inode,
+			 struct iattr *attr)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_qid qid = inode->ei_qid;
-	struct btree_trans trans;
+	struct bch_qid qid;
+	struct btree_trans *trans;
+	struct btree_iter inode_iter = { NULL };
 	struct bch_inode_unpacked inode_u;
 	struct posix_acl *acl = NULL;
-	unsigned qtypes = 0;
+	kuid_t kuid;
+	kgid_t kgid;
 	int ret;
 
 	mutex_lock(&inode->ei_update_lock);
 
-	if (c->opts.usrquota &&
-	    (iattr->ia_valid & ATTR_UID) &&
-	    !uid_eq(iattr->ia_uid, inode->v.i_uid)) {
-		qid.q[QTYP_USR] = from_kuid(&init_user_ns, iattr->ia_uid),
-		qtypes |= 1 << QTYP_USR;
-	}
+	qid = inode->ei_qid;
 
-	if (c->opts.grpquota &&
-	    (iattr->ia_valid & ATTR_GID) &&
-	    !gid_eq(iattr->ia_gid, inode->v.i_gid)) {
-		qid.q[QTYP_GRP] = from_kgid(&init_user_ns, iattr->ia_gid);
-		qtypes |= 1 << QTYP_GRP;
+	if (attr->ia_valid & ATTR_UID) {
+		kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
+		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid);
 	}
 
-	if (qtypes) {
-		ret = bch2_quota_transfer(c, qtypes, qid, inode->ei_qid,
-					  inode->v.i_blocks +
-					  inode->ei_quota_reserved);
-		if (ret)
-			goto err;
+	if (attr->ia_valid & ATTR_GID) {
+		kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
+		qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid);
 	}
 
-	bch2_trans_init(&trans, c);
+	ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
+				     KEY_TYPE_QUOTA_PREALLOC);
+	if (ret)
+		goto err;
+
+	trans = bch2_trans_get(c);
 retry:
-	bch2_trans_begin(&trans);
+	bch2_trans_begin(trans);
 	kfree(acl);
 	acl = NULL;
 
-	ret = bch2_write_inode_trans(&trans, inode, &inode_u,
-				inode_update_for_setattr_fn, iattr) ?:
-		(iattr->ia_valid & ATTR_MODE
-		 ? bch2_acl_chmod(&trans, inode, iattr->ia_mode, &acl)
-		 : 0) ?:
-		bch2_trans_commit(&trans, NULL, NULL,
-				  &inode->ei_journal_seq,
-				  BTREE_INSERT_ATOMIC|
-				  BTREE_INSERT_NOUNLOCK|
-				  BTREE_INSERT_NOFAIL);
-	if (ret == -EINTR)
+	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
+			      BTREE_ITER_intent);
+	if (ret)
+		goto btree_err;
+
+	bch2_setattr_copy(idmap, inode, &inode_u, attr);
+
+	if (attr->ia_valid & ATTR_MODE) {
+		ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
+				     inode_u.bi_mode, &acl);
+		if (ret)
+			goto btree_err;
+	}
+
+	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BCH_TRANS_COMMIT_no_enospc);
+btree_err:
+	bch2_trans_iter_exit(trans, &inode_iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 	if (unlikely(ret))
 		goto err_trans;
 
-	bch2_inode_update_after_write(c, inode, &inode_u, iattr->ia_valid);
+	bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
 
 	if (acl)
 		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
 err_trans:
-	bch2_trans_exit(&trans);
+	bch2_trans_put(trans);
 err:
 	mutex_unlock(&inode->ei_update_lock);
 
-	return ret;
+	return bch2_err_class(ret);
 }
 
-static int bch2_getattr(const struct path *path, struct kstat *stat,
+static int bch2_getattr(struct mnt_idmap *idmap,
+			const struct path *path, struct kstat *stat,
 			u32 request_mask, unsigned query_flags)
 {
 	struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v);
+	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v);
 
 	stat->dev	= inode->v.i_sb->s_dev;
 	stat->ino	= inode->v.i_ino;
 	stat->mode	= inode->v.i_mode;
 	stat->nlink	= inode->v.i_nlink;
-	stat->uid	= inode->v.i_uid;
-	stat->gid	= inode->v.i_gid;
+	stat->uid	= vfsuid_into_kuid(vfsuid);
+	stat->gid	= vfsgid_into_kgid(vfsgid);
 	stat->rdev	= inode->v.i_rdev;
 	stat->size	= i_size_read(&inode->v);
-	stat->atime	= inode->v.i_atime;
-	stat->mtime	= inode->v.i_mtime;
-	stat->ctime	= inode->v.i_ctime;
+	stat->atime	= inode_get_atime(&inode->v);
+	stat->mtime	= inode_get_mtime(&inode->v);
+	stat->ctime	= inode_get_ctime(&inode->v);
 	stat->blksize	= block_bytes(c);
 	stat->blocks	= inode->v.i_blocks;
 
+	stat->subvol	= inode->ei_inum.subvol;
+	stat->result_mask |= STATX_SUBVOL;
+
+	if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
+		stat->result_mask |= STATX_DIOALIGN;
+		/*
+		 * this is incorrect; we should be tracking this in superblock,
+		 * and checking the alignment of open devices
+		 */
+		stat->dio_mem_align = SECTOR_SIZE;
+		stat->dio_offset_align = block_bytes(c);
+	}
+
 	if (request_mask & STATX_BTIME) {
 		stat->result_mask |= STATX_BTIME;
 		stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
 	}
 
-	if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
+	if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
 		stat->attributes |= STATX_ATTR_IMMUTABLE;
-	if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
+	stat->attributes_mask	 |= STATX_ATTR_IMMUTABLE;
+
+	if (inode->ei_inode.bi_flags & BCH_INODE_append)
 		stat->attributes |= STATX_ATTR_APPEND;
-	if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
+	stat->attributes_mask	 |= STATX_ATTR_APPEND;
+
+	if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
 		stat->attributes |= STATX_ATTR_NODUMP;
+	stat->attributes_mask	 |= STATX_ATTR_NODUMP;
 
 	return 0;
 }
 
-static int bch2_setattr(struct dentry *dentry, struct iattr *iattr)
+static int bch2_setattr(struct mnt_idmap *idmap,
+			struct dentry *dentry, struct iattr *iattr)
 {
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	int ret;
 
 	lockdep_assert_held(&inode->v.i_rwsem);
 
-	ret = setattr_prepare(dentry, iattr);
+	ret   = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
+		setattr_prepare(idmap, dentry, iattr);
 	if (ret)
 		return ret;
 
 	return iattr->ia_valid & ATTR_SIZE
-		? bch2_truncate(inode, iattr)
-		: bch2_setattr_nonsize(inode, iattr);
+		? bchfs_truncate(idmap, inode, iattr)
+		: bch2_setattr_nonsize(idmap, inode, iattr);
 }
 
-static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode)
+static int bch2_tmpfile(struct mnt_idmap *idmap,
+			struct inode *vdir, struct file *file, umode_t mode)
 {
 	struct bch_inode_info *inode =
-		__bch2_create(to_bch_ei(vdir), dentry, mode, 0, true);
+		__bch2_create(idmap, to_bch_ei(vdir),
+			      file->f_path.dentry, mode, 0,
+			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
 
 	if (IS_ERR(inode))
-		return PTR_ERR(inode);
+		return bch2_err_class(PTR_ERR(inode));
 
-	d_mark_tmpfile(dentry, &inode->v);
-	d_instantiate(dentry, &inode->v);
-	return 0;
+	d_mark_tmpfile(file, &inode->v);
+	d_instantiate(file->f_path.dentry, &inode->v);
+	return finish_open_simple(file, 0);
 }
 
-static int bch2_fill_extent(struct fiemap_extent_info *info,
-			    const struct bkey_i *k, unsigned flags)
+static int bch2_fill_extent(struct bch_fs *c,
+			    struct fiemap_extent_info *info,
+			    struct bkey_s_c k, unsigned flags)
 {
-	if (bkey_extent_is_data(&k->k)) {
-		struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
-		const struct bch_extent_ptr *ptr;
-		struct bch_extent_crc_unpacked crc;
+	if (bkey_extent_is_direct_data(k.k)) {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
 		int ret;
 
-		extent_for_each_ptr_crc(e, ptr, crc) {
+		if (k.k->type == KEY_TYPE_reflink_v)
+			flags |= FIEMAP_EXTENT_SHARED;
+
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 			int flags2 = 0;
-			u64 offset = ptr->offset;
+			u64 offset = p.ptr.offset;
+
+			if (p.ptr.unwritten)
+				flags2 |= FIEMAP_EXTENT_UNWRITTEN;
 
-			if (crc.compression_type)
+			if (p.crc.compression_type)
 				flags2 |= FIEMAP_EXTENT_ENCODED;
 			else
-				offset += crc.offset;
+				offset += p.crc.offset;
 
-			if ((offset & (PAGE_SECTORS - 1)) ||
-			    (e.k->size & (PAGE_SECTORS - 1)))
+			if ((offset & (block_sectors(c) - 1)) ||
+			    (k.k->size & (block_sectors(c) - 1)))
 				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
 
 			ret = fiemap_fill_next_extent(info,
-						      bkey_start_offset(e.k) << 9,
-						      offset << 9,
-						      e.k->size << 9, flags|flags2);
+						bkey_start_offset(k.k) << 9,
+						offset << 9,
+						k.k->size << 9, flags|flags2);
 			if (ret)
 				return ret;
 		}
 
 		return 0;
-	} else if (k->k.type == BCH_RESERVATION) {
+	} else if (bkey_extent_is_inline_data(k.k)) {
+		return fiemap_fill_next_extent(info,
+					       bkey_start_offset(k.k) << 9,
+					       0, k.k->size << 9,
+					       flags|
+					       FIEMAP_EXTENT_DATA_INLINE);
+	} else if (k.k->type == KEY_TYPE_reservation) {
 		return fiemap_fill_next_extent(info,
-					       bkey_start_offset(&k->k) << 9,
-					       0, k->k.size << 9,
+					       bkey_start_offset(k.k) << 9,
+					       0, k.k->size << 9,
 					       flags|
 					       FIEMAP_EXTENT_DELALLOC|
 					       FIEMAP_EXTENT_UNWRITTEN);
@@ -983,42 +1275,109 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 {
 	struct bch_fs *c = vinode->i_sb->s_fs_info;
 	struct bch_inode_info *ei = to_bch_ei(vinode);
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	BKEY_PADDED(k) tmp;
+	struct bkey_buf cur, prev;
 	bool have_extent = false;
 	int ret = 0;
 
+	ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
 	if (start + len < start)
 		return -EINVAL;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
-			   POS(ei->v.i_ino, start >> 9), 0, k)
-		if (bkey_extent_is_data(k.k) ||
-		    k.k->type == BCH_RESERVATION) {
-			if (bkey_cmp(bkey_start_pos(k.k),
-				     POS(ei->v.i_ino, (start + len) >> 9)) >= 0)
-				break;
+	start >>= 9;
 
-			if (have_extent) {
-				ret = bch2_fill_extent(info, &tmp.k, 0);
-				if (ret)
-					goto out;
-			}
+	bch2_bkey_buf_init(&cur);
+	bch2_bkey_buf_init(&prev);
+	trans = bch2_trans_get(c);
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     POS(ei->v.i_ino, start), 0);
 
-			bkey_reassemble(&tmp.k, k);
-			have_extent = true;
+	while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+		enum btree_id data_btree = BTREE_ID_extents;
+
+		bch2_trans_begin(trans);
+
+		u32 snapshot;
+		ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
+		if (ret)
+			continue;
+
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+		k = bch2_btree_iter_peek_max(&iter, end);
+		ret = bkey_err(k);
+		if (ret)
+			continue;
+
+		if (!k.k)
+			break;
+
+		if (!bkey_extent_is_data(k.k) &&
+		    k.k->type != KEY_TYPE_reservation) {
+			bch2_btree_iter_advance(&iter);
+			continue;
 		}
 
-	if (have_extent)
-		ret = bch2_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST);
-out:
-	bch2_btree_iter_unlock(&iter);
+		s64 offset_into_extent	= iter.pos.offset - bkey_start_offset(k.k);
+		unsigned sectors	= k.k->size - offset_into_extent;
+
+		bch2_bkey_buf_reassemble(&cur, c, k);
+
+		ret = bch2_read_indirect_extent(trans, &data_btree,
+					&offset_into_extent, &cur);
+		if (ret)
+			continue;
+
+		k = bkey_i_to_s_c(cur.k);
+		bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
+
+		sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
+
+		bch2_cut_front(POS(k.k->p.inode,
+				   bkey_start_offset(k.k) +
+				   offset_into_extent),
+			       cur.k);
+		bch2_key_resize(&cur.k->k, sectors);
+		cur.k->k.p = iter.pos;
+		cur.k->k.p.offset += cur.k->k.size;
+
+		if (have_extent) {
+			bch2_trans_unlock(trans);
+			ret = bch2_fill_extent(c, info,
+					bkey_i_to_s_c(prev.k), 0);
+			if (ret)
+				break;
+		}
+
+		bkey_copy(prev.k, cur.k);
+		have_extent = true;
+
+		bch2_btree_iter_set_pos(&iter,
+			POS(iter.pos.inode, iter.pos.offset + sectors));
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (!ret && have_extent) {
+		bch2_trans_unlock(trans);
+		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
+				       FIEMAP_EXTENT_LAST);
+	}
+
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&cur, c);
+	bch2_bkey_buf_exit(&prev, c);
 	return ret < 0 ? ret : 0;
 }
 
 static const struct vm_operations_struct bch_vm_ops = {
-	.fault		= filemap_fault,
+	.fault		= bch2_page_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite   = bch2_page_mkwrite,
 };
@@ -1041,25 +1400,50 @@ static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
 
 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct bch_fs *c = file_inode(file)->i_sb->s_fs_info;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	int ret = bch2_readdir(c, inode_inum(inode), ctx);
 
-	return bch2_readdir(c, file, ctx);
+	bch_err_fn(c, ret);
+	return bch2_err_class(ret);
+}
+
+static int bch2_open(struct inode *vinode, struct file *file)
+{
+	if (file->f_flags & (O_WRONLY|O_RDWR)) {
+		struct bch_inode_info *inode = to_bch_ei(vinode);
+		struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+		int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol);
+		if (ret)
+			return ret;
+	}
+
+	file->f_mode |= FMODE_CAN_ODIRECT;
+
+	return generic_file_open(vinode, file);
 }
 
 static const struct file_operations bch_file_operations = {
+	.open		= bch2_open,
 	.llseek		= bch2_llseek,
-	.read_iter	= generic_file_read_iter,
+	.read_iter	= bch2_read_iter,
 	.write_iter	= bch2_write_iter,
 	.mmap		= bch2_mmap,
-	.open		= generic_file_open,
+	.get_unmapped_area = thp_get_unmapped_area,
 	.fsync		= bch2_fsync,
-	.splice_read	= generic_file_splice_read,
+	.splice_read	= filemap_splice_read,
 	.splice_write	= iter_file_splice_write,
 	.fallocate	= bch2_fallocate_dispatch,
 	.unlocked_ioctl = bch2_fs_file_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= bch2_compat_fs_ioctl,
 #endif
+	.remap_file_range = bch2_remap_file_range,
 };
 
 static const struct inode_operations bch_file_inode_operations = {
@@ -1068,7 +1452,7 @@ static const struct inode_operations bch_file_inode_operations = {
 	.fiemap		= bch2_fiemap,
 	.listxattr	= bch2_xattr_list,
 #ifdef CONFIG_BCACHEFS_POSIX_ACL
-	.get_acl	= bch2_get_acl,
+	.get_inode_acl	= bch2_get_acl,
 	.set_acl	= bch2_set_acl,
 #endif
 };
@@ -1080,7 +1464,7 @@ static const struct inode_operations bch_dir_inode_operations = {
 	.unlink		= bch2_unlink,
 	.symlink	= bch2_symlink,
 	.mkdir		= bch2_mkdir,
-	.rmdir		= bch2_rmdir,
+	.rmdir		= bch2_unlink,
 	.mknod		= bch2_mknod,
 	.rename		= bch2_rename2,
 	.getattr	= bch2_getattr,
@@ -1088,7 +1472,7 @@ static const struct inode_operations bch_dir_inode_operations = {
 	.tmpfile	= bch2_tmpfile,
 	.listxattr	= bch2_xattr_list,
 #ifdef CONFIG_BCACHEFS_POSIX_ACL
-	.get_acl	= bch2_get_acl,
+	.get_inode_acl	= bch2_get_acl,
 	.set_acl	= bch2_set_acl,
 #endif
 };
@@ -1096,7 +1480,7 @@ static const struct inode_operations bch_dir_inode_operations = {
 static const struct file_operations bch_dir_file_operations = {
 	.llseek		= bch2_dir_llseek,
 	.read		= generic_read_dir,
-	.iterate	= bch2_vfs_readdir,
+	.iterate_shared	= bch2_vfs_readdir,
 	.fsync		= bch2_fsync,
 	.unlocked_ioctl = bch2_fs_file_ioctl,
 #ifdef CONFIG_COMPAT
@@ -1110,7 +1494,7 @@ static const struct inode_operations bch_symlink_inode_operations = {
 	.setattr	= bch2_setattr,
 	.listxattr	= bch2_xattr_list,
 #ifdef CONFIG_BCACHEFS_POSIX_ACL
-	.get_acl	= bch2_get_acl,
+	.get_inode_acl	= bch2_get_acl,
 	.set_acl	= bch2_set_acl,
 #endif
 };
@@ -1120,85 +1504,280 @@ static const struct inode_operations bch_special_inode_operations = {
 	.setattr	= bch2_setattr,
 	.listxattr	= bch2_xattr_list,
 #ifdef CONFIG_BCACHEFS_POSIX_ACL
-	.get_acl	= bch2_get_acl,
+	.get_inode_acl	= bch2_get_acl,
 	.set_acl	= bch2_set_acl,
 #endif
 };
 
 static const struct address_space_operations bch_address_space_operations = {
-	.writepage	= bch2_writepage,
-	.readpage	= bch2_readpage,
+	.read_folio	= bch2_read_folio,
 	.writepages	= bch2_writepages,
-	.readpages	= bch2_readpages,
-	.set_page_dirty	= bch2_set_page_dirty,
+	.readahead	= bch2_readahead,
+	.dirty_folio	= filemap_dirty_folio,
 	.write_begin	= bch2_write_begin,
 	.write_end	= bch2_write_end,
-	.invalidatepage	= bch2_invalidatepage,
-	.releasepage	= bch2_releasepage,
-	.direct_IO	= bch2_direct_IO,
+	.invalidate_folio = bch2_invalidate_folio,
+	.release_folio	= bch2_release_folio,
 #ifdef CONFIG_MIGRATION
-	.migratepage	= bch2_migrate_page,
+	.migrate_folio	= filemap_migrate_folio,
 #endif
-	.error_remove_page = generic_error_remove_page,
+	.error_remove_folio = generic_error_remove_folio,
 };
 
-static struct inode *bch2_nfs_get_inode(struct super_block *sb,
-		u64 ino, u32 generation)
+struct bcachefs_fid {
+	u64		inum;
+	u32		subvol;
+	u32		gen;
+} __packed;
+
+struct bcachefs_fid_with_parent {
+	struct bcachefs_fid	fid;
+	struct bcachefs_fid	dir;
+} __packed;
+
+static int bcachefs_fid_valid(int fh_len, int fh_type)
 {
-	struct bch_fs *c = sb->s_fs_info;
-	struct inode *vinode;
+	switch (fh_type) {
+	case FILEID_BCACHEFS_WITHOUT_PARENT:
+		return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
+	case FILEID_BCACHEFS_WITH_PARENT:
+		return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
+	default:
+		return false;
+	}
+}
+
+static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
+{
+	return (struct bcachefs_fid) {
+		.inum	= inode->ei_inum.inum,
+		.subvol	= inode->ei_inum.subvol,
+		.gen	= inode->ei_inode.bi_generation,
+	};
+}
 
-	if (ino < BCACHEFS_ROOT_INO)
-		return ERR_PTR(-ESTALE);
+static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
+			  struct inode *vdir)
+{
+	struct bch_inode_info *inode	= to_bch_ei(vinode);
+	struct bch_inode_info *dir	= to_bch_ei(vdir);
+	int min_len;
+
+	if (!S_ISDIR(inode->v.i_mode) && dir) {
+		struct bcachefs_fid_with_parent *fid = (void *) fh;
 
-	vinode = bch2_vfs_inode_get(c, ino);
-	if (IS_ERR(vinode))
-		return ERR_CAST(vinode);
-	if (generation && vinode->i_generation != generation) {
-		/* we didn't find the right inode.. */
+		min_len = sizeof(*fid) / sizeof(u32);
+		if (*len < min_len) {
+			*len = min_len;
+			return FILEID_INVALID;
+		}
+
+		fid->fid = bch2_inode_to_fid(inode);
+		fid->dir = bch2_inode_to_fid(dir);
+
+		*len = min_len;
+		return FILEID_BCACHEFS_WITH_PARENT;
+	} else {
+		struct bcachefs_fid *fid = (void *) fh;
+
+		min_len = sizeof(*fid) / sizeof(u32);
+		if (*len < min_len) {
+			*len = min_len;
+			return FILEID_INVALID;
+		}
+		*fid = bch2_inode_to_fid(inode);
+
+		*len = min_len;
+		return FILEID_BCACHEFS_WITHOUT_PARENT;
+	}
+}
+
+static struct inode *bch2_nfs_get_inode(struct super_block *sb,
+					struct bcachefs_fid fid)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
+				    .subvol = fid.subvol,
+				    .inum = fid.inum,
+	});
+	if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
 		iput(vinode);
-		return ERR_PTR(-ESTALE);
+		vinode = ERR_PTR(-ESTALE);
 	}
 	return vinode;
 }
 
-static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
 		int fh_len, int fh_type)
 {
-	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
-				    bch2_nfs_get_inode);
+	struct bcachefs_fid *fid = (void *) _fid;
+
+	if (!bcachefs_fid_valid(fh_len, fh_type))
+		return NULL;
+
+	return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
 }
 
-static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
 		int fh_len, int fh_type)
 {
-	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
-				    bch2_nfs_get_inode);
+	struct bcachefs_fid_with_parent *fid = (void *) _fid;
+
+	if (!bcachefs_fid_valid(fh_len, fh_type) ||
+	    fh_type != FILEID_BCACHEFS_WITH_PARENT)
+		return NULL;
+
+	return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
+}
+
+static struct dentry *bch2_get_parent(struct dentry *child)
+{
+	struct bch_inode_info *inode = to_bch_ei(child->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	subvol_inum parent_inum = {
+		.subvol = inode->ei_inode.bi_parent_subvol ?:
+			inode->ei_inum.subvol,
+		.inum = inode->ei_inode.bi_dir,
+	};
+
+	return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
+}
+
+static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
+{
+	struct bch_inode_info *inode	= to_bch_ei(child->d_inode);
+	struct bch_inode_info *dir	= to_bch_ei(parent->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans *trans;
+	struct btree_iter iter1;
+	struct btree_iter iter2;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+	struct bch_inode_unpacked inode_u;
+	subvol_inum target;
+	u32 snapshot;
+	struct qstr dirent_name;
+	unsigned name_len = 0;
+	int ret;
+
+	if (!S_ISDIR(dir->v.i_mode))
+		return -EINVAL;
+
+	trans = bch2_trans_get(c);
+
+	bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
+			     POS(dir->ei_inode.bi_inum, 0), 0);
+	bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
+			     POS(dir->ei_inode.bi_inum, 0), 0);
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_btree_iter_set_snapshot(&iter1, snapshot);
+	bch2_btree_iter_set_snapshot(&iter2, snapshot);
+
+	ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
+	if (ret)
+		goto err;
+
+	if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
+		bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
+
+		k = bch2_btree_iter_peek_slot(&iter1);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (k.k->type != KEY_TYPE_dirent) {
+			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+			goto err;
+		}
+
+		d = bkey_s_c_to_dirent(k);
+		ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
+		if (ret > 0)
+			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+		if (ret)
+			goto err;
+
+		if (subvol_inum_eq(target, inode->ei_inum))
+			goto found;
+	} else {
+		/*
+		 * File with multiple hardlinks and our backref is to the wrong
+		 * directory - linear search:
+		 */
+		for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
+			if (k.k->p.inode > dir->ei_inode.bi_inum)
+				break;
+
+			if (k.k->type != KEY_TYPE_dirent)
+				continue;
+
+			d = bkey_s_c_to_dirent(k);
+			ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
+			if (ret < 0)
+				break;
+			if (ret)
+				continue;
+
+			if (subvol_inum_eq(target, inode->ei_inum))
+				goto found;
+		}
+	}
+
+	ret = -ENOENT;
+	goto err;
+found:
+	dirent_name = bch2_dirent_get_name(d);
+
+	name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
+	memcpy(name, dirent_name.name, name_len);
+	name[name_len] = '\0';
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_iter_exit(trans, &iter1);
+	bch2_trans_iter_exit(trans, &iter2);
+	bch2_trans_put(trans);
+
+	return ret;
 }
 
 static const struct export_operations bch_export_ops = {
+	.encode_fh	= bch2_encode_fh,
 	.fh_to_dentry	= bch2_fh_to_dentry,
 	.fh_to_parent	= bch2_fh_to_parent,
-	//.get_parent	= bch2_get_parent,
+	.get_parent	= bch2_get_parent,
+	.get_name	= bch2_get_name,
 };
 
-static void bch2_vfs_inode_init(struct bch_fs *c,
+static void bch2_vfs_inode_init(struct btree_trans *trans,
+				subvol_inum inum,
 				struct bch_inode_info *inode,
-				struct bch_inode_unpacked *bi)
+				struct bch_inode_unpacked *bi,
+				struct bch_subvolume *subvol)
 {
-	bch2_inode_update_after_write(c, inode, bi, ~0);
+	inode->v.i_ino		= inum.inum;
+	inode->ei_inum		= inum;
+	inode->ei_inode.bi_inum	= inum.inum;
+	bch2_inode_update_after_write(trans, inode, bi, ~0);
 
 	inode->v.i_blocks	= bi->bi_sectors;
-	inode->v.i_ino		= bi->bi_inum;
 	inode->v.i_rdev		= bi->bi_dev;
 	inode->v.i_generation	= bi->bi_generation;
 	inode->v.i_size		= bi->bi_size;
 
-	inode->ei_journal_seq	= 0;
+	inode->ei_flags		= 0;
 	inode->ei_quota_reserved = 0;
-	inode->ei_str_hash	= bch2_hash_info_init(c, bi);
+	inode->ei_qid		= bch_qid(bi);
 
-	bch2_inode_flags_to_vfs(inode);
+	if (BCH_SUBVOLUME_SNAP(subvol))
+		set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
 
 	inode->v.i_mapping->a_ops = &bch_address_space_operations;
 
@@ -1220,46 +1799,25 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
 		inode->v.i_op	= &bch_special_inode_operations;
 		break;
 	}
-}
 
-static struct inode *bch2_alloc_inode(struct super_block *sb)
-{
-	struct bch_inode_info *inode;
-
-	inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
-	if (!inode)
-		return NULL;
-
-	inode_init_once(&inode->v);
-	mutex_init(&inode->ei_update_lock);
-	mutex_init(&inode->ei_quota_lock);
-	inode->ei_journal_seq = 0;
-
-	return &inode->v;
-}
-
-static void bch2_i_callback(struct rcu_head *head)
-{
-	struct inode *vinode = container_of(head, struct inode, i_rcu);
-	struct bch_inode_info *inode = to_bch_ei(vinode);
-
-	kmem_cache_free(bch2_inode_cache, inode);
+	mapping_set_large_folios(inode->v.i_mapping);
 }
 
-static void bch2_destroy_inode(struct inode *vinode)
+static void bch2_free_inode(struct inode *vinode)
 {
-	call_rcu(&vinode->i_rcu, bch2_i_callback);
+	kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode));
 }
 
-static int inode_update_times_fn(struct bch_inode_info *inode,
+static int inode_update_times_fn(struct btree_trans *trans,
+				 struct bch_inode_info *inode,
 				 struct bch_inode_unpacked *bi,
 				 void *p)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 
-	bi->bi_atime	= timespec_to_bch2_time(c, inode->v.i_atime);
-	bi->bi_mtime	= timespec_to_bch2_time(c, inode->v.i_mtime);
-	bi->bi_ctime	= timespec_to_bch2_time(c, inode->v.i_ctime);
+	bi->bi_atime	= timespec_to_bch2_time(c, inode_get_atime(&inode->v));
+	bi->bi_mtime	= timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
+	bi->bi_ctime	= timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
 
 	return 0;
 }
@@ -1272,23 +1830,28 @@ static int bch2_vfs_write_inode(struct inode *vinode,
 	int ret;
 
 	mutex_lock(&inode->ei_update_lock);
-	ret = __bch2_write_inode(c, inode, inode_update_times_fn, NULL,
-				 ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
+	ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+			       ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
 	mutex_unlock(&inode->ei_update_lock);
 
-	if (c->opts.journal_flush_disabled)
-		return ret;
-
-	if (!ret && wbc->sync_mode == WB_SYNC_ALL)
-		ret = bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
-
-	return ret;
+	return bch2_err_class(ret);
 }
 
 static void bch2_evict_inode(struct inode *vinode)
 {
 	struct bch_fs *c = vinode->i_sb->s_fs_info;
 	struct bch_inode_info *inode = to_bch_ei(vinode);
+	bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v);
+
+	/*
+	 * evict() has waited for outstanding writeback, we'll do no more IO
+	 * through this inode: it's safe to remove from VFS inode hashtable here
+	 *
+	 * Do that now so that other threads aren't blocked from pulling it back
+	 * in, there's no reason for them to be:
+	 */
+	if (!delete)
+		bch2_inode_hash_remove(c, inode);
 
 	truncate_inode_pages_final(&inode->v.i_data);
 
@@ -1296,37 +1859,113 @@ static void bch2_evict_inode(struct inode *vinode)
 
 	BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
 
-	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
+	if (delete) {
 		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
-				BCH_QUOTA_WARN);
+				KEY_TYPE_QUOTA_WARN);
 		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
-				BCH_QUOTA_WARN);
-		bch2_inode_rm(c, inode->v.i_ino);
+				KEY_TYPE_QUOTA_WARN);
+		bch2_inode_rm(c, inode_inum(inode));
+
+		/*
+		 * If we are deleting, we need it present in the vfs hash table
+		 * so that fsck can check if unlinked inodes are still open:
+		 */
+		bch2_inode_hash_remove(c, inode);
+	}
+
+	mutex_lock(&c->vfs_inodes_lock);
+	list_del_init(&inode->ei_vfs_inode_list);
+	mutex_unlock(&c->vfs_inodes_lock);
+}
+
+void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
+{
+	struct bch_inode_info *inode;
+	DARRAY(struct bch_inode_info *) grabbed;
+	bool clean_pass = false, this_pass_clean;
+
+	/*
+	 * Initially, we scan for inodes without I_DONTCACHE, then mark them to
+	 * be pruned with d_mark_dontcache().
+	 *
+	 * Once we've had a clean pass where we didn't find any inodes without
+	 * I_DONTCACHE, we wait for them to be freed:
+	 */
+
+	darray_init(&grabbed);
+	darray_make_room(&grabbed, 1024);
+again:
+	cond_resched();
+	this_pass_clean = true;
+
+	mutex_lock(&c->vfs_inodes_lock);
+	list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
+		if (!snapshot_list_has_id(s, inode->ei_inum.subvol))
+			continue;
 
-		WARN_ONCE(atomic_long_dec_return(&c->nr_inodes) < 0,
-			  "nr_inodes < 0");
+		if (!(inode->v.i_state & I_DONTCACHE) &&
+		    !(inode->v.i_state & I_FREEING) &&
+		    igrab(&inode->v)) {
+			this_pass_clean = false;
+
+			if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
+				iput(&inode->v);
+				break;
+			}
+		} else if (clean_pass && this_pass_clean) {
+			struct wait_bit_queue_entry wqe;
+			struct wait_queue_head *wq_head;
+
+			wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW);
+			prepare_to_wait_event(wq_head, &wqe.wq_entry,
+					      TASK_UNINTERRUPTIBLE);
+			mutex_unlock(&c->vfs_inodes_lock);
+
+			schedule();
+			finish_wait(wq_head, &wqe.wq_entry);
+			goto again;
+		}
 	}
+	mutex_unlock(&c->vfs_inodes_lock);
+
+	darray_for_each(grabbed, i) {
+		inode = *i;
+		d_mark_dontcache(&inode->v);
+		d_prune_aliases(&inode->v);
+		iput(&inode->v);
+	}
+	grabbed.nr = 0;
+
+	if (!clean_pass || !this_pass_clean) {
+		clean_pass = this_pass_clean;
+		goto again;
+	}
+
+	darray_exit(&grabbed);
 }
 
 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct bch_fs *c = sb->s_fs_info;
-	u64 fsid;
+	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
+	unsigned shift = sb->s_blocksize_bits - 9;
+	/*
+	 * this assumes inodes take up 64 bytes, which is a decent average
+	 * number:
+	 */
+	u64 avail_inodes = ((usage.capacity - usage.used) << 3);
 
 	buf->f_type	= BCACHEFS_STATFS_MAGIC;
 	buf->f_bsize	= sb->s_blocksize;
-	buf->f_blocks	= c->capacity >> PAGE_SECTOR_SHIFT;
-	buf->f_bfree	= bch2_fs_sectors_free(c, bch2_fs_usage_read(c)) >>
-			   PAGE_SECTOR_SHIFT;
-	buf->f_bavail	= buf->f_bfree;
-	buf->f_files	= atomic_long_read(&c->nr_inodes);
-	buf->f_ffree	= U64_MAX;
-
-	fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
-	       le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
-	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
-	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+	buf->f_blocks	= usage.capacity >> shift;
+	buf->f_bfree	= usage.free >> shift;
+	buf->f_bavail	= avail_factor(usage.free) >> shift;
+
+	buf->f_files	= usage.nr_inodes + avail_inodes;
+	buf->f_ffree	= avail_inodes;
+
+	buf->f_fsid	= uuid_to_fsid(c->sb.user_uuid.b);
 	buf->f_namelen	= BCH_NAME_MAX;
 
 	return 0;
@@ -1335,249 +1974,251 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int bch2_sync_fs(struct super_block *sb, int wait)
 {
 	struct bch_fs *c = sb->s_fs_info;
+	int ret;
+
+	trace_bch2_sync_fs(sb, wait);
+
+	if (c->opts.journal_flush_disabled)
+		return 0;
 
 	if (!wait) {
 		bch2_journal_flush_async(&c->journal, NULL);
 		return 0;
 	}
 
-	return bch2_journal_flush(&c->journal);
+	ret = bch2_journal_flush(&c->journal);
+	return bch2_err_class(ret);
 }
 
-static struct bch_fs *bch2_path_to_fs(const char *dev)
+static struct bch_fs *bch2_path_to_fs(const char *path)
 {
 	struct bch_fs *c;
-	struct block_device *bdev = lookup_bdev(dev);
-
-	if (IS_ERR(bdev))
-		return ERR_CAST(bdev);
-
-	c = bch2_bdev_to_fs(bdev);
-	bdput(bdev);
-	return c ?: ERR_PTR(-ENOENT);
-}
-
-static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs,
-					       unsigned nr_devs, struct bch_opts opts)
-{
-	struct bch_fs *c, *c1, *c2;
-	size_t i;
-
-	if (!nr_devs)
-		return ERR_PTR(-EINVAL);
-
-	c = bch2_fs_open(devs, nr_devs, opts);
-
-	if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) {
-		/*
-		 * Already open?
-		 * Look up each block device, make sure they all belong to a
-		 * filesystem and they all belong to the _same_ filesystem
-		 */
-
-		c1 = bch2_path_to_fs(devs[0]);
-		if (!c1)
-			return c;
-
-		for (i = 1; i < nr_devs; i++) {
-			c2 = bch2_path_to_fs(devs[i]);
-			if (!IS_ERR(c2))
-				closure_put(&c2->cl);
-
-			if (c1 != c2) {
-				closure_put(&c1->cl);
-				return c;
-			}
-		}
-
-		c = c1;
-	}
-
-	if (IS_ERR(c))
-		return c;
+	dev_t dev;
+	int ret;
 
-	mutex_lock(&c->state_lock);
+	ret = lookup_bdev(path, &dev);
+	if (ret)
+		return ERR_PTR(ret);
 
-	if (!bch2_fs_running(c)) {
-		mutex_unlock(&c->state_lock);
+	c = bch2_dev_to_fs(dev);
+	if (c)
 		closure_put(&c->cl);
-		pr_err("err mounting %s: incomplete filesystem", dev_name);
-		return ERR_PTR(-EINVAL);
-	}
-
-	mutex_unlock(&c->state_lock);
-
-	set_bit(BCH_FS_BDEV_MOUNTED, &c->flags);
-	return c;
-}
-
-static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name,
-					     struct bch_opts opts)
-{
-	char *dev_name = NULL, **devs = NULL, *s;
-	struct bch_fs *c = ERR_PTR(-ENOMEM);
-	size_t i, nr_devs = 0;
-
-	dev_name = kstrdup(_dev_name, GFP_KERNEL);
-	if (!dev_name)
-		goto err;
-
-	for (s = dev_name; s; s = strchr(s + 1, ':'))
-		nr_devs++;
-
-	devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
-	if (!devs)
-		goto err;
-
-	for (i = 0, s = dev_name;
-	     s;
-	     (s = strchr(s, ':')) && (*s++ = '\0'))
-		devs[i++] = s;
-
-	c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts);
-err:
-	kfree(devs);
-	kfree(dev_name);
-	return c;
+	return c ?: ERR_PTR(-ENOENT);
 }
 
-static int bch2_remount(struct super_block *sb, int *flags, char *data)
+static int bch2_remount(struct super_block *sb, int *flags,
+			struct bch_opts opts)
 {
 	struct bch_fs *c = sb->s_fs_info;
-	struct bch_opts opts = bch2_opts_empty();
-	int ret;
-
-	opt_set(opts, read_only, (*flags & MS_RDONLY) != 0);
+	int ret = 0;
 
-	ret = bch2_parse_mount_opts(&opts, data);
-	if (ret)
-		return ret;
+	opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
 
 	if (opts.read_only != c->opts.read_only) {
-		const char *err = NULL;
-
-		mutex_lock(&c->state_lock);
+		down_write(&c->state_lock);
 
 		if (opts.read_only) {
 			bch2_fs_read_only(c);
 
-			sb->s_flags |= MS_RDONLY;
+			sb->s_flags |= SB_RDONLY;
 		} else {
-			err = bch2_fs_read_write(c);
-			if (err) {
-				bch_err(c, "error going rw: %s", err);
-				return -EINVAL;
+			ret = bch2_fs_read_write(c);
+			if (ret) {
+				bch_err(c, "error going rw: %i", ret);
+				up_write(&c->state_lock);
+				ret = -EINVAL;
+				goto err;
 			}
 
-			sb->s_flags &= ~MS_RDONLY;
+			sb->s_flags &= ~SB_RDONLY;
 		}
 
 		c->opts.read_only = opts.read_only;
 
-		mutex_unlock(&c->state_lock);
+		up_write(&c->state_lock);
 	}
 
-	if (opts.errors >= 0)
+	if (opt_defined(opts, errors))
 		c->opts.errors = opts.errors;
+err:
+	return bch2_err_class(ret);
+}
 
-	return ret;
+static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
+{
+	struct bch_fs *c = root->d_sb->s_fs_info;
+	bool first = true;
+
+	for_each_online_member(c, ca) {
+		if (!first)
+			seq_putc(seq, ':');
+		first = false;
+		seq_puts(seq, ca->disk_sb.sb_name);
+	}
+
+	return 0;
 }
 
 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
 {
 	struct bch_fs *c = root->d_sb->s_fs_info;
-	enum bch_opt_id i;
-	char buf[512];
+	struct printbuf buf = PRINTBUF;
 
-	for (i = 0; i < bch2_opts_nr; i++) {
-		const struct bch_option *opt = &bch2_opt_table[i];
-		u64 v = bch2_opt_get_by_id(&c->opts, i);
+	bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb,
+			  OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE);
+	printbuf_nul_terminate(&buf);
+	seq_printf(seq, ",%s", buf.buf);
 
-		if (opt->mode < OPT_MOUNT)
-			continue;
+	int ret = buf.allocation_failure ? -ENOMEM : 0;
+	printbuf_exit(&buf);
+	return ret;
+}
 
-		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
-			continue;
+static void bch2_put_super(struct super_block *sb)
+{
+	struct bch_fs *c = sb->s_fs_info;
 
-		bch2_opt_to_text(c, buf, sizeof(buf), opt, v,
-				 OPT_SHOW_MOUNT_STYLE);
-		seq_putc(seq, ',');
-		seq_puts(seq, buf);
-	}
+	__bch2_fs_stop(c);
+}
 
+/*
+ * bcachefs doesn't currently integrate intwrite freeze protection but the
+ * internal write references serve the same purpose. Therefore reuse the
+ * read-only transition code to perform the quiesce. The caveat is that we don't
+ * currently have the ability to block tasks that want a write reference while
+ * the superblock is frozen. This is fine for now, but we should either add
+ * blocking support or find a way to integrate sb_start_intwrite() and friends.
+ */
+static int bch2_freeze(struct super_block *sb)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	down_write(&c->state_lock);
+	bch2_fs_read_only(c);
+	up_write(&c->state_lock);
 	return 0;
+}
+
+static int bch2_unfreeze(struct super_block *sb)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	int ret;
 
+	if (test_bit(BCH_FS_emergency_ro, &c->flags))
+		return 0;
+
+	down_write(&c->state_lock);
+	ret = bch2_fs_read_write(c);
+	up_write(&c->state_lock);
+	return ret;
 }
 
 static const struct super_operations bch_super_operations = {
 	.alloc_inode	= bch2_alloc_inode,
-	.destroy_inode	= bch2_destroy_inode,
+	.free_inode	= bch2_free_inode,
 	.write_inode	= bch2_vfs_write_inode,
 	.evict_inode	= bch2_evict_inode,
 	.sync_fs	= bch2_sync_fs,
 	.statfs		= bch2_statfs,
+	.show_devname	= bch2_show_devname,
 	.show_options	= bch2_show_options,
-	.remount_fs	= bch2_remount,
-#if 0
 	.put_super	= bch2_put_super,
 	.freeze_fs	= bch2_freeze,
 	.unfreeze_fs	= bch2_unfreeze,
-#endif
 };
 
-static int bch2_test_super(struct super_block *s, void *data)
-{
-	return s->s_fs_info == data;
-}
-
 static int bch2_set_super(struct super_block *s, void *data)
 {
 	s->s_fs_info = data;
 	return 0;
 }
 
-static struct dentry *bch2_mount(struct file_system_type *fs_type,
-				 int flags, const char *dev_name, void *data)
+static int bch2_noset_super(struct super_block *s, void *data)
+{
+	return -EBUSY;
+}
+
+typedef DARRAY(struct bch_fs *) darray_fs;
+
+static int bch2_test_super(struct super_block *s, void *data)
+{
+	struct bch_fs *c = s->s_fs_info;
+	darray_fs *d = data;
+
+	if (!c)
+		return false;
+
+	darray_for_each(*d, i)
+		if (c != *i)
+			return false;
+	return true;
+}
+
+static int bch2_fs_get_tree(struct fs_context *fc)
 {
 	struct bch_fs *c;
-	struct bch_dev *ca;
 	struct super_block *sb;
 	struct inode *vinode;
-	struct bch_opts opts = bch2_opts_empty();
-	unsigned i;
+	struct bch2_opts_parse *opts_parse = fc->fs_private;
+	struct bch_opts opts = opts_parse->opts;
+	darray_str devs;
+	darray_fs devs_to_fs = {};
 	int ret;
 
-	opt_set(opts, read_only, (flags & MS_RDONLY) != 0);
+	opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
+	opt_set(opts, nostart, true);
 
-	ret = bch2_parse_mount_opts(&opts, data);
-	if (ret)
-		return ERR_PTR(ret);
+	if (!fc->source || strlen(fc->source) == 0)
+		return -EINVAL;
 
-	c = bch2_open_as_blockdevs(dev_name, opts);
-	if (IS_ERR(c))
-		return ERR_CAST(c);
+	ret = bch2_split_devs(fc->source, &devs);
+	if (ret)
+		return ret;
 
-	sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|MS_NOSEC, c);
-	if (IS_ERR(sb)) {
-		closure_put(&c->cl);
-		return ERR_CAST(sb);
+	darray_for_each(devs, i) {
+		ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
+		if (ret)
+			goto err;
 	}
 
-	BUG_ON(sb->s_fs_info != c);
+	sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs);
+	if (!IS_ERR(sb))
+		goto got_sb;
 
-	if (sb->s_root) {
-		closure_put(&c->cl);
+	c = bch2_fs_open(devs.data, devs.nr, opts);
+	ret = PTR_ERR_OR_ZERO(c);
+	if (ret)
+		goto err;
 
-		if ((flags ^ sb->s_flags) & MS_RDONLY) {
+	/* Some options can't be parsed until after the fs is started: */
+	opts = bch2_opts_empty();
+	ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf);
+	if (ret)
+		goto err_stop_fs;
+
+	bch2_opts_apply(&c->opts, opts);
+
+	ret = bch2_fs_start(c);
+	if (ret)
+		goto err_stop_fs;
+
+	sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
+	ret = PTR_ERR_OR_ZERO(sb);
+	if (ret)
+		goto err_stop_fs;
+got_sb:
+	c = sb->s_fs_info;
+
+	if (sb->s_root) {
+		if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) {
 			ret = -EBUSY;
 			goto err_put_super;
 		}
 		goto out;
 	}
 
-	/* XXX: blocksize */
-	sb->s_blocksize		= PAGE_SIZE;
-	sb->s_blocksize_bits	= PAGE_SHIFT;
+	sb->s_blocksize		= block_bytes(c);
+	sb->s_blocksize_bits	= ilog2(block_bytes(c));
 	sb->s_maxbytes		= MAX_LFS_FILESIZE;
 	sb->s_op		= &bch_super_operations;
 	sb->s_export_op		= &bch_export_ops;
@@ -1587,19 +2228,22 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 #endif
 	sb->s_xattr		= bch2_xattr_handlers;
 	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
-	sb->s_time_gran		= c->sb.time_precision;
+	sb->s_time_gran		= c->sb.nsec_per_time_unit;
+	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
+	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
+	super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid));
+	super_set_sysfs_name_uuid(sb);
+	sb->s_shrink->seeks	= 0;
 	c->vfs_sb		= sb;
-	strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
+	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
 
 	ret = super_setup_bdi(sb);
 	if (ret)
 		goto err_put_super;
 
-	sb->s_bdi->congested_fn		= bch2_congested;
-	sb->s_bdi->congested_data	= c;
-	sb->s_bdi->ra_pages		= VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+	sb->s_bdi->ra_pages		= VM_READAHEAD_PAGES;
 
-	for_each_online_member(ca, c, i) {
+	for_each_online_member(c, ca) {
 		struct block_device *bdev = ca->disk_sb.bdev;
 
 		/* XXX: create an anonymous device for multi device filesystems */
@@ -1609,30 +2253,54 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 		break;
 	}
 
+	c->dev = sb->s_dev;
+
 #ifdef CONFIG_BCACHEFS_POSIX_ACL
 	if (c->opts.acl)
-		sb->s_flags	|= MS_POSIXACL;
+		sb->s_flags	|= SB_POSIXACL;
 #endif
 
-	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
-	if (IS_ERR(vinode)) {
-		ret = PTR_ERR(vinode);
+	sb->s_shrink->seeks = 0;
+
+	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
+	ret = PTR_ERR_OR_ZERO(vinode);
+	bch_err_msg(c, ret, "mounting: error getting root inode");
+	if (ret)
 		goto err_put_super;
-	}
 
 	sb->s_root = d_make_root(vinode);
 	if (!sb->s_root) {
+		bch_err(c, "error mounting: error allocating root dentry");
 		ret = -ENOMEM;
 		goto err_put_super;
 	}
 
-	sb->s_flags |= MS_ACTIVE;
+	sb->s_flags |= SB_ACTIVE;
 out:
-	return dget(sb->s_root);
+	fc->root = dget(sb->s_root);
+err:
+	darray_exit(&devs_to_fs);
+	bch2_darray_str_exit(&devs);
+	if (ret)
+		pr_err("error: %s", bch2_err_str(ret));
+	/*
+	 * On an inconsistency error in recovery we might see an -EROFS derived
+	 * errorcode (from the journal), but we don't want to return that to
+	 * userspace as that causes util-linux to retry the mount RO - which is
+	 * confusing:
+	 */
+	if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
+		ret = -EIO;
+	return bch2_err_class(ret);
+
+err_stop_fs:
+	bch2_fs_stop(c);
+	goto err;
 
 err_put_super:
+	__bch2_fs_stop(c);
 	deactivate_locked_super(sb);
-	return ERR_PTR(ret);
+	goto err;
 }
 
 static void bch2_kill_sb(struct super_block *sb)
@@ -1640,19 +2308,93 @@ static void bch2_kill_sb(struct super_block *sb)
 	struct bch_fs *c = sb->s_fs_info;
 
 	generic_shutdown_super(sb);
+	bch2_fs_free(c);
+}
 
-	if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags))
-		bch2_fs_stop(c);
-	else
-		closure_put(&c->cl);
+static void bch2_fs_context_free(struct fs_context *fc)
+{
+	struct bch2_opts_parse *opts = fc->fs_private;
+
+	if (opts) {
+		printbuf_exit(&opts->parse_later);
+		kfree(opts);
+	}
+}
+
+static int bch2_fs_parse_param(struct fs_context *fc,
+			       struct fs_parameter *param)
+{
+	/*
+	 * the "source" param, i.e., the name of the device(s) to mount,
+	 * is handled by the VFS layer.
+	 */
+	if (!strcmp(param->key, "source"))
+		return -ENOPARAM;
+
+	struct bch2_opts_parse *opts = fc->fs_private;
+	struct bch_fs *c = NULL;
+
+	/* for reconfigure, we already have a struct bch_fs */
+	if (fc->root)
+		c = fc->root->d_sb->s_fs_info;
+
+	int ret = bch2_parse_one_mount_opt(c, &opts->opts,
+					   &opts->parse_later, param->key,
+					   param->string);
+
+	return bch2_err_class(ret);
+}
+
+static int bch2_fs_reconfigure(struct fs_context *fc)
+{
+	struct super_block *sb = fc->root->d_sb;
+	struct bch2_opts_parse *opts = fc->fs_private;
+
+	return bch2_remount(sb, &fc->sb_flags, opts->opts);
+}
+
+static const struct fs_context_operations bch2_context_ops = {
+	.free        = bch2_fs_context_free,
+	.parse_param = bch2_fs_parse_param,
+	.get_tree    = bch2_fs_get_tree,
+	.reconfigure = bch2_fs_reconfigure,
+};
+
+static int bch2_init_fs_context(struct fs_context *fc)
+{
+	struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+
+	if (!opts)
+		return -ENOMEM;
+
+	opts->parse_later = PRINTBUF;
+
+	fc->ops = &bch2_context_ops;
+	fc->fs_private = opts;
+
+	return 0;
+}
+
+void bch2_fs_vfs_exit(struct bch_fs *c)
+{
+	if (c->vfs_inodes_by_inum_table.ht.tbl)
+		rhltable_destroy(&c->vfs_inodes_by_inum_table);
+	if (c->vfs_inodes_table.tbl)
+		rhashtable_destroy(&c->vfs_inodes_table);
+}
+
+int bch2_fs_vfs_init(struct bch_fs *c)
+{
+	return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?:
+		rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params);
 }
 
 static struct file_system_type bcache_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "bcachefs",
-	.mount		= bch2_mount,
-	.kill_sb	= bch2_kill_sb,
-	.fs_flags	= FS_REQUIRES_DEV,
+	.owner			= THIS_MODULE,
+	.name			= "bcachefs",
+	.init_fs_context	= bch2_init_fs_context,
+	.kill_sb		= bch2_kill_sb,
+	.fs_flags		= FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
 };
 
 MODULE_ALIAS_FS("bcachefs");
@@ -1660,15 +2402,15 @@ MODULE_ALIAS_FS("bcachefs");
 void bch2_vfs_exit(void)
 {
 	unregister_filesystem(&bcache_fs_type);
-	if (bch2_inode_cache)
-		kmem_cache_destroy(bch2_inode_cache);
+	kmem_cache_destroy(bch2_inode_cache);
 }
 
 int __init bch2_vfs_init(void)
 {
 	int ret = -ENOMEM;
 
-	bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0);
+	bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT |
+				      SLAB_ACCOUNT);
 	if (!bch2_inode_cache)
 		goto err;
 
diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h
index e2fc2706..dd219854 100644
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@@ -1,74 +1,212 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_FS_H
 #define _BCACHEFS_FS_H
 
+#include "inode.h"
 #include "opts.h"
 #include "str_hash.h"
 #include "quota_types.h"
+#include "two_state_shared_lock.h"
 
 #include <linux/seqlock.h>
 #include <linux/stat.h>
 
 struct bch_inode_info {
 	struct inode		v;
+	struct rhash_head	hash;
+	struct rhlist_head	by_inum_hash;
+	subvol_inum		ei_inum;
+
+	struct list_head	ei_vfs_inode_list;
+	unsigned long		ei_flags;
 
 	struct mutex		ei_update_lock;
-	u64			ei_journal_seq;
 	u64			ei_quota_reserved;
 	unsigned long		ei_last_dirtied;
+	two_state_lock_t	ei_pagecache_lock;
 
 	struct mutex		ei_quota_lock;
 	struct bch_qid		ei_qid;
 
-	struct bch_hash_info	ei_str_hash;
+	/*
+	 * When we've been doing nocow writes we'll need to issue flushes to the
+	 * underlying block devices
+	 *
+	 * XXX: a device may have had a flush issued by some other codepath. It
+	 * would be better to keep for each device a sequence number that's
+	 * incremented when we isusue a cache flush, and track here the sequence
+	 * number that needs flushing.
+	 */
+	struct bch_devs_mask	ei_devs_need_flush;
 
 	/* copy of inode in btree: */
 	struct bch_inode_unpacked ei_inode;
 };
 
+#define bch2_pagecache_add_put(i)	bch2_two_state_unlock(&i->ei_pagecache_lock, 0)
+#define bch2_pagecache_add_tryget(i)	bch2_two_state_trylock(&i->ei_pagecache_lock, 0)
+#define bch2_pagecache_add_get(i)	bch2_two_state_lock(&i->ei_pagecache_lock, 0)
+
+#define bch2_pagecache_block_put(i)	bch2_two_state_unlock(&i->ei_pagecache_lock, 1)
+#define bch2_pagecache_block_get(i)	bch2_two_state_lock(&i->ei_pagecache_lock, 1)
+
+static inline subvol_inum inode_inum(struct bch_inode_info *inode)
+{
+	return inode->ei_inum;
+}
+
+/*
+ * Set if we've gotten a btree error for this inode, and thus the vfs inode and
+ * btree inode may be inconsistent:
+ */
+#define EI_INODE_ERROR			0
+
+/*
+ * Set in the inode is in a snapshot subvolume - we don't do quota accounting in
+ * those:
+ */
+#define EI_INODE_SNAPSHOT		1
+#define EI_INODE_HASHED			2
+
 #define to_bch_ei(_inode)					\
 	container_of_or_null(_inode, struct bch_inode_info, v)
 
+static inline int ptrcmp(void *l, void *r)
+{
+	return cmp_int(l, r);
+}
+
+enum bch_inode_lock_op {
+	INODE_PAGECACHE_BLOCK	= (1U << 0),
+	INODE_UPDATE_LOCK	= (1U << 1),
+};
+
+#define bch2_lock_inodes(_locks, ...)					\
+do {									\
+	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
+	unsigned i;							\
+									\
+	bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp);			\
+									\
+	for (i = 1; i < ARRAY_SIZE(a); i++)				\
+		if (a[i] != a[i - 1]) {					\
+			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
+				bch2_pagecache_block_get(a[i]);\
+			if ((_locks) & INODE_UPDATE_LOCK)			\
+				mutex_lock_nested(&a[i]->ei_update_lock, i);\
+		}							\
+} while (0)
+
+#define bch2_unlock_inodes(_locks, ...)					\
+do {									\
+	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
+	unsigned i;							\
+									\
+	bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp);			\
+									\
+	for (i = 1; i < ARRAY_SIZE(a); i++)				\
+		if (a[i] != a[i - 1]) {					\
+			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
+				bch2_pagecache_block_put(a[i]);\
+			if ((_locks) & INODE_UPDATE_LOCK)			\
+				mutex_unlock(&a[i]->ei_update_lock);	\
+		}							\
+} while (0)
+
 static inline struct bch_inode_info *file_bch_inode(struct file *file)
 {
 	return to_bch_ei(file_inode(file));
 }
 
-static inline u8 mode_to_type(umode_t mode)
+static inline bool inode_attr_changing(struct bch_inode_info *dir,
+				struct bch_inode_info *inode,
+				enum inode_opt_id id)
 {
-	return (mode >> 12) & 15;
+	return !(inode->ei_inode.bi_fields_set & (1 << id)) &&
+		bch2_inode_opt_get(&dir->ei_inode, id) !=
+		bch2_inode_opt_get(&inode->ei_inode, id);
 }
 
-static inline unsigned nlink_bias(umode_t mode)
+static inline bool inode_attrs_changing(struct bch_inode_info *dir,
+				 struct bch_inode_info *inode)
 {
-	return S_ISDIR(mode) ? 2 : 1;
+	unsigned id;
+
+	for (id = 0; id < Inode_opt_nr; id++)
+		if (inode_attr_changing(dir, inode, id))
+			return true;
+
+	return false;
 }
 
 struct bch_inode_unpacked;
 
 #ifndef NO_BCACHEFS_FS
 
+struct bch_inode_info *
+__bch2_create(struct mnt_idmap *, struct bch_inode_info *,
+	      struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
+
+int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p);
+
+int bch2_fs_quota_transfer(struct bch_fs *,
+			   struct bch_inode_info *,
+			   struct bch_qid,
+			   unsigned,
+			   enum quota_acct_mode);
+
+static inline int bch2_set_projid(struct bch_fs *c,
+				  struct bch_inode_info *inode,
+				  u32 projid)
+{
+	struct bch_qid qid = inode->ei_qid;
+
+	qid.q[QTYP_PRJ] = projid;
+
+	return bch2_fs_quota_transfer(c, inode, qid,
+				      1 << QTYP_PRJ,
+				      KEY_TYPE_QUOTA_PREALLOC);
+}
+
+struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
+
 /* returns 0 if we want to do the update, or error is passed up */
-typedef int (*inode_set_fn)(struct bch_inode_info *,
+typedef int (*inode_set_fn)(struct btree_trans *,
+			    struct bch_inode_info *,
 			    struct bch_inode_unpacked *, void *);
 
-void bch2_inode_update_after_write(struct bch_fs *,
+void bch2_inode_update_after_write(struct btree_trans *,
 				   struct bch_inode_info *,
 				   struct bch_inode_unpacked *,
 				   unsigned);
-int __must_check bch2_write_inode_trans(struct btree_trans *,
-				struct bch_inode_info *,
-				struct bch_inode_unpacked *,
-				inode_set_fn, void *);
-int __must_check __bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
-				    inode_set_fn, void *, unsigned);
-int __must_check bch2_write_inode(struct bch_fs *,
-				  struct bch_inode_info *);
+int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
+				  inode_set_fn, void *, unsigned);
+
+int bch2_setattr_nonsize(struct mnt_idmap *,
+			 struct bch_inode_info *,
+			 struct iattr *);
+int __bch2_unlink(struct inode *, struct dentry *, bool);
+
+void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
+
+void bch2_fs_vfs_exit(struct bch_fs *);
+int bch2_fs_vfs_init(struct bch_fs *);
 
 void bch2_vfs_exit(void);
 int bch2_vfs_init(void);
 
 #else
 
+#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)	({ do {} while (0); })
+
+static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; }
+
+static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
+					       snapshot_id_list *s) {}
+
+static inline void bch2_fs_vfs_exit(struct bch_fs *c) {}
+static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; }
+
 static inline void bch2_vfs_exit(void) {}
 static inline int bch2_vfs_init(void) { return 0; }
 
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index f6035cc7..1a5a0711 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -1,1362 +1,3433 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bcachefs_ioctl.h"
+#include "bkey_buf.h"
+#include "btree_cache.h"
 #include "btree_update.h"
+#include "buckets.h"
+#include "darray.h"
 #include "dirent.h"
 #include "error.h"
 #include "fs.h"
+#include "fs-common.h"
 #include "fsck.h"
 #include "inode.h"
 #include "keylist.h"
+#include "recovery_passes.h"
+#include "snapshot.h"
 #include "super.h"
+#include "thread_with_file.h"
 #include "xattr.h"
 
+#include <linux/bsearch.h>
 #include <linux/dcache.h> /* struct qstr */
-#include <linux/generic-radix-tree.h>
 
-#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+				   struct bkey_s_c_dirent d)
+{
+	return  inode->bi_dir		== d.k->p.inode &&
+		inode->bi_dir_offset	== d.k->p.offset;
+}
+
+static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
+					 struct bch_inode_unpacked *inode)
+{
+	if (d.v->d_type == DT_SUBVOL
+	    ? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
+	    : le64_to_cpu(d.v->d_inum)		== inode->bi_inum)
+		return 0;
+	return -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+}
+
+static void dirent_inode_mismatch_msg(struct printbuf *out,
+				      struct bch_fs *c,
+				      struct bkey_s_c_dirent dirent,
+				      struct bch_inode_unpacked *inode)
+{
+	prt_str(out, "inode points to dirent that does not point back:");
+	prt_newline(out);
+	bch2_bkey_val_to_text(out, c, dirent.s_c);
+	prt_newline(out);
+	bch2_inode_unpacked_to_text(out, inode);
+}
+
+static int dirent_points_to_inode(struct bch_fs *c,
+				  struct bkey_s_c_dirent dirent,
+				  struct bch_inode_unpacked *inode)
+{
+	int ret = dirent_points_to_inode_nowarn(dirent, inode);
+	if (ret) {
+		struct printbuf buf = PRINTBUF;
+		dirent_inode_mismatch_msg(&buf, c, dirent, inode);
+		bch_warn(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+	}
+	return ret;
+}
 
-static int remove_dirent(struct bch_fs *c, struct btree_iter *iter,
-			 struct bkey_s_c_dirent dirent)
+/*
+ * XXX: this is handling transaction restarts without returning
+ * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
+ */
+static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
+				    u32 snapshot)
 {
-	struct qstr name;
+	u64 sectors = 0;
+
+	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
+				SPOS(inum, 0, snapshot),
+				POS(inum, U64_MAX),
+				0, k, ({
+		if (bkey_extent_is_allocation(k.k))
+			sectors += k.k->size;
+		0;
+	}));
+
+	return ret ?: sectors;
+}
+
+static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
+				    u32 snapshot)
+{
+	u64 subdirs = 0;
+
+	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents,
+				    SPOS(inum, 0, snapshot),
+				    POS(inum, U64_MAX),
+				    0, k, ({
+		if (k.k->type == KEY_TYPE_dirent &&
+		    bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
+			subdirs++;
+		0;
+	}));
+
+	return ret ?: subdirs;
+}
+
+static int subvol_lookup(struct btree_trans *trans, u32 subvol,
+			 u32 *snapshot, u64 *inum)
+{
+	struct bch_subvolume s;
+	int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
+
+	*snapshot = le32_to_cpu(s.snapshot);
+	*inum = le64_to_cpu(s.inode);
+	return ret;
+}
+
+static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
+			      struct bch_inode_unpacked *inode)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
+				     BTREE_ITER_all_snapshots, k, ret) {
+		if (k.k->p.offset != inode_nr)
+			break;
+		if (!bkey_is_inode(k.k))
+			continue;
+		ret = bch2_inode_unpack(k, inode);
+		goto found;
+	}
+	ret = -BCH_ERR_ENOENT_inode;
+found:
+	bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot,
+			struct bch_inode_unpacked *inode)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, inode_nr, snapshot), 0);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	ret = bkey_is_inode(k.k)
+		? bch2_inode_unpack(k, inode)
+		: -BCH_ERR_ENOENT_inode;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int lookup_dirent_in_snapshot(struct btree_trans *trans,
+			   struct bch_hash_info hash_info,
+			   subvol_inum dir, struct qstr *name,
+			   u64 *target, unsigned *type, u32 snapshot)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
+							 &hash_info, dir, name, 0, snapshot);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+	*target = le64_to_cpu(d.v->d_inum);
+	*type = d.v->d_type;
+	bch2_trans_iter_exit(trans, &iter);
+	return 0;
+}
+
+static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
 	struct bch_inode_unpacked dir_inode;
 	struct bch_hash_info dir_hash_info;
-	u64 dir_inum = dirent.k->p.inode;
 	int ret;
-	char *buf;
 
-	name.len = bch2_dirent_name_bytes(dirent);
-	buf = kmalloc(name.len + 1, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
+	ret = lookup_first_inode(trans, pos.inode, &dir_inode);
+	if (ret)
+		goto err;
 
-	memcpy(buf, dirent.v->d_name, name.len);
-	buf[name.len] = '\0';
-	name.name = buf;
+	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
 
-	/* Unlock iter so we don't deadlock, after copying name: */
-	bch2_btree_iter_unlock(iter);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
 
-	ret = bch2_inode_find_by_inum(c, dir_inum, &dir_inode);
-	if (ret) {
-		bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
-		goto err;
+	ret =   bch2_btree_iter_traverse(&iter) ?:
+		bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+				    &dir_hash_info, &iter,
+				    BTREE_UPDATE_internal_snapshot_node);
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+/* Get lost+found, create if it doesn't exist: */
+static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
+			    struct bch_inode_unpacked *lostfound,
+			    u64 reattaching_inum)
+{
+	struct bch_fs *c = trans->c;
+	struct qstr lostfound_str = QSTR("lost+found");
+	struct btree_iter lostfound_iter = { NULL };
+	u64 inum = 0;
+	unsigned d_type = 0;
+	int ret;
+
+	struct bch_snapshot_tree st;
+	ret = bch2_snapshot_tree_lookup(trans,
+			bch2_snapshot_tree(c, snapshot), &st);
+	if (ret)
+		return ret;
+
+	subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
+
+	struct bch_subvolume subvol;
+	ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
+				 false, 0, &subvol);
+	bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
+		    le32_to_cpu(st.master_subvol), snapshot);
+	if (ret)
+		return ret;
+
+	if (!subvol.inode) {
+		struct btree_iter iter;
+		struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
+				BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
+				0, subvolume);
+		ret = PTR_ERR_OR_ZERO(subvol);
+		if (ret)
+			return ret;
+
+		subvol->v.inode = cpu_to_le64(reattaching_inum);
+		bch2_trans_iter_exit(trans, &iter);
 	}
 
-	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
+	root_inum.inum = le64_to_cpu(subvol.inode);
+
+	struct bch_inode_unpacked root_inode;
+	struct bch_hash_info root_hash_info;
+	ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode);
+	bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
+		    root_inum.inum, le32_to_cpu(st.master_subvol));
+	if (ret)
+		return ret;
+
+	root_hash_info = bch2_hash_info_init(c, &root_inode);
+
+	ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
+			      &lostfound_str, &inum, &d_type, snapshot);
+	if (bch2_err_matches(ret, ENOENT))
+		goto create_lostfound;
+
+	bch_err_fn(c, ret);
+	if (ret)
+		return ret;
+
+	if (d_type != DT_DIR) {
+		bch_err(c, "error looking up lost+found: not a directory");
+		return -BCH_ERR_ENOENT_not_directory;
+	}
+
+	/*
+	 * The bch2_check_dirents pass has already run, dangling dirents
+	 * shouldn't exist here:
+	 */
+	ret = lookup_inode(trans, inum, snapshot, lostfound);
+	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
+		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
+	return ret;
+
+create_lostfound:
+	/*
+	 * we always create lost+found in the root snapshot; we don't want
+	 * different branches of the snapshot tree to have different lost+found
+	 */
+	snapshot = le32_to_cpu(st.root_snapshot);
+	/*
+	 * XXX: we could have a nicer log message here  if we had a nice way to
+	 * walk backpointers to print a path
+	 */
+	struct printbuf path = PRINTBUF;
+	ret = bch2_inum_to_path(trans, root_inum, &path);
+	if (ret)
+		goto err;
+
+	bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u",
+		   path.buf, root_inum.subvol, snapshot);
+	printbuf_exit(&path);
+
+	u64 now = bch2_current_time(c);
+	u64 cpu = raw_smp_processor_id();
+
+	bch2_inode_init_early(c, lostfound);
+	bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
+	lostfound->bi_dir = root_inode.bi_inum;
+	lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
 
-	ret = bch2_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL);
+	root_inode.bi_nlink++;
+
+	ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
 	if (ret)
-		bch_err(c, "remove_dirent: err %i deleting dirent", ret);
+		goto err;
+
+	bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
+	ret = bch2_btree_iter_traverse(&lostfound_iter);
+	if (ret)
+		goto err;
+
+	ret =   bch2_dirent_create_snapshot(trans,
+				0, root_inode.bi_inum, snapshot, &root_hash_info,
+				mode_to_type(lostfound->bi_mode),
+				&lostfound_str,
+				lostfound->bi_inum,
+				&lostfound->bi_dir_offset,
+				STR_HASH_must_create) ?:
+		bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
+				       BTREE_UPDATE_internal_snapshot_node);
+err:
+	bch_err_msg(c, ret, "creating lost+found");
+	bch2_trans_iter_exit(trans, &lostfound_iter);
+	return ret;
+}
+
+static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
+{
+	if (inode->bi_inum == BCACHEFS_ROOT_INO &&
+	    inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
+		return false;
+
+	return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
+}
+
+static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
+					SPOS(d_pos.inode, d_pos.offset, snapshot),
+					BTREE_ITER_intent|
+					BTREE_ITER_with_updates);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (bpos_eq(k.k->p, d_pos)) {
+		/*
+		 * delet_at() doesn't work because the update path doesn't
+		 * internally use BTREE_ITER_with_updates yet
+		 */
+		struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
+		ret = PTR_ERR_OR_ZERO(k);
+		if (ret)
+			goto err;
+
+		bkey_init(&k->k);
+		k->k.type = KEY_TYPE_whiteout;
+		k->k.p = iter.pos;
+		ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
+	}
 err:
-	kfree(buf);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
-static int reattach_inode(struct bch_fs *c,
-			  struct bch_inode_unpacked *lostfound_inode,
-			  u64 inum)
+static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
 {
-	struct bch_hash_info lostfound_hash_info =
-		bch2_hash_info_init(c, lostfound_inode);
-	struct bkey_inode_buf packed;
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked lostfound;
 	char name_buf[20];
-	struct qstr name;
 	int ret;
 
-	snprintf(name_buf, sizeof(name_buf), "%llu", inum);
-	name = (struct qstr) QSTR(name_buf);
+	u32 dirent_snapshot = inode->bi_snapshot;
+	if (inode->bi_subvol) {
+		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
 
-	lostfound_inode->bi_nlink++;
+		u64 root_inum;
+		ret = subvol_lookup(trans, inode->bi_parent_subvol,
+				    &dirent_snapshot, &root_inum);
+		if (ret)
+			return ret;
 
-	bch2_inode_pack(&packed, lostfound_inode);
+		snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
+	} else {
+		snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
+	}
 
-	ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
-			       NULL, NULL, NULL,
-			       BTREE_INSERT_NOFAIL);
-	if (ret) {
-		bch_err(c, "error %i reattaching inode %llu while updating lost+found",
-			ret, inum);
+	ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
+	if (ret)
 		return ret;
+
+	lostfound.bi_nlink += S_ISDIR(inode->bi_mode);
+
+	/* ensure lost+found inode is also present in inode snapshot */
+	if (!inode->bi_subvol) {
+		BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot));
+		lostfound.bi_snapshot = inode->bi_snapshot;
 	}
 
-	ret = bch2_dirent_create(c, lostfound_inode->bi_inum,
-				 &lostfound_hash_info,
-				 DT_DIR, &name, inum, NULL,
-				 BTREE_INSERT_NOFAIL);
+	ret = __bch2_fsck_write_inode(trans, &lostfound);
+	if (ret)
+		return ret;
+
+	struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
+	struct qstr name = (struct qstr) QSTR(name_buf);
+
+	inode->bi_dir = lostfound.bi_inum;
+
+	ret = bch2_dirent_create_snapshot(trans,
+				inode->bi_parent_subvol, lostfound.bi_inum,
+				dirent_snapshot,
+				&dir_hash,
+				inode_d_type(inode),
+				&name,
+				inode->bi_subvol ?: inode->bi_inum,
+				&inode->bi_dir_offset,
+				STR_HASH_must_create);
 	if (ret) {
-		bch_err(c, "error %i reattaching inode %llu while creating new dirent",
-			ret, inum);
+		bch_err_msg(c, ret, "error creating dirent");
+		return ret;
+	}
+
+	ret = __bch2_fsck_write_inode(trans, inode);
+	if (ret)
 		return ret;
+
+	/*
+	 * Fix up inodes in child snapshots: if they should also be reattached
+	 * update the backpointer field, if they should not be we need to emit
+	 * whiteouts for the dirent we just created.
+	 */
+	if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
+		snapshot_id_list whiteouts_done;
+		struct btree_iter iter;
+		struct bkey_s_c k;
+
+		darray_init(&whiteouts_done);
+
+		for_each_btree_key_reverse_norestart(trans, iter,
+				BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
+				BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
+			if (k.k->p.offset != inode->bi_inum)
+				break;
+
+			if (!bkey_is_inode(k.k) ||
+			    !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
+			    snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
+				continue;
+
+			struct bch_inode_unpacked child_inode;
+			bch2_inode_unpack(k, &child_inode);
+
+			if (!inode_should_reattach(&child_inode)) {
+				ret = maybe_delete_dirent(trans,
+							  SPOS(lostfound.bi_inum, inode->bi_dir_offset,
+							       dirent_snapshot),
+							  k.k->p.snapshot);
+				if (ret)
+					break;
+
+				ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
+				if (ret)
+					break;
+			} else {
+				iter.snapshot = k.k->p.snapshot;
+				child_inode.bi_dir = inode->bi_dir;
+				child_inode.bi_dir_offset = inode->bi_dir_offset;
+
+				ret = bch2_inode_write_flags(trans, &iter, &child_inode,
+							     BTREE_UPDATE_internal_snapshot_node);
+				if (ret)
+					break;
+			}
+		}
+		darray_exit(&whiteouts_done);
+		bch2_trans_iter_exit(trans, &iter);
 	}
+
 	return ret;
 }
 
-struct inode_walker {
-	bool			first_this_inode;
-	bool			have_inode;
-	u64			cur_inum;
-	struct bch_inode_unpacked inode;
-};
+static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
+						struct btree_iter *iter,
+						struct bpos pos)
+{
+	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
+}
 
-static struct inode_walker inode_walker_init(void)
+static int remove_backpointer(struct btree_trans *trans,
+			      struct bch_inode_unpacked *inode)
 {
-	return (struct inode_walker) {
-		.cur_inum	= -1,
-		.have_inode	= false,
-	};
+	if (!inode->bi_dir)
+		return 0;
+
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter,
+				     SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot));
+	int ret = bkey_err(d) ?:
+		  dirent_points_to_inode(c, d, inode) ?:
+		  __remove_dirent(trans, d.k->p);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
 }
 
-static int walk_inode(struct bch_fs *c, struct inode_walker *w, u64 inum)
+static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
 {
-	w->first_this_inode	= inum != w->cur_inum;
-	w->cur_inum		= inum;
+	struct bch_fs *c = trans->c;
 
-	if (w->first_this_inode) {
-		int ret = bch2_inode_find_by_inum(c, inum, &w->inode);
+	struct bch_inode_unpacked inode;
+	int ret = bch2_inode_find_by_inum_trans(trans,
+				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
+				&inode);
+	if (ret)
+		return ret;
 
-		if (ret && ret != -ENOENT)
+	ret = remove_backpointer(trans, &inode);
+	if (!bch2_err_matches(ret, ENOENT))
+		bch_err_msg(c, ret, "removing dirent");
+	if (ret)
+		return ret;
+
+	ret = reattach_inode(trans, &inode);
+	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
+	return ret;
+}
+
+static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
+{
+	struct bch_fs *c = trans->c;
+
+	if (!bch2_snapshot_is_leaf(c, snapshotid)) {
+		bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
+		return -BCH_ERR_fsck_repair_unimplemented;
+	}
+
+	/*
+	 * If inum isn't set, that means we're being called from check_dirents,
+	 * not check_inodes - the root of this subvolume doesn't exist or we
+	 * would have found it there:
+	 */
+	if (!inum) {
+		struct btree_iter inode_iter = {};
+		struct bch_inode_unpacked new_inode;
+		u64 cpu = raw_smp_processor_id();
+
+		bch2_inode_init_early(c, &new_inode);
+		bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
+
+		new_inode.bi_subvol = subvolid;
+
+		int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
+			  bch2_btree_iter_traverse(&inode_iter) ?:
+			  bch2_inode_write(trans, &inode_iter, &new_inode);
+		bch2_trans_iter_exit(trans, &inode_iter);
+		if (ret)
 			return ret;
 
-		w->have_inode = !ret;
+		inum = new_inode.bi_inum;
 	}
 
+	bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
+
+	struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
+	int ret = PTR_ERR_OR_ZERO(new_subvol);
+	if (ret)
+		return ret;
+
+	bkey_subvolume_init(&new_subvol->k_i);
+	new_subvol->k.p.offset	= subvolid;
+	new_subvol->v.snapshot	= cpu_to_le32(snapshotid);
+	new_subvol->v.inode	= cpu_to_le64(inum);
+	ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
+	if (ret)
+		return ret;
+
+	struct btree_iter iter;
+	struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_snapshots, POS(0, snapshotid),
+			0, snapshot);
+	ret = PTR_ERR_OR_ZERO(s);
+	bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
+	if (ret)
+		return ret;
+
+	u32 snapshot_tree = le32_to_cpu(s->v.tree);
+
+	s->v.subvol = cpu_to_le32(subvolid);
+	SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
+	bch2_trans_iter_exit(trans, &iter);
+
+	struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
+			0, snapshot_tree);
+	ret = PTR_ERR_OR_ZERO(st);
+	bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
+	if (ret)
+		return ret;
+
+	if (!st->v.master_subvol)
+		st->v.master_subvol = cpu_to_le32(subvolid);
+
+	bch2_trans_iter_exit(trans, &iter);
 	return 0;
 }
 
-struct hash_check {
-	struct bch_hash_info	info;
-	struct btree_trans	*trans;
+static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
+{
+	struct bch_fs *c = trans->c;
+	unsigned i_mode = S_IFREG;
+	u64 i_size = 0;
+
+	switch (btree) {
+	case BTREE_ID_extents: {
+		struct btree_iter iter = {};
+
+		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
+		struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0));
+		bch2_trans_iter_exit(trans, &iter);
+		int ret = bkey_err(k);
+		if (ret)
+			return ret;
 
-	/* start of current chain of hash collisions: */
-	struct btree_iter	*chain;
+		i_size = k.k->p.offset << 9;
+		break;
+	}
+	case BTREE_ID_dirents:
+		i_mode = S_IFDIR;
+		break;
+	case BTREE_ID_xattrs:
+		break;
+	default:
+		BUG();
+	}
 
-	/* next offset in current chain of hash collisions: */
-	u64			next;
+	struct bch_inode_unpacked new_inode;
+	bch2_inode_init_early(c, &new_inode);
+	bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
+	new_inode.bi_size = i_size;
+	new_inode.bi_inum = inum;
+	new_inode.bi_snapshot = snapshot;
+
+	return __bch2_fsck_write_inode(trans, &new_inode);
+}
+
+struct snapshots_seen {
+	struct bpos			pos;
+	snapshot_id_list		ids;
 };
 
-static void hash_check_init(const struct bch_hash_desc desc,
-			    struct btree_trans *trans,
-			    struct hash_check *h)
+static inline void snapshots_seen_exit(struct snapshots_seen *s)
 {
-	h->trans = trans;
-	h->chain = bch2_trans_get_iter(trans, desc.btree_id, POS_MIN, 0);
-	h->next = -1;
+	darray_exit(&s->ids);
 }
 
-static void hash_check_set_inode(struct hash_check *h, struct bch_fs *c,
-				 const struct bch_inode_unpacked *bi)
+static inline void snapshots_seen_init(struct snapshots_seen *s)
 {
-	h->info = bch2_hash_info_init(c, bi);
-	h->next = -1;
+	memset(s, 0, sizeof(*s));
 }
 
-static int hash_redo_key(const struct bch_hash_desc desc,
-			 struct hash_check *h, struct bch_fs *c,
-			 struct btree_iter *k_iter, struct bkey_s_c k,
-			 u64 hashed)
+static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
 {
-	struct bkey_i *tmp;
-	int ret = 0;
-
-	tmp = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
-	if (!tmp)
-		return -ENOMEM;
-
-	bkey_reassemble(tmp, k);
+	u32 *i;
+	__darray_for_each(s->ids, i) {
+		if (*i == id)
+			return 0;
+		if (*i > id)
+			break;
+	}
 
-	ret = bch2_btree_delete_at(k_iter, 0);
+	int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
 	if (ret)
-		goto err;
+		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+			s->ids.size);
+	return ret;
+}
 
-	bch2_btree_iter_unlock(k_iter);
+static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
+				 enum btree_id btree_id, struct bpos pos)
+{
+	if (!bkey_eq(s->pos, pos))
+		s->ids.nr = 0;
+	s->pos = pos;
 
-	bch2_hash_set(desc, &h->info, c, k_iter->pos.inode, NULL, tmp,
-		      BTREE_INSERT_NOFAIL|
-		      BCH_HASH_SET_MUST_CREATE);
-err:
-	kfree(tmp);
-	return ret;
+	return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
 }
 
-/* fsck hasn't been converted to new transactions yet: */
-static int fsck_hash_delete_at(const struct bch_hash_desc desc,
-			       struct bch_hash_info *info,
-			       struct btree_iter *orig_iter)
+/**
+ * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
+ * and @ancestor hasn't been overwritten in @seen
+ *
+ * @c:		filesystem handle
+ * @seen:	list of snapshot ids already seen at current position
+ * @id:		descendent snapshot id
+ * @ancestor:	ancestor snapshot id
+ *
+ * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
+ */
+static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
+				    u32 id, u32 ancestor)
 {
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	int ret;
+	ssize_t i;
 
-	bch2_btree_iter_unlock(orig_iter);
+	EBUG_ON(id > ancestor);
 
-	bch2_trans_init(&trans, orig_iter->c);
-retry:
-	bch2_trans_begin(&trans);
+	/* @ancestor should be the snapshot most recently added to @seen */
+	EBUG_ON(ancestor != seen->pos.snapshot);
+	EBUG_ON(ancestor != darray_last(seen->ids));
 
-	iter = bch2_trans_copy_iter(&trans, orig_iter);
-	if (IS_ERR(iter)) {
-		ret = PTR_ERR(iter);
-		goto err;
+	if (id == ancestor)
+		return true;
+
+	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
+		return false;
+
+	/*
+	 * We know that @id is a descendant of @ancestor, we're checking if
+	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
+	 * @ascestor and with @id as a descendent.
+	 *
+	 * But we already know that we're scanning IDs between @id and @ancestor
+	 * numerically, since snapshot ID lists are kept sorted, so if we find
+	 * an id that's an ancestor of @id we're done:
+	 */
+
+	for (i = seen->ids.nr - 2;
+	     i >= 0 && seen->ids.data[i] >= id;
+	     --i)
+		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
+			return false;
+
+	return true;
+}
+
+/**
+ * ref_visible - given a key with snapshot id @src that points to a key with
+ * snapshot id @dst, test whether there is some snapshot in which @dst is
+ * visible.
+ *
+ * @c:		filesystem handle
+ * @s:		list of snapshot IDs already seen at @src
+ * @src:	snapshot ID of src key
+ * @dst:	snapshot ID of dst key
+ * Returns:	true if there is some snapshot in which @dst is visible
+ *
+ * Assumes we're visiting @src keys in natural key order
+ */
+static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
+			u32 src, u32 dst)
+{
+	return dst <= src
+		? key_visible_in_snapshot(c, s, dst, src)
+		: bch2_snapshot_is_ancestor(c, src, dst);
+}
+
+static int ref_visible2(struct bch_fs *c,
+			u32 src, struct snapshots_seen *src_seen,
+			u32 dst, struct snapshots_seen *dst_seen)
+{
+	if (dst > src) {
+		swap(dst, src);
+		swap(dst_seen, src_seen);
 	}
+	return key_visible_in_snapshot(c, src_seen, dst, src);
+}
 
-	ret   = bch2_hash_delete_at(&trans, desc, info, iter) ?:
-		bch2_trans_commit(&trans, NULL, NULL, NULL,
-				  BTREE_INSERT_ATOMIC|
-				  BTREE_INSERT_NOFAIL);
-err:
-	if (ret == -EINTR)
-		goto retry;
+#define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
+	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
+	     (_i)->snapshot <= (_snapshot); _i++)					\
+		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
 
-	bch2_trans_exit(&trans);
-	return ret;
+struct inode_walker_entry {
+	struct bch_inode_unpacked inode;
+	u32			snapshot;
+	u64			count;
+};
+
+struct inode_walker {
+	bool				first_this_inode;
+	bool				have_inodes;
+	bool				recalculate_sums;
+	struct bpos			last_pos;
+
+	DARRAY(struct inode_walker_entry) inodes;
+};
+
+static void inode_walker_exit(struct inode_walker *w)
+{
+	darray_exit(&w->inodes);
 }
 
-static int hash_check_duplicates(const struct bch_hash_desc desc,
-				 struct hash_check *h, struct bch_fs *c,
-				 struct btree_iter *k_iter, struct bkey_s_c k)
+static struct inode_walker inode_walker_init(void)
 {
-	struct btree_iter *iter;
-	struct bkey_s_c k2;
-	char buf[200];
-	int ret = 0;
+	return (struct inode_walker) { 0, };
+}
 
-	if (!bkey_cmp(h->chain->pos, k_iter->pos))
-		return 0;
+static int add_inode(struct bch_fs *c, struct inode_walker *w,
+		     struct bkey_s_c inode)
+{
+	struct bch_inode_unpacked u;
 
-	iter = bch2_trans_copy_iter(h->trans, h->chain);
-	BUG_ON(IS_ERR(iter));
+	BUG_ON(bch2_inode_unpack(inode, &u));
 
-	for_each_btree_key_continue(iter, 0, k2) {
-		if (bkey_cmp(k2.k->p, k.k->p) >= 0)
-			break;
+	return darray_push(&w->inodes, ((struct inode_walker_entry) {
+		.inode		= u,
+		.snapshot	= inode.k->p.snapshot,
+	}));
+}
 
-		if (fsck_err_on(k2.k->type == desc.key_type &&
-				!desc.cmp_bkey(k, k2), c,
-				"duplicate hash table keys:\n%s",
-				(bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
-						       buf, sizeof(buf), k), buf))) {
-			ret = fsck_hash_delete_at(desc, &h->info, k_iter);
-			if (ret)
-				return ret;
-			ret = 1;
+static int get_inodes_all_snapshots(struct btree_trans *trans,
+				    struct inode_walker *w, u64 inum)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	/*
+	 * We no longer have inodes for w->last_pos; clear this to avoid
+	 * screwing up check_i_sectors/check_subdir_count if we take a
+	 * transaction restart here:
+	 */
+	w->have_inodes = false;
+	w->recalculate_sums = false;
+	w->inodes.nr = 0;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
+				     BTREE_ITER_all_snapshots, k, ret) {
+		if (k.k->p.offset != inum)
 			break;
-		}
+
+		if (bkey_is_inode(k.k))
+			add_inode(c, w, k);
 	}
-fsck_err:
-	bch2_trans_iter_free(h->trans, iter);
-	return ret;
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret)
+		return ret;
+
+	w->first_this_inode = true;
+	w->have_inodes = true;
+	return 0;
 }
 
-static int hash_check_key(const struct bch_hash_desc desc,
-			  struct hash_check *h, struct bch_fs *c,
-			  struct btree_iter *k_iter, struct bkey_s_c k)
+static struct inode_walker_entry *
+lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
 {
-	char buf[200];
-	u64 hashed;
-	int ret = 0;
+	bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
 
-	if (k.k->type != desc.whiteout_type &&
-	    k.k->type != desc.key_type)
-		return 0;
+	struct inode_walker_entry *i;
+	__darray_for_each(w->inodes, i)
+		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
+			goto found;
 
-	if (k.k->p.offset != h->next)
-		bch2_btree_iter_copy(h->chain, k_iter);
-	h->next = k.k->p.offset + 1;
+	return NULL;
+found:
+	BUG_ON(k.k->p.snapshot > i->snapshot);
 
-	if (k.k->type != desc.key_type)
-		return 0;
+	if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
+		struct inode_walker_entry new = *i;
 
-	hashed = desc.hash_bkey(&h->info, k);
+		new.snapshot = k.k->p.snapshot;
+		new.count = 0;
 
-	if (fsck_err_on(hashed < h->chain->pos.offset ||
-			hashed > k.k->p.offset, c,
-			"hash table key at wrong offset: %llu, "
-			"hashed to %llu chain starts at %llu\n%s",
-			k.k->p.offset, hashed, h->chain->pos.offset,
-			(bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
-					       buf, sizeof(buf), k), buf))) {
-		ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
-		if (ret) {
-			bch_err(c, "hash_redo_key err %i", ret);
-			return ret;
-		}
-		return 1;
+		struct printbuf buf = PRINTBUF;
+		bch2_bkey_val_to_text(&buf, c, k);
+
+		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
+			 "unexpected because we should always update the inode when we update a key in that inode\n"
+			 "%s",
+			 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
+		printbuf_exit(&buf);
+
+		while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
+			--i;
+
+		size_t pos = i - w->inodes.data;
+		int ret = darray_insert_item(&w->inodes, pos, new);
+		if (ret)
+			return ERR_PTR(ret);
+
+		i = w->inodes.data + pos;
 	}
 
-	ret = hash_check_duplicates(desc, h, c, k_iter, k);
-fsck_err:
-	return ret;
+	return i;
 }
 
-/*
- * Walk extents: verify that extents have a corresponding S_ISREG inode, and
- * that i_size an i_sectors are consistent
- */
-noinline_for_stack
-static int check_extents(struct bch_fs *c)
+static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
+					     struct inode_walker *w,
+					     struct bkey_s_c k)
 {
-	struct inode_walker w = inode_walker_init();
+	if (w->last_pos.inode != k.k->p.inode) {
+		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+
+	w->last_pos = k.k->p;
+
+	return lookup_inode_for_snapshot(trans->c, w, k);
+}
+
+static int get_visible_inodes(struct btree_trans *trans,
+			      struct inode_walker *w,
+			      struct snapshots_seen *s,
+			      u64 inum)
+{
+	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	u64 i_sectors;
-	int ret = 0;
+	int ret;
 
-	bch_verbose(c, "checking extents");
+	w->inodes.nr = 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
-			   POS(BCACHEFS_ROOT_INO, 0), 0, k) {
-		ret = walk_inode(c, &w, k.k->p.inode);
-		if (ret)
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
+			   BTREE_ITER_all_snapshots, k, ret) {
+		if (k.k->p.offset != inum)
 			break;
 
-		if (fsck_err_on(!w.have_inode, c,
-			"extent type %u for missing inode %llu",
-			k.k->type, k.k->p.inode) ||
-		    fsck_err_on(w.have_inode &&
-			!S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
-			"extent type %u for non regular file, inode %llu mode %o",
-			k.k->type, k.k->p.inode, w.inode.bi_mode)) {
-			bch2_btree_iter_unlock(&iter);
-
-			ret = bch2_inode_truncate(c, k.k->p.inode, 0, NULL, NULL);
-			if (ret)
-				goto err;
+		if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
 			continue;
-		}
-
-		if (fsck_err_on(w.first_this_inode &&
-			w.have_inode &&
-			!(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
-			w.inode.bi_sectors !=
-			(i_sectors = bch2_count_inode_sectors(c, w.cur_inum)),
-			c, "i_sectors wrong: got %llu, should be %llu",
-			w.inode.bi_sectors, i_sectors)) {
-			struct bkey_inode_buf p;
 
-			w.inode.bi_sectors = i_sectors;
+		if (bkey_is_inode(k.k))
+			add_inode(c, w, k);
 
-			bch2_btree_iter_unlock(&iter);
-
-			bch2_inode_pack(&p, &w.inode);
+		if (k.k->p.snapshot >= s->pos.snapshot)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
 
-			ret = bch2_btree_insert(c, BTREE_ID_INODES,
-						&p.inode.k_i,
-						NULL,
-						NULL,
-						NULL,
-						BTREE_INSERT_NOFAIL);
-			if (ret) {
-				bch_err(c, "error in fs gc: error %i "
-					"updating inode", ret);
-				goto err;
-			}
+	return ret;
+}
 
-			/* revalidate iterator: */
-			k = bch2_btree_iter_peek(&iter);
-		}
+static int dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d)
+{
+	if (d.v->d_type == DT_SUBVOL) {
+		u32 snap;
+		u64 inum;
+		int ret = subvol_lookup(trans, le32_to_cpu(d.v->d_child_subvol), &snap, &inum);
+		if (ret && !bch2_err_matches(ret, ENOENT))
+			return ret;
+		return !ret;
+	} else {
+		struct btree_iter iter;
+		struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+				SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0);
+		int ret = bkey_err(k);
+		if (ret)
+			return ret;
 
-		if (fsck_err_on(w.have_inode &&
-			!(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-			k.k->type != BCH_RESERVATION &&
-			k.k->p.offset > round_up(w.inode.bi_size, PAGE_SIZE) >> 9, c,
-			"extent type %u offset %llu past end of inode %llu, i_size %llu",
-			k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
-			bch2_btree_iter_unlock(&iter);
-
-			ret = bch2_inode_truncate(c, k.k->p.inode,
-					round_up(w.inode.bi_size, PAGE_SIZE) >> 9,
-					NULL, NULL);
-			if (ret)
-				goto err;
-			continue;
-		}
+		ret = bkey_is_inode(k.k);
+		bch2_trans_iter_exit(trans, &iter);
+		return ret;
 	}
-err:
-fsck_err:
-	return bch2_btree_iter_unlock(&iter) ?: ret;
 }
 
 /*
- * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
- * validate d_type
+ * Prefer to delete the first one, since that will be the one at the wrong
+ * offset:
+ * return value: 0 -> delete k1, 1 -> delete k2
  */
-noinline_for_stack
-static int check_dirents(struct bch_fs *c)
+static int hash_pick_winner(struct btree_trans *trans,
+			    const struct bch_hash_desc desc,
+			    struct bch_hash_info *hash_info,
+			    struct bkey_s_c k1,
+			    struct bkey_s_c k2)
 {
-	struct inode_walker w = inode_walker_init();
-	struct hash_check h;
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct bkey_s_c k;
-	unsigned name_len;
-	char buf[200];
-	int ret = 0;
-
-	bch_verbose(c, "checking dirents");
-
-	bch2_trans_init(&trans, c);
+	if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) &&
+	    !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k)))
+		return 0;
 
-	BUG_ON(bch2_trans_preload_iters(&trans));
+	switch (desc.btree_id) {
+	case BTREE_ID_dirents: {
+		int ret = dirent_has_target(trans, bkey_s_c_to_dirent(k1));
+		if (ret < 0)
+			return ret;
+		if (!ret)
+			return 0;
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
-				   POS(BCACHEFS_ROOT_INO, 0), 0);
+		ret = dirent_has_target(trans, bkey_s_c_to_dirent(k2));
+		if (ret < 0)
+			return ret;
+		if (!ret)
+			return 1;
+		return 2;
+	}
+	default:
+		return 0;
+	}
+}
 
-	hash_check_init(bch2_dirent_hash_desc, &trans, &h);
+static int fsck_update_backpointers(struct btree_trans *trans,
+				    struct snapshots_seen *s,
+				    const struct bch_hash_desc desc,
+				    struct bch_hash_info *hash_info,
+				    struct bkey_i *new)
+{
+	if (new->k.type != KEY_TYPE_dirent)
+		return 0;
 
-	for_each_btree_key_continue(iter, 0, k) {
-		struct bkey_s_c_dirent d;
-		struct bch_inode_unpacked target;
-		bool have_target;
-		u64 d_inum;
+	struct bkey_i_dirent *d = bkey_i_to_dirent(new);
+	struct inode_walker target = inode_walker_init();
+	int ret = 0;
 
-		ret = walk_inode(c, &w, k.k->p.inode);
+	if (d->v.d_type == DT_SUBVOL) {
+		BUG();
+	} else {
+		ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
 		if (ret)
-			break;
+			goto err;
 
-		if (fsck_err_on(!w.have_inode, c,
-				"dirent in nonexisting directory:\n%s",
-				(bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
-						       buf, sizeof(buf), k), buf)) ||
-		    fsck_err_on(!S_ISDIR(w.inode.bi_mode), c,
-				"dirent in non directory inode type %u:\n%s",
-				mode_to_type(w.inode.bi_mode),
-				(bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
-						       buf, sizeof(buf), k), buf))) {
-			ret = bch2_btree_delete_at(iter, 0);
+		darray_for_each(target.inodes, i) {
+			i->inode.bi_dir_offset = d->k.p.offset;
+			ret = __bch2_fsck_write_inode(trans, &i->inode);
 			if (ret)
 				goto err;
-			continue;
 		}
+	}
+err:
+	inode_walker_exit(&target);
+	return ret;
+}
 
-		if (w.first_this_inode && w.have_inode)
-			hash_check_set_inode(&h, c, &w.inode);
+static int fsck_rename_dirent(struct btree_trans *trans,
+			      struct snapshots_seen *s,
+			      const struct bch_hash_desc desc,
+			      struct bch_hash_info *hash_info,
+			      struct bkey_s_c_dirent old)
+{
+	struct qstr old_name = bch2_dirent_get_name(old);
+	struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32);
+	int ret = PTR_ERR_OR_ZERO(new);
+	if (ret)
+		return ret;
 
-		ret = hash_check_key(bch2_dirent_hash_desc, &h, c, iter, k);
-		if (ret > 0) {
-			ret = 0;
-			continue;
-		}
+	bkey_dirent_init(&new->k_i);
+	dirent_copy_target(new, old);
+	new->k.p = old.k->p;
 
-		if (ret)
-			goto fsck_err;
+	for (unsigned i = 0; i < 1000; i++) {
+		unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u",
+				       old_name.len, old_name.name, i);
+		unsigned u64s = BKEY_U64s + dirent_val_u64s(len);
 
-		if (k.k->type != BCH_DIRENT)
-			continue;
+		if (u64s > U8_MAX)
+			return -EINVAL;
 
-		d = bkey_s_c_to_dirent(k);
-		d_inum = le64_to_cpu(d.v->d_inum);
+		new->k.u64s = u64s;
 
-		name_len = bch2_dirent_name_bytes(d);
+		ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+						(subvol_inum) { 0, old.k->p.inode },
+						old.k->p.snapshot, &new->k_i,
+						BTREE_UPDATE_internal_snapshot_node);
+		if (!bch2_err_matches(ret, EEXIST))
+			break;
+	}
 
-		if (fsck_err_on(!name_len, c, "empty dirent") ||
-		    fsck_err_on(name_len == 1 &&
-				!memcmp(d.v->d_name, ".", 1), c,
-				". dirent") ||
-		    fsck_err_on(name_len == 2 &&
-				!memcmp(d.v->d_name, "..", 2), c,
-				".. dirent")) {
-			ret = remove_dirent(c, iter, d);
-			if (ret)
-				goto err;
-			continue;
-		}
+	if (ret)
+		return ret;
 
-		if (fsck_err_on(d_inum == d.k->p.inode, c,
-				"dirent points to own directory:\n%s",
-				(bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
-						       buf, sizeof(buf), k), buf))) {
-			ret = remove_dirent(c, iter, d);
-			if (ret)
-				goto err;
-			continue;
-		}
+	return fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
+}
+
+static int hash_check_key(struct btree_trans *trans,
+			  struct snapshots_seen *s,
+			  const struct bch_hash_desc desc,
+			  struct bch_hash_info *hash_info,
+			  struct btree_iter *k_iter, struct bkey_s_c hash_k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter = { NULL };
+	struct printbuf buf = PRINTBUF;
+	struct bkey_s_c k;
+	u64 hash;
+	int ret = 0;
+
+	if (hash_k.k->type != desc.key_type)
+		return 0;
+
+	hash = desc.hash_bkey(hash_info, hash_k);
+
+	if (likely(hash == hash_k.k->p.offset))
+		return 0;
+
+	if (hash_k.k->p.offset < hash)
+		goto bad_hash;
 
-		ret = bch2_inode_find_by_inum(c, d_inum, &target);
-		if (ret && ret != -ENOENT)
+	for_each_btree_key_norestart(trans, iter, desc.btree_id,
+				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
+				     BTREE_ITER_slots, k, ret) {
+		if (bkey_eq(k.k->p, hash_k.k->p))
 			break;
 
-		have_target = !ret;
-		ret = 0;
+		if (k.k->type == desc.key_type &&
+		    !desc.cmp_bkey(k, hash_k))
+			goto duplicate_entries;
 
-		if (fsck_err_on(!have_target, c,
-				"dirent points to missing inode:\n%s",
-				(bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
-						       buf, sizeof(buf), k), buf))) {
-			ret = remove_dirent(c, iter, d);
-			if (ret)
-				goto err;
-			continue;
+		if (bkey_deleted(k.k)) {
+			bch2_trans_iter_exit(trans, &iter);
+			goto bad_hash;
 		}
+	}
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ret;
+bad_hash:
+	if (fsck_err(trans, hash_table_key_wrong_offset,
+		     "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n  %s",
+		     bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
+		     (printbuf_reset(&buf),
+		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
+		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k);
+		if (IS_ERR(new))
+			return PTR_ERR(new);
+
+		k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info,
+				       (subvol_inum) { 0, hash_k.k->p.inode },
+				       hash_k.k->p.snapshot, new,
+				       STR_HASH_must_create|
+				       BTREE_ITER_with_updates|
+				       BTREE_UPDATE_internal_snapshot_node);
+		ret = bkey_err(k);
+		if (ret)
+			goto out;
+		if (k.k)
+			goto duplicate_entries;
+
+		ret =   bch2_hash_delete_at(trans, desc, hash_info, k_iter,
+					    BTREE_UPDATE_internal_snapshot_node) ?:
+			fsck_update_backpointers(trans, s, desc, hash_info, new) ?:
+			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+			-BCH_ERR_transaction_restart_nested;
+		goto out;
+	}
+fsck_err:
+	goto out;
+duplicate_entries:
+	ret = hash_pick_winner(trans, desc, hash_info, hash_k, k);
+	if (ret < 0)
+		goto out;
+
+	if (!fsck_err(trans, hash_table_key_duplicate,
+		      "duplicate hash table keys%s:\n%s",
+		      ret != 2 ? "" : ", both point to valid inodes",
+		      (printbuf_reset(&buf),
+		       bch2_bkey_val_to_text(&buf, c, hash_k),
+		       prt_newline(&buf),
+		       bch2_bkey_val_to_text(&buf, c, k),
+		       buf.buf)))
+		goto out;
+
+	switch (ret) {
+	case 0:
+		ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0);
+		break;
+	case 1:
+		ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0);
+		break;
+	case 2:
+		ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?:
+			bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0);
+		goto out;
+	}
 
-		if (fsck_err_on(have_target &&
-				d.v->d_type !=
-				mode_to_type(target.bi_mode), c,
-				"incorrect d_type: should be %u:\n%s",
-				mode_to_type(target.bi_mode),
-				(bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
-						       buf, sizeof(buf), k), buf))) {
-			struct bkey_i_dirent *n;
-
-			n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
-			if (!n) {
-				ret = -ENOMEM;
-				goto err;
-			}
+	ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
+		-BCH_ERR_transaction_restart_nested;
+	goto out;
+}
 
-			bkey_reassemble(&n->k_i, d.s_c);
-			n->v.d_type = mode_to_type(target.bi_mode);
+static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
+					       struct btree_iter *iter,
+					       struct bch_inode_unpacked *inode,
+					       u32 *snapshot)
+{
+	if (inode->bi_subvol) {
+		u64 inum;
+		int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
+		if (ret)
+			return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
+	}
 
-			ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
-					BTREE_INSERT_NOFAIL,
-					BTREE_INSERT_ENTRY(iter, &n->k_i));
-			kfree(n);
-			if (ret)
-				goto err;
+	return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
+}
 
-		}
+static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
+	int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int check_inode_dirent_inode(struct btree_trans *trans,
+				    struct bch_inode_unpacked *inode,
+				    bool *write_inode)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+
+	u32 inode_snapshot = inode->bi_snapshot;
+	struct btree_iter dirent_iter = {};
+	struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
+	int ret = bkey_err(d);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (fsck_err_on(ret,
+			trans, inode_points_to_missing_dirent,
+			"inode points to missing dirent\n%s",
+			(bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
+	    fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode),
+			trans, inode_points_to_wrong_dirent,
+			"%s",
+			(printbuf_reset(&buf),
+			 dirent_inode_mismatch_msg(&buf, c, d, inode),
+			 buf.buf))) {
+		/*
+		 * We just clear the backpointer fields for now. If we find a
+		 * dirent that points to this inode in check_dirents(), we'll
+		 * update it then; then when we get to check_path() if the
+		 * backpointer is still 0 we'll reattach it.
+		 */
+		inode->bi_dir = 0;
+		inode->bi_dir_offset = 0;
+		*write_inode = true;
 	}
-err:
+
+	ret = 0;
 fsck_err:
-	return bch2_trans_exit(&trans) ?: ret;
+	bch2_trans_iter_exit(trans, &dirent_iter);
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
 }
 
-/*
- * Walk xattrs: verify that they all have a corresponding inode
- */
-noinline_for_stack
-static int check_xattrs(struct bch_fs *c)
+static int get_snapshot_root_inode(struct btree_trans *trans,
+				   struct bch_inode_unpacked *root,
+				   u64 inum)
 {
-	struct inode_walker w = inode_walker_init();
-	struct hash_check h;
-	struct btree_trans trans;
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret = 0;
 
-	bch_verbose(c, "checking xattrs");
+	for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
+					     SPOS(0, inum, U32_MAX),
+					     BTREE_ITER_all_snapshots, k, ret) {
+		if (k.k->p.offset != inum)
+			break;
+		if (bkey_is_inode(k.k))
+			goto found_root;
+	}
+	if (ret)
+		goto err;
+	BUG();
+found_root:
+	BUG_ON(bch2_inode_unpack(k, root));
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
 
-	bch2_trans_init(&trans, c);
+static int check_inode(struct btree_trans *trans,
+		       struct btree_iter *iter,
+		       struct bkey_s_c k,
+		       struct bch_inode_unpacked *snapshot_root,
+		       struct snapshots_seen *s)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	struct bch_inode_unpacked u;
+	bool do_update = false;
+	int ret;
+
+	ret = bch2_check_key_has_snapshot(trans, iter, k);
+	if (ret < 0)
+		goto err;
+	if (ret)
+		return 0;
 
-	BUG_ON(bch2_trans_preload_iters(&trans));
+	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
+	if (ret)
+		goto err;
 
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
-				   POS(BCACHEFS_ROOT_INO, 0), 0);
+	if (!bkey_is_inode(k.k))
+		return 0;
 
-	hash_check_init(bch2_xattr_hash_desc, &trans, &h);
+	BUG_ON(bch2_inode_unpack(k, &u));
 
-	for_each_btree_key_continue(iter, 0, k) {
-		ret = walk_inode(c, &w, k.k->p.inode);
+	if (snapshot_root->bi_inum != u.bi_inum) {
+		ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum);
 		if (ret)
-			break;
+			goto err;
+	}
+
+	if (fsck_err_on(u.bi_hash_seed		!= snapshot_root->bi_hash_seed ||
+			INODE_STR_HASH(&u)	!= INODE_STR_HASH(snapshot_root),
+			trans, inode_snapshot_mismatch,
+			"inodes in different snapshots don't match")) {
+		u.bi_hash_seed = snapshot_root->bi_hash_seed;
+		SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root));
+		do_update = true;
+	}
+
+	if (u.bi_dir || u.bi_dir_offset) {
+		ret = check_inode_dirent_inode(trans, &u, &do_update);
+		if (ret)
+			goto err;
+	}
+
+	if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked),
+			trans, inode_unlinked_but_has_dirent,
+			"inode unlinked but has dirent\n%s",
+			(printbuf_reset(&buf),
+			 bch2_inode_unpacked_to_text(&buf, &u),
+			 buf.buf))) {
+		u.bi_flags &= ~BCH_INODE_unlinked;
+		do_update = true;
+	}
+
+	if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) {
+		/* Check for this early so that check_unreachable_inode() will reattach it */
+
+		ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot);
+		if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty)
+			goto err;
+
+		fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty,
+			    "dir unlinked but not empty\n%s",
+			    (printbuf_reset(&buf),
+			     bch2_inode_unpacked_to_text(&buf, &u),
+			     buf.buf));
+		u.bi_flags &= ~BCH_INODE_unlinked;
+		do_update = true;
+		ret = 0;
+	}
+
+	ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+	if (ret < 0)
+		goto err;
+
+	if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
+			trans, inode_has_child_snapshots_wrong,
+			"inode has_child_snapshots flag wrong (should be %u)\n%s",
+			ret,
+			(printbuf_reset(&buf),
+			 bch2_inode_unpacked_to_text(&buf, &u),
+			 buf.buf))) {
+		if (ret)
+			u.bi_flags |= BCH_INODE_has_child_snapshot;
+		else
+			u.bi_flags &= ~BCH_INODE_has_child_snapshot;
+		do_update = true;
+	}
+	ret = 0;
+
+	if ((u.bi_flags & BCH_INODE_unlinked) &&
+	    !(u.bi_flags & BCH_INODE_has_child_snapshot)) {
+		if (!test_bit(BCH_FS_started, &c->flags)) {
+			/*
+			 * If we're not in online fsck, don't delete unlinked
+			 * inodes, just make sure they're on the deleted list.
+			 *
+			 * They might be referred to by a logged operation -
+			 * i.e. we might have crashed in the middle of a
+			 * truncate on an unlinked but open file - so we want to
+			 * let the delete_dead_inodes kill it after resuming
+			 * logged ops.
+			 */
+			ret = check_inode_deleted_list(trans, k.k->p);
+			if (ret < 0)
+				goto err_noprint;
+
+			fsck_err_on(!ret,
+				    trans, unlinked_inode_not_on_deleted_list,
+				    "inode %llu:%u unlinked, but not on deleted list",
+				    u.bi_inum, k.k->p.snapshot);
 
-		if (fsck_err_on(!w.have_inode, c,
-				"xattr for missing inode %llu",
-				k.k->p.inode)) {
-			ret = bch2_btree_delete_at(iter, 0);
+			ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1);
 			if (ret)
 				goto err;
-			continue;
+		} else {
+			ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
+			if (ret < 0)
+				goto err;
+
+			if (fsck_err_on(!ret,
+					trans, inode_unlinked_and_not_open,
+				      "inode %llu:%u unlinked and not open",
+				      u.bi_inum, u.bi_snapshot)) {
+				ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
+				bch_err_msg(c, ret, "in fsck deleting inode");
+				goto err_noprint;
+			}
+			ret = 0;
 		}
+	}
 
-		if (w.first_this_inode && w.have_inode)
-			hash_check_set_inode(&h, c, &w.inode);
+	if (fsck_err_on(u.bi_parent_subvol &&
+			(u.bi_subvol == 0 ||
+			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
+			trans, inode_bi_parent_nonzero,
+			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
+			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
+		u.bi_parent_subvol = 0;
+		do_update = true;
+	}
+
+	if (u.bi_subvol) {
+		struct bch_subvolume s;
+
+		ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
+		if (ret && !bch2_err_matches(ret, ENOENT))
+			goto err;
+
+		if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
+			ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
+			goto do_update;
+		}
+
+		if (fsck_err_on(ret,
+				trans, inode_bi_subvol_missing,
+				"inode %llu:%u bi_subvol points to missing subvolume %u",
+				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
+		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
+				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
+							   k.k->p.snapshot),
+				trans, inode_bi_subvol_wrong,
+				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
+				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
+				le64_to_cpu(s.inode),
+				le32_to_cpu(s.snapshot))) {
+			u.bi_subvol = 0;
+			u.bi_parent_subvol = 0;
+			do_update = true;
+		}
+	}
 
-		ret = hash_check_key(bch2_xattr_hash_desc, &h, c, iter, k);
+	if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal),
+			trans, inode_journal_seq_in_future,
+			"inode journal seq in future (currently at %llu)\n%s",
+			journal_cur_seq(&c->journal),
+			(printbuf_reset(&buf),
+			 bch2_inode_unpacked_to_text(&buf, &u),
+			buf.buf))) {
+		u.bi_journal_seq = journal_cur_seq(&c->journal);
+		do_update = true;
+	}
+do_update:
+	if (do_update) {
+		ret = __bch2_fsck_write_inode(trans, &u);
+		bch_err_msg(c, ret, "in fsck updating inode");
 		if (ret)
-			goto fsck_err;
+			goto err_noprint;
 	}
 err:
 fsck_err:
-	return bch2_trans_exit(&trans) ?: ret;
+	bch_err_fn(c, ret);
+err_noprint:
+	printbuf_exit(&buf);
+	return ret;
 }
 
-/* Get root directory, create if it doesn't exist: */
-static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
+int bch2_check_inodes(struct bch_fs *c)
 {
-	struct bkey_inode_buf packed;
-	int ret;
+	struct bch_inode_unpacked snapshot_root = {};
+	struct snapshots_seen s;
 
-	bch_verbose(c, "checking root directory");
+	snapshots_seen_init(&s);
 
-	ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode);
-	if (ret && ret != -ENOENT)
-		return ret;
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+				POS_MIN,
+				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			check_inode(trans, &iter, k, &snapshot_root, &s)));
 
-	if (fsck_err_on(ret, c, "root directory missing"))
-		goto create_root;
+	snapshots_seen_exit(&s);
+	bch_err_fn(c, ret);
+	return ret;
+}
 
-	if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c,
-			"root inode not a directory"))
-		goto create_root;
+static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
+					    struct bch_inode_unpacked *inode)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
 
-	return 0;
-fsck_err:
-	return ret;
-create_root:
-	bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
-			0, NULL);
-	root_inode->bi_inum = BCACHEFS_ROOT_INO;
+	/*
+	 * We look for inodes to reattach in natural key order, leaves first,
+	 * but we should do the reattach at the oldest version that needs to be
+	 * reattached:
+	 */
+	for_each_btree_key_norestart(trans, iter,
+				     BTREE_ID_inodes,
+				     SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
+				     BTREE_ITER_all_snapshots, k, ret) {
+		if (k.k->p.offset != inode->bi_inum)
+			break;
 
-	bch2_inode_pack(&packed, root_inode);
+		if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
+			continue;
 
-	return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
-				 NULL, NULL, NULL, BTREE_INSERT_NOFAIL);
-}
+		if (!bkey_is_inode(k.k))
+			break;
 
-/* Get lost+found, create if it doesn't exist: */
-static int check_lostfound(struct bch_fs *c,
-			   struct bch_inode_unpacked *root_inode,
-			   struct bch_inode_unpacked *lostfound_inode)
-{
-	struct qstr lostfound = QSTR("lost+found");
-	struct bch_hash_info root_hash_info =
-		bch2_hash_info_init(c, root_inode);
-	struct bkey_inode_buf packed;
-	u64 inum;
-	int ret;
+		struct bch_inode_unpacked parent_inode;
+		bch2_inode_unpack(k, &parent_inode);
 
-	bch_verbose(c, "checking lost+found");
+		if (!inode_should_reattach(&parent_inode))
+			break;
 
-	inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
-				 &lostfound);
-	if (!inum) {
-		bch_notice(c, "creating lost+found");
-		goto create_lostfound;
+		*inode = parent_inode;
 	}
+	bch2_trans_iter_exit(trans, &iter);
 
-	ret = bch2_inode_find_by_inum(c, inum, lostfound_inode);
-	if (ret && ret != -ENOENT)
-		return ret;
+	return ret;
+}
 
-	if (fsck_err_on(ret, c, "lost+found missing"))
-		goto create_lostfound;
+static int check_unreachable_inode(struct btree_trans *trans,
+				   struct btree_iter *iter,
+				   struct bkey_s_c k)
+{
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
 
-	if (fsck_err_on(!S_ISDIR(lostfound_inode->bi_mode), c,
-			"lost+found inode not a directory"))
-		goto create_lostfound;
+	if (!bkey_is_inode(k.k))
+		return 0;
 
-	return 0;
-fsck_err:
-	return ret;
-create_lostfound:
-	root_inode->bi_nlink++;
+	struct bch_inode_unpacked inode;
+	BUG_ON(bch2_inode_unpack(k, &inode));
 
-	bch2_inode_pack(&packed, root_inode);
+	if (!inode_should_reattach(&inode))
+		return 0;
 
-	ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
-				NULL, NULL, NULL, BTREE_INSERT_NOFAIL);
+	ret = find_oldest_inode_needs_reattach(trans, &inode);
 	if (ret)
 		return ret;
 
-	bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
-			0, root_inode);
+	if (fsck_err(trans, inode_unreachable,
+		     "unreachable inode:\n%s",
+		     (bch2_inode_unpacked_to_text(&buf, &inode),
+		      buf.buf)))
+		ret = reattach_inode(trans, &inode);
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
 
-	ret = bch2_inode_create(c, lostfound_inode, BLOCKDEV_INODE_MAX, 0,
-			       &c->unused_inode_hint);
-	if (ret)
-		return ret;
+/*
+ * Reattach unreachable (but not unlinked) inodes
+ *
+ * Run after check_inodes() and check_dirents(), so we node that inode
+ * backpointer fields point to valid dirents, and every inode that has a dirent
+ * that points to it has its backpointer field set - so we're just looking for
+ * non-unlinked inodes without backpointers:
+ *
+ * XXX: this is racy w.r.t. hardlink removal in online fsck
+ */
+int bch2_check_unreachable_inodes(struct bch_fs *c)
+{
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+				POS_MIN,
+				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			check_unreachable_inode(trans, &iter, k)));
+	bch_err_fn(c, ret);
+	return ret;
+}
 
-	ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
-				 &lostfound, lostfound_inode->bi_inum, NULL,
-				 BTREE_INSERT_NOFAIL);
+static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
+{
+	switch (btree) {
+	case BTREE_ID_extents:
+		return S_ISREG(mode) || S_ISLNK(mode);
+	case BTREE_ID_dirents:
+		return S_ISDIR(mode);
+	case BTREE_ID_xattrs:
+		return true;
+	default:
+		BUG();
+	}
+}
+
+static int check_key_has_inode(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct inode_walker *inode,
+			       struct inode_walker_entry *i,
+			       struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	int ret = PTR_ERR_OR_ZERO(i);
 	if (ret)
 		return ret;
 
-	return 0;
-}
+	if (k.k->type == KEY_TYPE_whiteout)
+		goto out;
 
-struct inode_bitmap {
-	unsigned long	*bits;
-	size_t		size;
-};
+	if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
+		ret =   reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
+			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+		if (ret)
+			goto err;
 
-static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr)
-{
-	return nr < b->size ? test_bit(nr, b->bits) : false;
+		inode->last_pos.inode--;
+		ret = -BCH_ERR_transaction_restart_nested;
+		goto err;
+	}
+
+	if (fsck_err_on(!i,
+			trans, key_in_missing_inode,
+			"key in missing inode:\n  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+		goto delete;
+
+	if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
+			trans, key_in_wrong_inode_type,
+			"key for wrong inode mode %o:\n  %s",
+			i->inode.bi_mode,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+		goto delete;
+out:
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
+delete:
+	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
+	goto out;
 }
 
-static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr)
+static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
 {
-	if (nr >= b->size) {
-		size_t new_size = max_t(size_t, max_t(size_t,
-					PAGE_SIZE * 8,
-					b->size * 2),
-					nr + 1);
-		void *n;
+	struct bch_fs *c = trans->c;
+	int ret = 0;
+	s64 count2;
+
+	darray_for_each(w->inodes, i) {
+		if (i->inode.bi_sectors == i->count)
+			continue;
+
+		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
 
-		new_size = roundup_pow_of_two(new_size);
-		n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO);
-		if (!n) {
-			return -ENOMEM;
+		if (w->recalculate_sums)
+			i->count = count2;
+
+		if (i->count != count2) {
+			bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
+					    w->last_pos.inode, i->snapshot, i->count, count2);
+			i->count = count2;
 		}
 
-		b->bits = n;
-		b->size = new_size;
+		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
+				trans, inode_i_sectors_wrong,
+				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
+				w->last_pos.inode, i->snapshot,
+				i->inode.bi_sectors, i->count)) {
+			i->inode.bi_sectors = i->count;
+			ret = bch2_fsck_write_inode(trans, &i->inode);
+			if (ret)
+				break;
+		}
 	}
+fsck_err:
+	bch_err_fn(c, ret);
+	return ret;
+}
 
-	__set_bit(nr, b->bits);
-	return 0;
+static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+{
+	u32 restart_count = trans->restart_count;
+	return check_i_sectors_notnested(trans, w) ?:
+		trans_was_restarted(trans, restart_count);
 }
 
-struct pathbuf {
-	size_t		nr;
-	size_t		size;
+struct extent_end {
+	u32			snapshot;
+	u64			offset;
+	struct snapshots_seen	seen;
+};
 
-	struct pathbuf_entry {
-		u64	inum;
-		u64	offset;
-	}		*entries;
+struct extent_ends {
+	struct bpos			last_pos;
+	DARRAY(struct extent_end)	e;
 };
 
-static int path_down(struct pathbuf *p, u64 inum)
+static void extent_ends_reset(struct extent_ends *extent_ends)
 {
-	if (p->nr == p->size) {
-		size_t new_size = max_t(size_t, 256UL, p->size * 2);
-		void *n = krealloc(p->entries,
-				   new_size * sizeof(p->entries[0]),
-				   GFP_KERNEL);
-		if (!n)
-			return -ENOMEM;
+	darray_for_each(extent_ends->e, i)
+		snapshots_seen_exit(&i->seen);
+	extent_ends->e.nr = 0;
+}
 
-		p->entries = n;
-		p->size = new_size;
-	};
+static void extent_ends_exit(struct extent_ends *extent_ends)
+{
+	extent_ends_reset(extent_ends);
+	darray_exit(&extent_ends->e);
+}
 
-	p->entries[p->nr++] = (struct pathbuf_entry) {
-		.inum = inum,
-		.offset = 0,
-	};
-	return 0;
+static void extent_ends_init(struct extent_ends *extent_ends)
+{
+	memset(extent_ends, 0, sizeof(*extent_ends));
 }
 
-noinline_for_stack
-static int check_directory_structure(struct bch_fs *c,
-				     struct bch_inode_unpacked *lostfound_inode)
+static int extent_ends_at(struct bch_fs *c,
+			  struct extent_ends *extent_ends,
+			  struct snapshots_seen *seen,
+			  struct bkey_s_c k)
 {
-	struct inode_bitmap dirs_done = { NULL, 0 };
-	struct pathbuf path = { 0, 0, NULL };
-	struct pathbuf_entry *e;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_s_c_dirent dirent;
-	bool had_unreachable;
-	u64 d_inum;
-	int ret = 0;
+	struct extent_end *i, n = (struct extent_end) {
+		.offset		= k.k->p.offset,
+		.snapshot	= k.k->p.snapshot,
+		.seen		= *seen,
+	};
 
-	bch_verbose(c, "checking directory structure");
+	n.seen.ids.data = kmemdup(seen->ids.data,
+			      sizeof(seen->ids.data[0]) * seen->ids.size,
+			      GFP_KERNEL);
+	if (!n.seen.ids.data)
+		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
 
-	/* DFS: */
-restart_dfs:
-	had_unreachable = false;
+	__darray_for_each(extent_ends->e, i) {
+		if (i->snapshot == k.k->p.snapshot) {
+			snapshots_seen_exit(&i->seen);
+			*i = n;
+			return 0;
+		}
 
-	ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO);
-	if (ret) {
-		bch_err(c, "memory allocation failure in inode_bitmap_set()");
-		goto err;
+		if (i->snapshot >= k.k->p.snapshot)
+			break;
 	}
 
-	ret = path_down(&path, BCACHEFS_ROOT_INO);
-	if (ret) {
-		return ret;
-	}
+	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
+}
 
-	while (path.nr) {
-next:
-		e = &path.entries[path.nr - 1];
+static int overlapping_extents_found(struct btree_trans *trans,
+				     enum btree_id btree,
+				     struct bpos pos1, struct snapshots_seen *pos1_seen,
+				     struct bkey pos2,
+				     bool *fixed,
+				     struct extent_end *extent_end)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	struct btree_iter iter1, iter2 = { NULL };
+	struct bkey_s_c k1, k2;
+	int ret;
 
-		if (e->offset == U64_MAX)
-			goto up;
+	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
 
-		for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
-				   POS(e->inum, e->offset + 1), 0, k) {
-			if (k.k->p.inode != e->inum)
-				break;
+	bch2_trans_iter_init(trans, &iter1, btree, pos1,
+			     BTREE_ITER_all_snapshots|
+			     BTREE_ITER_not_extents);
+	k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX));
+	ret = bkey_err(k1);
+	if (ret)
+		goto err;
 
-			e->offset = k.k->p.offset;
+	prt_str(&buf, "\n  ");
+	bch2_bkey_val_to_text(&buf, c, k1);
 
-			if (k.k->type != BCH_DIRENT)
-				continue;
+	if (!bpos_eq(pos1, k1.k->p)) {
+		prt_str(&buf, "\n  wanted\n  ");
+		bch2_bpos_to_text(&buf, pos1);
+		prt_str(&buf, "\n  ");
+		bch2_bkey_to_text(&buf, &pos2);
 
-			dirent = bkey_s_c_to_dirent(k);
+		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
+			__func__, buf.buf);
+		ret = -BCH_ERR_internal_fsck_err;
+		goto err;
+	}
 
-			if (dirent.v->d_type != DT_DIR)
-				continue;
+	bch2_trans_copy_iter(&iter2, &iter1);
 
-			d_inum = le64_to_cpu(dirent.v->d_inum);
+	while (1) {
+		bch2_btree_iter_advance(&iter2);
 
-			if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c,
-					"directory %llu has multiple hardlinks",
-					d_inum)) {
-				ret = remove_dirent(c, &iter, dirent);
-				if (ret)
-					goto err;
-				continue;
-			}
+		k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX));
+		ret = bkey_err(k2);
+		if (ret)
+			goto err;
 
-			ret = inode_bitmap_set(&dirs_done, d_inum);
-			if (ret) {
-				bch_err(c, "memory allocation failure in inode_bitmap_set()");
-				goto err;
-			}
+		if (bpos_ge(k2.k->p, pos2.p))
+			break;
+	}
 
-			ret = path_down(&path, d_inum);
-			if (ret) {
-				goto err;
-			}
+	prt_str(&buf, "\n  ");
+	bch2_bkey_val_to_text(&buf, c, k2);
 
-			bch2_btree_iter_unlock(&iter);
-			goto next;
+	if (bpos_gt(k2.k->p, pos2.p) ||
+	    pos2.size != k2.k->size) {
+		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
+			__func__, buf.buf);
+		ret = -BCH_ERR_internal_fsck_err;
+		goto err;
+	}
+
+	prt_printf(&buf, "\n  overwriting %s extent",
+		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
+
+	if (fsck_err(trans, extent_overlapping,
+		     "overlapping extents%s", buf.buf)) {
+		struct btree_iter *old_iter = &iter1;
+		struct disk_reservation res = { 0 };
+
+		if (pos1.snapshot < pos2.p.snapshot) {
+			old_iter = &iter2;
+			swap(k1, k2);
 		}
-		ret = bch2_btree_iter_unlock(&iter);
-		if (ret) {
-			bch_err(c, "btree error %i in fsck", ret);
+
+		trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
+
+		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
+				BTREE_UPDATE_internal_snapshot_node,
+				k1, k2) ?:
+			bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
+		bch2_disk_reservation_put(c, &res);
+
+		if (ret)
 			goto err;
+
+		*fixed = true;
+
+		if (pos1.snapshot == pos2.p.snapshot) {
+			/*
+			 * We overwrote the first extent, and did the overwrite
+			 * in the same snapshot:
+			 */
+			extent_end->offset = bkey_start_offset(&pos2);
+		} else if (pos1.snapshot > pos2.p.snapshot) {
+			/*
+			 * We overwrote the first extent in pos2's snapshot:
+			 */
+			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
+		} else {
+			/*
+			 * We overwrote the second extent - restart
+			 * check_extent() from the top:
+			 */
+			ret = -BCH_ERR_transaction_restart_nested;
 		}
-up:
-		path.nr--;
 	}
+fsck_err:
+err:
+	bch2_trans_iter_exit(trans, &iter2);
+	bch2_trans_iter_exit(trans, &iter1);
+	printbuf_exit(&buf);
+	return ret;
+}
 
-	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) {
-		if (k.k->type != BCH_INODE_FS)
-			continue;
+static int check_overlapping_extents(struct btree_trans *trans,
+			      struct snapshots_seen *seen,
+			      struct extent_ends *extent_ends,
+			      struct bkey_s_c k,
+			      struct btree_iter *iter,
+			      bool *fixed)
+{
+	struct bch_fs *c = trans->c;
+	int ret = 0;
+
+	/* transaction restart, running again */
+	if (bpos_eq(extent_ends->last_pos, k.k->p))
+		return 0;
 
-		if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode)))
+	if (extent_ends->last_pos.inode != k.k->p.inode)
+		extent_ends_reset(extent_ends);
+
+	darray_for_each(extent_ends->e, i) {
+		if (i->offset <= bkey_start_offset(k.k))
 			continue;
 
-		if (!bch2_empty_dir(c, k.k->p.inode))
+		if (!ref_visible2(c,
+				  k.k->p.snapshot, seen,
+				  i->snapshot, &i->seen))
 			continue;
 
-		if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c,
-				"unreachable directory found (inum %llu)",
-				k.k->p.inode)) {
-			bch2_btree_iter_unlock(&iter);
+		ret = overlapping_extents_found(trans, iter->btree_id,
+						SPOS(iter->pos.inode,
+						     i->offset,
+						     i->snapshot),
+						&i->seen,
+						*k.k, fixed, i);
+		if (ret)
+			goto err;
+	}
 
-			ret = reattach_inode(c, lostfound_inode, k.k->p.inode);
-			if (ret) {
-				goto err;
-			}
+	extent_ends->last_pos = k.k->p;
+err:
+	return ret;
+}
 
-			had_unreachable = true;
+static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
+				struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *i;
+	unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
+
+	bkey_for_each_crc(k.k, ptrs, crc, i)
+		if (crc_is_encoded(crc) &&
+		    crc.uncompressed_size > encoded_extent_max_sectors) {
+			struct printbuf buf = PRINTBUF;
+
+			bch2_bkey_val_to_text(&buf, c, k);
+			bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
+			printbuf_exit(&buf);
+		}
+
+	return 0;
+}
+
+static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
+			struct bkey_s_c k,
+			struct inode_walker *inode,
+			struct snapshots_seen *s,
+			struct extent_ends *extent_ends,
+			struct disk_reservation *res)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	ret = bch2_check_key_has_snapshot(trans, iter, k);
+	if (ret) {
+		ret = ret < 0 ? ret : 0;
+		goto out;
+	}
+
+	if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) {
+		ret = check_i_sectors(trans, inode);
+		if (ret)
+			goto err;
+	}
+
+	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
+	if (ret)
+		goto err;
+
+	struct inode_walker_entry *extent_i = walk_inode(trans, inode, k);
+	ret = PTR_ERR_OR_ZERO(extent_i);
+	if (ret)
+		goto err;
+
+	ret = check_key_has_inode(trans, iter, inode, extent_i, k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_whiteout) {
+		ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
+						&inode->recalculate_sums);
+		if (ret)
+			goto err;
+
+		/*
+		 * Check inodes in reverse order, from oldest snapshots to
+		 * newest, starting from the inode that matches this extent's
+		 * snapshot. If we didn't have one, iterate over all inodes:
+		 */
+		for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
+		     inode->inodes.data && i >= inode->inodes.data;
+		     --i) {
+			if (i->snapshot > k.k->p.snapshot ||
+			    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
+				continue;
+
+			if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
+					!bkey_extent_is_reservation(k),
+					trans, extent_past_end_of_inode,
+					"extent type past end of inode %llu:%u, i_size %llu\n  %s",
+					i->inode.bi_inum, i->snapshot, i->inode.bi_size,
+					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+				struct btree_iter iter2;
+
+				bch2_trans_copy_iter(&iter2, iter);
+				bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
+				ret =   bch2_btree_iter_traverse(&iter2) ?:
+					bch2_btree_delete_at(trans, &iter2,
+						BTREE_UPDATE_internal_snapshot_node);
+				bch2_trans_iter_exit(trans, &iter2);
+				if (ret)
+					goto err;
+
+				iter->k.type = KEY_TYPE_whiteout;
+				break;
+			}
 		}
 	}
-	ret = bch2_btree_iter_unlock(&iter);
+
+	ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
 	if (ret)
 		goto err;
 
-	if (had_unreachable) {
-		bch_info(c, "reattached unreachable directories, restarting pass to check for loops");
-		kfree(dirs_done.bits);
-		kfree(path.entries);
-		memset(&dirs_done, 0, sizeof(dirs_done));
-		memset(&path, 0, sizeof(path));
-		goto restart_dfs;
+	if (bkey_extent_is_allocation(k.k)) {
+		for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
+		     inode->inodes.data && i >= inode->inodes.data;
+		     --i) {
+			if (i->snapshot > k.k->p.snapshot ||
+			    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
+				continue;
+
+			i->count += k.k->size;
+		}
 	}
 
+	if (k.k->type != KEY_TYPE_whiteout) {
+		ret = extent_ends_at(c, extent_ends, s, k);
+		if (ret)
+			goto err;
+	}
 out:
-	kfree(dirs_done.bits);
-	kfree(path.entries);
-	return ret;
 err:
 fsck_err:
-	ret = bch2_btree_iter_unlock(&iter) ?: ret;
-	goto out;
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
 }
 
-struct nlink {
-	u32	count;
-	u32	dir_count;
-};
+/*
+ * Walk extents: verify that extents have a corresponding S_ISREG inode, and
+ * that i_size an i_sectors are consistent
+ */
+int bch2_check_extents(struct bch_fs *c)
+{
+	struct inode_walker w = inode_walker_init();
+	struct snapshots_seen s;
+	struct extent_ends extent_ends;
+	struct disk_reservation res = { 0 };
+
+	snapshots_seen_init(&s);
+	extent_ends_init(&extent_ends);
+
+	int ret = bch2_trans_run(c,
+		for_each_btree_key(trans, iter, BTREE_ID_extents,
+				POS(BCACHEFS_ROOT_INO, 0),
+				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
+			bch2_disk_reservation_put(c, &res);
+			check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
+			check_extent_overbig(trans, &iter, k);
+		})) ?:
+		check_i_sectors_notnested(trans, &w));
+
+	bch2_disk_reservation_put(c, &res);
+	extent_ends_exit(&extent_ends);
+	inode_walker_exit(&w);
+	snapshots_seen_exit(&s);
+
+	bch_err_fn(c, ret);
+	return ret;
+}
 
-typedef GENRADIX(struct nlink) nlink_table;
+int bch2_check_indirect_extents(struct bch_fs *c)
+{
+	struct disk_reservation res = { 0 };
+
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
+				POS_MIN,
+				BTREE_ITER_prefetch, k,
+				&res, NULL,
+				BCH_TRANS_COMMIT_no_enospc, ({
+			bch2_disk_reservation_put(c, &res);
+			check_extent_overbig(trans, &iter, k);
+		})));
+
+	bch2_disk_reservation_put(c, &res);
+	bch_err_fn(c, ret);
+	return ret;
+}
 
-static void inc_link(struct bch_fs *c, nlink_table *links,
-		     u64 range_start, u64 *range_end,
-		     u64 inum, bool dir)
+static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
 {
-	struct nlink *link;
+	struct bch_fs *c = trans->c;
+	int ret = 0;
+	s64 count2;
 
-	if (inum < range_start || inum >= *range_end)
-		return;
+	darray_for_each(w->inodes, i) {
+		if (i->inode.bi_nlink == i->count)
+			continue;
 
-	link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL);
-	if (!link) {
-		bch_verbose(c, "allocation failed during fs gc - will need another pass");
-		*range_end = inum;
-		return;
+		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
+		if (count2 < 0)
+			return count2;
+
+		if (i->count != count2) {
+			bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
+					    w->last_pos.inode, i->snapshot, i->count, count2);
+			i->count = count2;
+			if (i->inode.bi_nlink == i->count)
+				continue;
+		}
+
+		if (fsck_err_on(i->inode.bi_nlink != i->count,
+				trans, inode_dir_wrong_nlink,
+				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
+				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
+			i->inode.bi_nlink = i->count;
+			ret = bch2_fsck_write_inode(trans, &i->inode);
+			if (ret)
+				break;
+		}
 	}
+fsck_err:
+	bch_err_fn(c, ret);
+	return ret;
+}
 
-	if (dir)
-		link->dir_count++;
-	else
-		link->count++;
+static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+{
+	u32 restart_count = trans->restart_count;
+	return check_subdir_count_notnested(trans, w) ?:
+		trans_was_restarted(trans, restart_count);
 }
 
 noinline_for_stack
-static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
-			       u64 range_start, u64 *range_end)
+static int check_dirent_inode_dirent(struct btree_trans *trans,
+				   struct btree_iter *iter,
+				   struct bkey_s_c_dirent d,
+				   struct bch_inode_unpacked *target)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_s_c_dirent d;
-	u64 d_inum;
-	int ret;
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	struct btree_iter bp_iter = { NULL };
+	int ret = 0;
 
-	inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false);
+	if (inode_points_to_dirent(target, d))
+		return 0;
+
+	if (!target->bi_dir &&
+	    !target->bi_dir_offset) {
+		fsck_err_on(S_ISDIR(target->bi_mode),
+			    trans, inode_dir_missing_backpointer,
+			    "directory with missing backpointer\n%s",
+			    (printbuf_reset(&buf),
+			     bch2_bkey_val_to_text(&buf, c, d.s_c),
+			     prt_printf(&buf, "\n"),
+			     bch2_inode_unpacked_to_text(&buf, target),
+			     buf.buf));
+
+		fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
+			    trans, inode_unlinked_but_has_dirent,
+			    "inode unlinked but has dirent\n%s",
+			    (printbuf_reset(&buf),
+			     bch2_bkey_val_to_text(&buf, c, d.s_c),
+			     prt_printf(&buf, "\n"),
+			     bch2_inode_unpacked_to_text(&buf, target),
+			     buf.buf));
+
+		target->bi_flags &= ~BCH_INODE_unlinked;
+		target->bi_dir		= d.k->p.inode;
+		target->bi_dir_offset	= d.k->p.offset;
+		return __bch2_fsck_write_inode(trans, target);
+	}
 
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k) {
-		switch (k.k->type) {
-		case BCH_DIRENT:
-			d = bkey_s_c_to_dirent(k);
-			d_inum = le64_to_cpu(d.v->d_inum);
+	if (bch2_inode_should_have_single_bp(target) &&
+	    !fsck_err(trans, inode_wrong_backpointer,
+		      "dirent points to inode that does not point back:\n  %s",
+		      (bch2_bkey_val_to_text(&buf, c, d.s_c),
+		       prt_printf(&buf, "\n  "),
+		       bch2_inode_unpacked_to_text(&buf, target),
+		       buf.buf)))
+		goto err;
 
-			if (d.v->d_type == DT_DIR)
-				inc_link(c, links, range_start, range_end,
-					 d.k->p.inode, true);
+	struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
+			      SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot));
+	ret = bkey_err(bp_dirent);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		goto err;
 
-			inc_link(c, links, range_start, range_end,
-				 d_inum, false);
+	bool backpointer_exists = !ret;
+	ret = 0;
+
+	if (fsck_err_on(!backpointer_exists,
+			trans, inode_wrong_backpointer,
+			"inode %llu:%u has wrong backpointer:\n"
+			"got       %llu:%llu\n"
+			"should be %llu:%llu",
+			target->bi_inum, target->bi_snapshot,
+			target->bi_dir,
+			target->bi_dir_offset,
+			d.k->p.inode,
+			d.k->p.offset)) {
+		target->bi_dir		= d.k->p.inode;
+		target->bi_dir_offset	= d.k->p.offset;
+		ret = __bch2_fsck_write_inode(trans, target);
+		goto out;
+	}
 
-			break;
-		}
+	bch2_bkey_val_to_text(&buf, c, d.s_c);
+	prt_newline(&buf);
+	if (backpointer_exists)
+		bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+
+	if (fsck_err_on(backpointer_exists &&
+			(S_ISDIR(target->bi_mode) ||
+			 target->bi_subvol),
+			trans, inode_dir_multiple_links,
+			"%s %llu:%u with multiple links\n%s",
+			S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
+			target->bi_inum, target->bi_snapshot, buf.buf)) {
+		ret = __remove_dirent(trans, d.k->p);
+		goto out;
+	}
 
-		bch2_btree_iter_cond_resched(&iter);
+	/*
+	 * hardlinked file with nlink 0:
+	 * We're just adjusting nlink here so check_nlinks() will pick
+	 * it up, it ignores inodes with nlink 0
+	 */
+	if (fsck_err_on(backpointer_exists && !target->bi_nlink,
+			trans, inode_multiple_links_but_nlink_0,
+			"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
+			target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
+		target->bi_nlink++;
+		target->bi_flags &= ~BCH_INODE_unlinked;
+		ret = __bch2_fsck_write_inode(trans, target);
+		if (ret)
+			goto err;
 	}
-	ret = bch2_btree_iter_unlock(&iter);
+out:
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &bp_iter);
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+noinline_for_stack
+static int check_dirent_target(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c_dirent d,
+			       struct bch_inode_unpacked *target)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i_dirent *n;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	ret = check_dirent_inode_dirent(trans, iter, d, target);
 	if (ret)
-		bch_err(c, "error in fs gc: btree error %i while walking dirents", ret);
+		goto err;
 
+	if (fsck_err_on(d.v->d_type != inode_d_type(target),
+			trans, dirent_d_type_wrong,
+			"incorrect d_type: got %s, should be %s:\n%s",
+			bch2_d_type_str(d.v->d_type),
+			bch2_d_type_str(inode_d_type(target)),
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
+		ret = PTR_ERR_OR_ZERO(n);
+		if (ret)
+			goto err;
+
+		bkey_reassemble(&n->k_i, d.s_c);
+		n->v.d_type = inode_d_type(target);
+		if (n->v.d_type == DT_SUBVOL) {
+			n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+			n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
+		} else {
+			n->v.d_inum = cpu_to_le64(target->bi_inum);
+		}
+
+		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+		if (ret)
+			goto err;
+
+		d = dirent_i_to_s_c(n);
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
-s64 bch2_count_inode_sectors(struct bch_fs *c, u64 inum)
+/* find a subvolume that's a descendent of @snapshot: */
+static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	u64 sectors = 0;
+	int ret;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inum, 0), 0, k) {
-		if (k.k->p.inode != inum)
-			break;
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
+		if (k.k->type != KEY_TYPE_subvolume)
+			continue;
 
-		if (bkey_extent_is_allocation(k.k))
-			sectors += k.k->size;
+		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+		if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
+			bch2_trans_iter_exit(trans, &iter);
+			*subvolid = k.k->p.offset;
+			goto found;
+		}
 	}
-
-	return bch2_btree_iter_unlock(&iter) ?: sectors;
+	if (!ret)
+		ret = -ENOENT;
+found:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
 }
 
-static int check_inode_nlink(struct bch_fs *c,
-			     struct bch_inode_unpacked *lostfound_inode,
-			     struct bch_inode_unpacked *u,
-			     struct nlink *link,
-			     bool *do_update)
+noinline_for_stack
+static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
+				  struct bkey_s_c_dirent d)
 {
-	u32 i_nlink = u->bi_flags & BCH_INODE_UNLINKED
-		? 0
-		: u->bi_nlink + nlink_bias(u->bi_mode);
-	u32 real_i_nlink =
-		link->count * nlink_bias(u->bi_mode) +
-		link->dir_count;
+	struct bch_fs *c = trans->c;
+	struct btree_iter subvol_iter = {};
+	struct bch_inode_unpacked subvol_root;
+	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
+	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
+	u32 parent_snapshot;
+	u32 new_parent_subvol = 0;
+	u64 parent_inum;
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	/*
-	 * These should have been caught/fixed by earlier passes, we don't
-	 * repair them here:
-	 */
-	if (S_ISDIR(u->bi_mode) && link->count > 1) {
-		need_fsck_err(c, "directory %llu with multiple hardlinks: %u",
-			      u->bi_inum, link->count);
-		return 0;
+	ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (ret ||
+	    (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
+		int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
+		if (ret2 && !bch2_err_matches(ret, ENOENT))
+			return ret2;
 	}
 
-	if (S_ISDIR(u->bi_mode) && !link->count) {
-		need_fsck_err(c, "unreachable directory found (inum %llu)",
-			      u->bi_inum);
-		return 0;
+	if (ret &&
+	    !new_parent_subvol &&
+	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
+		/*
+		 * Couldn't find a subvol for dirent's snapshot - but we lost
+		 * subvols, so we need to reconstruct:
+		 */
+		ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
+		if (ret)
+			return ret;
+
+		parent_snapshot = d.k->p.snapshot;
 	}
 
-	if (!S_ISDIR(u->bi_mode) && link->dir_count) {
-		need_fsck_err(c, "non directory with subdirectories",
-			      u->bi_inum);
-		return 0;
+	if (fsck_err_on(ret,
+			trans, dirent_to_missing_parent_subvol,
+			"dirent parent_subvol points to missing subvolume\n%s",
+			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
+	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
+			trans, dirent_not_visible_in_parent_subvol,
+			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
+			parent_snapshot,
+			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+		if (!new_parent_subvol) {
+			bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
+			return -BCH_ERR_fsck_repair_unimplemented;
+		}
+
+		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
+		ret = PTR_ERR_OR_ZERO(new_dirent);
+		if (ret)
+			goto err;
+
+		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
 	}
 
-	if (!link->count &&
-	    !(u->bi_flags & BCH_INODE_UNLINKED) &&
-	    (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
-		if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)",
-			     u->bi_inum, mode_to_type(u->bi_mode)) ==
-		    FSCK_ERR_IGNORE)
-			return 0;
+	struct bkey_s_c_subvolume s =
+		bch2_bkey_get_iter_typed(trans, &subvol_iter,
+					 BTREE_ID_subvolumes, POS(0, target_subvol),
+					 0, subvolume);
+	ret = bkey_err(s.s_c);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
 
-		ret = reattach_inode(c, lostfound_inode, u->bi_inum);
+	if (ret) {
+		if (fsck_err(trans, dirent_to_missing_subvol,
+			     "dirent points to missing subvolume\n%s",
+			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
+			return __remove_dirent(trans, d.k->p);
+		ret = 0;
+		goto out;
+	}
+
+	if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
+			trans, subvol_fs_path_parent_wrong,
+			"subvol with wrong fs_path_parent, should be be %u\n%s",
+			parent_subvol,
+			(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+		struct bkey_i_subvolume *n =
+			bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
+		ret = PTR_ERR_OR_ZERO(n);
 		if (ret)
-			return ret;
+			goto err;
 
-		link->count = 1;
-		real_i_nlink = nlink_bias(u->bi_mode) + link->dir_count;
-		goto set_i_nlink;
+		n->v.fs_path_parent = cpu_to_le32(parent_subvol);
 	}
 
-	if (i_nlink < link->count) {
-		if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)",
-			     u->bi_inum, i_nlink, link->count,
-			     mode_to_type(u->bi_mode)) == FSCK_ERR_IGNORE)
-			return 0;
-		goto set_i_nlink;
+	u64 target_inum = le64_to_cpu(s.v->inode);
+	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
+
+	ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		goto err;
+
+	if (ret) {
+		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
+		ret = -BCH_ERR_fsck_repair_unimplemented;
+		goto err;
 	}
 
-	if (i_nlink != real_i_nlink &&
-	    c->sb.clean) {
-		if (fsck_err(c, "filesystem marked clean, "
-			     "but inode %llu has wrong i_nlink "
-			     "(type %u i_nlink %u, should be %u)",
-			     u->bi_inum, mode_to_type(u->bi_mode),
-			     i_nlink, real_i_nlink) == FSCK_ERR_IGNORE)
-			return 0;
-		goto set_i_nlink;
+	if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
+			trans, inode_bi_parent_wrong,
+			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
+			target_inum,
+			subvol_root.bi_parent_subvol, parent_subvol)) {
+		subvol_root.bi_parent_subvol = parent_subvol;
+		subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot);
+		ret = __bch2_fsck_write_inode(trans, &subvol_root);
+		if (ret)
+			goto err;
 	}
 
-	if (i_nlink != real_i_nlink &&
-	    (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
-		if (fsck_err(c, "inode %llu has wrong i_nlink "
-			     "(type %u i_nlink %u, should be %u)",
-			     u->bi_inum, mode_to_type(u->bi_mode),
-			     i_nlink, real_i_nlink) == FSCK_ERR_IGNORE)
-			return 0;
-		goto set_i_nlink;
+	ret = check_dirent_target(trans, iter, d, &subvol_root);
+	if (ret)
+		goto err;
+out:
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &subvol_iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
+			struct bkey_s_c k,
+			struct bch_hash_info *hash_info,
+			struct inode_walker *dir,
+			struct inode_walker *target,
+			struct snapshots_seen *s)
+{
+	struct bch_fs *c = trans->c;
+	struct inode_walker_entry *i;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	ret = bch2_check_key_has_snapshot(trans, iter, k);
+	if (ret) {
+		ret = ret < 0 ? ret : 0;
+		goto out;
 	}
 
-	if (real_i_nlink && i_nlink != real_i_nlink)
-		bch_verbose(c, "setting inode %llu nlink from %u to %u",
-			    u->bi_inum, i_nlink, real_i_nlink);
-set_i_nlink:
-	if (i_nlink != real_i_nlink) {
-		if (real_i_nlink) {
-			u->bi_nlink = real_i_nlink - nlink_bias(u->bi_mode);
-			u->bi_flags &= ~BCH_INODE_UNLINKED;
-		} else {
-			u->bi_nlink = 0;
-			u->bi_flags |= BCH_INODE_UNLINKED;
+	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
+	if (ret)
+		goto err;
+
+	if (k.k->type == KEY_TYPE_whiteout)
+		goto out;
+
+	if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
+		ret = check_subdir_count(trans, dir);
+		if (ret)
+			goto err;
+	}
+
+	i = walk_inode(trans, dir, k);
+	ret = PTR_ERR_OR_ZERO(i);
+	if (ret < 0)
+		goto err;
+
+	ret = check_key_has_inode(trans, iter, dir, i, k);
+	if (ret)
+		goto err;
+
+	if (!i)
+		goto out;
+
+	if (dir->first_this_inode)
+		*hash_info = bch2_hash_info_init(c, &i->inode);
+	dir->first_this_inode = false;
+
+	ret = hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k);
+	if (ret < 0)
+		goto err;
+	if (ret) {
+		/* dirent has been deleted */
+		ret = 0;
+		goto out;
+	}
+
+	if (k.k->type != KEY_TYPE_dirent)
+		goto out;
+
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+
+	if (d.v->d_type == DT_SUBVOL) {
+		ret = check_dirent_to_subvol(trans, iter, d);
+		if (ret)
+			goto err;
+	} else {
+		ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
+		if (ret)
+			goto err;
+
+		if (fsck_err_on(!target->inodes.nr,
+				trans, dirent_to_missing_inode,
+				"dirent points to missing inode:\n%s",
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k),
+				 buf.buf))) {
+			ret = __remove_dirent(trans, d.k->p);
+			if (ret)
+				goto err;
 		}
 
-		*do_update = true;
+		darray_for_each(target->inodes, i) {
+			ret = check_dirent_target(trans, iter, d, &i->inode);
+			if (ret)
+				goto err;
+		}
 	}
+
+	ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+	if (ret)
+		goto err;
+
+	if (d.v->d_type == DT_DIR)
+		for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
+			i->count++;
+out:
+err:
 fsck_err:
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
-static int check_inode(struct bch_fs *c,
-		       struct bch_inode_unpacked *lostfound_inode,
-		       struct btree_iter *iter,
-		       struct bkey_s_c_inode inode,
-		       struct nlink *link)
+/*
+ * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
+ * validate d_type
+ */
+int bch2_check_dirents(struct bch_fs *c)
 {
-	struct bch_inode_unpacked u;
-	bool do_update = false;
+	struct inode_walker dir = inode_walker_init();
+	struct inode_walker target = inode_walker_init();
+	struct snapshots_seen s;
+	struct bch_hash_info hash_info;
+
+	snapshots_seen_init(&s);
+
+	int ret = bch2_trans_run(c,
+		for_each_btree_key(trans, iter, BTREE_ID_dirents,
+				POS(BCACHEFS_ROOT_INO, 0),
+				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
+		check_subdir_count_notnested(trans, &dir));
+
+	snapshots_seen_exit(&s);
+	inode_walker_exit(&dir);
+	inode_walker_exit(&target);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
+		       struct bkey_s_c k,
+		       struct bch_hash_info *hash_info,
+		       struct inode_walker *inode)
+{
+	struct bch_fs *c = trans->c;
+	struct inode_walker_entry *i;
+	int ret;
+
+	ret = bch2_check_key_has_snapshot(trans, iter, k);
+	if (ret < 0)
+		return ret;
+	if (ret)
+		return 0;
+
+	i = walk_inode(trans, inode, k);
+	ret = PTR_ERR_OR_ZERO(i);
+	if (ret)
+		return ret;
+
+	ret = check_key_has_inode(trans, iter, inode, i, k);
+	if (ret)
+		return ret;
+
+	if (!i)
+		return 0;
+
+	if (inode->first_this_inode)
+		*hash_info = bch2_hash_info_init(c, &i->inode);
+	inode->first_this_inode = false;
+
+	ret = hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+/*
+ * Walk xattrs: verify that they all have a corresponding inode
+ */
+int bch2_check_xattrs(struct bch_fs *c)
+{
+	struct inode_walker inode = inode_walker_init();
+	struct bch_hash_info hash_info;
 	int ret = 0;
 
-	ret = bch2_inode_unpack(inode, &u);
-	if (bch2_fs_inconsistent_on(ret, c,
-			 "error unpacking inode %llu in fsck",
-			 inode.k->p.inode))
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
+			POS(BCACHEFS_ROOT_INO, 0),
+			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
+			k,
+			NULL, NULL,
+			BCH_TRANS_COMMIT_no_enospc,
+		check_xattr(trans, &iter, k, &hash_info, &inode)));
+
+	inode_walker_exit(&inode);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int check_root_trans(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked root_inode;
+	u32 snapshot;
+	u64 inum;
+	int ret;
+
+	ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
+	if (ret && !bch2_err_matches(ret, ENOENT))
 		return ret;
 
-	if (link) {
-		ret = check_inode_nlink(c, lostfound_inode, &u, link,
-					&do_update);
+	if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
+				"root subvol missing")) {
+		struct bkey_i_subvolume *root_subvol =
+			bch2_trans_kmalloc(trans, sizeof(*root_subvol));
+		ret = PTR_ERR_OR_ZERO(root_subvol);
 		if (ret)
-			return ret;
-	}
+			goto err;
 
-	if (u.bi_flags & BCH_INODE_UNLINKED) {
-		bch_verbose(c, "deleting inode %llu", u.bi_inum);
+		snapshot	= U32_MAX;
+		inum		= BCACHEFS_ROOT_INO;
 
-		ret = bch2_inode_rm(c, u.bi_inum);
+		bkey_subvolume_init(&root_subvol->k_i);
+		root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
+		root_subvol->v.flags	= 0;
+		root_subvol->v.snapshot	= cpu_to_le32(snapshot);
+		root_subvol->v.inode	= cpu_to_le64(inum);
+		ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
+		bch_err_msg(c, ret, "writing root subvol");
 		if (ret)
-			bch_err(c, "error in fs gc: error %i "
-				"while deleting inode", ret);
+			goto err;
+	}
+
+	ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode);
+	if (ret && !bch2_err_matches(ret, ENOENT))
 		return ret;
+
+	if (mustfix_fsck_err_on(ret,
+				trans, root_dir_missing,
+				"root directory missing") ||
+	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
+				trans, root_inode_not_dir,
+				"root inode not a directory")) {
+		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
+				0, NULL);
+		root_inode.bi_inum = inum;
+		root_inode.bi_snapshot = snapshot;
+
+		ret = __bch2_fsck_write_inode(trans, &root_inode);
+		bch_err_msg(c, ret, "writing root inode");
 	}
+err:
+fsck_err:
+	return ret;
+}
 
-	if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY) {
-		fsck_err_on(c->sb.clean, c,
-			    "filesystem marked clean, "
-			    "but inode %llu has i_size dirty",
-			    u.bi_inum);
+/* Get root directory, create if it doesn't exist: */
+int bch2_check_root(struct bch_fs *c)
+{
+	int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+		check_root_trans(trans));
+	bch_err_fn(c, ret);
+	return ret;
+}
 
-		bch_verbose(c, "truncating inode %llu", u.bi_inum);
+typedef DARRAY(u32) darray_u32;
 
-		/*
-		 * XXX: need to truncate partial blocks too here - or ideally
-		 * just switch units to bytes and that issue goes away
-		 */
+static bool darray_u32_has(darray_u32 *d, u32 v)
+{
+	darray_for_each(*d, i)
+		if (*i == v)
+			return true;
+	return false;
+}
 
-		ret = bch2_inode_truncate(c, u.bi_inum,
-				round_up(u.bi_size, PAGE_SIZE) >> 9,
-				NULL, NULL);
-		if (ret) {
-			bch_err(c, "error in fs gc: error %i "
-				"truncating inode", ret);
-			return ret;
+static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter parent_iter = {};
+	darray_u32 subvol_path = {};
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (k.k->type != KEY_TYPE_subvolume)
+		return 0;
+
+	while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
+		ret = darray_push(&subvol_path, k.k->p.offset);
+		if (ret)
+			goto err;
+
+		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+
+		struct bch_inode_unpacked subvol_root;
+		ret = bch2_inode_find_by_inum_trans(trans,
+					(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
+					&subvol_root);
+		if (ret)
+			break;
+
+		u32 parent = le32_to_cpu(s.v->fs_path_parent);
+
+		if (darray_u32_has(&subvol_path, parent)) {
+			if (fsck_err(c, subvol_loop, "subvolume loop"))
+				ret = reattach_subvol(trans, s);
+			break;
 		}
 
-		/*
-		 * We truncated without our normal sector accounting hook, just
-		 * make sure we recalculate it:
-		 */
-		u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
+		bch2_trans_iter_exit(trans, &parent_iter);
+		bch2_trans_iter_init(trans, &parent_iter,
+				     BTREE_ID_subvolumes, POS(0, parent), 0);
+		k = bch2_btree_iter_peek_slot(&parent_iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
 
-		u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
-		do_update = true;
+		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
+				trans, subvol_unreachable,
+				"unreachable subvolume %s",
+				(bch2_bkey_val_to_text(&buf, c, s.s_c),
+				 buf.buf))) {
+			ret = reattach_subvol(trans, s);
+			break;
+		}
 	}
+fsck_err:
+err:
+	printbuf_exit(&buf);
+	darray_exit(&subvol_path);
+	bch2_trans_iter_exit(trans, &parent_iter);
+	return ret;
+}
 
-	if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY) {
-		s64 sectors;
+int bch2_check_subvolume_structure(struct bch_fs *c)
+{
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			check_subvol_path(trans, &iter, k)));
+	bch_err_fn(c, ret);
+	return ret;
+}
 
-		fsck_err_on(c->sb.clean, c,
-			    "filesystem marked clean, "
-			    "but inode %llu has i_sectors dirty",
-			    u.bi_inum);
+struct pathbuf_entry {
+	u64	inum;
+	u32	snapshot;
+};
 
-		bch_verbose(c, "recounting sectors for inode %llu",
-			    u.bi_inum);
+typedef DARRAY(struct pathbuf_entry) pathbuf;
 
-		sectors = bch2_count_inode_sectors(c, u.bi_inum);
-		if (sectors < 0) {
-			bch_err(c, "error in fs gc: error %i "
-				"recounting inode sectors",
-				(int) sectors);
-			return sectors;
+static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
+{
+	darray_for_each(*p, i)
+		if (i->inum	== inum &&
+		    i->snapshot	== snapshot)
+			return true;
+	return false;
+}
+
+static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter inode_iter = {};
+	struct bch_inode_unpacked inode;
+	struct printbuf buf = PRINTBUF;
+	u32 snapshot = inode_k.k->p.snapshot;
+	int ret = 0;
+
+	p->nr = 0;
+
+	BUG_ON(bch2_inode_unpack(inode_k, &inode));
+
+	if (!S_ISDIR(inode.bi_mode))
+		return 0;
+
+	while (!inode.bi_subvol) {
+		struct btree_iter dirent_iter;
+		struct bkey_s_c_dirent d;
+		u32 parent_snapshot = snapshot;
+
+		d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
+		ret = bkey_err(d.s_c);
+		if (ret && !bch2_err_matches(ret, ENOENT))
+			break;
+
+		if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
+			bch2_trans_iter_exit(trans, &dirent_iter);
+
+		if (bch2_err_matches(ret, ENOENT)) {
+			printbuf_reset(&buf);
+			bch2_bkey_val_to_text(&buf, c, inode_k);
+			bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
+				bch2_err_str(ret), buf.buf);
+			goto out;
 		}
 
-		u.bi_sectors = sectors;
-		u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
-		do_update = true;
-	}
+		bch2_trans_iter_exit(trans, &dirent_iter);
 
-	if (do_update) {
-		struct bkey_inode_buf p;
+		ret = darray_push(p, ((struct pathbuf_entry) {
+			.inum		= inode.bi_inum,
+			.snapshot	= snapshot,
+		}));
+		if (ret)
+			return ret;
+
+		snapshot = parent_snapshot;
+
+		bch2_trans_iter_exit(trans, &inode_iter);
+		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
+					     SPOS(0, inode.bi_dir, snapshot), 0);
+		ret = bkey_err(inode_k) ?:
+			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
+			: bch2_inode_unpack(inode_k, &inode);
+		if (ret) {
+			/* Should have been caught in dirents pass */
+			bch_err_msg(c, ret, "error looking up parent directory");
+			break;
+		}
+
+		snapshot = inode_k.k->p.snapshot;
+
+		if (path_is_dup(p, inode.bi_inum, snapshot)) {
+			/* XXX print path */
+			bch_err(c, "directory structure loop");
+
+			darray_for_each(*p, i)
+				pr_err("%llu:%u", i->inum, i->snapshot);
+			pr_err("%llu:%u", inode.bi_inum, snapshot);
 
-		bch2_inode_pack(&p, &u);
+			if (fsck_err(trans, dir_loop, "directory structure loop")) {
+				ret = remove_backpointer(trans, &inode);
+				bch_err_msg(c, ret, "removing dirent");
+				if (ret)
+					break;
 
-		ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
-					  BTREE_INSERT_NOFAIL,
-					  BTREE_INSERT_ENTRY(iter, &p.inode.k_i));
-		if (ret && ret != -EINTR)
-			bch_err(c, "error in fs gc: error %i "
-				"updating inode", ret);
+				ret = reattach_inode(trans, &inode);
+				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
+			}
+			break;
+		}
 	}
+out:
 fsck_err:
+	bch2_trans_iter_exit(trans, &inode_iter);
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
-noinline_for_stack
-static int bch2_gc_walk_inodes(struct bch_fs *c,
-			       struct bch_inode_unpacked *lostfound_inode,
-			       nlink_table *links,
-			       u64 range_start, u64 range_end)
+/*
+ * Check for loops in the directory structure: all other connectivity issues
+ * have been fixed by prior passes
+ */
+int bch2_check_directory_structure(struct bch_fs *c)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct nlink *link, zero_links = { 0, 0 };
-	struct genradix_iter nlinks_iter;
-	int ret = 0, ret2 = 0;
-	u64 nlinks_pos;
+	pathbuf path = { 0, };
+	int ret;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
+					  BTREE_ITER_intent|
+					  BTREE_ITER_prefetch|
+					  BTREE_ITER_all_snapshots, k,
+					  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+			if (!bkey_is_inode(k.k))
+				continue;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0), 0);
-	nlinks_iter = genradix_iter_init(links, 0);
+			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
+				continue;
 
-	while ((k = bch2_btree_iter_peek(&iter)).k &&
-	       !btree_iter_err(k)) {
-peek_nlinks:	link = genradix_iter_peek(&nlinks_iter, links);
+			check_path(trans, &path, k);
+		})));
+	darray_exit(&path);
 
-		if (!link && (!k.k || iter.pos.inode >= range_end))
-			break;
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+struct nlink_table {
+	size_t		nr;
+	size_t		size;
+
+	struct nlink {
+		u64	inum;
+		u32	snapshot;
+		u32	count;
+	}		*d;
+};
 
-		nlinks_pos = range_start + nlinks_iter.pos;
-		if (iter.pos.inode > nlinks_pos) {
-			/* Should have been caught by dirents pass: */
-			need_fsck_err_on(link && link->count, c,
-				"missing inode %llu (nlink %u)",
-				nlinks_pos, link->count);
-			genradix_iter_advance(&nlinks_iter, links);
-			goto peek_nlinks;
+static int add_nlink(struct bch_fs *c, struct nlink_table *t,
+		     u64 inum, u32 snapshot)
+{
+	if (t->nr == t->size) {
+		size_t new_size = max_t(size_t, 128UL, t->size * 2);
+		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
+
+		if (!d) {
+			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
+				new_size);
+			return -BCH_ERR_ENOMEM_fsck_add_nlink;
 		}
 
-		if (iter.pos.inode < nlinks_pos || !link)
-			link = &zero_links;
+		if (t->d)
+			memcpy(d, t->d, t->size * sizeof(t->d[0]));
+		kvfree(t->d);
+
+		t->d = d;
+		t->size = new_size;
+	}
+
+
+	t->d[t->nr++] = (struct nlink) {
+		.inum		= inum,
+		.snapshot	= snapshot,
+	};
+
+	return 0;
+}
+
+static int nlink_cmp(const void *_l, const void *_r)
+{
+	const struct nlink *l = _l;
+	const struct nlink *r = _r;
+
+	return cmp_int(l->inum, r->inum);
+}
+
+static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
+		     struct nlink_table *links,
+		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
+{
+	struct nlink *link, key = {
+		.inum = inum, .snapshot = U32_MAX,
+	};
+
+	if (inum < range_start || inum >= range_end)
+		return;
+
+	link = __inline_bsearch(&key, links->d, links->nr,
+				sizeof(links->d[0]), nlink_cmp);
+	if (!link)
+		return;
+
+	while (link > links->d && link[0].inum == link[-1].inum)
+		--link;
+
+	for (; link < links->d + links->nr && link->inum == inum; link++)
+		if (ref_visible(c, s, snapshot, link->snapshot)) {
+			link->count++;
+			if (link->snapshot >= snapshot)
+				break;
+		}
+}
+
+noinline_for_stack
+static int check_nlinks_find_hardlinks(struct bch_fs *c,
+				       struct nlink_table *t,
+				       u64 start, u64 *end)
+{
+	int ret = bch2_trans_run(c,
+		for_each_btree_key(trans, iter, BTREE_ID_inodes,
+				   POS(0, start),
+				   BTREE_ITER_intent|
+				   BTREE_ITER_prefetch|
+				   BTREE_ITER_all_snapshots, k, ({
+			if (!bkey_is_inode(k.k))
+				continue;
+
+			/* Should never fail, checked by bch2_inode_invalid: */
+			struct bch_inode_unpacked u;
+			BUG_ON(bch2_inode_unpack(k, &u));
+
+			/*
+			 * Backpointer and directory structure checks are sufficient for
+			 * directories, since they can't have hardlinks:
+			 */
+			if (S_ISDIR(u.bi_mode))
+				continue;
 
-		if (k.k && k.k->type == BCH_INODE_FS) {
 			/*
-			 * Avoid potential deadlocks with iter for
-			 * truncate/rm/etc.:
+			 * Previous passes ensured that bi_nlink is nonzero if
+			 * it had multiple hardlinks:
 			 */
-			bch2_btree_iter_unlock(&iter);
+			if (!u.bi_nlink)
+				continue;
 
-			ret = check_inode(c, lostfound_inode, &iter,
-					  bkey_s_c_to_inode(k), link);
-			BUG_ON(ret == -EINTR);
+			ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
+			if (ret) {
+				*end = k.k->p.offset;
+				ret = 0;
+				break;
+			}
+			0;
+		})));
+
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+noinline_for_stack
+static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
+				     u64 range_start, u64 range_end)
+{
+	struct snapshots_seen s;
+
+	snapshots_seen_init(&s);
+
+	int ret = bch2_trans_run(c,
+		for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
+				   BTREE_ITER_intent|
+				   BTREE_ITER_prefetch|
+				   BTREE_ITER_all_snapshots, k, ({
+			ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
 			if (ret)
 				break;
 
-			if (link->count)
-				atomic_long_inc(&c->nr_inodes);
-		} else {
-			/* Should have been caught by dirents pass: */
-			need_fsck_err_on(link->count, c,
-				"missing inode %llu (nlink %u)",
-				nlinks_pos, link->count);
-		}
+			if (k.k->type == KEY_TYPE_dirent) {
+				struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+
+				if (d.v->d_type != DT_DIR &&
+				    d.v->d_type != DT_SUBVOL)
+					inc_link(c, &s, links, range_start, range_end,
+						 le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
+			}
+			0;
+		})));
+
+	snapshots_seen_exit(&s);
+
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
+				     struct bkey_s_c k,
+				     struct nlink_table *links,
+				     size_t *idx, u64 range_end)
+{
+	struct bch_inode_unpacked u;
+	struct nlink *link = &links->d[*idx];
+	int ret = 0;
+
+	if (k.k->p.offset >= range_end)
+		return 1;
+
+	if (!bkey_is_inode(k.k))
+		return 0;
+
+	BUG_ON(bch2_inode_unpack(k, &u));
+
+	if (S_ISDIR(u.bi_mode))
+		return 0;
+
+	if (!u.bi_nlink)
+		return 0;
 
-		if (nlinks_pos == iter.pos.inode)
-			genradix_iter_advance(&nlinks_iter, links);
+	while ((cmp_int(link->inum, k.k->p.offset) ?:
+		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
+		BUG_ON(*idx == links->nr);
+		link = &links->d[++*idx];
+	}
 
-		bch2_btree_iter_next(&iter);
-		bch2_btree_iter_cond_resched(&iter);
+	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
+			trans, inode_wrong_nlink,
+			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
+			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
+			bch2_inode_nlink_get(&u), link->count)) {
+		bch2_inode_nlink_set(&u, link->count);
+		ret = __bch2_fsck_write_inode(trans, &u);
 	}
 fsck_err:
-	ret2 = bch2_btree_iter_unlock(&iter);
-	if (ret2)
-		bch_err(c, "error in fs gc: btree error %i while walking inodes", ret2);
-
-	return ret ?: ret2;
+	return ret;
 }
 
 noinline_for_stack
-static int check_inode_nlinks(struct bch_fs *c,
-			      struct bch_inode_unpacked *lostfound_inode)
+static int check_nlinks_update_hardlinks(struct bch_fs *c,
+			       struct nlink_table *links,
+			       u64 range_start, u64 range_end)
 {
-	nlink_table links;
-	u64 this_iter_range_start, next_iter_range_start = 0;
-	int ret = 0;
+	size_t idx = 0;
+
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+				POS(0, range_start),
+				BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
+	if (ret < 0) {
+		bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
+		return ret;
+	}
 
-	bch_verbose(c, "checking inode nlinks");
+	return 0;
+}
 
-	genradix_init(&links);
+int bch2_check_nlinks(struct bch_fs *c)
+{
+	struct nlink_table links = { 0 };
+	u64 this_iter_range_start, next_iter_range_start = 0;
+	int ret = 0;
 
 	do {
 		this_iter_range_start = next_iter_range_start;
 		next_iter_range_start = U64_MAX;
 
-		ret = bch2_gc_walk_dirents(c, &links,
+		ret = check_nlinks_find_hardlinks(c, &links,
+						  this_iter_range_start,
+						  &next_iter_range_start);
+
+		ret = check_nlinks_walk_dirents(c, &links,
 					  this_iter_range_start,
-					  &next_iter_range_start);
+					  next_iter_range_start);
 		if (ret)
 			break;
 
-		ret = bch2_gc_walk_inodes(c, lostfound_inode, &links,
+		ret = check_nlinks_update_hardlinks(c, &links,
 					 this_iter_range_start,
 					 next_iter_range_start);
 		if (ret)
 			break;
 
-		genradix_free(&links);
+		links.nr = 0;
 	} while (next_iter_range_start != U64_MAX);
 
-	genradix_free(&links);
-
+	kvfree(links.d);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
-noinline_for_stack
-static int check_inodes_fast(struct bch_fs *c)
+static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
+			     struct bkey_s_c k)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_s_c_inode inode;
-	unsigned long nr_inodes = 0;
-	int ret = 0;
+	struct bkey_s_c_reflink_p p;
+	struct bkey_i_reflink_p *u;
 
-	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) {
-		if (k.k->type != BCH_INODE_FS)
-			continue;
+	if (k.k->type != KEY_TYPE_reflink_p)
+		return 0;
 
-		inode = bkey_s_c_to_inode(k);
+	p = bkey_s_c_to_reflink_p(k);
 
-		if (!(inode.v->bi_flags & BCH_INODE_UNLINKED))
-			nr_inodes++;
+	if (!p.v->front_pad && !p.v->back_pad)
+		return 0;
 
-		if (inode.v->bi_flags &
-		    (BCH_INODE_I_SIZE_DIRTY|
-		     BCH_INODE_I_SECTORS_DIRTY|
-		     BCH_INODE_UNLINKED)) {
-			fsck_err_on(c->sb.clean, c,
-				"filesystem marked clean but found inode %llu with flags %x",
-				inode.k->p.inode, inode.v->bi_flags);
-			ret = check_inode(c, NULL, &iter, inode, NULL);
-			BUG_ON(ret == -EINTR);
-			if (ret)
-				break;
-		}
-	}
-	atomic_long_set(&c->nr_inodes, nr_inodes);
-fsck_err:
-	return bch2_btree_iter_unlock(&iter) ?: ret;
+	u = bch2_trans_kmalloc(trans, sizeof(*u));
+	int ret = PTR_ERR_OR_ZERO(u);
+	if (ret)
+		return ret;
+
+	bkey_reassemble(&u->k_i, k);
+	u->v.front_pad	= 0;
+	u->v.back_pad	= 0;
+
+	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
 }
 
-/*
- * Checks for inconsistencies that shouldn't happen, unless we have a bug.
- * Doesn't fix them yet, mainly because they haven't yet been observed:
- */
-static int bch2_fsck_full(struct bch_fs *c)
+int bch2_fix_reflink_p(struct bch_fs *c)
 {
-	struct bch_inode_unpacked root_inode, lostfound_inode;
-	int ret;
+	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
+		return 0;
+
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_extents, POS_MIN,
+				BTREE_ITER_intent|BTREE_ITER_prefetch|
+				BTREE_ITER_all_snapshots, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			fix_reflink_p_key(trans, &iter, k)));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+#ifndef NO_BCACHEFS_CHARDEV
+
+struct fsck_thread {
+	struct thread_with_stdio thr;
+	struct bch_fs		*c;
+	struct bch_opts		opts;
+};
+
+static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
+{
+	struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
+	kfree(thr);
+}
 
-	bch_verbose(c, "starting fsck:");
-	ret =   check_extents(c) ?:
-		check_dirents(c) ?:
-		check_xattrs(c) ?:
-		check_root(c, &root_inode) ?:
-		check_lostfound(c, &root_inode, &lostfound_inode) ?:
-		check_directory_structure(c, &lostfound_inode) ?:
-		check_inode_nlinks(c, &lostfound_inode);
+static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
+{
+	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
+	struct bch_fs *c = thr->c;
 
-	bch2_flush_fsck_errs(c);
-	bch_verbose(c, "fsck done");
+	int ret = PTR_ERR_OR_ZERO(c);
+	if (ret)
+		return ret;
 
+	ret = bch2_fs_start(thr->c);
+	if (ret)
+		goto err;
+
+	if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
+		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
+		ret |= 1;
+	}
+	if (test_bit(BCH_FS_error, &c->flags)) {
+		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
+		ret |= 4;
+	}
+err:
+	bch2_fs_stop(c);
 	return ret;
 }
 
-static int bch2_fsck_inode_nlink(struct bch_fs *c)
+static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
+	.exit		= bch2_fsck_thread_exit,
+	.fn		= bch2_fsck_offline_thread_fn,
+};
+
+long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
 {
-	struct bch_inode_unpacked root_inode, lostfound_inode;
-	int ret;
+	struct bch_ioctl_fsck_offline arg;
+	struct fsck_thread *thr = NULL;
+	darray_str(devs) = {};
+	long ret = 0;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	if (arg.flags)
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	for (size_t i = 0; i < arg.nr_devs; i++) {
+		u64 dev_u64;
+		ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
+		if (ret)
+			goto err;
+
+		char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
+		ret = PTR_ERR_OR_ZERO(dev_str);
+		if (ret)
+			goto err;
+
+		ret = darray_push(&devs, dev_str);
+		if (ret) {
+			kfree(dev_str);
+			goto err;
+		}
+	}
+
+	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
+	if (!thr) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	thr->opts = bch2_opts_empty();
+
+	if (arg.opts) {
+		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
+		ret =   PTR_ERR_OR_ZERO(optstr) ?:
+			bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
+		if (!IS_ERR(optstr))
+			kfree(optstr);
 
-	bch_verbose(c, "checking inode link counts:");
-	ret =   check_root(c, &root_inode) ?:
-		check_lostfound(c, &root_inode, &lostfound_inode) ?:
-		check_inode_nlinks(c, &lostfound_inode);
+		if (ret)
+			goto err;
+	}
+
+	opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
+	opt_set(thr->opts, read_only, 1);
+	opt_set(thr->opts, ratelimit_errors, 0);
+
+	/* We need request_key() to be called before we punt to kthread: */
+	opt_set(thr->opts, nostart, true);
 
-	bch2_flush_fsck_errs(c);
-	bch_verbose(c, "done");
+	bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
 
+	thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
+
+	if (!IS_ERR(thr->c) &&
+	    thr->c->opts.errors == BCH_ON_ERROR_panic)
+		thr->c->opts.errors = BCH_ON_ERROR_ro;
+
+	ret = __bch2_run_thread_with_stdio(&thr->thr);
+out:
+	darray_for_each(devs, i)
+		kfree(*i);
+	darray_exit(&devs);
 	return ret;
+err:
+	if (thr)
+		bch2_fsck_thread_exit(&thr->thr);
+	pr_err("ret %s", bch2_err_str(ret));
+	goto out;
 }
 
-static int bch2_fsck_walk_inodes_only(struct bch_fs *c)
+static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
 {
-	int ret;
+	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
+	struct bch_fs *c = thr->c;
+
+	c->stdio_filter = current;
+	c->stdio = &thr->thr.stdio;
 
-	bch_verbose(c, "walking inodes:");
-	ret = check_inodes_fast(c);
+	/*
+	 * XXX: can we figure out a way to do this without mucking with c->opts?
+	 */
+	unsigned old_fix_errors = c->opts.fix_errors;
+	if (opt_defined(thr->opts, fix_errors))
+		c->opts.fix_errors = thr->opts.fix_errors;
+	else
+		c->opts.fix_errors = FSCK_FIX_ask;
 
-	bch2_flush_fsck_errs(c);
-	bch_verbose(c, "done");
+	c->opts.fsck = true;
+	set_bit(BCH_FS_fsck_running, &c->flags);
 
+	c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
+	int ret = bch2_run_online_recovery_passes(c);
+
+	clear_bit(BCH_FS_fsck_running, &c->flags);
+	bch_err_fn(c, ret);
+
+	c->stdio = NULL;
+	c->stdio_filter = NULL;
+	c->opts.fix_errors = old_fix_errors;
+
+	up(&c->online_fsck_mutex);
+	bch2_ro_ref_put(c);
 	return ret;
 }
 
-int bch2_fsck(struct bch_fs *c)
+static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
+	.exit		= bch2_fsck_thread_exit,
+	.fn		= bch2_fsck_online_thread_fn,
+};
+
+long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
 {
-	if (!c->opts.nofsck)
-		return bch2_fsck_full(c);
+	struct fsck_thread *thr = NULL;
+	long ret = 0;
+
+	if (arg.flags)
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!bch2_ro_ref_tryget(c))
+		return -EROFS;
+
+	if (down_trylock(&c->online_fsck_mutex)) {
+		bch2_ro_ref_put(c);
+		return -EAGAIN;
+	}
+
+	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
+	if (!thr) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	thr->c = c;
+	thr->opts = bch2_opts_empty();
 
-	if (!c->sb.clean &&
-	    !(c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK)))
-		return bch2_fsck_inode_nlink(c);
+	if (arg.opts) {
+		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
 
-	return bch2_fsck_walk_inodes_only(c);
+		ret =   PTR_ERR_OR_ZERO(optstr) ?:
+			bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
+		if (!IS_ERR(optstr))
+			kfree(optstr);
+
+		if (ret)
+			goto err;
+	}
+
+	ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
+err:
+	if (ret < 0) {
+		bch_err_fn(c, ret);
+		if (thr)
+			bch2_fsck_thread_exit(&thr->thr);
+		up(&c->online_fsck_mutex);
+		bch2_ro_ref_put(c);
+	}
+	return ret;
 }
+
+#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/libbcachefs/fsck.h b/libbcachefs/fsck.h
index bc9caaf2..4481b40a 100644
--- a/libbcachefs/fsck.h
+++ b/libbcachefs/fsck.h
@@ -1,7 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_FSCK_H
 #define _BCACHEFS_FSCK_H
 
-s64 bch2_count_inode_sectors(struct bch_fs *, u64);
-int bch2_fsck(struct bch_fs *);
+int bch2_check_inodes(struct bch_fs *);
+int bch2_check_extents(struct bch_fs *);
+int bch2_check_indirect_extents(struct bch_fs *);
+int bch2_check_dirents(struct bch_fs *);
+int bch2_check_xattrs(struct bch_fs *);
+int bch2_check_root(struct bch_fs *);
+int bch2_check_subvolume_structure(struct bch_fs *);
+int bch2_check_unreachable_inodes(struct bch_fs *);
+int bch2_check_directory_structure(struct bch_fs *);
+int bch2_check_nlinks(struct bch_fs *);
+int bch2_fix_reflink_p(struct bch_fs *);
+
+long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *);
+long bch2_ioctl_fsck_online(struct bch_fs *, struct bch_ioctl_fsck_online);
 
 #endif /* _BCACHEFS_FSCK_H */
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index d4139faa..8818e418 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -1,53 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_write_buffer.h"
 #include "bkey_methods.h"
 #include "btree_update.h"
+#include "buckets.h"
+#include "compress.h"
+#include "dirent.h"
+#include "disk_accounting.h"
 #include "error.h"
 #include "extents.h"
+#include "extent_update.h"
+#include "fs.h"
 #include "inode.h"
-#include "io.h"
-#include "keylist.h"
+#include "opts.h"
+#include "str_hash.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "varint.h"
 
 #include <linux/random.h>
 
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 
-#define FIELD_BYTES()						\
-
-static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
-static const u8 bits_table[8] = {
-	1  * 8 - 1,
-	2  * 8 - 2,
-	3  * 8 - 3,
-	4  * 8 - 4,
-	6  * 8 - 5,
-	8  * 8 - 6,
-	10 * 8 - 7,
-	13 * 8 - 8,
+#define x(name, ...)	#name,
+const char * const bch2_inode_opts[] = {
+	BCH_INODE_OPTS()
+	NULL,
 };
 
-static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo)
-{
-	__be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), };
-	unsigned shift, bytes, bits = likely(!hi)
-		? fls64(lo)
-		: fls64(hi) + 64;
-
-	for (shift = 1; shift <= 8; shift++)
-		if (bits < bits_table[shift - 1])
-			goto got_shift;
-
-	BUG();
-got_shift:
-	bytes = byte_table[shift - 1];
-
-	BUG_ON(out + bytes > end);
+static const char * const bch2_inode_flag_strs[] = {
+	BCH_INODE_FLAGS()
+	NULL
+};
+#undef  x
 
-	memcpy(out, (u8 *) in + 16 - bytes, bytes);
-	*out |= (1 << 8) >> shift;
+static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos);
 
-	return bytes;
-}
+static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
 
 static int inode_decode_field(const u8 *in, const u8 *end,
 			      u64 out[2], unsigned *out_bits)
@@ -83,77 +74,101 @@ static int inode_decode_field(const u8 *in, const u8 *end,
 	return bytes;
 }
 
-void bch2_inode_pack(struct bkey_inode_buf *packed,
-		     const struct bch_inode_unpacked *inode)
+static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
+					   const struct bch_inode_unpacked *inode)
 {
-	u8 *out = packed->inode.v.fields;
+	struct bkey_i_inode_v3 *k = &packed->inode;
+	u8 *out = k->v.fields;
 	u8 *end = (void *) &packed[1];
 	u8 *last_nonzero_field = out;
 	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+	unsigned bytes;
+	int ret;
 
-	bkey_inode_init(&packed->inode.k_i);
-	packed->inode.k.p.inode		= inode->bi_inum;
+	bkey_inode_v3_init(&packed->inode.k_i);
+	packed->inode.k.p.offset	= inode->bi_inum;
+	packed->inode.v.bi_journal_seq	= cpu_to_le64(inode->bi_journal_seq);
 	packed->inode.v.bi_hash_seed	= inode->bi_hash_seed;
-	packed->inode.v.bi_flags	= cpu_to_le32(inode->bi_flags);
-	packed->inode.v.bi_mode		= cpu_to_le16(inode->bi_mode);
+	packed->inode.v.bi_flags	= cpu_to_le64(inode->bi_flags);
+	packed->inode.v.bi_sectors	= cpu_to_le64(inode->bi_sectors);
+	packed->inode.v.bi_size		= cpu_to_le64(inode->bi_size);
+	packed->inode.v.bi_version	= cpu_to_le64(inode->bi_version);
+	SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
+	SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
 
-#define BCH_INODE_FIELD(_name, _bits)					\
-	out += inode_encode_field(out, end, 0, inode->_name);		\
+
+#define x(_name, _bits)							\
 	nr_fields++;							\
 									\
 	if (inode->_name) {						\
+		ret = bch2_varint_encode_fast(out, inode->_name);	\
+		out += ret;						\
+									\
+		if (_bits > 64)						\
+			*out++ = 0;					\
+									\
 		last_nonzero_field = out;				\
 		last_nonzero_fieldnr = nr_fields;			\
+	} else {							\
+		*out++ = 0;						\
+									\
+		if (_bits > 64)						\
+			*out++ = 0;					\
 	}
 
-	BCH_INODE_FIELDS()
-#undef  BCH_INODE_FIELD
+	BCH_INODE_FIELDS_v3()
+#undef  x
+	BUG_ON(out > end);
 
 	out = last_nonzero_field;
 	nr_fields = last_nonzero_fieldnr;
 
-	set_bkey_val_bytes(&packed->inode.k, out - (u8 *) &packed->inode.v);
-	memset(out, 0,
-	       (u8 *) &packed->inode.v +
-	       bkey_val_bytes(&packed->inode.k) - out);
+	bytes = out - (u8 *) &packed->inode.v;
+	set_bkey_val_bytes(&packed->inode.k, bytes);
+	memset_u64s_tail(&packed->inode.v, 0, bytes);
 
-	SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields);
+	SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
 
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
 		struct bch_inode_unpacked unpacked;
 
-		int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode),
-					   &unpacked);
+		ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
 		BUG_ON(ret);
 		BUG_ON(unpacked.bi_inum		!= inode->bi_inum);
 		BUG_ON(unpacked.bi_hash_seed	!= inode->bi_hash_seed);
+		BUG_ON(unpacked.bi_sectors	!= inode->bi_sectors);
+		BUG_ON(unpacked.bi_size		!= inode->bi_size);
+		BUG_ON(unpacked.bi_version	!= inode->bi_version);
 		BUG_ON(unpacked.bi_mode		!= inode->bi_mode);
 
-#define BCH_INODE_FIELD(_name, _bits)	BUG_ON(unpacked._name != inode->_name);
-		BCH_INODE_FIELDS()
-#undef  BCH_INODE_FIELD
+#define x(_name, _bits)	if (unpacked._name != inode->_name)		\
+			panic("unpacked %llu should be %llu",		\
+			      (u64) unpacked._name, (u64) inode->_name);
+		BCH_INODE_FIELDS_v3()
+#undef  x
 	}
 }
 
-int bch2_inode_unpack(struct bkey_s_c_inode inode,
-		      struct bch_inode_unpacked *unpacked)
+void bch2_inode_pack(struct bkey_inode_buf *packed,
+		     const struct bch_inode_unpacked *inode)
+{
+	bch2_inode_pack_inlined(packed, inode);
+}
+
+static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
+				struct bch_inode_unpacked *unpacked)
 {
 	const u8 *in = inode.v->fields;
-	const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k);
+	const u8 *end = bkey_val_end(inode);
 	u64 field[2];
 	unsigned fieldnr = 0, field_bits;
 	int ret;
 
-	unpacked->bi_inum	= inode.k->p.inode;
-	unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
-	unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
-	unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
-
-#define BCH_INODE_FIELD(_name, _bits)					\
-	if (fieldnr++ == INODE_NR_FIELDS(inode.v)) {			\
-		memset(&unpacked->_name, 0,				\
-		       sizeof(*unpacked) -				\
-		       offsetof(struct bch_inode_unpacked, _name));	\
+#define x(_name, _bits)							\
+	if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) {			\
+		unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
+		memset((void *) unpacked + offset, 0,			\
+		       sizeof(*unpacked) - offset);			\
 		return 0;						\
 	}								\
 									\
@@ -167,103 +182,639 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
 	unpacked->_name = field[1];					\
 	in += ret;
 
-	BCH_INODE_FIELDS()
-#undef  BCH_INODE_FIELD
+	BCH_INODE_FIELDS_v2()
+#undef  x
 
 	/* XXX: signal if there were more fields than expected? */
+	return 0;
+}
 
+static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
+				const u8 *in, const u8 *end,
+				unsigned nr_fields)
+{
+	unsigned fieldnr = 0;
+	int ret;
+	u64 v[2];
+
+#define x(_name, _bits)							\
+	if (fieldnr < nr_fields) {					\
+		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
+		if (ret < 0)						\
+			return ret;					\
+		in += ret;						\
+									\
+		if (_bits > 64) {					\
+			ret = bch2_varint_decode_fast(in, end, &v[1]);	\
+			if (ret < 0)					\
+				return ret;				\
+			in += ret;					\
+		} else {						\
+			v[1] = 0;					\
+		}							\
+	} else {							\
+		v[0] = v[1] = 0;					\
+	}								\
+									\
+	unpacked->_name = v[0];						\
+	if (v[1] || v[0] != unpacked->_name)				\
+		return -1;						\
+	fieldnr++;
+
+	BCH_INODE_FIELDS_v2()
+#undef  x
+
+	/* XXX: signal if there were more fields than expected? */
 	return 0;
 }
 
-const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
+static int bch2_inode_unpack_v3(struct bkey_s_c k,
+				struct bch_inode_unpacked *unpacked)
 {
-	if (k.k->p.offset)
-		return "nonzero offset";
+	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
+	const u8 *in = inode.v->fields;
+	const u8 *end = bkey_val_end(inode);
+	unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
+	unsigned fieldnr = 0;
+	int ret;
+	u64 v[2];
+
+	unpacked->bi_inum	= inode.k->p.offset;
+	unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
+	unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
+	unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
+	unpacked->bi_sectors	= le64_to_cpu(inode.v->bi_sectors);
+	unpacked->bi_size	= le64_to_cpu(inode.v->bi_size);
+	unpacked->bi_version	= le64_to_cpu(inode.v->bi_version);
+	unpacked->bi_mode	= INODEv3_MODE(inode.v);
+
+#define x(_name, _bits)							\
+	if (fieldnr < nr_fields) {					\
+		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
+		if (ret < 0)						\
+			return ret;					\
+		in += ret;						\
+									\
+		if (_bits > 64) {					\
+			ret = bch2_varint_decode_fast(in, end, &v[1]);	\
+			if (ret < 0)					\
+				return ret;				\
+			in += ret;					\
+		} else {						\
+			v[1] = 0;					\
+		}							\
+	} else {							\
+		v[0] = v[1] = 0;					\
+	}								\
+									\
+	unpacked->_name = v[0];						\
+	if (v[1] || v[0] != unpacked->_name)				\
+		return -1;						\
+	fieldnr++;
+
+	BCH_INODE_FIELDS_v3()
+#undef  x
+
+	/* XXX: signal if there were more fields than expected? */
+	return 0;
+}
+
+static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
+					       struct bch_inode_unpacked *unpacked)
+{
+	memset(unpacked, 0, sizeof(*unpacked));
+
+	unpacked->bi_snapshot = k.k->p.snapshot;
 
 	switch (k.k->type) {
-	case BCH_INODE_FS: {
+	case KEY_TYPE_inode: {
 		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
-		struct bch_inode_unpacked unpacked;
 
-		if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
-			return "incorrect value size";
+		unpacked->bi_inum	= inode.k->p.offset;
+		unpacked->bi_journal_seq= 0;
+		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
+		unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
+		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
+
+		if (INODEv1_NEW_VARINT(inode.v)) {
+			return bch2_inode_unpack_v2(unpacked, inode.v->fields,
+						    bkey_val_end(inode),
+						    INODEv1_NR_FIELDS(inode.v));
+		} else {
+			return bch2_inode_unpack_v1(inode, unpacked);
+		}
+		break;
+	}
+	case KEY_TYPE_inode_v2: {
+		struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+
+		unpacked->bi_inum	= inode.k->p.offset;
+		unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
+		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
+		unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
+		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
+
+		return bch2_inode_unpack_v2(unpacked, inode.v->fields,
+					    bkey_val_end(inode),
+					    INODEv2_NR_FIELDS(inode.v));
+	}
+	default:
+		BUG();
+	}
+}
+
+int bch2_inode_unpack(struct bkey_s_c k,
+		      struct bch_inode_unpacked *unpacked)
+{
+	unpacked->bi_snapshot = k.k->p.snapshot;
+
+	return likely(k.k->type == KEY_TYPE_inode_v3)
+		? bch2_inode_unpack_v3(k, unpacked)
+		: bch2_inode_unpack_slowpath(k, unpacked);
+}
 
-		if (k.k->p.inode < BLOCKDEV_INODE_MAX)
-			return "fs inode in blockdev range";
+int __bch2_inode_peek(struct btree_trans *trans,
+		      struct btree_iter *iter,
+		      struct bch_inode_unpacked *inode,
+		      subvol_inum inum, unsigned flags,
+		      bool warn)
+{
+	u32 snapshot;
+	int ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn);
+	if (ret)
+		return ret;
+
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
+					       SPOS(0, inum.inum, snapshot),
+					       flags|BTREE_ITER_cached);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
 
-		if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
-			return "invalid str hash type";
+	ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
+	if (ret)
+		goto err;
 
-		if (bch2_inode_unpack(inode, &unpacked))
-			return "invalid variable length fields";
+	ret = bch2_inode_unpack(k, inode);
+	if (ret)
+		goto err;
 
-		if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
-			return "invalid data checksum type";
+	return 0;
+err:
+	if (warn)
+		bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum);
+	bch2_trans_iter_exit(trans, iter);
+	return ret;
+}
+
+int bch2_inode_write_flags(struct btree_trans *trans,
+		     struct btree_iter *iter,
+		     struct bch_inode_unpacked *inode,
+		     enum btree_iter_update_trigger_flags flags)
+{
+	struct bkey_inode_buf *inode_p;
 
-		if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
-			return "invalid data checksum type";
+	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+	if (IS_ERR(inode_p))
+		return PTR_ERR(inode_p);
+
+	bch2_inode_pack_inlined(inode_p, inode);
+	inode_p->inode.k.p.snapshot = iter->snapshot;
+	return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
+}
+
+int __bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
+{
+	struct bkey_inode_buf *inode_p =
+		bch2_trans_kmalloc(trans, sizeof(*inode_p));
+
+	if (IS_ERR(inode_p))
+		return PTR_ERR(inode_p);
+
+	bch2_inode_pack(inode_p, inode);
+	inode_p->inode.k.p.snapshot = inode->bi_snapshot;
+
+	return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
+				&inode_p->inode.k_i,
+				BTREE_UPDATE_internal_snapshot_node);
+}
+
+int bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
+{
+	int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			    __bch2_fsck_write_inode(trans, inode));
+	bch_err_fn(trans->c, ret);
+	return ret;
+}
+
+struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
+{
+	struct bch_inode_unpacked u;
+	struct bkey_inode_buf *inode_p;
+	int ret;
+
+	if (!bkey_is_inode(&k->k))
+		return ERR_PTR(-ENOENT);
+
+	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+	if (IS_ERR(inode_p))
+		return ERR_CAST(inode_p);
 
-		if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
-		    unpacked.bi_nlink != 0)
-			return "flagged as unlinked but bi_nlink != 0";
+	ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u);
+	if (ret)
+		return ERR_PTR(ret);
 
-		return NULL;
+	bch2_inode_pack(inode_p, &u);
+	return &inode_p->inode.k_i;
+}
+
+static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
+				 struct bkey_validate_context from)
+{
+	struct bch_inode_unpacked unpacked;
+	int ret = 0;
+
+	bkey_fsck_err_on(k.k->p.inode,
+			 c, inode_pos_inode_nonzero,
+			 "nonzero k.p.inode");
+
+	bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX,
+			 c, inode_pos_blockdev_range,
+			 "fs inode in blockdev range");
+
+	bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked),
+			 c, inode_unpack_error,
+			 "invalid variable length fields");
+
+	bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1,
+			 c, inode_checksum_type_invalid,
+			 "invalid data checksum type (%u >= %u",
+			 unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
+
+	bkey_fsck_err_on(unpacked.bi_compression &&
+			 !bch2_compression_opt_valid(unpacked.bi_compression - 1),
+			 c, inode_compression_type_invalid,
+			 "invalid compression opt %u", unpacked.bi_compression - 1);
+
+	bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
+			 unpacked.bi_nlink != 0,
+			 c, inode_unlinked_but_nlink_nonzero,
+			 "flagged as unlinked but bi_nlink != 0");
+
+	bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode),
+			 c, inode_subvol_root_but_not_dir,
+			 "subvolume root but not a directory");
+fsck_err:
+	return ret;
+}
+
+int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
+			struct bkey_validate_context from)
+{
+	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+	int ret = 0;
+
+	bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
+			 c, inode_str_hash_invalid,
+			 "invalid str hash type (%llu >= %u)",
+			 INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR);
+
+	ret = __bch2_inode_validate(c, k, from);
+fsck_err:
+	return ret;
+}
+
+int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k,
+			   struct bkey_validate_context from)
+{
+	struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+	int ret = 0;
+
+	bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
+			 c, inode_str_hash_invalid,
+			 "invalid str hash type (%llu >= %u)",
+			 INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
+
+	ret = __bch2_inode_validate(c, k, from);
+fsck_err:
+	return ret;
+}
+
+int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k,
+			   struct bkey_validate_context from)
+{
+	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
+	int ret = 0;
+
+	bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
+			 INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k),
+			 c, inode_v3_fields_start_bad,
+			 "invalid fields_start (got %llu, min %u max %zu)",
+			 INODEv3_FIELDS_START(inode.v),
+			 INODEv3_FIELDS_START_INITIAL,
+			 bkey_val_u64s(inode.k));
+
+	bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
+			 c, inode_str_hash_invalid,
+			 "invalid str hash type (%llu >= %u)",
+			 INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
+
+	ret = __bch2_inode_validate(c, k, from);
+fsck_err:
+	return ret;
+}
+
+static void __bch2_inode_unpacked_to_text(struct printbuf *out,
+					  struct bch_inode_unpacked *inode)
+{
+	prt_printf(out, "\n");
+	printbuf_indent_add(out, 2);
+	prt_printf(out, "mode=%o\n", inode->bi_mode);
+
+	prt_str(out, "flags=");
+	prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
+	prt_printf(out, "(%x)\n", inode->bi_flags);
+
+	prt_printf(out, "journal_seq=%llu\n",	inode->bi_journal_seq);
+	prt_printf(out, "hash_seed=%llx\n",	inode->bi_hash_seed);
+	prt_printf(out, "hash_type=");
+	bch2_prt_str_hash_type(out, INODE_STR_HASH(inode));
+	prt_newline(out);
+	prt_printf(out, "bi_size=%llu\n",	inode->bi_size);
+	prt_printf(out, "bi_sectors=%llu\n",	inode->bi_sectors);
+	prt_printf(out, "bi_version=%llu\n",	inode->bi_version);
+
+#define x(_name, _bits)						\
+	prt_printf(out, #_name "=%llu\n", (u64) inode->_name);
+	BCH_INODE_FIELDS_v3()
+#undef  x
+
+	bch2_printbuf_strip_trailing_newline(out);
+	printbuf_indent_sub(out, 2);
+}
+
+void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
+{
+	prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot);
+	__bch2_inode_unpacked_to_text(out, inode);
+}
+
+void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bch_inode_unpacked inode;
+
+	if (bch2_inode_unpack(k, &inode)) {
+		prt_printf(out, "(unpack error)");
+		return;
 	}
-	case BCH_INODE_BLOCKDEV:
-		if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_blockdev))
-			return "incorrect value size";
 
-		if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
-			return "blockdev inode in fs range";
+	__bch2_inode_unpacked_to_text(out, &inode);
+}
 
-		return NULL;
-	case BCH_INODE_GENERATION:
-		if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
-			return "incorrect value size";
+static inline u64 bkey_inode_flags(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_inode:
+		return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
+	case KEY_TYPE_inode_v2:
+		return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
+	case KEY_TYPE_inode_v3:
+		return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
+	default:
+		return 0;
+	}
+}
 
-		return NULL;
+static inline void bkey_inode_flags_set(struct bkey_s k, u64 f)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_inode:
+		bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f);
+		return;
+	case KEY_TYPE_inode_v2:
+		bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f);
+		return;
+	case KEY_TYPE_inode_v3:
+		bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f);
+		return;
 	default:
-		return "invalid type";
+		BUG();
 	}
 }
 
-void bch2_inode_to_text(struct bch_fs *c, char *buf,
-			size_t size, struct bkey_s_c k)
+static inline bool bkey_is_unlinked_inode(struct bkey_s_c k)
 {
-	char *out = buf, *end = out + size;
-	struct bkey_s_c_inode inode;
-	struct bch_inode_unpacked unpacked;
+	unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked;
 
-	switch (k.k->type) {
-	case BCH_INODE_FS:
-		inode = bkey_s_c_to_inode(k);
-		if (bch2_inode_unpack(inode, &unpacked)) {
-			out += scnprintf(out, end - out, "(unpack error)");
+	return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot);
+}
+
+static struct bkey_s_c
+bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
+				   enum btree_id btree, struct bpos pos,
+				   unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	for_each_btree_key_max_norestart(trans, *iter, btree,
+					  bpos_successor(pos),
+					  SPOS(pos.inode, pos.offset, U32_MAX),
+					  flags|BTREE_ITER_all_snapshots, k, ret)
+		if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot))
+			return k;
+
+	bch2_trans_iter_exit(trans, iter);
+	return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
+}
+
+static struct bkey_s_c
+bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
+				    struct bpos pos, unsigned flags)
+{
+	struct bkey_s_c k;
+again:
+	k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags);
+	if (!k.k ||
+	    bkey_err(k) ||
+	    bkey_is_inode(k.k))
+		return k;
+
+	bch2_trans_iter_exit(trans, iter);
+	pos = k.k->p;
+	goto again;
+}
+
+int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	for_each_btree_key_max_norestart(trans, iter,
+			BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos),
+			BTREE_ITER_all_snapshots|
+			BTREE_ITER_with_updates, k, ret)
+		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) &&
+		    bkey_is_inode(k.k)) {
+			ret = 1;
 			break;
 		}
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
 
-#define BCH_INODE_FIELD(_name, _bits)						\
-		out += scnprintf(out, end - out, #_name ": %llu ", (u64) unpacked._name);
-		BCH_INODE_FIELDS()
-#undef  BCH_INODE_FIELD
-		break;
+static int update_inode_has_children(struct btree_trans *trans,
+				     struct bkey_s k,
+				     bool have_child)
+{
+	if (!have_child) {
+		int ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+		if (ret)
+			return ret < 0 ? ret : 0;
 	}
+
+	u64 f = bkey_inode_flags(k.s_c);
+	if (have_child != !!(f & BCH_INODE_has_child_snapshot))
+		bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot);
+
+	return 0;
 }
 
-void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
-		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
-		     struct bch_inode_unpacked *parent)
+static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos,
+					    bool have_child)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans,
+						&iter, pos, BTREE_ITER_with_updates);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
+	if (!k.k)
+		return 0;
+
+	if (!have_child) {
+		ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+		if (ret) {
+			ret = ret < 0 ? ret : 0;
+			goto err;
+		}
+	}
+
+	u64 f = bkey_inode_flags(k);
+	if (have_child != !!(f & BCH_INODE_has_child_snapshot)) {
+		struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k,
+					     BTREE_UPDATE_internal_snapshot_node);
+		ret = PTR_ERR_OR_ZERO(update);
+		if (ret)
+			goto err;
+
+		bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot);
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_trigger_inode(struct btree_trans *trans,
+		       enum btree_id btree_id, unsigned level,
+		       struct bkey_s_c old,
+		       struct bkey_s new,
+		       enum btree_iter_update_trigger_flags flags)
+{
+	struct bch_fs *c = trans->c;
+
+	if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
+		BUG_ON(!trans->journal_res.seq);
+		bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
+	}
+
+	s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
+	if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) {
+		struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes };
+		int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc);
+		if (ret)
+			return ret;
+	}
+
+	if (flags & BTREE_TRIGGER_transactional) {
+		int unlinked_delta =	(int) bkey_is_unlinked_inode(new.s_c) -
+					(int) bkey_is_unlinked_inode(old);
+		if (unlinked_delta) {
+			int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
+							      new.k->p, unlinked_delta > 0);
+			if (ret)
+				return ret;
+		}
+
+		/*
+		 * If we're creating or deleting an inode at this snapshot ID,
+		 * and there might be an inode in a parent snapshot ID, we might
+		 * need to set or clear the has_child_snapshot flag on the
+		 * parent.
+		 */
+		int deleted_delta = (int) bkey_is_inode(new.k) -
+				    (int) bkey_is_inode(old.k);
+		if (deleted_delta &&
+		    bch2_snapshot_parent(c, new.k->p.snapshot)) {
+			int ret = update_parent_inode_has_children(trans, new.k->p,
+								   deleted_delta > 0);
+			if (ret)
+				return ret;
+		}
+
+		/*
+		 * When an inode is first updated in a new snapshot, we may need
+		 * to clear has_child_snapshot
+		 */
+		if (deleted_delta > 0) {
+			int ret = update_inode_has_children(trans, new, false);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k,
+				   struct bkey_validate_context from)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(k.k->p.inode,
+			 c, inode_pos_inode_nonzero,
+			 "nonzero k.p.inode");
+fsck_err:
+	return ret;
+}
+
+void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
+				   struct bkey_s_c k)
+{
+	struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
+
+	prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
+}
+
+void bch2_inode_init_early(struct bch_fs *c,
+			   struct bch_inode_unpacked *inode_u)
 {
-	s64 now = timespec_to_bch2_time(c,
-		timespec_trunc(current_kernel_time(),
-			       c->sb.time_precision));
+	enum bch_str_hash_type str_hash =
+		bch2_str_hash_opt_to_type(c, c->opts.str_hash);
 
 	memset(inode_u, 0, sizeof(*inode_u));
 
-	/* ick */
-	inode_u->bi_flags |= c->opts.str_hash << INODE_STR_HASH_OFFSET;
+	SET_INODE_STR_HASH(inode_u, str_hash);
 	get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
+}
 
+void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
+			  uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+			  struct bch_inode_unpacked *parent)
+{
 	inode_u->bi_mode	= mode;
 	inode_u->bi_uid		= uid;
 	inode_u->bi_gid		= gid;
@@ -273,129 +824,187 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 	inode_u->bi_ctime	= now;
 	inode_u->bi_otime	= now;
 
+	if (parent && parent->bi_mode & S_ISGID) {
+		inode_u->bi_gid = parent->bi_gid;
+		if (S_ISDIR(mode))
+			inode_u->bi_mode |= S_ISGID;
+	}
+
 	if (parent) {
-#define BCH_INODE_FIELD(_name)	inode_u->_name = parent->_name;
-		BCH_INODE_FIELDS_INHERIT()
-#undef BCH_INODE_FIELD
+#define x(_name, ...)	inode_u->bi_##_name = parent->bi_##_name;
+		BCH_INODE_OPTS()
+#undef x
 	}
 }
 
+void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
+		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+		     struct bch_inode_unpacked *parent)
+{
+	bch2_inode_init_early(c, inode_u);
+	bch2_inode_init_late(inode_u, bch2_current_time(c),
+			     uid, gid, mode, rdev, parent);
+}
+
 static inline u32 bkey_generation(struct bkey_s_c k)
 {
 	switch (k.k->type) {
-	case BCH_INODE_BLOCKDEV:
-	case BCH_INODE_FS:
+	case KEY_TYPE_inode:
+	case KEY_TYPE_inode_v2:
 		BUG();
-	case BCH_INODE_GENERATION:
+	case KEY_TYPE_inode_generation:
 		return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
 	default:
 		return 0;
 	}
 }
 
-int __bch2_inode_create(struct btree_trans *trans,
-			struct bch_inode_unpacked *inode_u,
-			u64 min, u64 max, u64 *hint)
+/*
+ * This just finds an empty slot:
+ */
+int bch2_inode_create(struct btree_trans *trans,
+		      struct btree_iter *iter,
+		      struct bch_inode_unpacked *inode_u,
+		      u32 snapshot, u64 cpu)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_inode_buf *inode_p;
-	struct btree_iter *iter;
-	u64 start;
-	int ret;
-
-	if (!max)
-		max = ULLONG_MAX;
-
-	if (c->opts.inodes_32bit)
-		max = min_t(u64, max, U32_MAX);
+	struct bkey_s_c k;
+	u64 min, max, start, pos, *hint;
+	int ret = 0;
+	unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
+
+	if (c->opts.shard_inode_numbers) {
+		bits -= c->inode_shard_bits;
+
+		min = (cpu << bits);
+		max = (cpu << bits) | ~(ULLONG_MAX << bits);
+
+		min = max_t(u64, min, BLOCKDEV_INODE_MAX);
+		hint = c->unused_inode_hints + cpu;
+	} else {
+		min = BLOCKDEV_INODE_MAX;
+		max = ~(ULLONG_MAX << bits);
+		hint = c->unused_inode_hints;
+	}
 
 	start = READ_ONCE(*hint);
 
 	if (start >= max || start < min)
 		start = min;
 
-	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
-	if (IS_ERR(inode_p))
-		return PTR_ERR(inode_p);
-
-	iter = bch2_trans_get_iter(trans,
-			BTREE_ID_INODES, POS(start, 0),
-			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
+	pos = start;
+	bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
+			     BTREE_ITER_all_snapshots|
+			     BTREE_ITER_intent);
 again:
-	while (1) {
-		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
-
-		ret = btree_iter_err(k);
-		if (ret)
-			return ret;
-
-		switch (k.k->type) {
-		case BCH_INODE_BLOCKDEV:
-		case BCH_INODE_FS:
-			/* slot used */
-			if (iter->pos.inode >= max)
-				goto out;
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       !(ret = bkey_err(k)) &&
+	       bkey_lt(k.k->p, POS(0, max))) {
+		if (pos < iter->pos.offset)
+			goto found_slot;
+
+		/*
+		 * We don't need to iterate over keys in every snapshot once
+		 * we've found just one:
+		 */
+		pos = iter->pos.offset + 1;
+		bch2_btree_iter_set_pos(iter, POS(0, pos));
+	}
 
-			bch2_btree_iter_next_slot(iter);
-			break;
+	if (!ret && pos < max)
+		goto found_slot;
 
-		default:
-			*hint			= k.k->p.inode;
-			inode_u->bi_inum	= k.k->p.inode;
-			inode_u->bi_generation	= bkey_generation(k);
+	if (!ret && start == min)
+		ret = -BCH_ERR_ENOSPC_inode_create;
 
-			bch2_inode_pack(inode_p, inode_u);
-			bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
-			return 0;
-		}
+	if (ret) {
+		bch2_trans_iter_exit(trans, iter);
+		return ret;
 	}
-out:
-	if (start != min) {
-		/* Retry from start */
-		start = min;
-		bch2_btree_iter_set_pos(iter, POS(start, 0));
-		goto again;
+
+	/* Retry from start */
+	pos = start = min;
+	bch2_btree_iter_set_pos(iter, POS(0, pos));
+	goto again;
+found_slot:
+	bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret) {
+		bch2_trans_iter_exit(trans, iter);
+		return ret;
 	}
 
-	return -ENOSPC;
+	*hint			= k.k->p.offset;
+	inode_u->bi_inum	= k.k->p.offset;
+	inode_u->bi_generation	= bkey_generation(k);
+	return 0;
 }
 
-int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
-		      u64 min, u64 max, u64 *hint)
+static int bch2_inode_delete_keys(struct btree_trans *trans,
+				  subvol_inum inum, enum btree_id id)
 {
-	return bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC,
-			__bch2_inode_create(&trans, inode_u, min, max, hint));
-}
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i delete;
+	struct bpos end = POS(inum.inum, U64_MAX);
+	u32 snapshot;
+	int ret = 0;
 
-int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size,
-			struct extent_insert_hook *hook, u64 *journal_seq)
-{
-	return bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
-				       POS(inode_nr, new_size),
-				       POS(inode_nr + 1, 0),
-				       ZERO_VERSION, NULL, hook,
-				       journal_seq);
+	/*
+	 * We're never going to be deleting partial extents, no need to use an
+	 * extent iterator:
+	 */
+	bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
+			     BTREE_ITER_intent);
+
+	while (1) {
+		bch2_trans_begin(trans);
+
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+		if (ret)
+			goto err;
+
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+		k = bch2_btree_iter_peek_max(&iter, end);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (!k.k)
+			break;
+
+		bkey_init(&delete.k);
+		delete.k.p = iter.pos;
+
+		if (iter.flags & BTREE_ITER_is_extents)
+			bch2_key_resize(&delete.k,
+					bpos_min(end, k.k->p).offset -
+					iter.pos.offset);
+
+		ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
+		      bch2_trans_commit(trans, NULL, NULL,
+					BCH_TRANS_COMMIT_no_enospc);
+err:
+		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
 }
 
-int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
+int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 {
-	struct btree_iter iter;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter = { NULL };
 	struct bkey_i_inode_generation delete;
+	struct bch_inode_unpacked inode_u;
+	struct bkey_s_c k;
+	u32 snapshot;
 	int ret;
 
-	ret = bch2_inode_truncate(c, inode_nr, 0, NULL, NULL);
-	if (ret < 0)
-		return ret;
-
-	ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
-				     POS(inode_nr, 0),
-				     POS(inode_nr + 1, 0),
-				     ZERO_VERSION, NULL, NULL, NULL);
-	if (ret < 0)
-		return ret;
-
 	/*
 	 * If this was a directory, there shouldn't be any real dirents left -
 	 * but there could be whiteouts (from hash collisions) that we should
@@ -404,115 +1013,402 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 	 * XXX: the dirent could ideally would delete whiteouts when they're no
 	 * longer needed
 	 */
-	ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
-				     POS(inode_nr, 0),
-				     POS(inode_nr + 1, 0),
-				     ZERO_VERSION, NULL, NULL, NULL);
-	if (ret < 0)
-		return ret;
+	ret   = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
+		bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
+		bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
+	if (ret)
+		goto err;
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, inum.inum, snapshot),
+			       BTREE_ITER_intent|BTREE_ITER_cached);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (!bkey_is_inode(k.k)) {
+		bch2_fs_inconsistent(c,
+				     "inode %llu:%u not found when deleting",
+				     inum.inum, snapshot);
+		ret = -EIO;
+		goto err;
+	}
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inode_nr, 0),
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	do {
-		struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
-		u32 bi_generation = 0;
+	bch2_inode_unpack(k, &inode_u);
 
-		ret = btree_iter_err(k);
-		if (ret) {
-			bch2_btree_iter_unlock(&iter);
-			return ret;
-		}
+	bkey_inode_generation_init(&delete.k_i);
+	delete.k.p = iter.pos;
+	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
 
-		bch2_fs_inconsistent_on(k.k->type != BCH_INODE_FS, c,
-					"inode %llu not found when deleting",
-					inode_nr);
+	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				BCH_TRANS_COMMIT_no_enospc);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
 
-		switch (k.k->type) {
-		case BCH_INODE_FS: {
-			struct bch_inode_unpacked inode_u;
+	if (ret)
+		goto err2;
 
-			if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
-				bi_generation = inode_u.bi_generation + 1;
-			break;
-		}
-		case BCH_INODE_GENERATION: {
-			struct bkey_s_c_inode_generation g =
-				bkey_s_c_to_inode_generation(k);
-			bi_generation = le32_to_cpu(g.v->bi_generation);
-			break;
-		}
-		}
+	ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot));
+err2:
+	bch2_trans_put(trans);
+	return ret;
+}
 
-		if (!bi_generation) {
-			bkey_init(&delete.k);
-			delete.k.p.inode = inode_nr;
-		} else {
-			bkey_inode_generation_init(&delete.k_i);
-			delete.k.p.inode = inode_nr;
-			delete.v.bi_generation = cpu_to_le32(bi_generation);
-		}
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
+				  subvol_inum inum,
+				  struct bch_inode_unpacked *inode)
+{
+	struct btree_iter iter;
+	int ret;
 
-		ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
-				BTREE_INSERT_ATOMIC|
-				BTREE_INSERT_NOFAIL,
-				BTREE_INSERT_ENTRY(&iter, &delete.k_i));
-	} while (ret == -EINTR);
+	ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
+	if (!ret)
+		bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
 
-	bch2_btree_iter_unlock(&iter);
+int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
+				  subvol_inum inum,
+				  struct bch_inode_unpacked *inode)
+{
+	struct btree_iter iter;
+	int ret;
+
+	ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
+	if (!ret)
+		bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
-int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
+int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
 			    struct bch_inode_unpacked *inode)
 {
+	return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode));
+}
+
+int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
+{
+	if (bi->bi_flags & BCH_INODE_unlinked)
+		bi->bi_flags &= ~BCH_INODE_unlinked;
+	else {
+		if (bi->bi_nlink == U32_MAX)
+			return -EINVAL;
+
+		bi->bi_nlink++;
+	}
+
+	return 0;
+}
+
+void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
+{
+	if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) {
+		bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
+					bi->bi_inum);
+		return;
+	}
+
+	if (bi->bi_flags & BCH_INODE_unlinked) {
+		bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
+		return;
+	}
+
+	if (bi->bi_nlink)
+		bi->bi_nlink--;
+	else
+		bi->bi_flags |= BCH_INODE_unlinked;
+}
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
+{
+	struct bch_opts ret = { 0 };
+#define x(_name, _bits)							\
+	if (inode->bi_##_name)						\
+		opt_set(ret, _name, inode->bi_##_name - 1);
+	BCH_INODE_OPTS()
+#undef x
+	return ret;
+}
+
+void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
+			 struct bch_inode_unpacked *inode)
+{
+#define x(_name, _bits)							\
+	if ((inode)->bi_##_name) {					\
+		opts->_name = inode->bi_##_name - 1;			\
+		opts->_name##_from_inode = true;			\
+	} else {							\
+		opts->_name = c->opts._name;				\
+	}
+	BCH_INODE_OPTS()
+#undef x
+
+	bch2_io_opts_fixups(opts);
+}
+
+int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
+{
+	struct bch_inode_unpacked inode;
+	int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
+
+	if (ret)
+		return ret;
+
+	bch2_inode_opts_get(opts, trans->c, &inode);
+	return 0;
+}
+
+static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter = { NULL };
+	struct bkey_i_inode_generation delete;
+	struct bch_inode_unpacked inode_u;
+	struct bkey_s_c k;
+	int ret;
+
+	do {
+		ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+						      SPOS(inum, 0, snapshot),
+						      SPOS(inum, U64_MAX, snapshot),
+						      0, NULL) ?:
+			bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
+						      SPOS(inum, 0, snapshot),
+						      SPOS(inum, U64_MAX, snapshot),
+						      0, NULL) ?:
+			bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
+						      SPOS(inum, 0, snapshot),
+						      SPOS(inum, U64_MAX, snapshot),
+						      0, NULL);
+	} while (ret == -BCH_ERR_transaction_restart_nested);
+	if (ret)
+		goto err;
+retry:
+	bch2_trans_begin(trans);
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, inum, snapshot), BTREE_ITER_intent);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (!bkey_is_inode(k.k)) {
+		bch2_fs_inconsistent(c,
+				     "inode %llu:%u not found when deleting",
+				     inum, snapshot);
+		ret = -EIO;
+		goto err;
+	}
+
+	bch2_inode_unpack(k, &inode_u);
+
+	/* Subvolume root? */
+	if (inode_u.bi_subvol)
+		bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
+
+	bkey_inode_generation_init(&delete.k_i);
+	delete.k.p = iter.pos;
+	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
+
+	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				BCH_TRANS_COMMIT_no_enospc);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	return ret ?: -BCH_ERR_transaction_restart_nested;
+}
+
+/*
+ * After deleting an inode, there may be versions in older snapshots that should
+ * also be deleted - if they're not referenced by sibling snapshots and not open
+ * in other subvolumes:
+ */
+static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos)
+{
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	int ret = -ENOENT;
-
-	for_each_btree_key(&iter, c, BTREE_ID_INODES,
-			   POS(inode_nr, 0),
-			   BTREE_ITER_SLOTS, k) {
-		switch (k.k->type) {
-		case BCH_INODE_FS:
-			ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
-			break;
-		default:
-			/* hole, not found */
-			break;
-		}
+	int ret;
+next_parent:
+	ret = lockrestart_do(trans,
+		bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0)));
+	if (ret || !k.k)
+		return ret;
 
-		break;
+	bool unlinked = bkey_is_unlinked_inode(k);
+	pos = k.k->p;
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (!unlinked)
+		return 0;
+
+	ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos));
+	if (ret)
+		return ret < 0 ? ret : 0;
+
+	ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot);
+	if (ret)
+		return ret;
+	goto next_parent;
+}
+
+int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
+{
+	return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?:
+		delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
+}
+
+static int may_delete_deleted_inode(struct btree_trans *trans,
+				    struct btree_iter *iter,
+				    struct bpos pos,
+				    bool *need_another_pass)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter inode_iter;
+	struct bkey_s_c k;
+	struct bch_inode_unpacked inode;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
+	if (fsck_err_on(!bkey_is_inode(k.k),
+			trans, deleted_inode_missing,
+			"nonexistent inode %llu:%u in deleted_inodes btree",
+			pos.offset, pos.snapshot))
+		goto delete;
+
+	ret = bch2_inode_unpack(k, &inode);
+	if (ret)
+		goto out;
+
+	if (S_ISDIR(inode.bi_mode)) {
+		ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
+		if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
+				trans, deleted_inode_is_dir,
+				"non empty directory %llu:%u in deleted_inodes btree",
+				pos.offset, pos.snapshot))
+			goto delete;
+		if (ret)
+			goto out;
+	}
+
+	if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked),
+			trans, deleted_inode_not_unlinked,
+			"non-deleted inode %llu:%u in deleted_inodes btree",
+			pos.offset, pos.snapshot))
+		goto delete;
+
+	if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot,
+			trans, deleted_inode_has_child_snapshots,
+			"inode with child snapshots %llu:%u in deleted_inodes btree",
+			pos.offset, pos.snapshot))
+		goto delete;
+
+	ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+	if (ret < 0)
+		goto out;
+
+	if (ret) {
+		if (fsck_err(trans, inode_has_child_snapshots_wrong,
+			     "inode has_child_snapshots flag wrong (should be set)\n%s",
+			     (printbuf_reset(&buf),
+			      bch2_inode_unpacked_to_text(&buf, &inode),
+			      buf.buf))) {
+			inode.bi_flags |= BCH_INODE_has_child_snapshot;
+			ret = __bch2_fsck_write_inode(trans, &inode);
+			if (ret)
+				goto out;
+		}
+		goto delete;
 
 	}
 
-	return bch2_btree_iter_unlock(&iter) ?: ret;
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_inode_pack_test(void)
-{
-	struct bch_inode_unpacked *u, test_inodes[] = {
-		{
-			.bi_atime	= U64_MAX,
-			.bi_ctime	= U64_MAX,
-			.bi_mtime	= U64_MAX,
-			.bi_otime	= U64_MAX,
-			.bi_size	= U64_MAX,
-			.bi_sectors	= U64_MAX,
-			.bi_uid		= U32_MAX,
-			.bi_gid		= U32_MAX,
-			.bi_nlink	= U32_MAX,
-			.bi_generation	= U32_MAX,
-			.bi_dev		= U32_MAX,
-		},
-	};
-
-	for (u = test_inodes;
-	     u < test_inodes + ARRAY_SIZE(test_inodes);
-	     u++) {
-		struct bkey_inode_buf p;
-
-		bch2_inode_pack(&p, u);
+	if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
+	    !fsck_err(trans, deleted_inode_but_clean,
+		      "filesystem marked as clean but have deleted inode %llu:%u",
+		      pos.offset, pos.snapshot)) {
+		ret = 0;
+		goto out;
 	}
+
+	ret = 1;
+out:
+fsck_err:
+	bch2_trans_iter_exit(trans, &inode_iter);
+	printbuf_exit(&buf);
+	return ret;
+delete:
+	ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
+	goto out;
+}
+
+int bch2_delete_dead_inodes(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	bool need_another_pass;
+	int ret;
+again:
+	/*
+	 * if we ran check_inodes() unlinked inodes will have already been
+	 * cleaned up but the write buffer will be out of sync; therefore we
+	 * alway need a write buffer flush
+	 */
+	ret = bch2_btree_write_buffer_flush_sync(trans);
+	if (ret)
+		goto err;
+
+	need_another_pass = false;
+
+	/*
+	 * Weird transaction restart handling here because on successful delete,
+	 * bch2_inode_rm_snapshot() will return a nested transaction restart,
+	 * but we can't retry because the btree write buffer won't have been
+	 * flushed and we'd spin:
+	 */
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
+					BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+					NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+		ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass);
+		if (ret > 0) {
+			bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u",
+						k.k->p.offset, k.k->p.snapshot);
+
+			ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
+			/*
+			 * We don't want to loop here: a transaction restart
+			 * error here means we handled a transaction restart and
+			 * we're actually done, but if we loop we'll retry the
+			 * same key because the write buffer hasn't been flushed
+			 * yet
+			 */
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+				ret = 0;
+				continue;
+			}
+		}
+
+		ret;
+	}));
+
+	if (!ret && need_another_pass)
+		goto again;
+err:
+	bch2_trans_put(trans);
+	return ret;
 }
-#endif
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h
index a47194ab..927c8759 100644
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -1,115 +1,275 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_INODE_H
 #define _BCACHEFS_INODE_H
 
+#include "bkey.h"
+#include "bkey_methods.h"
 #include "opts.h"
+#include "snapshot.h"
 
-#include <linux/math64.h>
+extern const char * const bch2_inode_opts[];
 
-const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+int bch2_inode_validate(struct bch_fs *, struct bkey_s_c,
+			struct bkey_validate_context);
+int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c,
+			   struct bkey_validate_context);
+int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c,
+			   struct bkey_validate_context);
+void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_inode_ops (struct bkey_ops) {		\
-	.key_invalid	= bch2_inode_invalid,		\
+int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos);
+
+static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
+{
+	return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0
+		? __bch2_inode_has_child_snapshots(trans, pos)
+		: 0;
+}
+
+int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
+		       struct bkey_s_c, struct bkey_s,
+		       enum btree_iter_update_trigger_flags);
+
+#define bch2_bkey_ops_inode ((struct bkey_ops) {	\
+	.key_validate	= bch2_inode_validate,		\
+	.val_to_text	= bch2_inode_to_text,		\
+	.trigger	= bch2_trigger_inode,		\
+	.min_val_size	= 16,				\
+})
+
+#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) {	\
+	.key_validate	= bch2_inode_v2_validate,	\
+	.val_to_text	= bch2_inode_to_text,		\
+	.trigger	= bch2_trigger_inode,		\
+	.min_val_size	= 32,				\
+})
+
+#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) {	\
+	.key_validate	= bch2_inode_v3_validate,	\
 	.val_to_text	= bch2_inode_to_text,		\
+	.trigger	= bch2_trigger_inode,		\
+	.min_val_size	= 48,				\
+})
+
+static inline bool bkey_is_inode(const struct bkey *k)
+{
+	return  k->type == KEY_TYPE_inode ||
+		k->type == KEY_TYPE_inode_v2 ||
+		k->type == KEY_TYPE_inode_v3;
 }
 
+int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c,
+				   struct bkey_validate_context);
+void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_inode_generation ((struct bkey_ops) {	\
+	.key_validate	= bch2_inode_generation_validate,	\
+	.val_to_text	= bch2_inode_generation_to_text,	\
+	.min_val_size	= 8,					\
+})
+
+#if 0
+typedef struct {
+	u64			lo;
+	u32			hi;
+} __packed __aligned(4) u96;
+#endif
+typedef u64 u96;
+
 struct bch_inode_unpacked {
 	u64			bi_inum;
+	u32			bi_snapshot;
+	u64			bi_journal_seq;
 	__le64			bi_hash_seed;
+	u64			bi_size;
+	u64			bi_sectors;
+	u64			bi_version;
 	u32			bi_flags;
 	u16			bi_mode;
 
-#define BCH_INODE_FIELD(_name, _bits)	u##_bits _name;
-	BCH_INODE_FIELDS()
-#undef  BCH_INODE_FIELD
+#define x(_name, _bits)	u##_bits _name;
+	BCH_INODE_FIELDS_v3()
+#undef  x
 };
+BITMASK(INODE_STR_HASH,	struct bch_inode_unpacked, bi_flags, 20, 24);
 
 struct bkey_inode_buf {
-	struct bkey_i_inode	inode;
+	struct bkey_i_inode_v3	inode;
 
-#define BCH_INODE_FIELD(_name, _bits)		+ 8 + _bits / 8
-	u8		_pad[0 + BCH_INODE_FIELDS()];
-#undef  BCH_INODE_FIELD
-} __attribute__((packed, aligned(8)));
+#define x(_name, _bits)		+ 8 + _bits / 8
+	u8		_pad[0 + BCH_INODE_FIELDS_v3()];
+#undef  x
+};
 
 void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
-int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
+int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
+struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
+
+void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
+
+int __bch2_inode_peek(struct btree_trans *, struct btree_iter *,
+		      struct bch_inode_unpacked *, subvol_inum, unsigned, bool);
 
+static inline int bch2_inode_peek_nowarn(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 struct bch_inode_unpacked *inode,
+					 subvol_inum inum, unsigned flags)
+{
+	return __bch2_inode_peek(trans, iter, inode, inum, flags, false);
+}
+
+static inline int bch2_inode_peek(struct btree_trans *trans,
+				  struct btree_iter *iter,
+				  struct bch_inode_unpacked *inode,
+				  subvol_inum inum, unsigned flags)
+{
+	return __bch2_inode_peek(trans, iter, inode, inum, flags, true);
+	int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
+	return ret;
+}
+
+int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
+		     struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags);
+
+static inline int bch2_inode_write(struct btree_trans *trans,
+		     struct btree_iter *iter,
+		     struct bch_inode_unpacked *inode)
+{
+	return bch2_inode_write_flags(trans, iter, inode, 0);
+}
+
+int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
+int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
+
+void bch2_inode_init_early(struct bch_fs *,
+			   struct bch_inode_unpacked *);
+void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
+			  uid_t, gid_t, umode_t, dev_t,
+			  struct bch_inode_unpacked *);
 void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
 		     uid_t, gid_t, umode_t, dev_t,
 		     struct bch_inode_unpacked *);
 
-int __bch2_inode_create(struct btree_trans *,
-			struct bch_inode_unpacked *,
-			u64, u64, u64 *);
-int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
-		      u64, u64, u64 *);
+int bch2_inode_create(struct btree_trans *, struct btree_iter *,
+		      struct bch_inode_unpacked *, u32, u64);
 
-int bch2_inode_truncate(struct bch_fs *, u64, u64,
-		       struct extent_insert_hook *, u64 *);
-int bch2_inode_rm(struct bch_fs *, u64);
+int bch2_inode_rm(struct bch_fs *, subvol_inum);
 
-int bch2_inode_find_by_inum(struct bch_fs *, u64,
-			   struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
+				  subvol_inum,
+				  struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
+				  struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
+			    struct bch_inode_unpacked *);
 
-static inline struct timespec bch2_time_to_timespec(struct bch_fs *c, u64 time)
+#define inode_opt_get(_c, _inode, _name)			\
+	((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name)
+
+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+				      enum inode_opt_id id, u64 v)
 {
-	return ns_to_timespec(time * c->sb.time_precision + c->sb.time_base_lo);
+	switch (id) {
+#define x(_name, ...)							\
+	case Inode_opt_##_name:						\
+		inode->bi_##_name = v;					\
+		break;
+	BCH_INODE_OPTS()
+#undef x
+	default:
+		BUG();
+	}
 }
 
-static inline u64 timespec_to_bch2_time(struct bch_fs *c, struct timespec ts)
+static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
+				     enum inode_opt_id id)
 {
-	s64 ns = timespec_to_ns(&ts) - c->sb.time_base_lo;
+	switch (id) {
+#define x(_name, ...)							\
+	case Inode_opt_##_name:						\
+		return inode->bi_##_name;
+	BCH_INODE_OPTS()
+#undef x
+	default:
+		BUG();
+	}
+}
 
-	if (c->sb.time_precision == 1)
-		return ns;
+static inline u8 mode_to_type(umode_t mode)
+{
+	return (mode >> 12) & 15;
+}
 
-	return div_s64(ns, c->sb.time_precision);
+static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
+{
+	return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
 }
 
-static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
+static inline u32 bch2_inode_flags(struct bkey_s_c k)
 {
-	struct bch_io_opts ret = { 0 };
+	switch (k.k->type) {
+	case KEY_TYPE_inode:
+		return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
+	case KEY_TYPE_inode_v2:
+		return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
+	case KEY_TYPE_inode_v3:
+		return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
+	default:
+		return 0;
+	}
+}
 
-#define BCH_INODE_OPT(_name, _bits)					\
-	if (inode->bi_##_name)						\
-		opt_set(ret, _name, inode->bi_##_name - 1);
-	BCH_INODE_OPTS()
-#undef BCH_INODE_OPT
-	return ret;
+/* i_nlink: */
+
+static inline unsigned nlink_bias(umode_t mode)
+{
+	return S_ISDIR(mode) ? 2 : 1;
 }
 
-static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode,
-					enum bch_opt_id id, u64 v)
+static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
 {
-	switch (id) {
-#define BCH_INODE_OPT(_name, ...)					\
-	case Opt_##_name:						\
-		inode->bi_##_name = v;					\
-		break;
-	BCH_INODE_OPTS()
-#undef BCH_INODE_OPT
-	default:
-		BUG();
+	return bi->bi_flags & BCH_INODE_unlinked
+		  ? 0
+		  : bi->bi_nlink + nlink_bias(bi->bi_mode);
+}
+
+static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
+					unsigned nlink)
+{
+	if (nlink) {
+		bi->bi_nlink = nlink - nlink_bias(bi->bi_mode);
+		bi->bi_flags &= ~BCH_INODE_unlinked;
+	} else {
+		bi->bi_nlink = 0;
+		bi->bi_flags |= BCH_INODE_unlinked;
 	}
 }
 
-static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
-				      enum bch_opt_id id, u64 v)
+int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
+void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
+
+static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode)
 {
-	return __bch2_inode_opt_set(inode, id, v + 1);
+	bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset;
+
+	return S_ISDIR(inode->bi_mode) ||
+		(!inode->bi_nlink && inode_has_bp);
 }
 
-static inline void bch2_inode_opt_clear(struct bch_inode_unpacked *inode,
-					enum bch_opt_id id)
+struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
+void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
+			 struct bch_inode_unpacked *);
+int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
+
+static inline struct bch_extent_rebalance
+bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode)
 {
-	return __bch2_inode_opt_set(inode, id, 0);
+	struct bch_io_opts io_opts;
+	bch2_inode_opts_get(&io_opts, c, inode);
+	return io_opts_to_rebalance_opts(&io_opts);
 }
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_inode_pack_test(void);
-#else
-static inline void bch2_inode_pack_test(void) {}
-#endif
+int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
+int bch2_delete_dead_inodes(struct bch_fs *);
 
 #endif /* _BCACHEFS_INODE_H */
diff --git a/libbcachefs/inode_format.h b/libbcachefs/inode_format.h
new file mode 100644
index 00000000..7928d0c6
--- /dev/null
+++ b/libbcachefs/inode_format.h
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_INODE_FORMAT_H
+#define _BCACHEFS_INODE_FORMAT_H
+
+#define BLOCKDEV_INODE_MAX	4096
+#define BCACHEFS_ROOT_INO	4096
+
+struct bch_inode {
+	struct bch_val		v;
+
+	__le64			bi_hash_seed;
+	__le32			bi_flags;
+	__le16			bi_mode;
+	__u8			fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v2 {
+	struct bch_val		v;
+
+	__le64			bi_journal_seq;
+	__le64			bi_hash_seed;
+	__le64			bi_flags;
+	__le16			bi_mode;
+	__u8			fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v3 {
+	struct bch_val		v;
+
+	__le64			bi_journal_seq;
+	__le64			bi_hash_seed;
+	__le64			bi_flags;
+	__le64			bi_sectors;
+	__le64			bi_size;
+	__le64			bi_version;
+	__u8			fields[];
+} __packed __aligned(8);
+
+#define INODEv3_FIELDS_START_INITIAL	6
+#define INODEv3_FIELDS_START_CUR	(offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
+
+struct bch_inode_generation {
+	struct bch_val		v;
+
+	__le32			bi_generation;
+	__le32			pad;
+} __packed __aligned(8);
+
+/*
+ * bi_subvol and bi_parent_subvol are only set for subvolume roots:
+ */
+
+#define BCH_INODE_FIELDS_v2()			\
+	x(bi_atime,			96)	\
+	x(bi_ctime,			96)	\
+	x(bi_mtime,			96)	\
+	x(bi_otime,			96)	\
+	x(bi_size,			64)	\
+	x(bi_sectors,			64)	\
+	x(bi_uid,			32)	\
+	x(bi_gid,			32)	\
+	x(bi_nlink,			32)	\
+	x(bi_generation,		32)	\
+	x(bi_dev,			32)	\
+	x(bi_data_checksum,		8)	\
+	x(bi_compression,		8)	\
+	x(bi_project,			32)	\
+	x(bi_background_compression,	8)	\
+	x(bi_data_replicas,		8)	\
+	x(bi_promote_target,		16)	\
+	x(bi_foreground_target,		16)	\
+	x(bi_background_target,		16)	\
+	x(bi_erasure_code,		16)	\
+	x(bi_fields_set,		16)	\
+	x(bi_dir,			64)	\
+	x(bi_dir_offset,		64)	\
+	x(bi_subvol,			32)	\
+	x(bi_parent_subvol,		32)
+
+#define BCH_INODE_FIELDS_v3()			\
+	x(bi_atime,			96)	\
+	x(bi_ctime,			96)	\
+	x(bi_mtime,			96)	\
+	x(bi_otime,			96)	\
+	x(bi_uid,			32)	\
+	x(bi_gid,			32)	\
+	x(bi_nlink,			32)	\
+	x(bi_generation,		32)	\
+	x(bi_dev,			32)	\
+	x(bi_data_checksum,		8)	\
+	x(bi_compression,		8)	\
+	x(bi_project,			32)	\
+	x(bi_background_compression,	8)	\
+	x(bi_data_replicas,		8)	\
+	x(bi_promote_target,		16)	\
+	x(bi_foreground_target,		16)	\
+	x(bi_background_target,		16)	\
+	x(bi_erasure_code,		16)	\
+	x(bi_fields_set,		16)	\
+	x(bi_dir,			64)	\
+	x(bi_dir_offset,		64)	\
+	x(bi_subvol,			32)	\
+	x(bi_parent_subvol,		32)	\
+	x(bi_nocow,			8)
+
+/* subset of BCH_INODE_FIELDS */
+#define BCH_INODE_OPTS()			\
+	x(data_checksum,		8)	\
+	x(compression,			8)	\
+	x(project,			32)	\
+	x(background_compression,	8)	\
+	x(data_replicas,		8)	\
+	x(promote_target,		16)	\
+	x(foreground_target,		16)	\
+	x(background_target,		16)	\
+	x(erasure_code,			16)	\
+	x(nocow,			8)
+
+enum inode_opt_id {
+#define x(name, ...)				\
+	Inode_opt_##name,
+	BCH_INODE_OPTS()
+#undef  x
+	Inode_opt_nr,
+};
+
+#define BCH_INODE_FLAGS()			\
+	x(sync,				0)	\
+	x(immutable,			1)	\
+	x(append,			2)	\
+	x(nodump,			3)	\
+	x(noatime,			4)	\
+	x(i_size_dirty,			5)	\
+	x(i_sectors_dirty,		6)	\
+	x(unlinked,			7)	\
+	x(backptr_untrusted,		8)	\
+	x(has_child_snapshot,		9)
+
+/* bits 20+ reserved for packed fields below: */
+
+enum bch_inode_flags {
+#define x(t, n)	BCH_INODE_##t = 1U << n,
+	BCH_INODE_FLAGS()
+#undef x
+};
+
+enum __bch_inode_flags {
+#define x(t, n)	__BCH_INODE_##t = n,
+	BCH_INODE_FLAGS()
+#undef x
+};
+
+LE32_BITMASK(INODEv1_STR_HASH,	struct bch_inode, bi_flags, 20, 24);
+LE32_BITMASK(INODEv1_NR_FIELDS,	struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODEv1_NEW_VARINT,struct bch_inode, bi_flags, 31, 32);
+
+LE64_BITMASK(INODEv2_STR_HASH,	struct bch_inode_v2, bi_flags, 20, 24);
+LE64_BITMASK(INODEv2_NR_FIELDS,	struct bch_inode_v2, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_STR_HASH,	struct bch_inode_v3, bi_flags, 20, 24);
+LE64_BITMASK(INODEv3_NR_FIELDS,	struct bch_inode_v3, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_FIELDS_START,
+				struct bch_inode_v3, bi_flags, 31, 36);
+LE64_BITMASK(INODEv3_MODE,	struct bch_inode_v3, bi_flags, 36, 52);
+
+#endif /* _BCACHEFS_INODE_FORMAT_H */
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
deleted file mode 100644
index f26d4041..00000000
--- a/libbcachefs/io.c
+++ /dev/null
@@ -1,1853 +0,0 @@
-/*
- * Some low level IO code, and hacks for various block layer limitations
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc.h"
-#include "bset.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "compress.h"
-#include "clock.h"
-#include "debug.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "extents.h"
-#include "io.h"
-#include "journal.h"
-#include "keylist.h"
-#include "move.h"
-#include "rebalance.h"
-#include "replicas.h"
-#include "super.h"
-#include "super-io.h"
-
-#include <linux/blkdev.h>
-#include <linux/random.h>
-
-#include <trace/events/bcachefs.h>
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
-	const struct bch_devs_mask *devs;
-	unsigned d, nr = 0, total = 0;
-	u64 now = local_clock(), last;
-	s64 congested;
-	struct bch_dev *ca;
-
-	if (!target)
-		return false;
-
-	rcu_read_lock();
-	devs = bch2_target_to_mask(c, target);
-	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
-		ca = rcu_dereference(c->devs[d]);
-		if (!ca)
-			continue;
-
-		congested = atomic_read(&ca->congested);
-		last = READ_ONCE(ca->congested_last);
-		if (time_after64(now, last))
-			congested -= (now - last) >> 12;
-
-		total += max(congested, 0LL);
-		nr++;
-	}
-	rcu_read_unlock();
-
-	return bch2_rand_range(nr * CONGESTED_MAX) < total;
-}
-
-static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
-				       u64 now, int rw)
-{
-	u64 latency_capable =
-		ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
-	/* ideally we'd be taking into account the device's variance here: */
-	u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
-	s64 latency_over = io_latency - latency_threshold;
-
-	if (latency_threshold && latency_over > 0) {
-		/*
-		 * bump up congested by approximately latency_over * 4 /
-		 * latency_threshold - we don't need much accuracy here so don't
-		 * bother with the divide:
-		 */
-		if (atomic_read(&ca->congested) < CONGESTED_MAX)
-			atomic_add(latency_over >>
-				   max_t(int, ilog2(latency_threshold) - 2, 0),
-				   &ca->congested);
-
-		ca->congested_last = now;
-	} else if (atomic_read(&ca->congested) > 0) {
-		atomic_dec(&ca->congested);
-	}
-}
-
-void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
-{
-	atomic64_t *latency = &ca->cur_latency[rw];
-	u64 now = local_clock();
-	u64 io_latency = time_after64(now, submit_time)
-		? now - submit_time
-		: 0;
-	u64 old, new, v = atomic64_read(latency);
-
-	do {
-		old = v;
-
-		/*
-		 * If the io latency was reasonably close to the current
-		 * latency, skip doing the update and atomic operation - most of
-		 * the time:
-		 */
-		if (abs((int) (old - io_latency)) < (old >> 1) &&
-		    now & ~(~0 << 5))
-			break;
-
-		new = ewma_add(old, io_latency, 5);
-	} while ((v = atomic64_cmpxchg(latency, old, new)) != old);
-
-	bch2_congested_acct(ca, io_latency, now, rw);
-
-	__bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
-}
-
-/* Allocate, free from mempool: */
-
-void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
-{
-	struct bio_vec *bv;
-	unsigned i;
-
-	bio_for_each_segment_all(bv, bio, i)
-		if (bv->bv_page != ZERO_PAGE(0))
-			mempool_free(bv->bv_page, &c->bio_bounce_pages);
-	bio->bi_vcnt = 0;
-}
-
-static void bch2_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio,
-				    bool *using_mempool)
-{
-	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++];
-
-	if (likely(!*using_mempool)) {
-		bv->bv_page = alloc_page(GFP_NOIO);
-		if (unlikely(!bv->bv_page)) {
-			mutex_lock(&c->bio_bounce_pages_lock);
-			*using_mempool = true;
-			goto pool_alloc;
-
-		}
-	} else {
-pool_alloc:
-		bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
-	}
-
-	bv->bv_len = PAGE_SIZE;
-	bv->bv_offset = 0;
-}
-
-void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
-			       size_t bytes)
-{
-	bool using_mempool = false;
-
-	BUG_ON(DIV_ROUND_UP(bytes, PAGE_SIZE) > bio->bi_max_vecs);
-
-	bio->bi_iter.bi_size = bytes;
-
-	while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
-		bch2_bio_alloc_page_pool(c, bio, &using_mempool);
-
-	if (using_mempool)
-		mutex_unlock(&c->bio_bounce_pages_lock);
-}
-
-void bch2_bio_alloc_more_pages_pool(struct bch_fs *c, struct bio *bio,
-				    size_t bytes)
-{
-	while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) {
-		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
-
-		BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
-
-		bv->bv_page = alloc_page(GFP_NOIO);
-		if (!bv->bv_page) {
-			/*
-			 * We already allocated from mempool, we can't allocate from it again
-			 * without freeing the pages we already allocated or else we could
-			 * deadlock:
-			 */
-			bch2_bio_free_pages_pool(c, bio);
-			bch2_bio_alloc_pages_pool(c, bio, bytes);
-			return;
-		}
-
-		bv->bv_len = PAGE_SIZE;
-		bv->bv_offset = 0;
-		bio->bi_vcnt++;
-	}
-
-	bio->bi_iter.bi_size = bytes;
-}
-
-/* Writes */
-
-void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
-			       enum bch_data_type type,
-			       const struct bkey_i *k)
-{
-	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
-	const struct bch_extent_ptr *ptr;
-	struct bch_write_bio *n;
-	struct bch_dev *ca;
-
-	BUG_ON(c->opts.nochanges);
-
-	extent_for_each_ptr(e, ptr) {
-		BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
-		       !c->devs[ptr->dev]);
-
-		ca = bch_dev_bkey_exists(c, ptr->dev);
-
-		if (ptr + 1 < &extent_entry_last(e)->ptr) {
-			n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
-						   &ca->replica_set));
-
-			n->bio.bi_end_io	= wbio->bio.bi_end_io;
-			n->bio.bi_private	= wbio->bio.bi_private;
-			n->parent		= wbio;
-			n->split		= true;
-			n->bounce		= false;
-			n->put_bio		= true;
-			n->bio.bi_opf		= wbio->bio.bi_opf;
-			bio_inc_remaining(&wbio->bio);
-		} else {
-			n = wbio;
-			n->split		= false;
-		}
-
-		n->c			= c;
-		n->dev			= ptr->dev;
-		n->have_ioref		= bch2_dev_get_ioref(ca, WRITE);
-		n->submit_time		= local_clock();
-		n->bio.bi_iter.bi_sector = ptr->offset;
-
-		if (!journal_flushes_device(ca))
-			n->bio.bi_opf |= REQ_FUA;
-
-		if (likely(n->have_ioref)) {
-			this_cpu_add(ca->io_done->sectors[WRITE][type],
-				     bio_sectors(&n->bio));
-
-			bio_set_dev(&n->bio, ca->disk_sb.bdev);
-			submit_bio(&n->bio);
-		} else {
-			n->bio.bi_status	= BLK_STS_REMOVED;
-			bio_endio(&n->bio);
-		}
-	}
-}
-
-static void __bch2_write(struct closure *);
-
-static void bch2_write_done(struct closure *cl)
-{
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-	struct bch_fs *c = op->c;
-
-	if (!op->error && (op->flags & BCH_WRITE_FLUSH))
-		op->error = bch2_journal_error(&c->journal);
-
-	if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
-		bch2_disk_reservation_put(c, &op->res);
-	percpu_ref_put(&c->writes);
-	bch2_keylist_free(&op->insert_keys, op->inline_keys);
-
-	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
-
-	closure_return(cl);
-}
-
-int bch2_write_index_default(struct bch_write_op *op)
-{
-	struct keylist *keys = &op->insert_keys;
-	struct btree_iter iter;
-	int ret;
-
-	bch2_btree_iter_init(&iter, op->c, BTREE_ID_EXTENTS,
-			     bkey_start_pos(&bch2_keylist_front(keys)->k),
-			     BTREE_ITER_INTENT);
-
-	ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
-					NULL, op_journal_seq(op),
-					BTREE_INSERT_NOFAIL|
-					BTREE_INSERT_USE_RESERVE);
-	bch2_btree_iter_unlock(&iter);
-
-	return ret;
-}
-
-/**
- * bch_write_index - after a write, update index to point to new data
- */
-static void __bch2_write_index(struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	struct keylist *keys = &op->insert_keys;
-	struct bkey_s_extent e;
-	struct bch_extent_ptr *ptr;
-	struct bkey_i *src, *dst = keys->keys, *n, *k;
-	int ret;
-
-	for (src = keys->keys; src != keys->top; src = n) {
-		n = bkey_next(src);
-		bkey_copy(dst, src);
-
-		e = bkey_i_to_s_extent(dst);
-		extent_for_each_ptr_backwards(e, ptr)
-			if (test_bit(ptr->dev, op->failed.d))
-				bch2_extent_drop_ptr(e, ptr);
-
-		if (!bch2_extent_nr_ptrs(e.c)) {
-			ret = -EIO;
-			goto err;
-		}
-
-		if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) {
-			ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, e.s_c);
-			if (ret)
-				goto err;
-		}
-
-		dst = bkey_next(dst);
-	}
-
-	keys->top = dst;
-
-	/*
-	 * probably not the ideal place to hook this in, but I don't
-	 * particularly want to plumb io_opts all the way through the btree
-	 * update stack right now
-	 */
-	for_each_keylist_key(keys, k)
-		bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
-
-	if (!bch2_keylist_empty(keys)) {
-		u64 sectors_start = keylist_sectors(keys);
-		int ret = op->index_update_fn(op);
-
-		BUG_ON(keylist_sectors(keys) && !ret);
-
-		op->written += sectors_start - keylist_sectors(keys);
-
-		if (ret) {
-			__bcache_io_error(c, "btree IO error %i", ret);
-			op->error = ret;
-		}
-	}
-out:
-	bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);
-	return;
-err:
-	keys->top = keys->keys;
-	op->error = ret;
-	goto out;
-}
-
-static void bch2_write_index(struct closure *cl)
-{
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-	struct bch_fs *c = op->c;
-
-	__bch2_write_index(op);
-
-	if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
-		bch2_journal_flush_seq_async(&c->journal,
-					     *op_journal_seq(op),
-					     cl);
-		continue_at(cl, bch2_write_done, index_update_wq(op));
-	} else {
-		continue_at_nobarrier(cl, bch2_write_done, NULL);
-	}
-}
-
-static void bch2_write_endio(struct bio *bio)
-{
-	struct closure *cl		= bio->bi_private;
-	struct bch_write_op *op		= container_of(cl, struct bch_write_op, cl);
-	struct bch_write_bio *wbio	= to_wbio(bio);
-	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
-	struct bch_fs *c		= wbio->c;
-	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
-
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
-		set_bit(wbio->dev, op->failed.d);
-
-	if (wbio->have_ioref) {
-		bch2_latency_acct(ca, wbio->submit_time, WRITE);
-		percpu_ref_put(&ca->io_ref);
-	}
-
-	if (wbio->bounce)
-		bch2_bio_free_pages_pool(c, bio);
-
-	if (wbio->put_bio)
-		bio_put(bio);
-
-	if (parent)
-		bio_endio(&parent->bio);
-	else
-		closure_put(cl);
-}
-
-static void init_append_extent(struct bch_write_op *op,
-			       struct write_point *wp,
-			       struct bversion version,
-			       struct bch_extent_crc_unpacked crc)
-{
-	struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
-
-	op->pos.offset += crc.uncompressed_size;
-	e->k.p = op->pos;
-	e->k.size = crc.uncompressed_size;
-	e->k.version = version;
-	bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED);
-
-	bch2_extent_crc_append(e, crc);
-	bch2_alloc_sectors_append_ptrs(op->c, wp, e, crc.compressed_size);
-
-	bch2_keylist_push(&op->insert_keys);
-}
-
-static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
-					struct write_point *wp,
-					struct bio *src,
-					bool *page_alloc_failed)
-{
-	struct bch_write_bio *wbio;
-	struct bio *bio;
-	unsigned output_available =
-		min(wp->sectors_free << 9, src->bi_iter.bi_size);
-	unsigned pages = DIV_ROUND_UP(output_available, PAGE_SIZE);
-
-	bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
-	wbio			= wbio_init(bio);
-	wbio->bounce		= true;
-	wbio->put_bio		= true;
-	/* copy WRITE_SYNC flag */
-	wbio->bio.bi_opf	= src->bi_opf;
-
-	/*
-	 * We can't use mempool for more than c->sb.encoded_extent_max
-	 * worth of pages, but we'd like to allocate more if we can:
-	 */
-	while (bio->bi_iter.bi_size < output_available) {
-		unsigned len = min_t(unsigned, PAGE_SIZE,
-				     output_available - bio->bi_iter.bi_size);
-		struct page *p;
-
-		p = alloc_page(GFP_NOIO);
-		if (!p) {
-			unsigned pool_max =
-				min_t(unsigned, output_available,
-				      c->sb.encoded_extent_max << 9);
-
-			if (bio_sectors(bio) < pool_max)
-				bch2_bio_alloc_pages_pool(c, bio, pool_max);
-			break;
-		}
-
-		bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
-			.bv_page	= p,
-			.bv_len		= len,
-			.bv_offset	= 0,
-		};
-		bio->bi_iter.bi_size += len;
-	}
-
-	*page_alloc_failed = bio->bi_vcnt < pages;
-	return bio;
-}
-
-static int bch2_write_rechecksum(struct bch_fs *c,
-				 struct bch_write_op *op,
-				 unsigned new_csum_type)
-{
-	struct bio *bio = &op->wbio.bio;
-	struct bch_extent_crc_unpacked new_crc;
-	int ret;
-
-	/* bch2_rechecksum_bio() can't encrypt or decrypt data: */
-
-	if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
-	    bch2_csum_type_is_encryption(new_csum_type))
-		new_csum_type = op->crc.csum_type;
-
-	ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
-				  NULL, &new_crc,
-				  op->crc.offset, op->crc.live_size,
-				  new_csum_type);
-	if (ret)
-		return ret;
-
-	bio_advance(bio, op->crc.offset << 9);
-	bio->bi_iter.bi_size = op->crc.live_size << 9;
-	op->crc = new_crc;
-	return 0;
-}
-
-static int bch2_write_decrypt(struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	struct nonce nonce = extent_nonce(op->version, op->crc);
-	struct bch_csum csum;
-
-	if (!bch2_csum_type_is_encryption(op->crc.csum_type))
-		return 0;
-
-	/*
-	 * If we need to decrypt data in the write path, we'll no longer be able
-	 * to verify the existing checksum (poly1305 mac, in this case) after
-	 * it's decrypted - this is the last point we'll be able to reverify the
-	 * checksum:
-	 */
-	csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
-	if (bch2_crc_cmp(op->crc.csum, csum))
-		return -EIO;
-
-	bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
-	op->crc.csum_type = 0;
-	op->crc.csum = (struct bch_csum) { 0, 0 };
-	return 0;
-}
-
-static enum prep_encoded_ret {
-	PREP_ENCODED_OK,
-	PREP_ENCODED_ERR,
-	PREP_ENCODED_CHECKSUM_ERR,
-	PREP_ENCODED_DO_WRITE,
-} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
-{
-	struct bch_fs *c = op->c;
-	struct bio *bio = &op->wbio.bio;
-
-	if (!(op->flags & BCH_WRITE_DATA_ENCODED))
-		return PREP_ENCODED_OK;
-
-	BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
-
-	/* Can we just write the entire extent as is? */
-	if (op->crc.uncompressed_size == op->crc.live_size &&
-	    op->crc.compressed_size <= wp->sectors_free &&
-	    op->crc.compression_type == op->compression_type) {
-		if (!op->crc.compression_type &&
-		    op->csum_type != op->crc.csum_type &&
-		    bch2_write_rechecksum(c, op, op->csum_type))
-			return PREP_ENCODED_CHECKSUM_ERR;
-
-		return PREP_ENCODED_DO_WRITE;
-	}
-
-	/*
-	 * If the data is compressed and we couldn't write the entire extent as
-	 * is, we have to decompress it:
-	 */
-	if (op->crc.compression_type) {
-		struct bch_csum csum;
-
-		if (bch2_write_decrypt(op))
-			return PREP_ENCODED_CHECKSUM_ERR;
-
-		/* Last point we can still verify checksum: */
-		csum = bch2_checksum_bio(c, op->crc.csum_type,
-					 extent_nonce(op->version, op->crc),
-					 bio);
-		if (bch2_crc_cmp(op->crc.csum, csum))
-			return PREP_ENCODED_CHECKSUM_ERR;
-
-		if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
-			return PREP_ENCODED_ERR;
-	}
-
-	/*
-	 * No longer have compressed data after this point - data might be
-	 * encrypted:
-	 */
-
-	/*
-	 * If the data is checksummed and we're only writing a subset,
-	 * rechecksum and adjust bio to point to currently live data:
-	 */
-	if ((op->crc.live_size != op->crc.uncompressed_size ||
-	     op->crc.csum_type != op->csum_type) &&
-	    bch2_write_rechecksum(c, op, op->csum_type))
-		return PREP_ENCODED_CHECKSUM_ERR;
-
-	/*
-	 * If we want to compress the data, it has to be decrypted:
-	 */
-	if ((op->compression_type ||
-	     bch2_csum_type_is_encryption(op->crc.csum_type) !=
-	     bch2_csum_type_is_encryption(op->csum_type)) &&
-	    bch2_write_decrypt(op))
-		return PREP_ENCODED_CHECKSUM_ERR;
-
-	return PREP_ENCODED_OK;
-}
-
-static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
-{
-	struct bch_fs *c = op->c;
-	struct bio *src = &op->wbio.bio, *dst = src;
-	struct bvec_iter saved_iter;
-	struct bkey_i *key_to_write;
-	unsigned key_to_write_offset = op->insert_keys.top_p -
-		op->insert_keys.keys_p;
-	unsigned total_output = 0;
-	bool bounce = false, page_alloc_failed = false;
-	int ret, more = 0;
-
-	BUG_ON(!bio_sectors(src));
-
-	switch (bch2_write_prep_encoded_data(op, wp)) {
-	case PREP_ENCODED_OK:
-		break;
-	case PREP_ENCODED_ERR:
-		ret = -EIO;
-		goto err;
-	case PREP_ENCODED_CHECKSUM_ERR:
-		goto csum_err;
-	case PREP_ENCODED_DO_WRITE:
-		init_append_extent(op, wp, op->version, op->crc);
-		goto do_write;
-	}
-
-	if (op->compression_type ||
-	    (op->csum_type &&
-	     !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
-	    (bch2_csum_type_is_encryption(op->csum_type) &&
-	     !(op->flags & BCH_WRITE_PAGES_OWNED))) {
-		dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed);
-		bounce = true;
-	}
-
-	saved_iter = dst->bi_iter;
-
-	do {
-		struct bch_extent_crc_unpacked crc =
-			(struct bch_extent_crc_unpacked) { 0 };
-		struct bversion version = op->version;
-		size_t dst_len, src_len;
-
-		if (page_alloc_failed &&
-		    bio_sectors(dst) < wp->sectors_free &&
-		    bio_sectors(dst) < c->sb.encoded_extent_max)
-			break;
-
-		BUG_ON(op->compression_type &&
-		       (op->flags & BCH_WRITE_DATA_ENCODED) &&
-		       bch2_csum_type_is_encryption(op->crc.csum_type));
-		BUG_ON(op->compression_type && !bounce);
-
-		crc.compression_type = op->compression_type
-			?  bch2_bio_compress(c, dst, &dst_len, src, &src_len,
-					     op->compression_type)
-			: 0;
-		if (!crc.compression_type) {
-			dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
-			dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
-
-			if (op->csum_type)
-				dst_len = min_t(unsigned, dst_len,
-						c->sb.encoded_extent_max << 9);
-
-			if (bounce) {
-				swap(dst->bi_iter.bi_size, dst_len);
-				bio_copy_data(dst, src);
-				swap(dst->bi_iter.bi_size, dst_len);
-			}
-
-			src_len = dst_len;
-		}
-
-		BUG_ON(!src_len || !dst_len);
-
-		if (bch2_csum_type_is_encryption(op->csum_type)) {
-			if (bversion_zero(version)) {
-				version.lo = atomic64_inc_return(&c->key_version) + 1;
-			} else {
-				crc.nonce = op->nonce;
-				op->nonce += src_len >> 9;
-			}
-		}
-
-		if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
-		    !crc.compression_type &&
-		    bch2_csum_type_is_encryption(op->crc.csum_type) ==
-		    bch2_csum_type_is_encryption(op->csum_type)) {
-			/*
-			 * Note: when we're using rechecksum(), we need to be
-			 * checksumming @src because it has all the data our
-			 * existing checksum covers - if we bounced (because we
-			 * were trying to compress), @dst will only have the
-			 * part of the data the new checksum will cover.
-			 *
-			 * But normally we want to be checksumming post bounce,
-			 * because part of the reason for bouncing is so the
-			 * data can't be modified (by userspace) while it's in
-			 * flight.
-			 */
-			if (bch2_rechecksum_bio(c, src, version, op->crc,
-					&crc, &op->crc,
-					src_len >> 9,
-					bio_sectors(src) - (src_len >> 9),
-					op->csum_type))
-				goto csum_err;
-		} else {
-			if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
-			    bch2_rechecksum_bio(c, src, version, op->crc,
-					NULL, &op->crc,
-					src_len >> 9,
-					bio_sectors(src) - (src_len >> 9),
-					op->crc.csum_type))
-				goto csum_err;
-
-			crc.compressed_size	= dst_len >> 9;
-			crc.uncompressed_size	= src_len >> 9;
-			crc.live_size		= src_len >> 9;
-
-			swap(dst->bi_iter.bi_size, dst_len);
-			bch2_encrypt_bio(c, op->csum_type,
-					 extent_nonce(version, crc), dst);
-			crc.csum = bch2_checksum_bio(c, op->csum_type,
-					 extent_nonce(version, crc), dst);
-			crc.csum_type = op->csum_type;
-			swap(dst->bi_iter.bi_size, dst_len);
-		}
-
-		init_append_extent(op, wp, version, crc);
-
-		if (dst != src)
-			bio_advance(dst, dst_len);
-		bio_advance(src, src_len);
-		total_output += dst_len;
-	} while (dst->bi_iter.bi_size &&
-		 src->bi_iter.bi_size &&
-		 wp->sectors_free &&
-		 !bch2_keylist_realloc(&op->insert_keys,
-				      op->inline_keys,
-				      ARRAY_SIZE(op->inline_keys),
-				      BKEY_EXTENT_U64s_MAX));
-
-	more = src->bi_iter.bi_size != 0;
-
-	dst->bi_iter = saved_iter;
-
-	if (!bounce && more) {
-		dst = bio_split(src, total_output >> 9,
-				GFP_NOIO, &c->bio_write);
-		wbio_init(dst)->put_bio = true;
-	}
-
-	dst->bi_iter.bi_size = total_output;
-
-	/* Free unneeded pages after compressing: */
-	if (bounce)
-		while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
-			mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
-				     &c->bio_bounce_pages);
-do_write:
-	/* might have done a realloc... */
-
-	key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
-
-	dst->bi_end_io	= bch2_write_endio;
-	dst->bi_private	= &op->cl;
-	bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
-
-	closure_get(dst->bi_private);
-
-	bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER,
-				  key_to_write);
-	return more;
-csum_err:
-	bch_err(c, "error verifying existing checksum while "
-		"rewriting existing data (memory corruption?)");
-	ret = -EIO;
-err:
-	if (bounce) {
-		bch2_bio_free_pages_pool(c, dst);
-		bio_put(dst);
-	}
-
-	return ret;
-}
-
-static void __bch2_write(struct closure *cl)
-{
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-	struct bch_fs *c = op->c;
-	struct write_point *wp;
-	int ret;
-again:
-	do {
-		/* +1 for possible cache device: */
-		if (op->open_buckets_nr + op->nr_replicas + 1 >
-		    ARRAY_SIZE(op->open_buckets))
-			goto flush_io;
-
-		if (bch2_keylist_realloc(&op->insert_keys,
-					op->inline_keys,
-					ARRAY_SIZE(op->inline_keys),
-					BKEY_EXTENT_U64s_MAX))
-			goto flush_io;
-
-		wp = bch2_alloc_sectors_start(c,
-			op->target,
-			op->write_point,
-			&op->devs_have,
-			op->nr_replicas,
-			op->nr_replicas_required,
-			op->alloc_reserve,
-			op->flags,
-			(op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
-		EBUG_ON(!wp);
-
-		if (unlikely(IS_ERR(wp))) {
-			if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
-				ret = PTR_ERR(wp);
-				goto err;
-			}
-
-			goto flush_io;
-		}
-
-		ret = bch2_write_extent(op, wp);
-
-		BUG_ON(op->open_buckets_nr + wp->nr_ptrs - wp->first_ptr >
-		       ARRAY_SIZE(op->open_buckets));
-		bch2_open_bucket_get(c, wp,
-				     &op->open_buckets_nr,
-				     op->open_buckets);
-		bch2_alloc_sectors_done(c, wp);
-
-		if (ret < 0)
-			goto err;
-	} while (ret);
-
-	continue_at(cl, bch2_write_index, index_update_wq(op));
-	return;
-err:
-	op->error = ret;
-
-	continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
-		    ? bch2_write_index
-		    : bch2_write_done, index_update_wq(op));
-	return;
-flush_io:
-	closure_sync(cl);
-
-	if (!bch2_keylist_empty(&op->insert_keys)) {
-		__bch2_write_index(op);
-
-		if (op->error) {
-			continue_at_nobarrier(cl, bch2_write_done, NULL);
-			return;
-		}
-	}
-
-	goto again;
-}
-
-/**
- * bch_write - handle a write to a cache device or flash only volume
- *
- * This is the starting point for any data to end up in a cache device; it could
- * be from a normal write, or a writeback write, or a write to a flash only
- * volume - it's also used by the moving garbage collector to compact data in
- * mostly empty buckets.
- *
- * It first writes the data to the cache, creating a list of keys to be inserted
- * (if the data won't fit in a single open bucket, there will be multiple keys);
- * after the data is written it calls bch_journal, and after the keys have been
- * added to the next journal write they're inserted into the btree.
- *
- * If op->discard is true, instead of inserting the data it invalidates the
- * region of the cache represented by op->bio and op->inode.
- */
-void bch2_write(struct closure *cl)
-{
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-	struct bch_fs *c = op->c;
-
-	BUG_ON(!op->nr_replicas);
-	BUG_ON(!op->write_point.v);
-	BUG_ON(!bkey_cmp(op->pos, POS_MAX));
-	BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX);
-
-	op->start_time = local_clock();
-
-	memset(&op->failed, 0, sizeof(op->failed));
-
-	bch2_keylist_init(&op->insert_keys, op->inline_keys);
-	wbio_init(&op->wbio.bio)->put_bio = false;
-
-	if (c->opts.nochanges ||
-	    !percpu_ref_tryget(&c->writes)) {
-		__bcache_io_error(c, "read only");
-		op->error = -EROFS;
-		if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
-			bch2_disk_reservation_put(c, &op->res);
-		closure_return(cl);
-		return;
-	}
-
-	bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
-
-	continue_at_nobarrier(cl, __bch2_write, NULL);
-}
-
-/* Cache promotion on read */
-
-struct promote_op {
-	struct closure		cl;
-	u64			start_time;
-
-	struct rhash_head	hash;
-	struct bpos		pos;
-
-	struct migrate_write	write;
-	struct bio_vec		bi_inline_vecs[0]; /* must be last */
-};
-
-static const struct rhashtable_params bch_promote_params = {
-	.head_offset	= offsetof(struct promote_op, hash),
-	.key_offset	= offsetof(struct promote_op, pos),
-	.key_len	= sizeof(struct bpos),
-};
-
-static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
-				  struct bpos pos,
-				  struct bch_io_opts opts,
-				  unsigned flags)
-{
-	if (!opts.promote_target)
-		return false;
-
-	if (!(flags & BCH_READ_MAY_PROMOTE))
-		return false;
-
-	if (percpu_ref_is_dying(&c->writes))
-		return false;
-
-	if (!bkey_extent_is_data(k.k))
-		return false;
-
-	if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), opts.promote_target))
-		return false;
-
-	if (bch2_target_congested(c, opts.promote_target))
-		return false;
-
-	if (rhashtable_lookup_fast(&c->promote_table, &pos,
-				   bch_promote_params))
-		return false;
-
-	return true;
-}
-
-static void promote_free(struct bch_fs *c, struct promote_op *op)
-{
-	int ret;
-
-	ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
-				     bch_promote_params);
-	BUG_ON(ret);
-	percpu_ref_put(&c->writes);
-	kfree(op);
-}
-
-static void promote_done(struct closure *cl)
-{
-	struct promote_op *op =
-		container_of(cl, struct promote_op, cl);
-	struct bch_fs *c = op->write.op.c;
-
-	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
-			       op->start_time);
-
-	bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
-	promote_free(c, op);
-}
-
-static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
-{
-	struct bch_fs *c = rbio->c;
-	struct closure *cl = &op->cl;
-	struct bio *bio = &op->write.op.wbio.bio;
-
-	trace_promote(&rbio->bio);
-
-	/* we now own pages: */
-	BUG_ON(!rbio->bounce);
-	BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
-
-	memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
-	       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
-	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
-
-	bch2_migrate_read_done(&op->write, rbio);
-
-	closure_init(cl, NULL);
-	closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
-	closure_return_with_destructor(cl, promote_done);
-}
-
-noinline
-static struct promote_op *__promote_alloc(struct bch_fs *c,
-					  struct bpos pos,
-					  struct extent_pick_ptr *pick,
-					  struct bch_io_opts opts,
-					  unsigned rbio_sectors,
-					  struct bch_read_bio **rbio)
-{
-	struct promote_op *op = NULL;
-	struct bio *bio;
-	unsigned rbio_pages = DIV_ROUND_UP(rbio_sectors, PAGE_SECTORS);
-	/* data might have to be decompressed in the write path: */
-	unsigned wbio_pages = DIV_ROUND_UP(pick->crc.uncompressed_size,
-					   PAGE_SECTORS);
-	int ret;
-
-	if (!percpu_ref_tryget(&c->writes))
-		return NULL;
-
-	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * wbio_pages,
-		     GFP_NOIO);
-	if (!op)
-		goto err;
-
-	op->start_time = local_clock();
-	op->pos = pos;
-
-	/*
-	 * promotes require bouncing, but if the extent isn't
-	 * checksummed/compressed it might be too big for the mempool:
-	 */
-	if (rbio_sectors > c->sb.encoded_extent_max) {
-		*rbio = kzalloc(sizeof(struct bch_read_bio) +
-				sizeof(struct bio_vec) * rbio_pages,
-				GFP_NOIO);
-		if (!*rbio)
-			goto err;
-
-		rbio_init(&(*rbio)->bio, opts);
-		bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs,
-			 rbio_pages);
-
-		(*rbio)->bio.bi_iter.bi_size = rbio_sectors << 9;
-		bch2_bio_map(&(*rbio)->bio, NULL);
-
-		if (bch2_bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
-			goto err;
-
-		(*rbio)->bounce		= true;
-		(*rbio)->split		= true;
-		(*rbio)->kmalloc	= true;
-	}
-
-	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
-					  bch_promote_params))
-		goto err;
-
-	bio = &op->write.op.wbio.bio;
-	bio_init(bio, bio->bi_inline_vecs, wbio_pages);
-
-	ret = bch2_migrate_write_init(c, &op->write,
-			writepoint_hashed((unsigned long) current),
-			opts,
-			DATA_PROMOTE,
-			(struct data_opts) {
-				.target = opts.promote_target
-			},
-			bkey_s_c_null);
-	BUG_ON(ret);
-
-	return op;
-err:
-	if (*rbio)
-		bio_free_pages(&(*rbio)->bio);
-	kfree(*rbio);
-	*rbio = NULL;
-	kfree(op);
-	percpu_ref_put(&c->writes);
-	return NULL;
-}
-
-static inline struct promote_op *promote_alloc(struct bch_fs *c,
-					       struct bvec_iter iter,
-					       struct bkey_s_c k,
-					       struct extent_pick_ptr *pick,
-					       struct bch_io_opts opts,
-					       unsigned flags,
-					       struct bch_read_bio **rbio,
-					       bool *bounce,
-					       bool *read_full)
-{
-	bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
-	unsigned sectors = promote_full
-		? pick->crc.compressed_size
-		: bvec_iter_sectors(iter);
-	struct bpos pos = promote_full
-		? bkey_start_pos(k.k)
-		: POS(k.k->p.inode, iter.bi_sector);
-	struct promote_op *promote;
-
-	if (!should_promote(c, k, pos, opts, flags))
-		return NULL;
-
-	promote = __promote_alloc(c, pos, pick, opts, sectors, rbio);
-	if (!promote)
-		return NULL;
-
-	*bounce		= true;
-	*read_full	= promote_full;
-	return promote;
-}
-
-/* Read */
-
-#define READ_RETRY_AVOID	1
-#define READ_RETRY		2
-#define READ_ERR		3
-
-enum rbio_context {
-	RBIO_CONTEXT_NULL,
-	RBIO_CONTEXT_HIGHPRI,
-	RBIO_CONTEXT_UNBOUND,
-};
-
-static inline struct bch_read_bio *
-bch2_rbio_parent(struct bch_read_bio *rbio)
-{
-	return rbio->split ? rbio->parent : rbio;
-}
-
-__always_inline
-static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
-			   enum rbio_context context,
-			   struct workqueue_struct *wq)
-{
-	if (context <= rbio->context) {
-		fn(&rbio->work);
-	} else {
-		rbio->work.func		= fn;
-		rbio->context		= context;
-		queue_work(wq, &rbio->work);
-	}
-}
-
-static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
-{
-	BUG_ON(rbio->bounce && !rbio->split);
-
-	if (rbio->promote)
-		promote_free(rbio->c, rbio->promote);
-	rbio->promote = NULL;
-
-	if (rbio->bounce)
-		bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
-
-	if (rbio->split) {
-		struct bch_read_bio *parent = rbio->parent;
-
-		if (rbio->kmalloc)
-			kfree(rbio);
-		else
-			bio_put(&rbio->bio);
-
-		rbio = parent;
-	}
-
-	return rbio;
-}
-
-static void bch2_rbio_done(struct bch_read_bio *rbio)
-{
-	bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
-			       rbio->start_time);
-	bio_endio(&rbio->bio);
-}
-
-static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
-				     struct bvec_iter bvec_iter, u64 inode,
-				     struct bch_devs_mask *avoid, unsigned flags)
-{
-	struct btree_iter iter;
-	BKEY_PADDED(k) tmp;
-	struct bkey_s_c k;
-	int ret;
-
-	flags &= ~BCH_READ_LAST_FRAGMENT;
-
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
-			     rbio->pos, BTREE_ITER_SLOTS);
-retry:
-	rbio->bio.bi_status = 0;
-
-	k = bch2_btree_iter_peek_slot(&iter);
-	if (btree_iter_err(k)) {
-		bch2_btree_iter_unlock(&iter);
-		goto err;
-	}
-
-	bkey_reassemble(&tmp.k, k);
-	k = bkey_i_to_s_c(&tmp.k);
-	bch2_btree_iter_unlock(&iter);
-
-	if (!bkey_extent_is_data(k.k) ||
-	    !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
-				     rbio->pick.ptr,
-				     rbio->pos.offset -
-				     rbio->pick.crc.offset)) {
-		/* extent we wanted to read no longer exists: */
-		rbio->hole = true;
-		goto out;
-	}
-
-	ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags);
-	if (ret == READ_RETRY)
-		goto retry;
-	if (ret)
-		goto err;
-	goto out;
-err:
-	rbio->bio.bi_status = BLK_STS_IOERR;
-out:
-	bch2_rbio_done(rbio);
-}
-
-static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
-			    struct bvec_iter bvec_iter, u64 inode,
-			    struct bch_devs_mask *avoid, unsigned flags)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	flags &= ~BCH_READ_LAST_FRAGMENT;
-	flags |= BCH_READ_MUST_CLONE;
-retry:
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
-			   POS(inode, bvec_iter.bi_sector),
-			   BTREE_ITER_SLOTS, k) {
-		BKEY_PADDED(k) tmp;
-		unsigned bytes;
-
-		bkey_reassemble(&tmp.k, k);
-		k = bkey_i_to_s_c(&tmp.k);
-		bch2_btree_iter_unlock(&iter);
-
-		bytes = min_t(unsigned, bvec_iter.bi_size,
-			      (k.k->p.offset - bvec_iter.bi_sector) << 9);
-		swap(bvec_iter.bi_size, bytes);
-
-		ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags);
-		switch (ret) {
-		case READ_RETRY:
-			goto retry;
-		case READ_ERR:
-			goto err;
-		};
-
-		if (bytes == bvec_iter.bi_size)
-			goto out;
-
-		swap(bvec_iter.bi_size, bytes);
-		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
-	}
-
-	/*
-	 * If we get here, it better have been because there was an error
-	 * reading a btree node
-	 */
-	ret = bch2_btree_iter_unlock(&iter);
-	BUG_ON(!ret);
-	__bcache_io_error(c, "btree IO error %i", ret);
-err:
-	rbio->bio.bi_status = BLK_STS_IOERR;
-out:
-	bch2_rbio_done(rbio);
-}
-
-static void bch2_rbio_retry(struct work_struct *work)
-{
-	struct bch_read_bio *rbio =
-		container_of(work, struct bch_read_bio, work);
-	struct bch_fs *c	= rbio->c;
-	struct bvec_iter iter	= rbio->bvec_iter;
-	unsigned flags		= rbio->flags;
-	u64 inode		= rbio->pos.inode;
-	struct bch_devs_mask avoid;
-
-	trace_read_retry(&rbio->bio);
-
-	memset(&avoid, 0, sizeof(avoid));
-
-	if (rbio->retry == READ_RETRY_AVOID)
-		__set_bit(rbio->pick.ptr.dev, avoid.d);
-
-	rbio->bio.bi_status = 0;
-
-	rbio = bch2_rbio_free(rbio);
-
-	flags |= BCH_READ_IN_RETRY;
-	flags &= ~BCH_READ_MAY_PROMOTE;
-
-	if (flags & BCH_READ_NODECODE)
-		bch2_read_retry_nodecode(c, rbio, iter, inode, &avoid, flags);
-	else
-		bch2_read_retry(c, rbio, iter, inode, &avoid, flags);
-}
-
-static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
-			    blk_status_t error)
-{
-	rbio->retry = retry;
-
-	if (rbio->flags & BCH_READ_IN_RETRY)
-		return;
-
-	if (retry == READ_ERR) {
-		rbio = bch2_rbio_free(rbio);
-
-		rbio->bio.bi_status = error;
-		bch2_rbio_done(rbio);
-	} else {
-		bch2_rbio_punt(rbio, bch2_rbio_retry,
-			       RBIO_CONTEXT_UNBOUND, system_unbound_wq);
-	}
-}
-
-static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
-{
-	struct bch_fs *c = rbio->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_i_extent *e;
-	BKEY_PADDED(k) new;
-	struct bch_extent_crc_unpacked new_crc;
-	unsigned offset;
-	int ret;
-
-	if (rbio->pick.crc.compression_type)
-		return;
-
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, rbio->pos,
-			     BTREE_ITER_INTENT);
-retry:
-	k = bch2_btree_iter_peek(&iter);
-	if (IS_ERR_OR_NULL(k.k))
-		goto out;
-
-	if (!bkey_extent_is_data(k.k))
-		goto out;
-
-	bkey_reassemble(&new.k, k);
-	e = bkey_i_to_extent(&new.k);
-
-	if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e),
-				     rbio->pick.ptr,
-				     rbio->pos.offset -
-				     rbio->pick.crc.offset) ||
-	    bversion_cmp(e->k.version, rbio->version))
-		goto out;
-
-	/* Extent was merged? */
-	if (bkey_start_offset(&e->k) < rbio->pos.offset ||
-	    e->k.p.offset > rbio->pos.offset + rbio->pick.crc.uncompressed_size)
-		goto out;
-
-	/* The extent might have been partially overwritten since we read it: */
-	offset = rbio->pick.crc.offset + (bkey_start_offset(&e->k) - rbio->pos.offset);
-
-	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
-				rbio->pick.crc, NULL, &new_crc,
-				offset, e->k.size,
-				rbio->pick.crc.csum_type)) {
-		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
-		goto out;
-	}
-
-	if (!bch2_extent_narrow_crcs(e, new_crc))
-		goto out;
-
-	ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
-				   BTREE_INSERT_ATOMIC|
-				   BTREE_INSERT_NOFAIL|
-				   BTREE_INSERT_NOWAIT,
-				   BTREE_INSERT_ENTRY(&iter, &e->k_i));
-	if (ret == -EINTR)
-		goto retry;
-out:
-	bch2_btree_iter_unlock(&iter);
-}
-
-static bool should_narrow_crcs(struct bkey_s_c k,
-			       struct extent_pick_ptr *pick,
-			       unsigned flags)
-{
-	return !(flags & BCH_READ_IN_RETRY) &&
-		bkey_extent_is_data(k.k) &&
-		bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc);
-}
-
-/* Inner part that may run in process context */
-static void __bch2_read_endio(struct work_struct *work)
-{
-	struct bch_read_bio *rbio =
-		container_of(work, struct bch_read_bio, work);
-	struct bch_fs *c	= rbio->c;
-	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
-	struct bio *src		= &rbio->bio;
-	struct bio *dst		= &bch2_rbio_parent(rbio)->bio;
-	struct bvec_iter dst_iter = rbio->bvec_iter;
-	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
-	struct nonce nonce = extent_nonce(rbio->version, crc);
-	struct bch_csum csum;
-
-	/* Reset iterator for checksumming and copying bounced data: */
-	if (rbio->bounce) {
-		src->bi_iter.bi_size		= crc.compressed_size << 9;
-		src->bi_iter.bi_idx		= 0;
-		src->bi_iter.bi_bvec_done	= 0;
-	} else {
-		src->bi_iter			= rbio->bvec_iter;
-	}
-
-	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
-	if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
-		goto csum_err;
-
-	if (unlikely(rbio->narrow_crcs))
-		bch2_rbio_narrow_crcs(rbio);
-
-	if (rbio->flags & BCH_READ_NODECODE)
-		goto nodecode;
-
-	/* Adjust crc to point to subset of data we want: */
-	crc.offset     += rbio->bvec_iter.bi_sector - rbio->pos.offset;
-	crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
-
-	if (crc.compression_type != BCH_COMPRESSION_NONE) {
-		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-		if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
-			goto decompression_err;
-	} else {
-		/* don't need to decrypt the entire bio: */
-		nonce = nonce_add(nonce, crc.offset << 9);
-		bio_advance(src, crc.offset << 9);
-
-		BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
-		src->bi_iter.bi_size = dst_iter.bi_size;
-
-		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-
-		if (rbio->bounce) {
-			struct bvec_iter src_iter = src->bi_iter;
-			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
-		}
-	}
-
-	if (rbio->promote) {
-		/*
-		 * Re encrypt data we decrypted, so it's consistent with
-		 * rbio->crc:
-		 */
-		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-		promote_start(rbio->promote, rbio);
-		rbio->promote = NULL;
-	}
-nodecode:
-	if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
-		rbio = bch2_rbio_free(rbio);
-		bch2_rbio_done(rbio);
-	}
-	return;
-csum_err:
-	/*
-	 * Checksum error: if the bio wasn't bounced, we may have been
-	 * reading into buffers owned by userspace (that userspace can
-	 * scribble over) - retry the read, bouncing it this time:
-	 */
-	if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
-		rbio->flags |= BCH_READ_MUST_BOUNCE;
-		bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
-		return;
-	}
-
-	bch2_dev_io_error(ca,
-		"data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
-		rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
-		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
-		csum.hi, csum.lo, crc.csum_type);
-	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
-	return;
-decompression_err:
-	__bcache_io_error(c, "decompression error, inode %llu offset %llu",
-			  rbio->pos.inode,
-			  (u64) rbio->bvec_iter.bi_sector);
-	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
-	return;
-}
-
-static void bch2_read_endio(struct bio *bio)
-{
-	struct bch_read_bio *rbio =
-		container_of(bio, struct bch_read_bio, bio);
-	struct bch_fs *c	= rbio->c;
-	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
-	struct workqueue_struct *wq = NULL;
-	enum rbio_context context = RBIO_CONTEXT_NULL;
-
-	if (rbio->have_ioref) {
-		bch2_latency_acct(ca, rbio->submit_time, READ);
-		percpu_ref_put(&ca->io_ref);
-	}
-
-	if (!rbio->split)
-		rbio->bio.bi_end_io = rbio->end_io;
-
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
-		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
-		return;
-	}
-
-	if (rbio->pick.ptr.cached &&
-	    (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
-	     ptr_stale(ca, &rbio->pick.ptr))) {
-		atomic_long_inc(&c->read_realloc_races);
-
-		if (rbio->flags & BCH_READ_RETRY_IF_STALE)
-			bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
-		else
-			bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
-		return;
-	}
-
-	if (rbio->narrow_crcs ||
-	    rbio->pick.crc.compression_type ||
-	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
-		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
-	else if (rbio->pick.crc.csum_type)
-		context = RBIO_CONTEXT_HIGHPRI,	wq = system_highpri_wq;
-
-	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
-}
-
-int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
-		       struct bvec_iter iter, struct bkey_s_c k,
-		       struct bch_devs_mask *avoid, unsigned flags)
-{
-	struct extent_pick_ptr pick;
-	struct bch_read_bio *rbio = NULL;
-	struct bch_dev *ca;
-	struct promote_op *promote = NULL;
-	bool bounce = false, read_full = false, narrow_crcs = false;
-	struct bpos pos = bkey_start_pos(k.k);
-	int pick_ret;
-
-	pick_ret = bch2_extent_pick_ptr(c, k, avoid, &pick);
-
-	/* hole or reservation - just zero fill: */
-	if (!pick_ret)
-		goto hole;
-
-	if (pick_ret < 0)
-		goto no_device;
-
-	if (pick_ret > 0)
-		ca = bch_dev_bkey_exists(c, pick.ptr.dev);
-
-	if (flags & BCH_READ_NODECODE) {
-		/*
-		 * can happen if we retry, and the extent we were going to read
-		 * has been merged in the meantime:
-		 */
-		if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
-			goto hole;
-
-		iter.bi_sector	= pos.offset;
-		iter.bi_size	= pick.crc.compressed_size << 9;
-		goto noclone;
-	}
-
-	if (!(flags & BCH_READ_LAST_FRAGMENT) ||
-	    bio_flagged(&orig->bio, BIO_CHAIN))
-		flags |= BCH_READ_MUST_CLONE;
-
-	narrow_crcs = should_narrow_crcs(k, &pick, flags);
-
-	if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
-		flags |= BCH_READ_MUST_BOUNCE;
-
-	EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
-		k.k->p.offset < bvec_iter_end_sector(iter));
-
-	if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
-	    (pick.crc.csum_type != BCH_CSUM_NONE &&
-	     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
-	      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
-	       (flags & BCH_READ_USER_MAPPED)) ||
-	      (flags & BCH_READ_MUST_BOUNCE)))) {
-		read_full = true;
-		bounce = true;
-	}
-
-	promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
-				&rbio, &bounce, &read_full);
-
-	if (!read_full) {
-		EBUG_ON(pick.crc.compression_type);
-		EBUG_ON(pick.crc.csum_type &&
-			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
-			 bvec_iter_sectors(iter) != pick.crc.live_size ||
-			 pick.crc.offset ||
-			 iter.bi_sector != pos.offset));
-
-		pick.ptr.offset += pick.crc.offset +
-			(iter.bi_sector - pos.offset);
-		pick.crc.compressed_size	= bvec_iter_sectors(iter);
-		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
-		pick.crc.offset			= 0;
-		pick.crc.live_size		= bvec_iter_sectors(iter);
-		pos.offset			= iter.bi_sector;
-	}
-
-	if (rbio) {
-		/* promote already allocated bounce rbio */
-	} else if (bounce) {
-		unsigned sectors = pick.crc.compressed_size;
-
-		rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
-						  DIV_ROUND_UP(sectors, PAGE_SECTORS),
-						  &c->bio_read_split),
-				 orig->opts);
-
-		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
-		rbio->bounce	= true;
-		rbio->split	= true;
-	} else if (flags & BCH_READ_MUST_CLONE) {
-		/*
-		 * Have to clone if there were any splits, due to error
-		 * reporting issues (if a split errored, and retrying didn't
-		 * work, when it reports the error to its parent (us) we don't
-		 * know if the error was from our bio, and we should retry, or
-		 * from the whole bio, in which case we don't want to retry and
-		 * lose the error)
-		 */
-		rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
-						&c->bio_read_split),
-				 orig->opts);
-		rbio->bio.bi_iter = iter;
-		rbio->split	= true;
-	} else {
-noclone:
-		rbio = orig;
-		rbio->bio.bi_iter = iter;
-		BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
-	}
-
-	BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
-
-	rbio->c			= c;
-	rbio->submit_time	= local_clock();
-	if (rbio->split)
-		rbio->parent	= orig;
-	else
-		rbio->end_io	= orig->bio.bi_end_io;
-	rbio->bvec_iter		= iter;
-	rbio->flags		= flags;
-	rbio->have_ioref	= pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
-	rbio->narrow_crcs	= narrow_crcs;
-	rbio->hole		= 0;
-	rbio->retry		= 0;
-	rbio->context		= 0;
-	rbio->devs_have		= bch2_bkey_devs(k);
-	rbio->pick		= pick;
-	rbio->pos		= pos;
-	rbio->version		= k.k->version;
-	rbio->promote		= promote;
-	INIT_WORK(&rbio->work, NULL);
-
-	rbio->bio.bi_opf	= orig->bio.bi_opf;
-	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
-	rbio->bio.bi_end_io	= bch2_read_endio;
-
-	if (rbio->bounce)
-		trace_read_bounce(&rbio->bio);
-
-	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
-
-	if (!rbio->have_ioref)
-		goto no_device_postclone;
-
-	percpu_down_read_preempt_disable(&c->usage_lock);
-	bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
-	percpu_up_read_preempt_enable(&c->usage_lock);
-
-	this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
-		     bio_sectors(&rbio->bio));
-
-	bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
-
-	if (likely(!(flags & BCH_READ_IN_RETRY))) {
-		if (!(flags & BCH_READ_LAST_FRAGMENT)) {
-			bio_inc_remaining(&orig->bio);
-			trace_read_split(&orig->bio);
-		}
-
-		submit_bio(&rbio->bio);
-		return 0;
-	} else {
-		int ret;
-
-		submit_bio_wait(&rbio->bio);
-
-		rbio->context = RBIO_CONTEXT_UNBOUND;
-		bch2_read_endio(&rbio->bio);
-
-		ret = rbio->retry;
-		rbio = bch2_rbio_free(rbio);
-
-		if (ret == READ_RETRY_AVOID) {
-			__set_bit(pick.ptr.dev, avoid->d);
-			ret = READ_RETRY;
-		}
-
-		return ret;
-	}
-
-no_device_postclone:
-	if (!rbio->split)
-		rbio->bio.bi_end_io = rbio->end_io;
-	bch2_rbio_free(rbio);
-no_device:
-	__bcache_io_error(c, "no device to read from");
-
-	if (likely(!(flags & BCH_READ_IN_RETRY))) {
-		orig->bio.bi_status = BLK_STS_IOERR;
-
-		if (flags & BCH_READ_LAST_FRAGMENT)
-			bch2_rbio_done(orig);
-		return 0;
-	} else {
-		return READ_ERR;
-	}
-
-hole:
-	/*
-	 * won't normally happen in the BCH_READ_NODECODE
-	 * (bch2_move_extent()) path, but if we retry and the extent we wanted
-	 * to read no longer exists we have to signal that:
-	 */
-	if (flags & BCH_READ_NODECODE)
-		orig->hole = true;
-
-	zero_fill_bio_iter(&orig->bio, iter);
-
-	if (flags & BCH_READ_LAST_FRAGMENT)
-		bch2_rbio_done(orig);
-	return 0;
-}
-
-void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	unsigned flags = BCH_READ_RETRY_IF_STALE|
-		BCH_READ_MAY_PROMOTE|
-		BCH_READ_USER_MAPPED;
-	int ret;
-
-	BUG_ON(rbio->_state);
-	BUG_ON(flags & BCH_READ_NODECODE);
-	BUG_ON(flags & BCH_READ_IN_RETRY);
-
-	rbio->c = c;
-	rbio->start_time = local_clock();
-
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
-			   POS(inode, rbio->bio.bi_iter.bi_sector),
-			   BTREE_ITER_SLOTS, k) {
-		BKEY_PADDED(k) tmp;
-		unsigned bytes;
-
-		/*
-		 * Unlock the iterator while the btree node's lock is still in
-		 * cache, before doing the IO:
-		 */
-		bkey_reassemble(&tmp.k, k);
-		k = bkey_i_to_s_c(&tmp.k);
-		bch2_btree_iter_unlock(&iter);
-
-		bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
-			      (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
-		swap(rbio->bio.bi_iter.bi_size, bytes);
-
-		if (rbio->bio.bi_iter.bi_size == bytes)
-			flags |= BCH_READ_LAST_FRAGMENT;
-
-		bch2_read_extent(c, rbio, k, flags);
-
-		if (flags & BCH_READ_LAST_FRAGMENT)
-			return;
-
-		swap(rbio->bio.bi_iter.bi_size, bytes);
-		bio_advance(&rbio->bio, bytes);
-	}
-
-	/*
-	 * If we get here, it better have been because there was an error
-	 * reading a btree node
-	 */
-	ret = bch2_btree_iter_unlock(&iter);
-	BUG_ON(!ret);
-	bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
-	bch2_rbio_done(rbio);
-}
-
-void bch2_fs_io_exit(struct bch_fs *c)
-{
-	if (c->promote_table.tbl)
-		rhashtable_destroy(&c->promote_table);
-	mempool_exit(&c->bio_bounce_pages);
-	bioset_exit(&c->bio_write);
-	bioset_exit(&c->bio_read_split);
-	bioset_exit(&c->bio_read);
-}
-
-int bch2_fs_io_init(struct bch_fs *c)
-{
-	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
-			BIOSET_NEED_BVECS) ||
-	    bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
-			BIOSET_NEED_BVECS) ||
-	    bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
-			BIOSET_NEED_BVECS) ||
-	    mempool_init_page_pool(&c->bio_bounce_pages,
-				   max_t(unsigned,
-					 c->opts.btree_node_size,
-					 c->sb.encoded_extent_max) /
-				   PAGE_SECTORS, 0) ||
-	    rhashtable_init(&c->promote_table, &bch_promote_params))
-		return -ENOMEM;
-
-	return 0;
-}
diff --git a/libbcachefs/io.h b/libbcachefs/io.h
deleted file mode 100644
index 68539c78..00000000
--- a/libbcachefs/io.h
+++ /dev/null
@@ -1,139 +0,0 @@
-#ifndef _BCACHEFS_IO_H
-#define _BCACHEFS_IO_H
-
-#include "alloc.h"
-#include "checksum.h"
-#include "io_types.h"
-
-#define to_wbio(_bio)			\
-	container_of((_bio), struct bch_write_bio, bio)
-
-#define to_rbio(_bio)			\
-	container_of((_bio), struct bch_read_bio, bio)
-
-void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
-void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
-void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t);
-
-void bch2_latency_acct(struct bch_dev *, u64, int);
-
-void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
-			       enum bch_data_type, const struct bkey_i *);
-
-#define BLK_STS_REMOVED		((__force blk_status_t)128)
-
-enum bch_write_flags {
-	BCH_WRITE_ALLOC_NOWAIT		= (1 << 0),
-	BCH_WRITE_CACHED		= (1 << 1),
-	BCH_WRITE_FLUSH			= (1 << 2),
-	BCH_WRITE_DATA_ENCODED		= (1 << 3),
-	BCH_WRITE_PAGES_STABLE		= (1 << 4),
-	BCH_WRITE_PAGES_OWNED		= (1 << 5),
-	BCH_WRITE_ONLY_SPECIFIED_DEVS	= (1 << 6),
-	BCH_WRITE_NOPUT_RESERVATION	= (1 << 7),
-	BCH_WRITE_NOMARK_REPLICAS	= (1 << 8),
-
-	/* Internal: */
-	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 9),
-};
-
-static inline u64 *op_journal_seq(struct bch_write_op *op)
-{
-	return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR)
-		? op->journal_seq_p : &op->journal_seq;
-}
-
-static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
-{
-	op->journal_seq_p = journal_seq;
-	op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
-}
-
-static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
-{
-	return op->alloc_reserve == RESERVE_MOVINGGC
-		? op->c->copygc_wq
-		: op->c->wq;
-}
-
-int bch2_write_index_default(struct bch_write_op *);
-
-static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
-				      struct bch_io_opts opts)
-{
-	op->c			= c;
-	op->io_wq		= index_update_wq(op);
-	op->flags		= 0;
-	op->written		= 0;
-	op->error		= 0;
-	op->csum_type		= bch2_data_checksum_type(c, opts.data_checksum);
-	op->compression_type	= bch2_compression_opt_to_type[opts.compression];
-	op->nr_replicas		= 0;
-	op->nr_replicas_required = c->opts.data_replicas_required;
-	op->alloc_reserve	= RESERVE_NONE;
-	op->open_buckets_nr	= 0;
-	op->devs_have.nr	= 0;
-	op->target		= 0;
-	op->opts		= opts;
-	op->pos			= POS_MAX;
-	op->version		= ZERO_VERSION;
-	op->write_point		= (struct write_point_specifier) { 0 };
-	op->res			= (struct disk_reservation) { 0 };
-	op->journal_seq		= 0;
-	op->index_update_fn	= bch2_write_index_default;
-}
-
-void bch2_write(struct closure *);
-
-static inline struct bch_write_bio *wbio_init(struct bio *bio)
-{
-	struct bch_write_bio *wbio = to_wbio(bio);
-
-	memset(wbio, 0, offsetof(struct bch_write_bio, bio));
-	return wbio;
-}
-
-struct bch_devs_mask;
-struct cache_promote_op;
-struct extent_pick_ptr;
-
-int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
-		       struct bkey_s_c, struct bch_devs_mask *, unsigned);
-void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
-
-enum bch_read_flags {
-	BCH_READ_RETRY_IF_STALE		= 1 << 0,
-	BCH_READ_MAY_PROMOTE		= 1 << 1,
-	BCH_READ_USER_MAPPED		= 1 << 2,
-	BCH_READ_NODECODE		= 1 << 3,
-	BCH_READ_LAST_FRAGMENT		= 1 << 4,
-
-	/* internal: */
-	BCH_READ_MUST_BOUNCE		= 1 << 5,
-	BCH_READ_MUST_CLONE		= 1 << 6,
-	BCH_READ_IN_RETRY		= 1 << 7,
-};
-
-static inline void bch2_read_extent(struct bch_fs *c,
-				    struct bch_read_bio *rbio,
-				    struct bkey_s_c k,
-				    unsigned flags)
-{
-	__bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags);
-}
-
-static inline struct bch_read_bio *rbio_init(struct bio *bio,
-					     struct bch_io_opts opts)
-{
-	struct bch_read_bio *rbio = to_rbio(bio);
-
-	rbio->_state	= 0;
-	rbio->promote	= NULL;
-	rbio->opts	= opts;
-	return rbio;
-}
-
-void bch2_fs_io_exit(struct bch_fs *);
-int bch2_fs_io_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_IO_H */
diff --git a/libbcachefs/io_misc.c b/libbcachefs/io_misc.c
new file mode 100644
index 00000000..53539791
--- /dev/null
+++ b/libbcachefs/io_misc.c
@@ -0,0 +1,542 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * io_misc.c - fallocate, fpunch, truncate:
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "error.h"
+#include "extents.h"
+#include "extent_update.h"
+#include "inode.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "logged_ops.h"
+#include "rebalance.h"
+#include "subvolume.h"
+
+/* Overwrites whatever was present with zeroes: */
+int bch2_extent_fallocate(struct btree_trans *trans,
+			  subvol_inum inum,
+			  struct btree_iter *iter,
+			  u64 sectors,
+			  struct bch_io_opts opts,
+			  s64 *i_sectors_delta,
+			  struct write_point_specifier write_point)
+{
+	struct bch_fs *c = trans->c;
+	struct disk_reservation disk_res = { 0 };
+	struct closure cl;
+	struct open_buckets open_buckets = { 0 };
+	struct bkey_s_c k;
+	struct bkey_buf old, new;
+	unsigned sectors_allocated = 0, new_replicas;
+	bool unwritten = opts.nocow &&
+	    c->sb.version >= bcachefs_metadata_version_unwritten_extents;
+	int ret;
+
+	bch2_bkey_buf_init(&old);
+	bch2_bkey_buf_init(&new);
+	closure_init_stack(&cl);
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
+	new_replicas = max(0, (int) opts.data_replicas -
+			   (int) bch2_bkey_nr_ptrs_fully_allocated(k));
+
+	/*
+	 * Get a disk reservation before (in the nocow case) calling
+	 * into the allocator:
+	 */
+	ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
+	if (unlikely(ret))
+		goto err_noprint;
+
+	bch2_bkey_buf_reassemble(&old, c, k);
+
+	if (!unwritten) {
+		struct bkey_i_reservation *reservation;
+
+		bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
+		reservation = bkey_reservation_init(new.k);
+		reservation->k.p = iter->pos;
+		bch2_key_resize(&reservation->k, sectors);
+		reservation->v.nr_replicas = opts.data_replicas;
+	} else {
+		struct bkey_i_extent *e;
+		struct bch_devs_list devs_have;
+		struct write_point *wp;
+
+		devs_have.nr = 0;
+
+		bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
+
+		e = bkey_extent_init(new.k);
+		e->k.p = iter->pos;
+
+		ret = bch2_alloc_sectors_start_trans(trans,
+				opts.foreground_target,
+				false,
+				write_point,
+				&devs_have,
+				opts.data_replicas,
+				opts.data_replicas,
+				BCH_WATERMARK_normal, 0, &cl, &wp);
+		if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+			ret = -BCH_ERR_transaction_restart_nested;
+		if (ret)
+			goto err;
+
+		sectors = min_t(u64, sectors, wp->sectors_free);
+		sectors_allocated = sectors;
+
+		bch2_key_resize(&e->k, sectors);
+
+		bch2_open_bucket_get(c, wp, &open_buckets);
+		bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
+		bch2_alloc_sectors_done(c, wp);
+
+		extent_for_each_ptr(extent_i_to_s(e), ptr)
+			ptr->unwritten = true;
+	}
+
+	ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
+				 0, i_sectors_delta, true);
+err:
+	if (!ret && sectors_allocated)
+		bch2_increment_clock(c, sectors_allocated, WRITE);
+	if (should_print_err(ret)) {
+		struct printbuf buf = PRINTBUF;
+		bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9);
+		prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret));
+		bch_err_ratelimited(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+	}
+err_noprint:
+	bch2_open_buckets_put(c, &open_buckets);
+	bch2_disk_reservation_put(c, &disk_res);
+	bch2_bkey_buf_exit(&new, c);
+	bch2_bkey_buf_exit(&old, c);
+
+	if (closure_nr_remaining(&cl) != 1) {
+		bch2_trans_unlock_long(trans);
+		bch2_wait_on_allocator(c, &cl);
+	}
+
+	return ret;
+}
+
+/*
+ * Returns -BCH_ERR_transacton_restart if we had to drop locks:
+ */
+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
+		   subvol_inum inum, u64 end,
+		   s64 *i_sectors_delta)
+{
+	struct bch_fs *c	= trans->c;
+	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
+	struct bpos end_pos = POS(inum.inum, end);
+	struct bkey_s_c k;
+	int ret = 0, ret2 = 0;
+	u32 snapshot;
+
+	while (!ret ||
+	       bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(c, 0);
+		struct bkey_i delete;
+
+		if (ret)
+			ret2 = ret;
+
+		bch2_trans_begin(trans);
+
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+		if (ret)
+			continue;
+
+		bch2_btree_iter_set_snapshot(iter, snapshot);
+
+		/*
+		 * peek_max() doesn't have ideal semantics for extents:
+		 */
+		k = bch2_btree_iter_peek_max(iter, end_pos);
+		if (!k.k)
+			break;
+
+		ret = bkey_err(k);
+		if (ret)
+			continue;
+
+		bkey_init(&delete.k);
+		delete.k.p = iter->pos;
+
+		/* create the biggest key we can */
+		bch2_key_resize(&delete.k, max_sectors);
+		bch2_cut_back(end_pos, &delete);
+
+		ret = bch2_extent_update(trans, inum, iter, &delete,
+				&disk_res, 0, i_sectors_delta, false);
+		bch2_disk_reservation_put(c, &disk_res);
+	}
+
+	return ret ?: ret2;
+}
+
+int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
+		s64 *i_sectors_delta)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     POS(inum.inum, start),
+			     BTREE_ITER_intent);
+
+	ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta);
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		ret = 0;
+
+	return ret;
+}
+
+/* truncate: */
+
+void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k);
+
+	prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol));
+	prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum));
+	prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size));
+}
+
+static int truncate_set_isize(struct btree_trans *trans,
+			      subvol_inum inum,
+			      u64 new_i_size,
+			      bool warn)
+{
+	struct btree_iter iter = { NULL };
+	struct bch_inode_unpacked inode_u;
+	int ret;
+
+	ret   = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn) ?:
+		(inode_u.bi_size = new_i_size, 0) ?:
+		bch2_inode_write(trans, &iter, &inode_u);
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
+					    struct bkey_i *op_k,
+					    u64 *i_sectors_delta)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter fpunch_iter;
+	struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k);
+	subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+	u64 new_i_size = le64_to_cpu(op->v.new_i_size);
+	bool warn_errors = i_sectors_delta != NULL;
+	int ret;
+
+	ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			truncate_set_isize(trans, inum, new_i_size, i_sectors_delta != NULL));
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents,
+			     POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
+			     BTREE_ITER_intent);
+	ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
+	bch2_trans_iter_exit(trans, &fpunch_iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		ret = 0;
+err:
+	if (warn_errors)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k)
+{
+	return __bch2_resume_logged_op_truncate(trans, op_k, NULL);
+}
+
+int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta)
+{
+	struct bkey_i_logged_op_truncate op;
+
+	bkey_logged_op_truncate_init(&op.k_i);
+	op.v.subvol	= cpu_to_le32(inum.subvol);
+	op.v.inum	= cpu_to_le64(inum.inum);
+	op.v.new_i_size	= cpu_to_le64(new_i_size);
+
+	/*
+	 * Logged ops aren't atomic w.r.t. snapshot creation: creating a
+	 * snapshot while they're in progress, then crashing, will result in the
+	 * resume only proceeding in one of the snapshots
+	 */
+	down_read(&c->snapshot_create_lock);
+	struct btree_trans *trans = bch2_trans_get(c);
+	int ret = bch2_logged_op_start(trans, &op.k_i);
+	if (ret)
+		goto out;
+	ret = __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta);
+	ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret;
+out:
+	bch2_trans_put(trans);
+	up_read(&c->snapshot_create_lock);
+
+	return ret;
+}
+
+/* finsert/fcollapse: */
+
+void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k);
+
+	prt_printf(out, "subvol=%u",		le32_to_cpu(op.v->subvol));
+	prt_printf(out, " inum=%llu",		le64_to_cpu(op.v->inum));
+	prt_printf(out, " dst_offset=%lli",	le64_to_cpu(op.v->dst_offset));
+	prt_printf(out, " src_offset=%llu",	le64_to_cpu(op.v->src_offset));
+}
+
+static int adjust_i_size(struct btree_trans *trans, subvol_inum inum,
+			 u64 offset, s64 len, bool warn)
+{
+	struct btree_iter iter;
+	struct bch_inode_unpacked inode_u;
+	int ret;
+
+	offset	<<= 9;
+	len	<<= 9;
+
+	ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn);
+	if (ret)
+		return ret;
+
+	if (len > 0) {
+		if (MAX_LFS_FILESIZE - inode_u.bi_size < len) {
+			ret = -EFBIG;
+			goto err;
+		}
+
+		if (offset >= inode_u.bi_size) {
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	inode_u.bi_size += len;
+	inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c);
+
+	ret = bch2_inode_write(trans, &iter, &inode_u);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
+					   struct bkey_i *op_k,
+					   u64 *i_sectors_delta)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
+	subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+	struct bch_io_opts opts;
+	u64 dst_offset = le64_to_cpu(op->v.dst_offset);
+	u64 src_offset = le64_to_cpu(op->v.src_offset);
+	s64 shift = dst_offset - src_offset;
+	u64 len = abs(shift);
+	u64 pos = le64_to_cpu(op->v.pos);
+	bool insert = shift > 0;
+	u32 snapshot;
+	bool warn_errors = i_sectors_delta != NULL;
+	int ret = 0;
+
+	ret = bch2_inum_opts_get(trans, inum, &opts);
+	if (ret)
+		return ret;
+
+	/*
+	 * check for missing subvolume before fpunch, as in resume we don't want
+	 * it to be a fatal error
+	 */
+	ret = lockrestart_do(trans, __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors));
+	if (ret)
+		return ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     POS(inum.inum, 0),
+			     BTREE_ITER_intent);
+
+	switch (op->v.state) {
+case LOGGED_OP_FINSERT_start:
+	op->v.state = LOGGED_OP_FINSERT_shift_extents;
+
+	if (insert) {
+		ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+				adjust_i_size(trans, inum, src_offset, len, warn_errors) ?:
+				bch2_logged_op_update(trans, &op->k_i));
+		if (ret)
+			goto err;
+	} else {
+		bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset));
+
+		ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
+		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto err;
+
+		ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+				bch2_logged_op_update(trans, &op->k_i));
+	}
+
+	fallthrough;
+case LOGGED_OP_FINSERT_shift_extents:
+	while (1) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(c, 0);
+		struct bkey_i delete, *copy;
+		struct bkey_s_c k;
+		struct bpos src_pos = POS(inum.inum, src_offset);
+
+		bch2_trans_begin(trans);
+
+		ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot,
+						    warn_errors);
+		if (ret)
+			goto btree_err;
+
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
+		bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
+
+		k = insert
+			? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0))
+			: bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX));
+		if ((ret = bkey_err(k)))
+			goto btree_err;
+
+		if (!k.k ||
+		    k.k->p.inode != inum.inum ||
+		    bkey_le(k.k->p, POS(inum.inum, src_offset)))
+			break;
+
+		copy = bch2_bkey_make_mut_noupdate(trans, k);
+		if ((ret = PTR_ERR_OR_ZERO(copy)))
+			goto btree_err;
+
+		if (insert &&
+		    bkey_lt(bkey_start_pos(k.k), src_pos)) {
+			bch2_cut_front(src_pos, copy);
+
+			/* Splitting compressed extent? */
+			bch2_disk_reservation_add(c, &disk_res,
+					copy->k.size *
+					bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)),
+					BCH_DISK_RESERVATION_NOFAIL);
+		}
+
+		bkey_init(&delete.k);
+		delete.k.p = copy->k.p;
+		delete.k.p.snapshot = snapshot;
+		delete.k.size = copy->k.size;
+
+		copy->k.p.offset += shift;
+		copy->k.p.snapshot = snapshot;
+
+		op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
+
+		ret =   bch2_bkey_set_needs_rebalance(c, &opts, copy) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
+			bch2_logged_op_update(trans, &op->k_i) ?:
+			bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc);
+btree_err:
+		bch2_disk_reservation_put(c, &disk_res);
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			goto err;
+
+		pos = le64_to_cpu(op->v.pos);
+	}
+
+	op->v.state = LOGGED_OP_FINSERT_finish;
+
+	if (!insert) {
+		ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+				adjust_i_size(trans, inum, src_offset, shift, warn_errors) ?:
+				bch2_logged_op_update(trans, &op->k_i));
+	} else {
+		/* We need an inode update to update bi_journal_seq for fsync: */
+		ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+				adjust_i_size(trans, inum, 0, 0, warn_errors) ?:
+				bch2_logged_op_update(trans, &op->k_i));
+	}
+
+	break;
+case LOGGED_OP_FINSERT_finish:
+	break;
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	if (warn_errors)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k)
+{
+	return __bch2_resume_logged_op_finsert(trans, op_k, NULL);
+}
+
+int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
+			   u64 offset, u64 len, bool insert,
+			   s64 *i_sectors_delta)
+{
+	struct bkey_i_logged_op_finsert op;
+	s64 shift = insert ? len : -len;
+
+	bkey_logged_op_finsert_init(&op.k_i);
+	op.v.subvol	= cpu_to_le32(inum.subvol);
+	op.v.inum	= cpu_to_le64(inum.inum);
+	op.v.dst_offset	= cpu_to_le64(offset + shift);
+	op.v.src_offset	= cpu_to_le64(offset);
+	op.v.pos	= cpu_to_le64(insert ? U64_MAX : offset);
+
+	/*
+	 * Logged ops aren't atomic w.r.t. snapshot creation: creating a
+	 * snapshot while they're in progress, then crashing, will result in the
+	 * resume only proceeding in one of the snapshots
+	 */
+	down_read(&c->snapshot_create_lock);
+	struct btree_trans *trans = bch2_trans_get(c);
+	int ret = bch2_logged_op_start(trans, &op.k_i);
+	if (ret)
+		goto out;
+	ret = __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta);
+	ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret;
+out:
+	bch2_trans_put(trans);
+	up_read(&c->snapshot_create_lock);
+
+	return ret;
+}
diff --git a/libbcachefs/io_misc.h b/libbcachefs/io_misc.h
new file mode 100644
index 00000000..9cb44a7c
--- /dev/null
+++ b/libbcachefs/io_misc.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_MISC_H
+#define _BCACHEFS_IO_MISC_H
+
+int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
+			  u64, struct bch_io_opts, s64 *,
+			  struct write_point_specifier);
+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
+		   subvol_inum, u64, s64 *);
+int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
+
+void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) {	\
+	.val_to_text	= bch2_logged_op_truncate_to_text,	\
+	.min_val_size	= 24,					\
+})
+
+int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *);
+
+int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *);
+
+void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) {	\
+	.val_to_text	= bch2_logged_op_finsert_to_text,	\
+	.min_val_size	= 24,					\
+})
+
+int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *);
+
+int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *);
+
+#endif /* _BCACHEFS_IO_MISC_H */
diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c
new file mode 100644
index 00000000..34a3569d
--- /dev/null
+++ b/libbcachefs/io_read.c
@@ -0,0 +1,1320 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Some low level IO code, and hacks for various block layer limitations
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "data_update.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io_read.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "reflink.h"
+#include "subvolume.h"
+#include "trace.h"
+
+#include <linux/sched/mm.h>
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+	const struct bch_devs_mask *devs;
+	unsigned d, nr = 0, total = 0;
+	u64 now = local_clock(), last;
+	s64 congested;
+	struct bch_dev *ca;
+
+	if (!target)
+		return false;
+
+	rcu_read_lock();
+	devs = bch2_target_to_mask(c, target) ?:
+		&c->rw_devs[BCH_DATA_user];
+
+	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
+		ca = rcu_dereference(c->devs[d]);
+		if (!ca)
+			continue;
+
+		congested = atomic_read(&ca->congested);
+		last = READ_ONCE(ca->congested_last);
+		if (time_after64(now, last))
+			congested -= (now - last) >> 12;
+
+		total += max(congested, 0LL);
+		nr++;
+	}
+	rcu_read_unlock();
+
+	return bch2_rand_range(nr * CONGESTED_MAX) < total;
+}
+
+#else
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+	return false;
+}
+
+#endif
+
+/* Cache promotion on read */
+
+struct promote_op {
+	struct rcu_head		rcu;
+	u64			start_time;
+
+	struct rhash_head	hash;
+	struct bpos		pos;
+
+	struct data_update	write;
+	struct bio_vec		bi_inline_vecs[]; /* must be last */
+};
+
+static const struct rhashtable_params bch_promote_params = {
+	.head_offset		= offsetof(struct promote_op, hash),
+	.key_offset		= offsetof(struct promote_op, pos),
+	.key_len		= sizeof(struct bpos),
+	.automatic_shrinking	= true,
+};
+
+static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
+				  struct bpos pos,
+				  struct bch_io_opts opts,
+				  unsigned flags,
+				  struct bch_io_failures *failed)
+{
+	if (!failed) {
+		BUG_ON(!opts.promote_target);
+
+		if (!(flags & BCH_READ_MAY_PROMOTE))
+			return -BCH_ERR_nopromote_may_not;
+
+		if (bch2_bkey_has_target(c, k, opts.promote_target))
+			return -BCH_ERR_nopromote_already_promoted;
+
+		if (bkey_extent_is_unwritten(k))
+			return -BCH_ERR_nopromote_unwritten;
+
+		if (bch2_target_congested(c, opts.promote_target))
+			return -BCH_ERR_nopromote_congested;
+	}
+
+	if (rhashtable_lookup_fast(&c->promote_table, &pos,
+				   bch_promote_params))
+		return -BCH_ERR_nopromote_in_flight;
+
+	return 0;
+}
+
+static void promote_free(struct bch_fs *c, struct promote_op *op)
+{
+	int ret;
+
+	bch2_data_update_exit(&op->write);
+
+	ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+				     bch_promote_params);
+	BUG_ON(ret);
+	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+	kfree_rcu(op, rcu);
+}
+
+static void promote_done(struct bch_write_op *wop)
+{
+	struct promote_op *op =
+		container_of(wop, struct promote_op, write.op);
+	struct bch_fs *c = op->write.op.c;
+
+	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
+			       op->start_time);
+	promote_free(c, op);
+}
+
+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
+{
+	struct bio *bio = &op->write.op.wbio.bio;
+
+	trace_and_count(op->write.op.c, read_promote, &rbio->bio);
+
+	/* we now own pages: */
+	BUG_ON(!rbio->bounce);
+	BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
+
+	memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+	       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
+	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
+
+	bch2_data_update_read_done(&op->write, rbio->pick.crc);
+}
+
+static struct promote_op *__promote_alloc(struct btree_trans *trans,
+					  enum btree_id btree_id,
+					  struct bkey_s_c k,
+					  struct bpos pos,
+					  struct extent_ptr_decoded *pick,
+					  struct bch_io_opts opts,
+					  unsigned sectors,
+					  struct bch_read_bio **rbio,
+					  struct bch_io_failures *failed)
+{
+	struct bch_fs *c = trans->c;
+	struct promote_op *op = NULL;
+	struct bio *bio;
+	unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+	int ret;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
+		return ERR_PTR(-BCH_ERR_nopromote_no_writes);
+
+	op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL);
+	if (!op) {
+		ret = -BCH_ERR_nopromote_enomem;
+		goto err;
+	}
+
+	op->start_time = local_clock();
+	op->pos = pos;
+
+	/*
+	 * We don't use the mempool here because extents that aren't
+	 * checksummed or compressed can be too big for the mempool:
+	 */
+	*rbio = kzalloc(sizeof(struct bch_read_bio) +
+			sizeof(struct bio_vec) * pages,
+			GFP_KERNEL);
+	if (!*rbio) {
+		ret = -BCH_ERR_nopromote_enomem;
+		goto err;
+	}
+
+	rbio_init(&(*rbio)->bio, opts);
+	bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
+
+	if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) {
+		ret = -BCH_ERR_nopromote_enomem;
+		goto err;
+	}
+
+	(*rbio)->bounce		= true;
+	(*rbio)->split		= true;
+	(*rbio)->kmalloc	= true;
+
+	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
+					  bch_promote_params)) {
+		ret = -BCH_ERR_nopromote_in_flight;
+		goto err;
+	}
+
+	bio = &op->write.op.wbio.bio;
+	bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
+
+	struct data_update_opts update_opts = {};
+
+	if (!failed) {
+		update_opts.target = opts.promote_target;
+		update_opts.extra_replicas = 1;
+		update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED;
+	} else {
+		update_opts.target = opts.foreground_target;
+
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+		unsigned ptr_bit = 1;
+		bkey_for_each_ptr(ptrs, ptr) {
+			if (bch2_dev_io_failures(failed, ptr->dev))
+				update_opts.rewrite_ptrs |= ptr_bit;
+			ptr_bit <<= 1;
+		}
+	}
+
+	ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
+			writepoint_hashed((unsigned long) current),
+			opts,
+			update_opts,
+			btree_id, k);
+	/*
+	 * possible errors: -BCH_ERR_nocow_lock_blocked,
+	 * -BCH_ERR_ENOSPC_disk_reservation:
+	 */
+	if (ret) {
+		BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
+					      bch_promote_params));
+		goto err;
+	}
+
+	op->write.op.end_io = promote_done;
+
+	return op;
+err:
+	if (*rbio)
+		bio_free_pages(&(*rbio)->bio);
+	kfree(*rbio);
+	*rbio = NULL;
+	/* We may have added to the rhashtable and thus need rcu freeing: */
+	kfree_rcu(op, rcu);
+	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+	return ERR_PTR(ret);
+}
+
+noinline
+static struct promote_op *promote_alloc(struct btree_trans *trans,
+					struct bvec_iter iter,
+					struct bkey_s_c k,
+					struct extent_ptr_decoded *pick,
+					struct bch_io_opts opts,
+					unsigned flags,
+					struct bch_read_bio **rbio,
+					bool *bounce,
+					bool *read_full,
+					struct bch_io_failures *failed)
+{
+	struct bch_fs *c = trans->c;
+	/*
+	 * if failed != NULL we're not actually doing a promote, we're
+	 * recovering from an io/checksum error
+	 */
+	bool promote_full = (failed ||
+			     *read_full ||
+			     READ_ONCE(c->opts.promote_whole_extents));
+	/* data might have to be decompressed in the write path: */
+	unsigned sectors = promote_full
+		? max(pick->crc.compressed_size, pick->crc.live_size)
+		: bvec_iter_sectors(iter);
+	struct bpos pos = promote_full
+		? bkey_start_pos(k.k)
+		: POS(k.k->p.inode, iter.bi_sector);
+	struct promote_op *promote;
+	int ret;
+
+	ret = should_promote(c, k, pos, opts, flags, failed);
+	if (ret)
+		goto nopromote;
+
+	promote = __promote_alloc(trans,
+				  k.k->type == KEY_TYPE_reflink_v
+				  ? BTREE_ID_reflink
+				  : BTREE_ID_extents,
+				  k, pos, pick, opts, sectors, rbio, failed);
+	ret = PTR_ERR_OR_ZERO(promote);
+	if (ret)
+		goto nopromote;
+
+	*bounce		= true;
+	*read_full	= promote_full;
+	return promote;
+nopromote:
+	trace_read_nopromote(c, ret);
+	return NULL;
+}
+
+/* Read */
+
+static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
+				   struct bch_read_bio *rbio, struct bpos read_pos)
+{
+	return bch2_inum_offset_err_msg_trans(trans, out,
+		(subvol_inum) { rbio->subvol, read_pos.inode },
+		read_pos.offset << 9);
+}
+
+static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
+			      struct bch_read_bio *rbio, struct bpos read_pos)
+{
+	bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
+}
+
+#define READ_RETRY_AVOID	1
+#define READ_RETRY		2
+#define READ_ERR		3
+
+enum rbio_context {
+	RBIO_CONTEXT_NULL,
+	RBIO_CONTEXT_HIGHPRI,
+	RBIO_CONTEXT_UNBOUND,
+};
+
+static inline struct bch_read_bio *
+bch2_rbio_parent(struct bch_read_bio *rbio)
+{
+	return rbio->split ? rbio->parent : rbio;
+}
+
+__always_inline
+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
+			   enum rbio_context context,
+			   struct workqueue_struct *wq)
+{
+	if (context <= rbio->context) {
+		fn(&rbio->work);
+	} else {
+		rbio->work.func		= fn;
+		rbio->context		= context;
+		queue_work(wq, &rbio->work);
+	}
+}
+
+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
+{
+	BUG_ON(rbio->bounce && !rbio->split);
+
+	if (rbio->promote)
+		promote_free(rbio->c, rbio->promote);
+	rbio->promote = NULL;
+
+	if (rbio->bounce)
+		bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+
+	if (rbio->split) {
+		struct bch_read_bio *parent = rbio->parent;
+
+		if (rbio->kmalloc)
+			kfree(rbio);
+		else
+			bio_put(&rbio->bio);
+
+		rbio = parent;
+	}
+
+	return rbio;
+}
+
+/*
+ * Only called on a top level bch_read_bio to complete an entire read request,
+ * not a split:
+ */
+static void bch2_rbio_done(struct bch_read_bio *rbio)
+{
+	if (rbio->start_time)
+		bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+				       rbio->start_time);
+	bio_endio(&rbio->bio);
+}
+
+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
+				     struct bvec_iter bvec_iter,
+				     struct bch_io_failures *failed,
+				     unsigned flags)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_buf sk;
+	struct bkey_s_c k;
+	int ret;
+
+	flags &= ~BCH_READ_LAST_FRAGMENT;
+	flags |= BCH_READ_MUST_CLONE;
+
+	bch2_bkey_buf_init(&sk);
+
+	bch2_trans_iter_init(trans, &iter, rbio->data_btree,
+			     rbio->read_pos, BTREE_ITER_slots);
+retry:
+	bch2_trans_begin(trans);
+	rbio->bio.bi_status = 0;
+
+	ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+	if (ret)
+		goto err;
+
+	bch2_bkey_buf_reassemble(&sk, c, k);
+	k = bkey_i_to_s_c(sk.k);
+
+	if (!bch2_bkey_matches_ptr(c, k,
+				   rbio->pick.ptr,
+				   rbio->data_pos.offset -
+				   rbio->pick.crc.offset)) {
+		/* extent we wanted to read no longer exists: */
+		rbio->hole = true;
+		goto out;
+	}
+
+	ret = __bch2_read_extent(trans, rbio, bvec_iter,
+				 rbio->read_pos,
+				 rbio->data_btree,
+				 k, 0, failed, flags);
+	if (ret == READ_RETRY)
+		goto retry;
+	if (ret)
+		goto err;
+out:
+	bch2_rbio_done(rbio);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&sk, c);
+	return;
+err:
+	rbio->bio.bi_status = BLK_STS_IOERR;
+	goto out;
+}
+
+static void bch2_rbio_retry(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct bvec_iter iter	= rbio->bvec_iter;
+	unsigned flags		= rbio->flags;
+	subvol_inum inum = {
+		.subvol = rbio->subvol,
+		.inum	= rbio->read_pos.inode,
+	};
+	struct bch_io_failures failed = { .nr = 0 };
+
+	trace_and_count(c, read_retry, &rbio->bio);
+
+	if (rbio->retry == READ_RETRY_AVOID)
+		bch2_mark_io_failure(&failed, &rbio->pick);
+
+	rbio->bio.bi_status = 0;
+
+	rbio = bch2_rbio_free(rbio);
+
+	flags |= BCH_READ_IN_RETRY;
+	flags &= ~BCH_READ_MAY_PROMOTE;
+
+	if (flags & BCH_READ_NODECODE) {
+		bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
+	} else {
+		flags &= ~BCH_READ_LAST_FRAGMENT;
+		flags |= BCH_READ_MUST_CLONE;
+
+		__bch2_read(c, rbio, iter, inum, &failed, flags);
+	}
+}
+
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
+			    blk_status_t error)
+{
+	rbio->retry = retry;
+
+	if (rbio->flags & BCH_READ_IN_RETRY)
+		return;
+
+	if (retry == READ_ERR) {
+		rbio = bch2_rbio_free(rbio);
+
+		rbio->bio.bi_status = error;
+		bch2_rbio_done(rbio);
+	} else {
+		bch2_rbio_punt(rbio, bch2_rbio_retry,
+			       RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+	}
+}
+
+static void bch2_read_io_err(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bio *bio = &rbio->bio;
+	struct bch_fs *c	= rbio->c;
+	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+	struct printbuf buf = PRINTBUF;
+
+	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+	prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
+
+	if (ca) {
+		bch2_io_error(ca, BCH_MEMBER_ERROR_read);
+		bch_err_ratelimited(ca, "%s", buf.buf);
+	} else {
+		bch_err_ratelimited(c, "%s", buf.buf);
+	}
+
+	printbuf_exit(&buf);
+	bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+}
+
+static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
+				   struct bch_read_bio *rbio)
+{
+	struct bch_fs *c = rbio->c;
+	u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
+	struct bch_extent_crc_unpacked new_crc;
+	struct btree_iter iter;
+	struct bkey_i *new;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	if (crc_is_compressed(rbio->pick.crc))
+		return 0;
+
+	k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
+			       BTREE_ITER_slots|BTREE_ITER_intent);
+	if ((ret = bkey_err(k)))
+		goto out;
+
+	if (bversion_cmp(k.k->bversion, rbio->version) ||
+	    !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
+		goto out;
+
+	/* Extent was merged? */
+	if (bkey_start_offset(k.k) < data_offset ||
+	    k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
+		goto out;
+
+	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
+			rbio->pick.crc, NULL, &new_crc,
+			bkey_start_offset(k.k) - data_offset, k.k->size,
+			rbio->pick.crc.csum_type)) {
+		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * going to be temporarily appending another checksum entry:
+	 */
+	new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
+				 sizeof(struct bch_extent_crc128));
+	if ((ret = PTR_ERR_OR_ZERO(new)))
+		goto out;
+
+	bkey_reassemble(new, k);
+
+	if (!bch2_bkey_narrow_crcs(new, new_crc))
+		goto out;
+
+	ret = bch2_trans_update(trans, &iter, new,
+				BTREE_UPDATE_internal_snapshot_node);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
+{
+	bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			     __bch2_rbio_narrow_crcs(trans, rbio));
+}
+
+static void bch2_read_csum_err(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct bio *src		= &rbio->bio;
+	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+	struct nonce nonce = extent_nonce(rbio->version, crc);
+	struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+	struct printbuf buf = PRINTBUF;
+
+	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+	prt_str(&buf, "data ");
+	bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
+
+	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+	if (ca) {
+		bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+		bch_err_ratelimited(ca, "%s", buf.buf);
+	} else {
+		bch_err_ratelimited(c, "%s", buf.buf);
+	}
+
+	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+	printbuf_exit(&buf);
+}
+
+static void bch2_read_decompress_err(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct printbuf buf = PRINTBUF;
+
+	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+	prt_str(&buf, "decompression error");
+
+	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+	if (ca)
+		bch_err_ratelimited(ca, "%s", buf.buf);
+	else
+		bch_err_ratelimited(c, "%s", buf.buf);
+
+	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	printbuf_exit(&buf);
+}
+
+static void bch2_read_decrypt_err(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct printbuf buf = PRINTBUF;
+
+	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
+	prt_str(&buf, "decrypt error");
+
+	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+	if (ca)
+		bch_err_ratelimited(ca, "%s", buf.buf);
+	else
+		bch_err_ratelimited(c, "%s", buf.buf);
+
+	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	printbuf_exit(&buf);
+}
+
+/* Inner part that may run in process context */
+static void __bch2_read_endio(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct bio *src		= &rbio->bio;
+	struct bio *dst		= &bch2_rbio_parent(rbio)->bio;
+	struct bvec_iter dst_iter = rbio->bvec_iter;
+	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+	struct nonce nonce = extent_nonce(rbio->version, crc);
+	unsigned nofs_flags;
+	struct bch_csum csum;
+	int ret;
+
+	nofs_flags = memalloc_nofs_save();
+
+	/* Reset iterator for checksumming and copying bounced data: */
+	if (rbio->bounce) {
+		src->bi_iter.bi_size		= crc.compressed_size << 9;
+		src->bi_iter.bi_idx		= 0;
+		src->bi_iter.bi_bvec_done	= 0;
+	} else {
+		src->bi_iter			= rbio->bvec_iter;
+	}
+
+	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+	if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
+		goto csum_err;
+
+	/*
+	 * XXX
+	 * We need to rework the narrow_crcs path to deliver the read completion
+	 * first, and then punt to a different workqueue, otherwise we're
+	 * holding up reads while doing btree updates which is bad for memory
+	 * reclaim.
+	 */
+	if (unlikely(rbio->narrow_crcs))
+		bch2_rbio_narrow_crcs(rbio);
+
+	if (rbio->flags & BCH_READ_NODECODE)
+		goto nodecode;
+
+	/* Adjust crc to point to subset of data we want: */
+	crc.offset     += rbio->offset_into_extent;
+	crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
+
+	if (crc_is_compressed(crc)) {
+		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (ret)
+			goto decrypt_err;
+
+		if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
+		    !c->opts.no_data_io)
+			goto decompression_err;
+	} else {
+		/* don't need to decrypt the entire bio: */
+		nonce = nonce_add(nonce, crc.offset << 9);
+		bio_advance(src, crc.offset << 9);
+
+		BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
+		src->bi_iter.bi_size = dst_iter.bi_size;
+
+		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (ret)
+			goto decrypt_err;
+
+		if (rbio->bounce) {
+			struct bvec_iter src_iter = src->bi_iter;
+
+			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+		}
+	}
+
+	if (rbio->promote) {
+		/*
+		 * Re encrypt data we decrypted, so it's consistent with
+		 * rbio->crc:
+		 */
+		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (ret)
+			goto decrypt_err;
+
+		promote_start(rbio->promote, rbio);
+		rbio->promote = NULL;
+	}
+nodecode:
+	if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
+		rbio = bch2_rbio_free(rbio);
+		bch2_rbio_done(rbio);
+	}
+out:
+	memalloc_nofs_restore(nofs_flags);
+	return;
+csum_err:
+	/*
+	 * Checksum error: if the bio wasn't bounced, we may have been
+	 * reading into buffers owned by userspace (that userspace can
+	 * scribble over) - retry the read, bouncing it this time:
+	 */
+	if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
+		rbio->flags |= BCH_READ_MUST_BOUNCE;
+		bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
+		goto out;
+	}
+
+	bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+	goto out;
+decompression_err:
+	bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+	goto out;
+decrypt_err:
+	bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+	goto out;
+}
+
+static void bch2_read_endio(struct bio *bio)
+{
+	struct bch_read_bio *rbio =
+		container_of(bio, struct bch_read_bio, bio);
+	struct bch_fs *c	= rbio->c;
+	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+	struct workqueue_struct *wq = NULL;
+	enum rbio_context context = RBIO_CONTEXT_NULL;
+
+	if (rbio->have_ioref) {
+		bch2_latency_acct(ca, rbio->submit_time, READ);
+		percpu_ref_put(&ca->io_ref);
+	}
+
+	if (!rbio->split)
+		rbio->bio.bi_end_io = rbio->end_io;
+
+	if (unlikely(bio->bi_status)) {
+		bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+		return;
+	}
+
+	if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+	    (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
+		trace_and_count(c, read_reuse_race, &rbio->bio);
+
+		if (rbio->flags & BCH_READ_RETRY_IF_STALE)
+			bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
+		else
+			bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
+		return;
+	}
+
+	if (rbio->narrow_crcs ||
+	    rbio->promote ||
+	    crc_is_compressed(rbio->pick.crc) ||
+	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
+		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
+	else if (rbio->pick.crc.csum_type)
+		context = RBIO_CONTEXT_HIGHPRI,	wq = system_highpri_wq;
+
+	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
+}
+
+static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
+						   struct bch_dev *ca,
+						   struct bkey_s_c k,
+						   struct bch_extent_ptr ptr)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+			     PTR_BUCKET_POS(ca, &ptr),
+			     BTREE_ITER_cached);
+
+	int gen = bucket_gen_get(ca, iter.pos.offset);
+	if (gen >= 0) {
+		prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
+		printbuf_indent_add(&buf, 2);
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "memory gen: %u", gen);
+
+		ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+		if (!ret) {
+			prt_newline(&buf);
+			bch2_bkey_val_to_text(&buf, c, k);
+		}
+	} else {
+		prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n",
+			   iter.pos.inode, iter.pos.offset);
+		printbuf_indent_add(&buf, 2);
+
+		prt_printf(&buf, "first bucket %u nbuckets %llu\n",
+			   ca->mi.first_bucket, ca->mi.nbuckets);
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		prt_newline(&buf);
+	}
+
+	bch2_fs_inconsistent(c, "%s", buf.buf);
+
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+}
+
+int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
+		       struct bvec_iter iter, struct bpos read_pos,
+		       enum btree_id data_btree, struct bkey_s_c k,
+		       unsigned offset_into_extent,
+		       struct bch_io_failures *failed, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct extent_ptr_decoded pick;
+	struct bch_read_bio *rbio = NULL;
+	struct promote_op *promote = NULL;
+	bool bounce = false, read_full = false, narrow_crcs = false;
+	struct bpos data_pos = bkey_start_pos(k.k);
+	int pick_ret;
+
+	if (bkey_extent_is_inline_data(k.k)) {
+		unsigned bytes = min_t(unsigned, iter.bi_size,
+				       bkey_inline_data_bytes(k.k));
+
+		swap(iter.bi_size, bytes);
+		memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
+		swap(iter.bi_size, bytes);
+		bio_advance_iter(&orig->bio, &iter, bytes);
+		zero_fill_bio_iter(&orig->bio, iter);
+		goto out_read_done;
+	}
+retry_pick:
+	pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
+
+	/* hole or reservation - just zero fill: */
+	if (!pick_ret)
+		goto hole;
+
+	if (unlikely(pick_ret < 0)) {
+		struct printbuf buf = PRINTBUF;
+		bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
+		prt_printf(&buf, "no device to read from: %s\n  ", bch2_err_str(pick_ret));
+		bch2_bkey_val_to_text(&buf, c, k);
+
+		bch_err_ratelimited(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+		goto err;
+	}
+
+	if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) {
+		struct printbuf buf = PRINTBUF;
+		bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
+		prt_printf(&buf, "attempting to read encrypted data without encryption key\n  ");
+		bch2_bkey_val_to_text(&buf, c, k);
+
+		bch_err_ratelimited(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+		goto err;
+	}
+
+	struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+
+	/*
+	 * Stale dirty pointers are treated as IO errors, but @failed isn't
+	 * allocated unless we're in the retry path - so if we're not in the
+	 * retry path, don't check here, it'll be caught in bch2_read_endio()
+	 * and we'll end up in the retry path:
+	 */
+	if ((flags & BCH_READ_IN_RETRY) &&
+	    !pick.ptr.cached &&
+	    ca &&
+	    unlikely(dev_ptr_stale(ca, &pick.ptr))) {
+		read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
+		bch2_mark_io_failure(failed, &pick);
+		percpu_ref_put(&ca->io_ref);
+		goto retry_pick;
+	}
+
+	/*
+	 * Unlock the iterator while the btree node's lock is still in
+	 * cache, before doing the IO:
+	 */
+	bch2_trans_unlock(trans);
+
+	if (flags & BCH_READ_NODECODE) {
+		/*
+		 * can happen if we retry, and the extent we were going to read
+		 * has been merged in the meantime:
+		 */
+		if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) {
+			if (ca)
+				percpu_ref_put(&ca->io_ref);
+			goto hole;
+		}
+
+		iter.bi_size	= pick.crc.compressed_size << 9;
+		goto get_bio;
+	}
+
+	if (!(flags & BCH_READ_LAST_FRAGMENT) ||
+	    bio_flagged(&orig->bio, BIO_CHAIN))
+		flags |= BCH_READ_MUST_CLONE;
+
+	narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
+		bch2_can_narrow_extent_crcs(k, pick.crc);
+
+	if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
+		flags |= BCH_READ_MUST_BOUNCE;
+
+	EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
+
+	if (crc_is_compressed(pick.crc) ||
+	    (pick.crc.csum_type != BCH_CSUM_none &&
+	     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+	      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
+	       (flags & BCH_READ_USER_MAPPED)) ||
+	      (flags & BCH_READ_MUST_BOUNCE)))) {
+		read_full = true;
+		bounce = true;
+	}
+
+	if (orig->opts.promote_target)// || failed)
+		promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
+					&rbio, &bounce, &read_full, failed);
+
+	if (!read_full) {
+		EBUG_ON(crc_is_compressed(pick.crc));
+		EBUG_ON(pick.crc.csum_type &&
+			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+			 bvec_iter_sectors(iter) != pick.crc.live_size ||
+			 pick.crc.offset ||
+			 offset_into_extent));
+
+		data_pos.offset += offset_into_extent;
+		pick.ptr.offset += pick.crc.offset +
+			offset_into_extent;
+		offset_into_extent		= 0;
+		pick.crc.compressed_size	= bvec_iter_sectors(iter);
+		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
+		pick.crc.offset			= 0;
+		pick.crc.live_size		= bvec_iter_sectors(iter);
+	}
+get_bio:
+	if (rbio) {
+		/*
+		 * promote already allocated bounce rbio:
+		 * promote needs to allocate a bio big enough for uncompressing
+		 * data in the write path, but we're not going to use it all
+		 * here:
+		 */
+		EBUG_ON(rbio->bio.bi_iter.bi_size <
+		       pick.crc.compressed_size << 9);
+		rbio->bio.bi_iter.bi_size =
+			pick.crc.compressed_size << 9;
+	} else if (bounce) {
+		unsigned sectors = pick.crc.compressed_size;
+
+		rbio = rbio_init(bio_alloc_bioset(NULL,
+						  DIV_ROUND_UP(sectors, PAGE_SECTORS),
+						  0,
+						  GFP_NOFS,
+						  &c->bio_read_split),
+				 orig->opts);
+
+		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
+		rbio->bounce	= true;
+		rbio->split	= true;
+	} else if (flags & BCH_READ_MUST_CLONE) {
+		/*
+		 * Have to clone if there were any splits, due to error
+		 * reporting issues (if a split errored, and retrying didn't
+		 * work, when it reports the error to its parent (us) we don't
+		 * know if the error was from our bio, and we should retry, or
+		 * from the whole bio, in which case we don't want to retry and
+		 * lose the error)
+		 */
+		rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
+						 &c->bio_read_split),
+				 orig->opts);
+		rbio->bio.bi_iter = iter;
+		rbio->split	= true;
+	} else {
+		rbio = orig;
+		rbio->bio.bi_iter = iter;
+		EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
+	}
+
+	EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
+
+	rbio->c			= c;
+	rbio->submit_time	= local_clock();
+	if (rbio->split)
+		rbio->parent	= orig;
+	else
+		rbio->end_io	= orig->bio.bi_end_io;
+	rbio->bvec_iter		= iter;
+	rbio->offset_into_extent= offset_into_extent;
+	rbio->flags		= flags;
+	rbio->have_ioref	= ca != NULL;
+	rbio->narrow_crcs	= narrow_crcs;
+	rbio->hole		= 0;
+	rbio->retry		= 0;
+	rbio->context		= 0;
+	/* XXX: only initialize this if needed */
+	rbio->devs_have		= bch2_bkey_devs(k);
+	rbio->pick		= pick;
+	rbio->subvol		= orig->subvol;
+	rbio->read_pos		= read_pos;
+	rbio->data_btree	= data_btree;
+	rbio->data_pos		= data_pos;
+	rbio->version		= k.k->bversion;
+	rbio->promote		= promote;
+	INIT_WORK(&rbio->work, NULL);
+
+	if (flags & BCH_READ_NODECODE)
+		orig->pick = pick;
+
+	rbio->bio.bi_opf	= orig->bio.bi_opf;
+	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
+	rbio->bio.bi_end_io	= bch2_read_endio;
+
+	if (rbio->bounce)
+		trace_and_count(c, read_bounce, &rbio->bio);
+
+	this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
+	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
+
+	/*
+	 * If it's being moved internally, we don't want to flag it as a cache
+	 * hit:
+	 */
+	if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE))
+		bch2_bucket_io_time_reset(trans, pick.ptr.dev,
+			PTR_BUCKET_NR(ca, &pick.ptr), READ);
+
+	if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
+		bio_inc_remaining(&orig->bio);
+		trace_and_count(c, read_split, &orig->bio);
+	}
+
+	if (!rbio->pick.idx) {
+		if (unlikely(!rbio->have_ioref)) {
+			struct printbuf buf = PRINTBUF;
+			bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
+			prt_printf(&buf, "no device to read from:\n  ");
+			bch2_bkey_val_to_text(&buf, c, k);
+
+			bch_err_ratelimited(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			goto out;
+		}
+
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
+			     bio_sectors(&rbio->bio));
+		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+
+		if (unlikely(c->opts.no_data_io)) {
+			if (likely(!(flags & BCH_READ_IN_RETRY)))
+				bio_endio(&rbio->bio);
+		} else {
+			if (likely(!(flags & BCH_READ_IN_RETRY)))
+				submit_bio(&rbio->bio);
+			else
+				submit_bio_wait(&rbio->bio);
+		}
+
+		/*
+		 * We just submitted IO which may block, we expect relock fail
+		 * events and shouldn't count them:
+		 */
+		trans->notrace_relock_fail = true;
+	} else {
+		/* Attempting reconstruct read: */
+		if (bch2_ec_read_extent(trans, rbio, k)) {
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			goto out;
+		}
+
+		if (likely(!(flags & BCH_READ_IN_RETRY)))
+			bio_endio(&rbio->bio);
+	}
+out:
+	if (likely(!(flags & BCH_READ_IN_RETRY))) {
+		return 0;
+	} else {
+		int ret;
+
+		rbio->context = RBIO_CONTEXT_UNBOUND;
+		bch2_read_endio(&rbio->bio);
+
+		ret = rbio->retry;
+		rbio = bch2_rbio_free(rbio);
+
+		if (ret == READ_RETRY_AVOID) {
+			bch2_mark_io_failure(failed, &pick);
+			ret = READ_RETRY;
+		}
+
+		if (!ret)
+			goto out_read_done;
+
+		return ret;
+	}
+
+err:
+	if (flags & BCH_READ_IN_RETRY)
+		return READ_ERR;
+
+	orig->bio.bi_status = BLK_STS_IOERR;
+	goto out_read_done;
+
+hole:
+	/*
+	 * won't normally happen in the BCH_READ_NODECODE
+	 * (bch2_move_extent()) path, but if we retry and the extent we wanted
+	 * to read no longer exists we have to signal that:
+	 */
+	if (flags & BCH_READ_NODECODE)
+		orig->hole = true;
+
+	zero_fill_bio_iter(&orig->bio, iter);
+out_read_done:
+	if (flags & BCH_READ_LAST_FRAGMENT)
+		bch2_rbio_done(orig);
+	return 0;
+}
+
+void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+		 struct bvec_iter bvec_iter, subvol_inum inum,
+		 struct bch_io_failures *failed, unsigned flags)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_buf sk;
+	struct bkey_s_c k;
+	int ret;
+
+	BUG_ON(flags & BCH_READ_NODECODE);
+
+	bch2_bkey_buf_init(&sk);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     POS(inum.inum, bvec_iter.bi_sector),
+			     BTREE_ITER_slots);
+
+	while (1) {
+		enum btree_id data_btree = BTREE_ID_extents;
+
+		bch2_trans_begin(trans);
+
+		u32 snapshot;
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+		if (ret)
+			goto err;
+
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+		bch2_btree_iter_set_pos(&iter,
+				POS(inum.inum, bvec_iter.bi_sector));
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		s64 offset_into_extent = iter.pos.offset -
+			bkey_start_offset(k.k);
+		unsigned sectors = k.k->size - offset_into_extent;
+
+		bch2_bkey_buf_reassemble(&sk, c, k);
+
+		ret = bch2_read_indirect_extent(trans, &data_btree,
+					&offset_into_extent, &sk);
+		if (ret)
+			goto err;
+
+		k = bkey_i_to_s_c(sk.k);
+
+		/*
+		 * With indirect extents, the amount of data to read is the min
+		 * of the original extent and the indirect extent:
+		 */
+		sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
+
+		unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
+		swap(bvec_iter.bi_size, bytes);
+
+		if (bvec_iter.bi_size == bytes)
+			flags |= BCH_READ_LAST_FRAGMENT;
+
+		ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
+					 data_btree, k,
+					 offset_into_extent, failed, flags);
+		if (ret)
+			goto err;
+
+		if (flags & BCH_READ_LAST_FRAGMENT)
+			break;
+
+		swap(bvec_iter.bi_size, bytes);
+		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+err:
+		if (ret &&
+		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+		    ret != READ_RETRY &&
+		    ret != READ_RETRY_AVOID)
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret) {
+		struct printbuf buf = PRINTBUF;
+		bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9);
+		prt_printf(&buf, "read error %i from btree lookup", ret);
+		bch_err_ratelimited(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+
+		rbio->bio.bi_status = BLK_STS_IOERR;
+		bch2_rbio_done(rbio);
+	}
+
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&sk, c);
+}
+
+void bch2_fs_io_read_exit(struct bch_fs *c)
+{
+	if (c->promote_table.tbl)
+		rhashtable_destroy(&c->promote_table);
+	bioset_exit(&c->bio_read_split);
+	bioset_exit(&c->bio_read);
+}
+
+int bch2_fs_io_read_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_bio_read_init;
+
+	if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_bio_read_split_init;
+
+	if (rhashtable_init(&c->promote_table, &bch_promote_params))
+		return -BCH_ERR_ENOMEM_promote_table_init;
+
+	return 0;
+}
diff --git a/libbcachefs/io_read.h b/libbcachefs/io_read.h
new file mode 100644
index 00000000..a82e8a94
--- /dev/null
+++ b/libbcachefs/io_read.h
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_READ_H
+#define _BCACHEFS_IO_READ_H
+
+#include "bkey_buf.h"
+#include "reflink.h"
+
+struct bch_read_bio {
+	struct bch_fs		*c;
+	u64			start_time;
+	u64			submit_time;
+
+	/*
+	 * Reads will often have to be split, and if the extent being read from
+	 * was checksummed or compressed we'll also have to allocate bounce
+	 * buffers and copy the data back into the original bio.
+	 *
+	 * If we didn't have to split, we have to save and restore the original
+	 * bi_end_io - @split below indicates which:
+	 */
+	union {
+	struct bch_read_bio	*parent;
+	bio_end_io_t		*end_io;
+	};
+
+	/*
+	 * Saved copy of bio->bi_iter, from submission time - allows us to
+	 * resubmit on IO error, and also to copy data back to the original bio
+	 * when we're bouncing:
+	 */
+	struct bvec_iter	bvec_iter;
+
+	unsigned		offset_into_extent;
+
+	u16			flags;
+	union {
+	struct {
+	u16			bounce:1,
+				split:1,
+				kmalloc:1,
+				have_ioref:1,
+				narrow_crcs:1,
+				hole:1,
+				retry:2,
+				context:2;
+	};
+	u16			_state;
+	};
+
+	struct bch_devs_list	devs_have;
+
+	struct extent_ptr_decoded pick;
+
+	/*
+	 * pos we read from - different from data_pos for indirect extents:
+	 */
+	u32			subvol;
+	struct bpos		read_pos;
+
+	/*
+	 * start pos of data we read (may not be pos of data we want) - for
+	 * promote, narrow extents paths:
+	 */
+	enum btree_id		data_btree;
+	struct bpos		data_pos;
+	struct bversion		version;
+
+	struct promote_op	*promote;
+
+	struct bch_io_opts	opts;
+
+	struct work_struct	work;
+
+	struct bio		bio;
+};
+
+#define to_rbio(_bio)		container_of((_bio), struct bch_read_bio, bio)
+
+struct bch_devs_mask;
+struct cache_promote_op;
+struct extent_ptr_decoded;
+
+static inline int bch2_read_indirect_extent(struct btree_trans *trans,
+					    enum btree_id *data_btree,
+					    s64 *offset_into_extent,
+					    struct bkey_buf *extent)
+{
+	if (extent->k->k.type != KEY_TYPE_reflink_p)
+		return 0;
+
+	*data_btree = BTREE_ID_reflink;
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter,
+						offset_into_extent,
+						bkey_i_to_s_c_reflink_p(extent->k),
+						true, 0);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (bkey_deleted(k.k)) {
+		bch2_trans_iter_exit(trans, &iter);
+		return -BCH_ERR_missing_indirect_extent;
+	}
+
+	bch2_bkey_buf_reassemble(extent, trans->c, k);
+	bch2_trans_iter_exit(trans, &iter);
+	return 0;
+}
+
+enum bch_read_flags {
+	BCH_READ_RETRY_IF_STALE		= 1 << 0,
+	BCH_READ_MAY_PROMOTE		= 1 << 1,
+	BCH_READ_USER_MAPPED		= 1 << 2,
+	BCH_READ_NODECODE		= 1 << 3,
+	BCH_READ_LAST_FRAGMENT		= 1 << 4,
+
+	/* internal: */
+	BCH_READ_MUST_BOUNCE		= 1 << 5,
+	BCH_READ_MUST_CLONE		= 1 << 6,
+	BCH_READ_IN_RETRY		= 1 << 7,
+};
+
+int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
+		       struct bvec_iter, struct bpos, enum btree_id,
+		       struct bkey_s_c, unsigned,
+		       struct bch_io_failures *, unsigned);
+
+static inline void bch2_read_extent(struct btree_trans *trans,
+			struct bch_read_bio *rbio, struct bpos read_pos,
+			enum btree_id data_btree, struct bkey_s_c k,
+			unsigned offset_into_extent, unsigned flags)
+{
+	__bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
+			   data_btree, k, offset_into_extent, NULL, flags);
+}
+
+void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
+		 subvol_inum, struct bch_io_failures *, unsigned flags);
+
+static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+			     subvol_inum inum)
+{
+	struct bch_io_failures failed = { .nr = 0 };
+
+	BUG_ON(rbio->_state);
+
+	rbio->c = c;
+	rbio->start_time = local_clock();
+	rbio->subvol = inum.subvol;
+
+	__bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
+		    BCH_READ_RETRY_IF_STALE|
+		    BCH_READ_MAY_PROMOTE|
+		    BCH_READ_USER_MAPPED);
+}
+
+static inline struct bch_read_bio *rbio_init(struct bio *bio,
+					     struct bch_io_opts opts)
+{
+	struct bch_read_bio *rbio = to_rbio(bio);
+
+	rbio->_state	= 0;
+	rbio->promote	= NULL;
+	rbio->opts	= opts;
+	return rbio;
+}
+
+void bch2_fs_io_read_exit(struct bch_fs *);
+int bch2_fs_io_read_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_READ_H */
diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h
deleted file mode 100644
index 28281ea6..00000000
--- a/libbcachefs/io_types.h
+++ /dev/null
@@ -1,145 +0,0 @@
-#ifndef _BCACHEFS_IO_TYPES_H
-#define _BCACHEFS_IO_TYPES_H
-
-#include "alloc_types.h"
-#include "btree_types.h"
-#include "buckets_types.h"
-#include "extents_types.h"
-#include "keylist_types.h"
-#include "opts.h"
-#include "super_types.h"
-
-#include <linux/llist.h>
-#include <linux/workqueue.h>
-
-struct bch_read_bio {
-	struct bch_fs		*c;
-	u64			start_time;
-	u64			submit_time;
-
-	/*
-	 * Reads will often have to be split, and if the extent being read from
-	 * was checksummed or compressed we'll also have to allocate bounce
-	 * buffers and copy the data back into the original bio.
-	 *
-	 * If we didn't have to split, we have to save and restore the original
-	 * bi_end_io - @split below indicates which:
-	 */
-	union {
-	struct bch_read_bio	*parent;
-	bio_end_io_t		*end_io;
-	};
-
-	/*
-	 * Saved copy of bio->bi_iter, from submission time - allows us to
-	 * resubmit on IO error, and also to copy data back to the original bio
-	 * when we're bouncing:
-	 */
-	struct bvec_iter	bvec_iter;
-
-	u16			flags;
-	union {
-	struct {
-	u16			bounce:1,
-				split:1,
-				kmalloc:1,
-				have_ioref:1,
-				narrow_crcs:1,
-				hole:1,
-				retry:2,
-				context:2;
-	};
-	u16			_state;
-	};
-
-	struct bch_devs_list	devs_have;
-
-	struct extent_pick_ptr	pick;
-	/* start pos of data we read (may not be pos of data we want) */
-	struct bpos		pos;
-	struct bversion		version;
-
-	struct promote_op	*promote;
-
-	struct bch_io_opts	opts;
-
-	struct work_struct	work;
-
-	struct bio		bio;
-};
-
-struct bch_write_bio {
-	struct bch_fs		*c;
-	struct bch_write_bio	*parent;
-
-	u64			submit_time;
-
-	struct bch_devs_list	failed;
-	u8			order;
-	u8			dev;
-
-	unsigned		split:1,
-				bounce:1,
-				put_bio:1,
-				have_ioref:1,
-				used_mempool:1;
-
-	struct bio		bio;
-};
-
-struct bch_write_op {
-	struct closure		cl;
-	struct bch_fs		*c;
-	struct workqueue_struct	*io_wq;
-	u64			start_time;
-
-	unsigned		written; /* sectors */
-	u16			flags;
-	s16			error; /* dio write path expects it to hold -ERESTARTSYS... */
-
-	unsigned		csum_type:4;
-	unsigned		compression_type:4;
-	unsigned		nr_replicas:4;
-	unsigned		nr_replicas_required:4;
-	unsigned		alloc_reserve:4;
-
-	u8			open_buckets_nr;
-	struct bch_devs_list	devs_have;
-	u16			target;
-	u16			nonce;
-
-	struct bch_io_opts	opts;
-
-	struct bpos		pos;
-	struct bversion		version;
-
-	/* For BCH_WRITE_DATA_ENCODED: */
-	struct bch_extent_crc_unpacked crc;
-
-	struct write_point_specifier write_point;
-
-	struct disk_reservation	res;
-
-	u8			open_buckets[16];
-
-	/*
-	 * If caller wants to flush but hasn't passed us a journal_seq ptr, we
-	 * still need to stash the journal_seq somewhere:
-	 */
-	union {
-		u64			*journal_seq_p;
-		u64			journal_seq;
-	};
-
-	int			(*index_update_fn)(struct bch_write_op *);
-
-	struct bch_devs_mask	failed;
-
-	struct keylist		insert_keys;
-	u64			inline_keys[BKEY_EXTENT_U64s_MAX * 2];
-
-	/* Must be last: */
-	struct bch_write_bio	wbio;
-};
-
-#endif /* _BCACHEFS_IO_TYPES_H */
diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c
new file mode 100644
index 00000000..20da357e
--- /dev/null
+++ b/libbcachefs/io_write.c
@@ -0,0 +1,1707 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "bset.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "debug.h"
+#include "ec.h"
+#include "error.h"
+#include "extent_update.h"
+#include "inode.h"
+#include "io_write.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "nocow_locking.h"
+#include "rebalance.h"
+#include "subvolume.h"
+#include "super.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/blkdev.h>
+#include <linux/prefetch.h>
+#include <linux/random.h>
+#include <linux/sched/mm.h>
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
+static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
+				       u64 now, int rw)
+{
+	u64 latency_capable =
+		ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
+	/* ideally we'd be taking into account the device's variance here: */
+	u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
+	s64 latency_over = io_latency - latency_threshold;
+
+	if (latency_threshold && latency_over > 0) {
+		/*
+		 * bump up congested by approximately latency_over * 4 /
+		 * latency_threshold - we don't need much accuracy here so don't
+		 * bother with the divide:
+		 */
+		if (atomic_read(&ca->congested) < CONGESTED_MAX)
+			atomic_add(latency_over >>
+				   max_t(int, ilog2(latency_threshold) - 2, 0),
+				   &ca->congested);
+
+		ca->congested_last = now;
+	} else if (atomic_read(&ca->congested) > 0) {
+		atomic_dec(&ca->congested);
+	}
+}
+
+void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
+{
+	atomic64_t *latency = &ca->cur_latency[rw];
+	u64 now = local_clock();
+	u64 io_latency = time_after64(now, submit_time)
+		? now - submit_time
+		: 0;
+	u64 old, new;
+
+	old = atomic64_read(latency);
+	do {
+		/*
+		 * If the io latency was reasonably close to the current
+		 * latency, skip doing the update and atomic operation - most of
+		 * the time:
+		 */
+		if (abs((int) (old - io_latency)) < (old >> 1) &&
+		    now & ~(~0U << 5))
+			break;
+
+		new = ewma_add(old, io_latency, 5);
+	} while (!atomic64_try_cmpxchg(latency, &old, new));
+
+	bch2_congested_acct(ca, io_latency, now, rw);
+
+	__bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
+}
+
+#endif
+
+/* Allocate, free from mempool: */
+
+void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
+{
+	struct bvec_iter_all iter;
+	struct bio_vec *bv;
+
+	bio_for_each_segment_all(bv, bio, iter)
+		if (bv->bv_page != ZERO_PAGE(0))
+			mempool_free(bv->bv_page, &c->bio_bounce_pages);
+	bio->bi_vcnt = 0;
+}
+
+static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
+{
+	struct page *page;
+
+	if (likely(!*using_mempool)) {
+		page = alloc_page(GFP_NOFS);
+		if (unlikely(!page)) {
+			mutex_lock(&c->bio_bounce_pages_lock);
+			*using_mempool = true;
+			goto pool_alloc;
+
+		}
+	} else {
+pool_alloc:
+		page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
+	}
+
+	return page;
+}
+
+void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
+			       size_t size)
+{
+	bool using_mempool = false;
+
+	while (size) {
+		struct page *page = __bio_alloc_page_pool(c, &using_mempool);
+		unsigned len = min_t(size_t, PAGE_SIZE, size);
+
+		BUG_ON(!bio_add_page(bio, page, len, 0));
+		size -= len;
+	}
+
+	if (using_mempool)
+		mutex_unlock(&c->bio_bounce_pages_lock);
+}
+
+/* Extent update path: */
+
+int bch2_sum_sector_overwrites(struct btree_trans *trans,
+			       struct btree_iter *extent_iter,
+			       struct bkey_i *new,
+			       bool *usage_increasing,
+			       s64 *i_sectors_delta,
+			       s64 *disk_sectors_delta)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c old;
+	unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
+	bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
+	int ret = 0;
+
+	*usage_increasing	= false;
+	*i_sectors_delta	= 0;
+	*disk_sectors_delta	= 0;
+
+	bch2_trans_copy_iter(&iter, extent_iter);
+
+	for_each_btree_key_max_continue_norestart(iter,
+				new->k.p, BTREE_ITER_slots, old, ret) {
+		s64 sectors = min(new->k.p.offset, old.k->p.offset) -
+			max(bkey_start_offset(&new->k),
+			    bkey_start_offset(old.k));
+
+		*i_sectors_delta += sectors *
+			(bkey_extent_is_allocation(&new->k) -
+			 bkey_extent_is_allocation(old.k));
+
+		*disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
+		*disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
+			? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
+			: 0;
+
+		if (!*usage_increasing &&
+		    (new->k.p.snapshot != old.k->p.snapshot ||
+		     new_replicas > bch2_bkey_replicas(c, old) ||
+		     (!new_compressed && bch2_bkey_sectors_compressed(old))))
+			*usage_increasing = true;
+
+		if (bkey_ge(old.k->p, new->k.p))
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
+						    struct btree_iter *extent_iter,
+						    u64 new_i_size,
+						    s64 i_sectors_delta)
+{
+	/*
+	 * Crazy performance optimization:
+	 * Every extent update needs to also update the inode: the inode trigger
+	 * will set bi->journal_seq to the journal sequence number of this
+	 * transaction - for fsync.
+	 *
+	 * But if that's the only reason we're updating the inode (we're not
+	 * updating bi_size or bi_sectors), then we don't need the inode update
+	 * to be journalled - if we crash, the bi_journal_seq update will be
+	 * lost, but that's fine.
+	 */
+	unsigned inode_update_flags = BTREE_UPDATE_nojournal;
+
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			      SPOS(0,
+				   extent_iter->pos.inode,
+				   extent_iter->snapshot),
+			      BTREE_ITER_intent|
+			      BTREE_ITER_cached);
+	int ret = bkey_err(k);
+	if (unlikely(ret))
+		return ret;
+
+	/*
+	 * varint_decode_fast(), in the inode .invalid method, reads up to 7
+	 * bytes past the end of the buffer:
+	 */
+	struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8);
+	ret = PTR_ERR_OR_ZERO(k_mut);
+	if (unlikely(ret))
+		goto err;
+
+	bkey_reassemble(k_mut, k);
+
+	if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) {
+		k_mut = bch2_inode_to_v3(trans, k_mut);
+		ret = PTR_ERR_OR_ZERO(k_mut);
+		if (unlikely(ret))
+			goto err;
+	}
+
+	struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut);
+
+	if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) &&
+	    new_i_size > le64_to_cpu(inode->v.bi_size)) {
+		inode->v.bi_size = cpu_to_le64(new_i_size);
+		inode_update_flags = 0;
+	}
+
+	if (i_sectors_delta) {
+		le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
+		inode_update_flags = 0;
+	}
+
+	if (inode->k.p.snapshot != iter.snapshot) {
+		inode->k.p.snapshot = iter.snapshot;
+		inode_update_flags = 0;
+	}
+
+	ret = bch2_trans_update(trans, &iter, &inode->k_i,
+				BTREE_UPDATE_internal_snapshot_node|
+				inode_update_flags);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_extent_update(struct btree_trans *trans,
+		       subvol_inum inum,
+		       struct btree_iter *iter,
+		       struct bkey_i *k,
+		       struct disk_reservation *disk_res,
+		       u64 new_i_size,
+		       s64 *i_sectors_delta_total,
+		       bool check_enospc)
+{
+	struct bpos next_pos;
+	bool usage_increasing;
+	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
+	int ret;
+
+	/*
+	 * This traverses us the iterator without changing iter->path->pos to
+	 * search_key() (which is pos + 1 for extents): we want there to be a
+	 * path already traversed at iter->pos because
+	 * bch2_trans_extent_update() will use it to attempt extent merging
+	 */
+	ret = __bch2_btree_iter_traverse(iter);
+	if (ret)
+		return ret;
+
+	ret = bch2_extent_trim_atomic(trans, iter, k);
+	if (ret)
+		return ret;
+
+	next_pos = k->k.p;
+
+	ret = bch2_sum_sector_overwrites(trans, iter, k,
+			&usage_increasing,
+			&i_sectors_delta,
+			&disk_sectors_delta);
+	if (ret)
+		return ret;
+
+	if (disk_res &&
+	    disk_sectors_delta > (s64) disk_res->sectors) {
+		ret = bch2_disk_reservation_add(trans->c, disk_res,
+					disk_sectors_delta - disk_res->sectors,
+					!check_enospc || !usage_increasing
+					? BCH_DISK_RESERVATION_NOFAIL : 0);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * Note:
+	 * We always have to do an inode update - even when i_size/i_sectors
+	 * aren't changing - for fsync to work properly; fsync relies on
+	 * inode->bi_journal_seq which is updated by the trigger code:
+	 */
+	ret =   bch2_extent_update_i_size_sectors(trans, iter,
+						  min(k->k.p.offset << 9, new_i_size),
+						  i_sectors_delta) ?:
+		bch2_trans_update(trans, iter, k, 0) ?:
+		bch2_trans_commit(trans, disk_res, NULL,
+				BCH_TRANS_COMMIT_no_check_rw|
+				BCH_TRANS_COMMIT_no_enospc);
+	if (unlikely(ret))
+		return ret;
+
+	if (i_sectors_delta_total)
+		*i_sectors_delta_total += i_sectors_delta;
+	bch2_btree_iter_set_pos(iter, next_pos);
+	return 0;
+}
+
+static int bch2_write_index_default(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct bkey_buf sk;
+	struct keylist *keys = &op->insert_keys;
+	struct bkey_i *k = bch2_keylist_front(keys);
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	subvol_inum inum = {
+		.subvol = op->subvol,
+		.inum	= k->k.p.inode,
+	};
+	int ret;
+
+	BUG_ON(!inum.subvol);
+
+	bch2_bkey_buf_init(&sk);
+
+	do {
+		bch2_trans_begin(trans);
+
+		k = bch2_keylist_front(keys);
+		bch2_bkey_buf_copy(&sk, c, k);
+
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol,
+						  &sk.k->k.p.snapshot);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+
+		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+				     bkey_start_pos(&sk.k->k),
+				     BTREE_ITER_slots|BTREE_ITER_intent);
+
+		ret =   bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?:
+			bch2_extent_update(trans, inum, &iter, sk.k,
+					&op->res,
+					op->new_i_size, &op->i_sectors_delta,
+					op->flags & BCH_WRITE_CHECK_ENOSPC);
+		bch2_trans_iter_exit(trans, &iter);
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+
+		if (bkey_ge(iter.pos, k->k.p))
+			bch2_keylist_pop_front(&op->insert_keys);
+		else
+			bch2_cut_front(iter.pos, k);
+	} while (!bch2_keylist_empty(keys));
+
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&sk, c);
+
+	return ret;
+}
+
+/* Writes */
+
+static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op,
+				  u64 offset)
+{
+	bch2_inum_offset_err_msg(op->c, out,
+				 (subvol_inum) { op->subvol, op->pos.inode, },
+				 offset << 9);
+	prt_printf(out, "write error%s: ",
+		   op->flags & BCH_WRITE_MOVE ? "(internal move)" : "");
+}
+
+static void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
+{
+	__bch2_write_op_error(out, op, op->pos.offset);
+}
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
+			       enum bch_data_type type,
+			       const struct bkey_i *k,
+			       bool nocow)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
+	struct bch_write_bio *n;
+
+	BUG_ON(c->opts.nochanges);
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = nocow
+			? bch2_dev_have_ref(c, ptr->dev)
+			: bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE);
+
+		if (to_entry(ptr + 1) < ptrs.end) {
+			n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set));
+
+			n->bio.bi_end_io	= wbio->bio.bi_end_io;
+			n->bio.bi_private	= wbio->bio.bi_private;
+			n->parent		= wbio;
+			n->split		= true;
+			n->bounce		= false;
+			n->put_bio		= true;
+			n->bio.bi_opf		= wbio->bio.bi_opf;
+			bio_inc_remaining(&wbio->bio);
+		} else {
+			n = wbio;
+			n->split		= false;
+		}
+
+		n->c			= c;
+		n->dev			= ptr->dev;
+		n->have_ioref		= ca != NULL;
+		n->nocow		= nocow;
+		n->submit_time		= local_clock();
+		n->inode_offset		= bkey_start_offset(&k->k);
+		if (nocow)
+			n->nocow_bucket	= PTR_BUCKET_NR(ca, ptr);
+		n->bio.bi_iter.bi_sector = ptr->offset;
+
+		if (likely(n->have_ioref)) {
+			this_cpu_add(ca->io_done->sectors[WRITE][type],
+				     bio_sectors(&n->bio));
+
+			bio_set_dev(&n->bio, ca->disk_sb.bdev);
+
+			if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
+				bio_endio(&n->bio);
+				continue;
+			}
+
+			submit_bio(&n->bio);
+		} else {
+			n->bio.bi_status	= BLK_STS_REMOVED;
+			bio_endio(&n->bio);
+		}
+	}
+}
+
+static void __bch2_write(struct bch_write_op *);
+
+static void bch2_write_done(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bch_fs *c = op->c;
+
+	EBUG_ON(op->open_buckets.nr);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
+	bch2_disk_reservation_put(c, &op->res);
+
+	if (!(op->flags & BCH_WRITE_MOVE))
+		bch2_write_ref_put(c, BCH_WRITE_REF_write);
+	bch2_keylist_free(&op->insert_keys, op->inline_keys);
+
+	EBUG_ON(cl->parent);
+	closure_debug_destroy(cl);
+	if (op->end_io)
+		op->end_io(op);
+}
+
+static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
+{
+	struct keylist *keys = &op->insert_keys;
+	struct bkey_i *src, *dst = keys->keys, *n;
+
+	for (src = keys->keys; src != keys->top; src = n) {
+		n = bkey_next(src);
+
+		if (bkey_extent_is_direct_data(&src->k)) {
+			bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
+					    test_bit(ptr->dev, op->failed.d));
+
+			if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
+				return -EIO;
+		}
+
+		if (dst != src)
+			memmove_u64s_down(dst, src, src->k.u64s);
+		dst = bkey_next(dst);
+	}
+
+	keys->top = dst;
+	return 0;
+}
+
+/**
+ * __bch2_write_index - after a write, update index to point to new data
+ * @op:		bch_write_op to process
+ */
+static void __bch2_write_index(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct keylist *keys = &op->insert_keys;
+	unsigned dev;
+	int ret = 0;
+
+	if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+		ret = bch2_write_drop_io_error_ptrs(op);
+		if (ret)
+			goto err;
+	}
+
+	if (!bch2_keylist_empty(keys)) {
+		u64 sectors_start = keylist_sectors(keys);
+
+		ret = !(op->flags & BCH_WRITE_MOVE)
+			? bch2_write_index_default(op)
+			: bch2_data_update_index_update(op);
+
+		BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
+		BUG_ON(keylist_sectors(keys) && !ret);
+
+		op->written += sectors_start - keylist_sectors(keys);
+
+		if (unlikely(ret && !bch2_err_matches(ret, EROFS))) {
+			struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
+
+			struct printbuf buf = PRINTBUF;
+			__bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
+			prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
+			bch_err_ratelimited(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+		}
+
+		if (ret)
+			goto err;
+	}
+out:
+	/* If some a bucket wasn't written, we can't erasure code it: */
+	for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
+		bch2_open_bucket_write_error(c, &op->open_buckets, dev);
+
+	bch2_open_buckets_put(c, &op->open_buckets);
+	return;
+err:
+	keys->top = keys->keys;
+	op->error = ret;
+	op->flags |= BCH_WRITE_SUBMITTED;
+	goto out;
+}
+
+static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
+{
+	if (state != wp->state) {
+		u64 now = ktime_get_ns();
+
+		if (wp->last_state_change &&
+		    time_after64(now, wp->last_state_change))
+			wp->time[wp->state] += now - wp->last_state_change;
+		wp->state = state;
+		wp->last_state_change = now;
+	}
+}
+
+static inline void wp_update_state(struct write_point *wp, bool running)
+{
+	enum write_point_state state;
+
+	state = running			 ? WRITE_POINT_running :
+		!list_empty(&wp->writes) ? WRITE_POINT_waiting_io
+					 : WRITE_POINT_stopped;
+
+	__wp_update_state(wp, state);
+}
+
+static CLOSURE_CALLBACK(bch2_write_index)
+{
+	closure_type(op, struct bch_write_op, cl);
+	struct write_point *wp = op->wp;
+	struct workqueue_struct *wq = index_update_wq(op);
+	unsigned long flags;
+
+	if ((op->flags & BCH_WRITE_SUBMITTED) &&
+	    (op->flags & BCH_WRITE_MOVE))
+		bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
+
+	spin_lock_irqsave(&wp->writes_lock, flags);
+	if (wp->state == WRITE_POINT_waiting_io)
+		__wp_update_state(wp, WRITE_POINT_waiting_work);
+	list_add_tail(&op->wp_list, &wp->writes);
+	spin_unlock_irqrestore (&wp->writes_lock, flags);
+
+	queue_work(wq, &wp->index_update_work);
+}
+
+static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
+{
+	op->wp = wp;
+
+	if (wp->state == WRITE_POINT_stopped) {
+		spin_lock_irq(&wp->writes_lock);
+		__wp_update_state(wp, WRITE_POINT_waiting_io);
+		spin_unlock_irq(&wp->writes_lock);
+	}
+}
+
+void bch2_write_point_do_index_updates(struct work_struct *work)
+{
+	struct write_point *wp =
+		container_of(work, struct write_point, index_update_work);
+	struct bch_write_op *op;
+
+	while (1) {
+		spin_lock_irq(&wp->writes_lock);
+		op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list);
+		wp_update_state(wp, op != NULL);
+		spin_unlock_irq(&wp->writes_lock);
+
+		if (!op)
+			break;
+
+		op->flags |= BCH_WRITE_IN_WORKER;
+
+		__bch2_write_index(op);
+
+		if (!(op->flags & BCH_WRITE_SUBMITTED))
+			__bch2_write(op);
+		else
+			bch2_write_done(&op->cl);
+	}
+}
+
+static void bch2_write_endio(struct bio *bio)
+{
+	struct closure *cl		= bio->bi_private;
+	struct bch_write_op *op		= container_of(cl, struct bch_write_op, cl);
+	struct bch_write_bio *wbio	= to_wbio(bio);
+	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
+	struct bch_fs *c		= wbio->c;
+	struct bch_dev *ca		= wbio->have_ioref
+		? bch2_dev_have_ref(c, wbio->dev)
+		: NULL;
+
+	if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+				    op->pos.inode,
+				    wbio->inode_offset << 9,
+				    "data write error: %s",
+				    bch2_blk_status_to_str(bio->bi_status))) {
+		set_bit(wbio->dev, op->failed.d);
+		op->flags |= BCH_WRITE_IO_ERROR;
+	}
+
+	if (wbio->nocow) {
+		bch2_bucket_nocow_unlock(&c->nocow_locks,
+					 POS(ca->dev_idx, wbio->nocow_bucket),
+					 BUCKET_NOCOW_LOCK_UPDATE);
+		set_bit(wbio->dev, op->devs_need_flush->d);
+	}
+
+	if (wbio->have_ioref) {
+		bch2_latency_acct(ca, wbio->submit_time, WRITE);
+		percpu_ref_put(&ca->io_ref);
+	}
+
+	if (wbio->bounce)
+		bch2_bio_free_pages_pool(c, bio);
+
+	if (wbio->put_bio)
+		bio_put(bio);
+
+	if (parent)
+		bio_endio(&parent->bio);
+	else
+		closure_put(cl);
+}
+
+static void init_append_extent(struct bch_write_op *op,
+			       struct write_point *wp,
+			       struct bversion version,
+			       struct bch_extent_crc_unpacked crc)
+{
+	struct bkey_i_extent *e;
+
+	op->pos.offset += crc.uncompressed_size;
+
+	e = bkey_extent_init(op->insert_keys.top);
+	e->k.p		= op->pos;
+	e->k.size	= crc.uncompressed_size;
+	e->k.bversion	= version;
+
+	if (crc.csum_type ||
+	    crc.compression_type ||
+	    crc.nonce)
+		bch2_extent_crc_append(&e->k_i, crc);
+
+	bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
+				       op->flags & BCH_WRITE_CACHED);
+
+	bch2_keylist_push(&op->insert_keys);
+}
+
+static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
+					struct write_point *wp,
+					struct bio *src,
+					bool *page_alloc_failed,
+					void *buf)
+{
+	struct bch_write_bio *wbio;
+	struct bio *bio;
+	unsigned output_available =
+		min(wp->sectors_free << 9, src->bi_iter.bi_size);
+	unsigned pages = DIV_ROUND_UP(output_available +
+				      (buf
+				       ? ((unsigned long) buf & (PAGE_SIZE - 1))
+				       : 0), PAGE_SIZE);
+
+	pages = min(pages, BIO_MAX_VECS);
+
+	bio = bio_alloc_bioset(NULL, pages, 0,
+			       GFP_NOFS, &c->bio_write);
+	wbio			= wbio_init(bio);
+	wbio->put_bio		= true;
+	/* copy WRITE_SYNC flag */
+	wbio->bio.bi_opf	= src->bi_opf;
+
+	if (buf) {
+		bch2_bio_map(bio, buf, output_available);
+		return bio;
+	}
+
+	wbio->bounce		= true;
+
+	/*
+	 * We can't use mempool for more than c->sb.encoded_extent_max
+	 * worth of pages, but we'd like to allocate more if we can:
+	 */
+	bch2_bio_alloc_pages_pool(c, bio,
+				  min_t(unsigned, output_available,
+					c->opts.encoded_extent_max));
+
+	if (bio->bi_iter.bi_size < output_available)
+		*page_alloc_failed =
+			bch2_bio_alloc_pages(bio,
+					     output_available -
+					     bio->bi_iter.bi_size,
+					     GFP_NOFS) != 0;
+
+	return bio;
+}
+
+static int bch2_write_rechecksum(struct bch_fs *c,
+				 struct bch_write_op *op,
+				 unsigned new_csum_type)
+{
+	struct bio *bio = &op->wbio.bio;
+	struct bch_extent_crc_unpacked new_crc;
+	int ret;
+
+	/* bch2_rechecksum_bio() can't encrypt or decrypt data: */
+
+	if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
+	    bch2_csum_type_is_encryption(new_csum_type))
+		new_csum_type = op->crc.csum_type;
+
+	ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
+				  NULL, &new_crc,
+				  op->crc.offset, op->crc.live_size,
+				  new_csum_type);
+	if (ret)
+		return ret;
+
+	bio_advance(bio, op->crc.offset << 9);
+	bio->bi_iter.bi_size = op->crc.live_size << 9;
+	op->crc = new_crc;
+	return 0;
+}
+
+static int bch2_write_decrypt(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct nonce nonce = extent_nonce(op->version, op->crc);
+	struct bch_csum csum;
+	int ret;
+
+	if (!bch2_csum_type_is_encryption(op->crc.csum_type))
+		return 0;
+
+	/*
+	 * If we need to decrypt data in the write path, we'll no longer be able
+	 * to verify the existing checksum (poly1305 mac, in this case) after
+	 * it's decrypted - this is the last point we'll be able to reverify the
+	 * checksum:
+	 */
+	csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+	if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
+		return -EIO;
+
+	ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+	op->crc.csum_type = 0;
+	op->crc.csum = (struct bch_csum) { 0, 0 };
+	return ret;
+}
+
+static enum prep_encoded_ret {
+	PREP_ENCODED_OK,
+	PREP_ENCODED_ERR,
+	PREP_ENCODED_CHECKSUM_ERR,
+	PREP_ENCODED_DO_WRITE,
+} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
+{
+	struct bch_fs *c = op->c;
+	struct bio *bio = &op->wbio.bio;
+
+	if (!(op->flags & BCH_WRITE_DATA_ENCODED))
+		return PREP_ENCODED_OK;
+
+	BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
+
+	/* Can we just write the entire extent as is? */
+	if (op->crc.uncompressed_size == op->crc.live_size &&
+	    op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 &&
+	    op->crc.compressed_size <= wp->sectors_free &&
+	    (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
+	     op->incompressible)) {
+		if (!crc_is_compressed(op->crc) &&
+		    op->csum_type != op->crc.csum_type &&
+		    bch2_write_rechecksum(c, op, op->csum_type) &&
+		    !c->opts.no_data_io)
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		return PREP_ENCODED_DO_WRITE;
+	}
+
+	/*
+	 * If the data is compressed and we couldn't write the entire extent as
+	 * is, we have to decompress it:
+	 */
+	if (crc_is_compressed(op->crc)) {
+		struct bch_csum csum;
+
+		if (bch2_write_decrypt(op))
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		/* Last point we can still verify checksum: */
+		csum = bch2_checksum_bio(c, op->crc.csum_type,
+					 extent_nonce(op->version, op->crc),
+					 bio);
+		if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
+			return PREP_ENCODED_ERR;
+	}
+
+	/*
+	 * No longer have compressed data after this point - data might be
+	 * encrypted:
+	 */
+
+	/*
+	 * If the data is checksummed and we're only writing a subset,
+	 * rechecksum and adjust bio to point to currently live data:
+	 */
+	if ((op->crc.live_size != op->crc.uncompressed_size ||
+	     op->crc.csum_type != op->csum_type) &&
+	    bch2_write_rechecksum(c, op, op->csum_type) &&
+	    !c->opts.no_data_io)
+		return PREP_ENCODED_CHECKSUM_ERR;
+
+	/*
+	 * If we want to compress the data, it has to be decrypted:
+	 */
+	if ((op->compression_opt ||
+	     bch2_csum_type_is_encryption(op->crc.csum_type) !=
+	     bch2_csum_type_is_encryption(op->csum_type)) &&
+	    bch2_write_decrypt(op))
+		return PREP_ENCODED_CHECKSUM_ERR;
+
+	return PREP_ENCODED_OK;
+}
+
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
+			     struct bio **_dst)
+{
+	struct bch_fs *c = op->c;
+	struct bio *src = &op->wbio.bio, *dst = src;
+	struct bvec_iter saved_iter;
+	void *ec_buf;
+	unsigned total_output = 0, total_input = 0;
+	bool bounce = false;
+	bool page_alloc_failed = false;
+	int ret, more = 0;
+
+	BUG_ON(!bio_sectors(src));
+
+	ec_buf = bch2_writepoint_ec_buf(c, wp);
+
+	switch (bch2_write_prep_encoded_data(op, wp)) {
+	case PREP_ENCODED_OK:
+		break;
+	case PREP_ENCODED_ERR:
+		ret = -EIO;
+		goto err;
+	case PREP_ENCODED_CHECKSUM_ERR:
+		goto csum_err;
+	case PREP_ENCODED_DO_WRITE:
+		/* XXX look for bug here */
+		if (ec_buf) {
+			dst = bch2_write_bio_alloc(c, wp, src,
+						   &page_alloc_failed,
+						   ec_buf);
+			bio_copy_data(dst, src);
+			bounce = true;
+		}
+		init_append_extent(op, wp, op->version, op->crc);
+		goto do_write;
+	}
+
+	if (ec_buf ||
+	    op->compression_opt ||
+	    (op->csum_type &&
+	     !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
+	    (bch2_csum_type_is_encryption(op->csum_type) &&
+	     !(op->flags & BCH_WRITE_PAGES_OWNED))) {
+		dst = bch2_write_bio_alloc(c, wp, src,
+					   &page_alloc_failed,
+					   ec_buf);
+		bounce = true;
+	}
+
+	saved_iter = dst->bi_iter;
+
+	do {
+		struct bch_extent_crc_unpacked crc = { 0 };
+		struct bversion version = op->version;
+		size_t dst_len = 0, src_len = 0;
+
+		if (page_alloc_failed &&
+		    dst->bi_iter.bi_size  < (wp->sectors_free << 9) &&
+		    dst->bi_iter.bi_size < c->opts.encoded_extent_max)
+			break;
+
+		BUG_ON(op->compression_opt &&
+		       (op->flags & BCH_WRITE_DATA_ENCODED) &&
+		       bch2_csum_type_is_encryption(op->crc.csum_type));
+		BUG_ON(op->compression_opt && !bounce);
+
+		crc.compression_type = op->incompressible
+			? BCH_COMPRESSION_TYPE_incompressible
+			: op->compression_opt
+			? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
+					    op->compression_opt)
+			: 0;
+		if (!crc_is_compressed(crc)) {
+			dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+			dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
+
+			if (op->csum_type)
+				dst_len = min_t(unsigned, dst_len,
+						c->opts.encoded_extent_max);
+
+			if (bounce) {
+				swap(dst->bi_iter.bi_size, dst_len);
+				bio_copy_data(dst, src);
+				swap(dst->bi_iter.bi_size, dst_len);
+			}
+
+			src_len = dst_len;
+		}
+
+		BUG_ON(!src_len || !dst_len);
+
+		if (bch2_csum_type_is_encryption(op->csum_type)) {
+			if (bversion_zero(version)) {
+				version.lo = atomic64_inc_return(&c->key_version);
+			} else {
+				crc.nonce = op->nonce;
+				op->nonce += src_len >> 9;
+			}
+		}
+
+		if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+		    !crc_is_compressed(crc) &&
+		    bch2_csum_type_is_encryption(op->crc.csum_type) ==
+		    bch2_csum_type_is_encryption(op->csum_type)) {
+			u8 compression_type = crc.compression_type;
+			u16 nonce = crc.nonce;
+			/*
+			 * Note: when we're using rechecksum(), we need to be
+			 * checksumming @src because it has all the data our
+			 * existing checksum covers - if we bounced (because we
+			 * were trying to compress), @dst will only have the
+			 * part of the data the new checksum will cover.
+			 *
+			 * But normally we want to be checksumming post bounce,
+			 * because part of the reason for bouncing is so the
+			 * data can't be modified (by userspace) while it's in
+			 * flight.
+			 */
+			if (bch2_rechecksum_bio(c, src, version, op->crc,
+					&crc, &op->crc,
+					src_len >> 9,
+					bio_sectors(src) - (src_len >> 9),
+					op->csum_type))
+				goto csum_err;
+			/*
+			 * rchecksum_bio sets compression_type on crc from op->crc,
+			 * this isn't always correct as sometimes we're changing
+			 * an extent from uncompressed to incompressible.
+			 */
+			crc.compression_type = compression_type;
+			crc.nonce = nonce;
+		} else {
+			if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+			    bch2_rechecksum_bio(c, src, version, op->crc,
+					NULL, &op->crc,
+					src_len >> 9,
+					bio_sectors(src) - (src_len >> 9),
+					op->crc.csum_type))
+				goto csum_err;
+
+			crc.compressed_size	= dst_len >> 9;
+			crc.uncompressed_size	= src_len >> 9;
+			crc.live_size		= src_len >> 9;
+
+			swap(dst->bi_iter.bi_size, dst_len);
+			ret = bch2_encrypt_bio(c, op->csum_type,
+					       extent_nonce(version, crc), dst);
+			if (ret)
+				goto err;
+
+			crc.csum = bch2_checksum_bio(c, op->csum_type,
+					 extent_nonce(version, crc), dst);
+			crc.csum_type = op->csum_type;
+			swap(dst->bi_iter.bi_size, dst_len);
+		}
+
+		init_append_extent(op, wp, version, crc);
+
+		if (dst != src)
+			bio_advance(dst, dst_len);
+		bio_advance(src, src_len);
+		total_output	+= dst_len;
+		total_input	+= src_len;
+	} while (dst->bi_iter.bi_size &&
+		 src->bi_iter.bi_size &&
+		 wp->sectors_free &&
+		 !bch2_keylist_realloc(&op->insert_keys,
+				      op->inline_keys,
+				      ARRAY_SIZE(op->inline_keys),
+				      BKEY_EXTENT_U64s_MAX));
+
+	more = src->bi_iter.bi_size != 0;
+
+	dst->bi_iter = saved_iter;
+
+	if (dst == src && more) {
+		BUG_ON(total_output != total_input);
+
+		dst = bio_split(src, total_input >> 9,
+				GFP_NOFS, &c->bio_write);
+		wbio_init(dst)->put_bio	= true;
+		/* copy WRITE_SYNC flag */
+		dst->bi_opf		= src->bi_opf;
+	}
+
+	dst->bi_iter.bi_size = total_output;
+do_write:
+	*_dst = dst;
+	return more;
+csum_err:
+	{
+		struct printbuf buf = PRINTBUF;
+		bch2_write_op_error(&buf, op);
+		prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)");
+		bch_err_ratelimited(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+	}
+
+	ret = -EIO;
+err:
+	if (to_wbio(dst)->bounce)
+		bch2_bio_free_pages_pool(c, dst);
+	if (to_wbio(dst)->put_bio)
+		bio_put(dst);
+
+	return ret;
+}
+
+static bool bch2_extent_is_writeable(struct bch_write_op *op,
+				     struct bkey_s_c k)
+{
+	struct bch_fs *c = op->c;
+	struct bkey_s_c_extent e;
+	struct extent_ptr_decoded p;
+	const union bch_extent_entry *entry;
+	unsigned replicas = 0;
+
+	if (k.k->type != KEY_TYPE_extent)
+		return false;
+
+	e = bkey_s_c_to_extent(k);
+
+	rcu_read_lock();
+	extent_for_each_ptr_decode(e, p, entry) {
+		if (crc_is_encoded(p.crc) || p.has_ec) {
+			rcu_read_unlock();
+			return false;
+		}
+
+		replicas += bch2_extent_ptr_durability(c, &p);
+	}
+	rcu_read_unlock();
+
+	return replicas >= op->opts.data_replicas;
+}
+
+static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
+						  struct btree_iter *iter,
+						  struct bkey_i *orig,
+						  struct bkey_s_c k,
+						  u64 new_i_size)
+{
+	if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
+		/* trace this */
+		return 0;
+	}
+
+	struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+	int ret = PTR_ERR_OR_ZERO(new);
+	if (ret)
+		return ret;
+
+	bch2_cut_front(bkey_start_pos(&orig->k), new);
+	bch2_cut_back(orig->k.p, new);
+
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+	bkey_for_each_ptr(ptrs, ptr)
+		ptr->unwritten = 0;
+
+	/*
+	 * Note that we're not calling bch2_subvol_get_snapshot() in this path -
+	 * that was done when we kicked off the write, and here it's important
+	 * that we update the extent that we wrote to - even if a snapshot has
+	 * since been created. The write is still outstanding, so we're ok
+	 * w.r.t. snapshot atomicity:
+	 */
+	return  bch2_extent_update_i_size_sectors(trans, iter,
+					min(new->k.p.offset << 9, new_i_size), 0) ?:
+		bch2_trans_update(trans, iter, new,
+				  BTREE_UPDATE_internal_snapshot_node);
+}
+
+static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct btree_trans *trans = bch2_trans_get(c);
+
+	for_each_keylist_key(&op->insert_keys, orig) {
+		int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
+				     bkey_start_pos(&orig->k), orig->k.p,
+				     BTREE_ITER_intent, k,
+				     NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+			bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
+		}));
+
+		if (ret && !bch2_err_matches(ret, EROFS)) {
+			struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
+
+			struct printbuf buf = PRINTBUF;
+			__bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
+			prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
+			bch_err_ratelimited(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+		}
+
+		if (ret) {
+			op->error = ret;
+			break;
+		}
+	}
+
+	bch2_trans_put(trans);
+}
+
+static void __bch2_nocow_write_done(struct bch_write_op *op)
+{
+	if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+		op->error = -EIO;
+	} else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
+		bch2_nocow_write_convert_unwritten(op);
+}
+
+static CLOSURE_CALLBACK(bch2_nocow_write_done)
+{
+	closure_type(op, struct bch_write_op, cl);
+
+	__bch2_nocow_write_done(op);
+	bch2_write_done(cl);
+}
+
+struct bucket_to_lock {
+	struct bpos		b;
+	unsigned		gen;
+	struct nocow_lock_bucket *l;
+};
+
+static void bch2_nocow_write(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
+	u32 snapshot;
+	struct bucket_to_lock *stale_at;
+	int stale, ret;
+
+	if (op->flags & BCH_WRITE_MOVE)
+		return;
+
+	darray_init(&buckets);
+	trans = bch2_trans_get(c);
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot);
+	if (unlikely(ret))
+		goto err;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     SPOS(op->pos.inode, op->pos.offset, snapshot),
+			     BTREE_ITER_slots);
+	while (1) {
+		struct bio *bio = &op->wbio.bio;
+
+		buckets.nr = 0;
+
+		ret = bch2_trans_relock(trans);
+		if (ret)
+			break;
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		/* fall back to normal cow write path? */
+		if (unlikely(k.k->p.snapshot != snapshot ||
+			     !bch2_extent_is_writeable(op, k)))
+			break;
+
+		if (bch2_keylist_realloc(&op->insert_keys,
+					 op->inline_keys,
+					 ARRAY_SIZE(op->inline_keys),
+					 k.k->u64s))
+			break;
+
+		/* Get iorefs before dropping btree locks: */
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+		bkey_for_each_ptr(ptrs, ptr) {
+			struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);
+			if (unlikely(!ca))
+				goto err_get_ioref;
+
+			struct bpos b = PTR_BUCKET_POS(ca, ptr);
+			struct nocow_lock_bucket *l =
+				bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b));
+			prefetch(l);
+
+			/* XXX allocating memory with btree locks held - rare */
+			darray_push_gfp(&buckets, ((struct bucket_to_lock) {
+						   .b = b, .gen = ptr->gen, .l = l,
+						   }), GFP_KERNEL|__GFP_NOFAIL);
+
+			if (ptr->unwritten)
+				op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
+		}
+
+		/* Unlock before taking nocow locks, doing IO: */
+		bkey_reassemble(op->insert_keys.top, k);
+		bch2_trans_unlock(trans);
+
+		bch2_cut_front(op->pos, op->insert_keys.top);
+		if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
+			bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
+
+		darray_for_each(buckets, i) {
+			struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode);
+
+			__bch2_bucket_nocow_lock(&c->nocow_locks, i->l,
+						 bucket_to_u64(i->b),
+						 BUCKET_NOCOW_LOCK_UPDATE);
+
+			int gen = bucket_gen_get(ca, i->b.offset);
+			stale = gen < 0 ? gen : gen_after(gen, i->gen);
+			if (unlikely(stale)) {
+				stale_at = i;
+				goto err_bucket_stale;
+			}
+		}
+
+		bio = &op->wbio.bio;
+		if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
+			bio = bio_split(bio, k.k->p.offset - op->pos.offset,
+					GFP_KERNEL, &c->bio_write);
+			wbio_init(bio)->put_bio = true;
+			bio->bi_opf = op->wbio.bio.bi_opf;
+		} else {
+			op->flags |= BCH_WRITE_SUBMITTED;
+		}
+
+		op->pos.offset += bio_sectors(bio);
+		op->written += bio_sectors(bio);
+
+		bio->bi_end_io	= bch2_write_endio;
+		bio->bi_private	= &op->cl;
+		bio->bi_opf |= REQ_OP_WRITE;
+		closure_get(&op->cl);
+		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
+					  op->insert_keys.top, true);
+
+		bch2_keylist_push(&op->insert_keys);
+		if (op->flags & BCH_WRITE_SUBMITTED)
+			break;
+		bch2_btree_iter_advance(&iter);
+	}
+out:
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	if (ret) {
+		struct printbuf buf = PRINTBUF;
+		bch2_write_op_error(&buf, op);
+		prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
+		bch_err_ratelimited(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+		op->error = ret;
+		op->flags |= BCH_WRITE_SUBMITTED;
+	}
+
+	bch2_trans_put(trans);
+	darray_exit(&buckets);
+
+	/* fallback to cow write path? */
+	if (!(op->flags & BCH_WRITE_SUBMITTED)) {
+		closure_sync(&op->cl);
+		__bch2_nocow_write_done(op);
+		op->insert_keys.top = op->insert_keys.keys;
+	} else if (op->flags & BCH_WRITE_SYNC) {
+		closure_sync(&op->cl);
+		bch2_nocow_write_done(&op->cl.work);
+	} else {
+		/*
+		 * XXX
+		 * needs to run out of process context because ei_quota_lock is
+		 * a mutex
+		 */
+		continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
+	}
+	return;
+err_get_ioref:
+	darray_for_each(buckets, i)
+		percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref);
+
+	/* Fall back to COW path: */
+	goto out;
+err_bucket_stale:
+	darray_for_each(buckets, i) {
+		bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE);
+		if (i == stale_at)
+			break;
+	}
+
+	struct printbuf buf = PRINTBUF;
+	if (bch2_fs_inconsistent_on(stale < 0, c,
+				    "pointer to invalid bucket in nocow path on device %llu\n  %s",
+				    stale_at->b.inode,
+				    (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = -EIO;
+	} else {
+		/* We can retry this: */
+		ret = -BCH_ERR_transaction_restart;
+	}
+	printbuf_exit(&buf);
+
+	goto err_get_ioref;
+}
+
+static void __bch2_write(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct write_point *wp = NULL;
+	struct bio *bio = NULL;
+	unsigned nofs_flags;
+	int ret;
+
+	nofs_flags = memalloc_nofs_save();
+
+	if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
+		bch2_nocow_write(op);
+		if (op->flags & BCH_WRITE_SUBMITTED)
+			goto out_nofs_restore;
+	}
+again:
+	memset(&op->failed, 0, sizeof(op->failed));
+
+	do {
+		struct bkey_i *key_to_write;
+		unsigned key_to_write_offset = op->insert_keys.top_p -
+			op->insert_keys.keys_p;
+
+		/* +1 for possible cache device: */
+		if (op->open_buckets.nr + op->nr_replicas + 1 >
+		    ARRAY_SIZE(op->open_buckets.v))
+			break;
+
+		if (bch2_keylist_realloc(&op->insert_keys,
+					op->inline_keys,
+					ARRAY_SIZE(op->inline_keys),
+					BKEY_EXTENT_U64s_MAX))
+			break;
+
+		/*
+		 * The copygc thread is now global, which means it's no longer
+		 * freeing up space on specific disks, which means that
+		 * allocations for specific disks may hang arbitrarily long:
+		 */
+		ret = bch2_trans_run(c, lockrestart_do(trans,
+			bch2_alloc_sectors_start_trans(trans,
+				op->target,
+				op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
+				op->write_point,
+				&op->devs_have,
+				op->nr_replicas,
+				op->nr_replicas_required,
+				op->watermark,
+				op->flags,
+				&op->cl, &wp)));
+		if (unlikely(ret)) {
+			if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+				break;
+
+			goto err;
+		}
+
+		EBUG_ON(!wp);
+
+		bch2_open_bucket_get(c, wp, &op->open_buckets);
+		ret = bch2_write_extent(op, wp, &bio);
+
+		bch2_alloc_sectors_done_inlined(c, wp);
+err:
+		if (ret <= 0) {
+			op->flags |= BCH_WRITE_SUBMITTED;
+
+			if (unlikely(ret < 0)) {
+				if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) {
+					struct printbuf buf = PRINTBUF;
+					bch2_write_op_error(&buf, op);
+					prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret));
+					bch_err_ratelimited(c, "%s", buf.buf);
+					printbuf_exit(&buf);
+				}
+				op->error = ret;
+				break;
+			}
+		}
+
+		bio->bi_end_io	= bch2_write_endio;
+		bio->bi_private	= &op->cl;
+		bio->bi_opf |= REQ_OP_WRITE;
+
+		closure_get(bio->bi_private);
+
+		key_to_write = (void *) (op->insert_keys.keys_p +
+					 key_to_write_offset);
+
+		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
+					  key_to_write, false);
+	} while (ret);
+
+	/*
+	 * Sync or no?
+	 *
+	 * If we're running asynchronously, wne may still want to block
+	 * synchronously here if we weren't able to submit all of the IO at
+	 * once, as that signals backpressure to the caller.
+	 */
+	if ((op->flags & BCH_WRITE_SYNC) ||
+	    (!(op->flags & BCH_WRITE_SUBMITTED) &&
+	     !(op->flags & BCH_WRITE_IN_WORKER))) {
+		bch2_wait_on_allocator(c, &op->cl);
+
+		__bch2_write_index(op);
+
+		if (!(op->flags & BCH_WRITE_SUBMITTED))
+			goto again;
+		bch2_write_done(&op->cl);
+	} else {
+		bch2_write_queue(op, wp);
+		continue_at(&op->cl, bch2_write_index, NULL);
+	}
+out_nofs_restore:
+	memalloc_nofs_restore(nofs_flags);
+}
+
+static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
+{
+	struct bio *bio = &op->wbio.bio;
+	struct bvec_iter iter;
+	struct bkey_i_inline_data *id;
+	unsigned sectors;
+	int ret;
+
+	memset(&op->failed, 0, sizeof(op->failed));
+
+	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+	op->flags |= BCH_WRITE_SUBMITTED;
+
+	bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
+
+	ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
+				   ARRAY_SIZE(op->inline_keys),
+				   BKEY_U64s + DIV_ROUND_UP(data_len, 8));
+	if (ret) {
+		op->error = ret;
+		goto err;
+	}
+
+	sectors = bio_sectors(bio);
+	op->pos.offset += sectors;
+
+	id = bkey_inline_data_init(op->insert_keys.top);
+	id->k.p		= op->pos;
+	id->k.bversion	= op->version;
+	id->k.size	= sectors;
+
+	iter = bio->bi_iter;
+	iter.bi_size = data_len;
+	memcpy_from_bio(id->v.data, bio, iter);
+
+	while (data_len & 7)
+		id->v.data[data_len++] = '\0';
+	set_bkey_val_bytes(&id->k, data_len);
+	bch2_keylist_push(&op->insert_keys);
+
+	__bch2_write_index(op);
+err:
+	bch2_write_done(&op->cl);
+}
+
+/**
+ * bch2_write() - handle a write to a cache device or flash only volume
+ * @cl:		&bch_write_op->cl
+ *
+ * This is the starting point for any data to end up in a cache device; it could
+ * be from a normal write, or a writeback write, or a write to a flash only
+ * volume - it's also used by the moving garbage collector to compact data in
+ * mostly empty buckets.
+ *
+ * It first writes the data to the cache, creating a list of keys to be inserted
+ * (if the data won't fit in a single open bucket, there will be multiple keys);
+ * after the data is written it calls bch_journal, and after the keys have been
+ * added to the next journal write they're inserted into the btree.
+ *
+ * If op->discard is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
+ */
+CLOSURE_CALLBACK(bch2_write)
+{
+	closure_type(op, struct bch_write_op, cl);
+	struct bio *bio = &op->wbio.bio;
+	struct bch_fs *c = op->c;
+	unsigned data_len;
+
+	EBUG_ON(op->cl.parent);
+	BUG_ON(!op->nr_replicas);
+	BUG_ON(!op->write_point.v);
+	BUG_ON(bkey_eq(op->pos, POS_MAX));
+
+	if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
+		op->flags |= BCH_WRITE_ALLOC_NOWAIT;
+
+	op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
+	op->start_time = local_clock();
+	bch2_keylist_init(&op->insert_keys, op->inline_keys);
+	wbio_init(bio)->put_bio = false;
+
+	if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
+		struct printbuf buf = PRINTBUF;
+		bch2_write_op_error(&buf, op);
+		prt_printf(&buf, "misaligned write");
+		printbuf_exit(&buf);
+		op->error = -EIO;
+		goto err;
+	}
+
+	if (c->opts.nochanges) {
+		op->error = -BCH_ERR_erofs_no_writes;
+		goto err;
+	}
+
+	if (!(op->flags & BCH_WRITE_MOVE) &&
+	    !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
+		op->error = -BCH_ERR_erofs_no_writes;
+		goto err;
+	}
+
+	this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
+	bch2_increment_clock(c, bio_sectors(bio), WRITE);
+
+	data_len = min_t(u64, bio->bi_iter.bi_size,
+			 op->new_i_size - (op->pos.offset << 9));
+
+	if (c->opts.inline_data &&
+	    data_len <= min(block_bytes(c) / 2, 1024U)) {
+		bch2_write_data_inline(op, data_len);
+		return;
+	}
+
+	__bch2_write(op);
+	return;
+err:
+	bch2_disk_reservation_put(c, &op->res);
+
+	closure_debug_destroy(&op->cl);
+	if (op->end_io)
+		op->end_io(op);
+}
+
+static const char * const bch2_write_flags[] = {
+#define x(f)	#f,
+	BCH_WRITE_FLAGS()
+#undef x
+	NULL
+};
+
+void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
+{
+	prt_str(out, "pos: ");
+	bch2_bpos_to_text(out, op->pos);
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	prt_str(out, "started: ");
+	bch2_pr_time_units(out, local_clock() - op->start_time);
+	prt_newline(out);
+
+	prt_str(out, "flags: ");
+	prt_bitflags(out, bch2_write_flags, op->flags);
+	prt_newline(out);
+
+	prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl));
+
+	printbuf_indent_sub(out, 2);
+}
+
+void bch2_fs_io_write_exit(struct bch_fs *c)
+{
+	mempool_exit(&c->bio_bounce_pages);
+	bioset_exit(&c->replica_set);
+	bioset_exit(&c->bio_write);
+}
+
+int bch2_fs_io_write_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->bio_write,   1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) ||
+	    bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0))
+		return -BCH_ERR_ENOMEM_bio_write_init;
+
+	if (mempool_init_page_pool(&c->bio_bounce_pages,
+				   max_t(unsigned,
+					 c->opts.btree_node_size,
+					 c->opts.encoded_extent_max) /
+				   PAGE_SIZE, 0))
+		return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
+
+	return 0;
+}
diff --git a/libbcachefs/io_write.h b/libbcachefs/io_write.h
new file mode 100644
index 00000000..5400ce94
--- /dev/null
+++ b/libbcachefs/io_write.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_WRITE_H
+#define _BCACHEFS_IO_WRITE_H
+
+#include "checksum.h"
+#include "io_write_types.h"
+
+#define to_wbio(_bio)			\
+	container_of((_bio), struct bch_write_bio, bio)
+
+void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
+void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void bch2_latency_acct(struct bch_dev *, u64, int);
+#else
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
+#endif
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
+			       enum bch_data_type, const struct bkey_i *, bool);
+
+#define BCH_WRITE_FLAGS()		\
+	x(ALLOC_NOWAIT)			\
+	x(CACHED)			\
+	x(DATA_ENCODED)			\
+	x(PAGES_STABLE)			\
+	x(PAGES_OWNED)			\
+	x(ONLY_SPECIFIED_DEVS)		\
+	x(WROTE_DATA_INLINE)		\
+	x(FROM_INTERNAL)		\
+	x(CHECK_ENOSPC)			\
+	x(SYNC)				\
+	x(MOVE)				\
+	x(IN_WORKER)			\
+	x(SUBMITTED)			\
+	x(IO_ERROR)			\
+	x(CONVERT_UNWRITTEN)
+
+enum __bch_write_flags {
+#define x(f)	__BCH_WRITE_##f,
+	BCH_WRITE_FLAGS()
+#undef x
+};
+
+enum bch_write_flags {
+#define x(f)	BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
+	BCH_WRITE_FLAGS()
+#undef x
+};
+
+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
+{
+	return op->watermark == BCH_WATERMARK_copygc
+		? op->c->copygc_wq
+		: op->c->btree_update_wq;
+}
+
+int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
+			       struct bkey_i *, bool *, s64 *, s64 *);
+int bch2_extent_update(struct btree_trans *, subvol_inum,
+		       struct btree_iter *, struct bkey_i *,
+		       struct disk_reservation *, u64, s64 *, bool);
+
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
+				      struct bch_io_opts opts)
+{
+	op->c			= c;
+	op->end_io		= NULL;
+	op->flags		= 0;
+	op->written		= 0;
+	op->error		= 0;
+	op->csum_type		= bch2_data_checksum_type(c, opts);
+	op->compression_opt	= opts.compression;
+	op->nr_replicas		= 0;
+	op->nr_replicas_required = c->opts.data_replicas_required;
+	op->watermark		= BCH_WATERMARK_normal;
+	op->incompressible	= 0;
+	op->open_buckets.nr	= 0;
+	op->devs_have.nr	= 0;
+	op->target		= 0;
+	op->opts		= opts;
+	op->subvol		= 0;
+	op->pos			= POS_MAX;
+	op->version		= ZERO_VERSION;
+	op->write_point		= (struct write_point_specifier) { 0 };
+	op->res			= (struct disk_reservation) { 0 };
+	op->new_i_size		= U64_MAX;
+	op->i_sectors_delta	= 0;
+	op->devs_need_flush	= NULL;
+}
+
+CLOSURE_CALLBACK(bch2_write);
+void bch2_write_point_do_index_updates(struct work_struct *);
+
+static inline struct bch_write_bio *wbio_init(struct bio *bio)
+{
+	struct bch_write_bio *wbio = to_wbio(bio);
+
+	memset(&wbio->wbio, 0, sizeof(wbio->wbio));
+	return wbio;
+}
+
+void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
+
+void bch2_fs_io_write_exit(struct bch_fs *);
+int bch2_fs_io_write_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_WRITE_H */
diff --git a/libbcachefs/io_write_types.h b/libbcachefs/io_write_types.h
new file mode 100644
index 00000000..6e878a6f
--- /dev/null
+++ b/libbcachefs/io_write_types.h
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_WRITE_TYPES_H
+#define _BCACHEFS_IO_WRITE_TYPES_H
+
+#include "alloc_types.h"
+#include "btree_types.h"
+#include "buckets_types.h"
+#include "extents_types.h"
+#include "keylist_types.h"
+#include "opts.h"
+#include "super_types.h"
+
+#include <linux/llist.h>
+#include <linux/workqueue.h>
+
+struct bch_write_bio {
+	struct_group(wbio,
+	struct bch_fs		*c;
+	struct bch_write_bio	*parent;
+
+	u64			submit_time;
+	u64			inode_offset;
+	u64			nocow_bucket;
+
+	struct bch_devs_list	failed;
+	u8			dev;
+
+	unsigned		split:1,
+				bounce:1,
+				put_bio:1,
+				have_ioref:1,
+				nocow:1,
+				used_mempool:1,
+				first_btree_write:1;
+	);
+
+	struct bio		bio;
+};
+
+struct bch_write_op {
+	struct closure		cl;
+	struct bch_fs		*c;
+	void			(*end_io)(struct bch_write_op *);
+	u64			start_time;
+
+	unsigned		written; /* sectors */
+	u16			flags;
+	s16			error; /* dio write path expects it to hold -ERESTARTSYS... */
+
+	unsigned		compression_opt:8;
+	unsigned		csum_type:4;
+	unsigned		nr_replicas:4;
+	unsigned		nr_replicas_required:4;
+	unsigned		watermark:3;
+	unsigned		incompressible:1;
+	unsigned		stripe_waited:1;
+
+	struct bch_devs_list	devs_have;
+	u16			target;
+	u16			nonce;
+	struct bch_io_opts	opts;
+
+	u32			subvol;
+	struct bpos		pos;
+	struct bversion		version;
+
+	/* For BCH_WRITE_DATA_ENCODED: */
+	struct bch_extent_crc_unpacked crc;
+
+	struct write_point_specifier write_point;
+
+	struct write_point	*wp;
+	struct list_head	wp_list;
+
+	struct disk_reservation	res;
+
+	struct open_buckets	open_buckets;
+
+	u64			new_i_size;
+	s64			i_sectors_delta;
+
+	struct bch_devs_mask	failed;
+
+	struct keylist		insert_keys;
+	u64			inline_keys[BKEY_EXTENT_U64s_MAX * 2];
+
+	/*
+	 * Bitmask of devices that have had nocow writes issued to them since
+	 * last flush:
+	 */
+	struct bch_devs_mask	*devs_need_flush;
+
+	/* Must be last: */
+	struct bch_write_bio	wbio;
+};
+
+#endif /* _BCACHEFS_IO_WRITE_TYPES_H */
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index b4fe27f8..dc665219 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * bcachefs journalling code, for btree insertions
  *
@@ -5,341 +6,540 @@
  */
 
 #include "bcachefs.h"
-#include "alloc.h"
+#include "alloc_foreground.h"
 #include "bkey_methods.h"
 #include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
+#include "error.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
+#include "journal_sb.h"
 #include "journal_seq_blacklist.h"
-#include "super-io.h"
+#include "trace.h"
 
-#include <trace/events/bcachefs.h>
+static const char * const bch2_journal_errors[] = {
+#define x(n)	#n,
+	JOURNAL_ERRORS()
+#undef x
+	NULL
+};
+
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
+{
+	return seq > j->seq_ondisk;
+}
+
+static bool __journal_entry_is_open(union journal_res_state state)
+{
+	return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+}
+
+static inline unsigned nr_unwritten_journal_entries(struct journal *j)
+{
+	return atomic64_read(&j->seq) - j->seq_ondisk;
+}
 
 static bool journal_entry_is_open(struct journal *j)
 {
-	return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+	return __journal_entry_is_open(j->reservations);
 }
 
-void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
+static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
 {
-	struct journal_buf *w = journal_prev_buf(j);
-
-	atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
-
-	if (!need_write_just_set &&
-	    test_bit(JOURNAL_NEED_WRITE, &j->flags))
-		bch2_time_stats_update(j->delay_time,
-				       j->need_write_time);
-#if 0
-	closure_call(&j->io, bch2_journal_write, NULL, NULL);
-#else
-	/* Shut sparse up: */
-	closure_init(&j->io, NULL);
-	set_closure_fn(&j->io, bch2_journal_write, NULL);
-	bch2_journal_write(&j->io);
-#endif
+	union journal_res_state s = READ_ONCE(j->reservations);
+	unsigned i = seq & JOURNAL_BUF_MASK;
+	struct journal_buf *buf = j->buf + i;
+
+	prt_printf(out, "seq:\t%llu\n", seq);
+	printbuf_indent_add(out, 2);
+
+	prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i));
+
+	prt_printf(out, "size:\t");
+	prt_human_readable_u64(out, vstruct_bytes(buf->data));
+	prt_newline(out);
+
+	prt_printf(out, "expires:\t");
+	prt_printf(out, "%li jiffies\n", buf->expires - jiffies);
+
+	prt_printf(out, "flags:\t");
+	if (buf->noflush)
+		prt_str(out, "noflush ");
+	if (buf->must_flush)
+		prt_str(out, "must_flush ");
+	if (buf->separate_flush)
+		prt_str(out, "separate_flush ");
+	if (buf->need_flush_to_write_buffer)
+		prt_str(out, "need_flush_to_write_buffer ");
+	if (buf->write_started)
+		prt_str(out, "write_started ");
+	if (buf->write_allocated)
+		prt_str(out, "write_allocated ");
+	if (buf->write_done)
+		prt_str(out, "write_done");
+	prt_newline(out);
+
+	printbuf_indent_sub(out, 2);
 }
 
-static void journal_pin_new_entry(struct journal *j, int count)
+static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
 {
-	struct journal_entry_pin_list *p;
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 24);
+
+	for (u64 seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j);
+	     seq++)
+		bch2_journal_buf_to_text(out, j, seq);
+	prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
+}
 
-	/*
-	 * The fifo_push() needs to happen at the same time as j->seq is
-	 * incremented for journal_last_seq() to be calculated correctly
-	 */
-	atomic64_inc(&j->seq);
-	p = fifo_push_ref(&j->pin);
+static inline struct journal_buf *
+journal_seq_to_buf(struct journal *j, u64 seq)
+{
+	struct journal_buf *buf = NULL;
+
+	EBUG_ON(seq > journal_cur_seq(j));
 
-	INIT_LIST_HEAD(&p->list);
+	if (journal_seq_unwritten(j, seq)) {
+		buf = j->buf + (seq & JOURNAL_BUF_MASK);
+		EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
+	}
+	return buf;
+}
+
+static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(p->list); i++)
+		INIT_LIST_HEAD(&p->list[i]);
 	INIT_LIST_HEAD(&p->flushed);
 	atomic_set(&p->count, count);
 	p->devs.nr = 0;
 }
 
-static void bch2_journal_buf_init(struct journal *j)
+/*
+ * Detect stuck journal conditions and trigger shutdown. Technically the journal
+ * can end up stuck for a variety of reasons, such as a blocked I/O, journal
+ * reservation lockup, etc. Since this is a fatal error with potentially
+ * unpredictable characteristics, we want to be fairly conservative before we
+ * decide to shut things down.
+ *
+ * Consider the journal stuck when it appears full with no ability to commit
+ * btree transactions, to discard journal buckets, nor acquire priority
+ * (reserved watermark) reservation.
+ */
+static inline bool
+journal_error_check_stuck(struct journal *j, int error, unsigned flags)
 {
-	struct journal_buf *buf = journal_cur_buf(j);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	bool stuck = false;
+	struct printbuf buf = PRINTBUF;
 
-	memset(buf->has_inode, 0, sizeof(buf->has_inode));
+	if (!(error == JOURNAL_ERR_journal_full ||
+	      error == JOURNAL_ERR_journal_pin_full) ||
+	    nr_unwritten_journal_entries(j) ||
+	    (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim)
+		return stuck;
 
-	memset(buf->data, 0, sizeof(*buf->data));
-	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
-	buf->data->u64s	= 0;
+	spin_lock(&j->lock);
+
+	if (j->can_discard) {
+		spin_unlock(&j->lock);
+		return stuck;
+	}
+
+	stuck = true;
+
+	/*
+	 * The journal shutdown path will set ->err_seq, but do it here first to
+	 * serialize against concurrent failures and avoid duplicate error
+	 * reports.
+	 */
+	if (j->err_seq) {
+		spin_unlock(&j->lock);
+		return stuck;
+	}
+	j->err_seq = journal_cur_seq(j);
+	spin_unlock(&j->lock);
+
+	bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)",
+		bch2_journal_errors[error]);
+	bch2_journal_debug_to_text(&buf, j);
+	bch_err(c, "%s", buf.buf);
+
+	printbuf_reset(&buf);
+	bch2_journal_pins_to_text(&buf, j);
+	bch_err(c, "Journal pins:\n%s", buf.buf);
+	printbuf_exit(&buf);
+
+	bch2_fatal_error(c);
+	dump_stack();
+
+	return stuck;
 }
 
-static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf)
+void bch2_journal_do_writes(struct journal *j)
 {
-	return BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
+	for (u64 seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j);
+	     seq++) {
+		unsigned idx = seq & JOURNAL_BUF_MASK;
+		struct journal_buf *w = j->buf + idx;
+
+		if (w->write_started && !w->write_allocated)
+			break;
+		if (w->write_started)
+			continue;
+
+		if (!journal_state_count(j->reservations, idx)) {
+			w->write_started = true;
+			closure_call(&w->io, bch2_journal_write, j->wq, NULL);
+		}
+
+		break;
+	}
 }
 
-static inline bool journal_entry_empty(struct jset *j)
+/*
+ * Final processing when the last reference of a journal buffer has been
+ * dropped. Drop the pin list reference acquired at journal entry open and write
+ * the buffer, if requested.
+ */
+void bch2_journal_buf_put_final(struct journal *j, u64 seq)
 {
-	struct jset_entry *i;
+	lockdep_assert_held(&j->lock);
 
-	if (j->seq != j->last_seq)
-		return false;
+	if (__bch2_journal_pin_put(j, seq))
+		bch2_journal_reclaim_fast(j);
+	bch2_journal_do_writes(j);
 
-	vstruct_for_each(j, i)
-		if (i->type || i->u64s)
-			return false;
-	return true;
+	/*
+	 * for __bch2_next_write_buffer_flush_journal_buf(), when quiescing an
+	 * open journal entry
+	 */
+	wake_up(&j->wait);
 }
 
-static enum {
-	JOURNAL_ENTRY_ERROR,
-	JOURNAL_ENTRY_INUSE,
-	JOURNAL_ENTRY_CLOSED,
-	JOURNAL_UNLOCKED,
-} journal_buf_switch(struct journal *j, bool need_write_just_set)
+/*
+ * Returns true if journal entry is now closed:
+ *
+ * We don't close a journal_buf until the next journal_buf is finished writing,
+ * and can be opened again - this also initializes the next journal_buf:
+ */
+static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *buf;
+	struct journal_buf *buf = journal_cur_buf(j);
 	union journal_res_state old, new;
-	u64 v = atomic64_read(&j->reservations.counter);
+	unsigned sectors;
+
+	BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
+	       closed_val != JOURNAL_ENTRY_ERROR_VAL);
 
 	lockdep_assert_held(&j->lock);
 
+	old.v = atomic64_read(&j->reservations.counter);
 	do {
-		old.v = new.v = v;
-		if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
-			return JOURNAL_ENTRY_CLOSED;
+		new.v = old.v;
+		new.cur_entry_offset = closed_val;
 
-		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-			return JOURNAL_ENTRY_ERROR;
-
-		if (new.prev_buf_unwritten)
-			return JOURNAL_ENTRY_INUSE;
+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
+		    old.cur_entry_offset == new.cur_entry_offset)
+			return;
+	} while (!atomic64_try_cmpxchg(&j->reservations.counter,
+				       &old.v, new.v));
 
-		/*
-		 * avoid race between setting buf->data->u64s and
-		 * journal_res_put starting write:
-		 */
-		journal_state_inc(&new);
+	if (!__journal_entry_is_open(old))
+		return;
 
-		new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
-		new.idx++;
-		new.prev_buf_unwritten = 1;
+	if (old.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)
+		old.cur_entry_offset = j->cur_entry_offset_if_blocked;
 
-		BUG_ON(journal_state_count(new, new.idx));
-	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
-				       old.v, new.v)) != old.v);
+	/* Close out old buffer: */
+	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);
 
-	clear_bit(JOURNAL_NEED_WRITE, &j->flags);
+	if (trace_journal_entry_close_enabled() && trace) {
+		struct printbuf pbuf = PRINTBUF;
+		pbuf.atomic++;
 
-	buf = &j->buf[old.idx];
-	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);
+		prt_str(&pbuf, "entry size: ");
+		prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data));
+		prt_newline(&pbuf);
+		bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT);
+		trace_journal_entry_close(c, pbuf.buf);
+		printbuf_exit(&pbuf);
+	}
 
-	j->prev_buf_sectors =
-		vstruct_blocks_plus(buf->data, c->block_bits,
-				    journal_entry_u64s_reserve(buf)) *
-		c->opts.block_size;
-	BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
+	sectors = vstruct_blocks_plus(buf->data, c->block_bits,
+				      buf->u64s_reserved) << c->block_bits;
+	BUG_ON(sectors > buf->sectors);
+	buf->sectors = sectors;
 
-	bch2_journal_reclaim_fast(j);
-	/* XXX: why set this here, and not in bch2_journal_write()? */
-	buf->data->last_seq	= cpu_to_le64(journal_last_seq(j));
+	/*
+	 * We have to set last_seq here, _before_ opening a new journal entry:
+	 *
+	 * A threads may replace an old pin with a new pin on their current
+	 * journal reservation - the expectation being that the journal will
+	 * contain either what the old pin protected or what the new pin
+	 * protects.
+	 *
+	 * After the old pin is dropped journal_last_seq() won't include the old
+	 * pin, so we can only write the updated last_seq on the entry that
+	 * contains whatever the new pin protects.
+	 *
+	 * Restated, we can _not_ update last_seq for a given entry if there
+	 * could be a newer entry open with reservations/pins that have been
+	 * taken against it.
+	 *
+	 * Hence, we want update/set last_seq on the current journal entry right
+	 * before we open a new one:
+	 */
+	buf->last_seq		= journal_last_seq(j);
+	buf->data->last_seq	= cpu_to_le64(buf->last_seq);
+	BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
 
-	if (journal_entry_empty(buf->data))
-		clear_bit(JOURNAL_NOT_EMPTY, &j->flags);
-	else
-		set_bit(JOURNAL_NOT_EMPTY, &j->flags);
+	cancel_delayed_work(&j->write_work);
 
-	journal_pin_new_entry(j, 1);
+	bch2_journal_space_available(j);
 
-	bch2_journal_buf_init(j);
+	__bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq));
+}
 
-	cancel_delayed_work(&j->write_work);
+void bch2_journal_halt(struct journal *j)
+{
+	spin_lock(&j->lock);
+	__journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
+	if (!j->err_seq)
+		j->err_seq = journal_cur_seq(j);
+	journal_wake(j);
 	spin_unlock(&j->lock);
+}
 
-	if (c->bucket_journal_seq > 1 << 14) {
-		c->bucket_journal_seq = 0;
-		bch2_bucket_seq_cleanup(c);
-	}
+static bool journal_entry_want_write(struct journal *j)
+{
+	bool ret = !journal_entry_is_open(j) ||
+		journal_cur_seq(j) == journal_last_unwritten_seq(j);
 
-	c->bucket_journal_seq++;
+	/* Don't close it yet if we already have a write in flight: */
+	if (ret)
+		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
+	else if (nr_unwritten_journal_entries(j)) {
+		struct journal_buf *buf = journal_cur_buf(j);
 
-	/* ugh - might be called from __journal_res_get() under wait_event() */
-	__set_current_state(TASK_RUNNING);
-	bch2_journal_buf_put(j, old.idx, need_write_just_set);
+		if (!buf->flush_time) {
+			buf->flush_time	= local_clock() ?: 1;
+			buf->expires = jiffies;
+		}
+	}
 
-	return JOURNAL_UNLOCKED;
+	return ret;
 }
 
-void bch2_journal_halt(struct journal *j)
+bool bch2_journal_entry_close(struct journal *j)
 {
-	union journal_res_state old, new;
-	u64 v = atomic64_read(&j->reservations.counter);
-
-	do {
-		old.v = new.v = v;
-		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-			return;
+	bool ret;
 
-		new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
-	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
-				       old.v, new.v)) != old.v);
+	spin_lock(&j->lock);
+	ret = journal_entry_want_write(j);
+	spin_unlock(&j->lock);
 
-	journal_wake(j);
-	closure_wake_up(&journal_cur_buf(j)->wait);
-	closure_wake_up(&journal_prev_buf(j)->wait);
+	return ret;
 }
 
 /*
  * should _only_ called from journal_res_get() - when we actually want a
  * journal reservation - journal entry is open means journal is dirty:
- *
- * returns:
- * 1:		success
- * 0:		journal currently full (must wait)
- * -EROFS:	insufficient rw devices
- * -EIO:	journal error
  */
 static int journal_entry_open(struct journal *j)
 {
-	struct journal_buf *buf = journal_cur_buf(j);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_buf *buf = j->buf +
+		((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
 	union journal_res_state old, new;
-	ssize_t u64s;
-	int sectors;
-	u64 v;
+	int u64s;
 
 	lockdep_assert_held(&j->lock);
 	BUG_ON(journal_entry_is_open(j));
+	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
+	if (j->blocked)
+		return JOURNAL_ERR_blocked;
+
+	if (j->cur_entry_error)
+		return j->cur_entry_error;
+
+	if (bch2_journal_error(j))
+		return JOURNAL_ERR_insufficient_devices; /* -EROFS */
 
 	if (!fifo_free(&j->pin))
-		return 0;
+		return JOURNAL_ERR_journal_pin_full;
+
+	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
+		return JOURNAL_ERR_max_in_flight;
+
+	if (bch2_fs_fatal_err_on(journal_cur_seq(j) >= JOURNAL_SEQ_MAX,
+				 c, "cannot start: journal seq overflow"))
+		return JOURNAL_ERR_insufficient_devices; /* -EROFS */
 
-	sectors = bch2_journal_entry_sectors(j);
-	if (sectors <= 0)
-		return sectors;
+	BUG_ON(!j->cur_entry_sectors);
 
-	buf->disk_sectors	= sectors;
+	buf->expires		=
+		(journal_cur_seq(j) == j->flushed_seq_ondisk
+		 ? jiffies
+		 : j->last_flush_write) +
+		msecs_to_jiffies(c->opts.journal_flush_delay);
 
-	sectors = min_t(unsigned, sectors, buf->size >> 9);
-	j->cur_buf_sectors	= sectors;
+	buf->u64s_reserved	= j->entry_u64s_reserved;
+	buf->disk_sectors	= j->cur_entry_sectors;
+	buf->sectors		= min(buf->disk_sectors, buf->buf_size >> 9);
 
-	u64s = (sectors << 9) / sizeof(u64);
+	u64s = (int) (buf->sectors << 9) / sizeof(u64) -
+		journal_entry_overhead(j);
+	u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
+
+	if (u64s <= (ssize_t) j->early_journal_entries.nr)
+		return JOURNAL_ERR_journal_full;
+
+	if (fifo_empty(&j->pin) && j->reclaim_thread)
+		wake_up_process(j->reclaim_thread);
 
-	/* Subtract the journal header */
-	u64s -= sizeof(struct jset) / sizeof(u64);
 	/*
-	 * Btree roots, prio pointers don't get added until right before we do
-	 * the write:
+	 * The fifo_push() needs to happen at the same time as j->seq is
+	 * incremented for journal_last_seq() to be calculated correctly
 	 */
-	u64s -= journal_entry_u64s_reserve(buf);
-	u64s  = max_t(ssize_t, 0L, u64s);
+	atomic64_inc(&j->seq);
+	journal_pin_list_init(fifo_push_ref(&j->pin), 1);
 
-	BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL);
+	BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
 
-	if (u64s <= le32_to_cpu(buf->data->u64s))
-		return 0;
+	BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
+
+	bkey_extent_init(&buf->key);
+	buf->noflush		= false;
+	buf->must_flush		= false;
+	buf->separate_flush	= false;
+	buf->flush_time		= 0;
+	buf->need_flush_to_write_buffer = true;
+	buf->write_started	= false;
+	buf->write_allocated	= false;
+	buf->write_done		= false;
+
+	memset(buf->data, 0, sizeof(*buf->data));
+	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
+	buf->data->u64s	= 0;
+
+	if (j->early_journal_entries.nr) {
+		memcpy(buf->data->_data, j->early_journal_entries.data,
+		       j->early_journal_entries.nr * sizeof(u64));
+		le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr);
+	}
 
 	/*
 	 * Must be set before marking the journal entry as open:
 	 */
 	j->cur_entry_u64s = u64s;
 
-	v = atomic64_read(&j->reservations.counter);
+	old.v = atomic64_read(&j->reservations.counter);
 	do {
-		old.v = new.v = v;
+		new.v = old.v;
 
-		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-			return -EIO;
+		BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
+
+		new.idx++;
+		BUG_ON(journal_state_count(new, new.idx));
+		BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK));
+
+		journal_state_inc(&new);
 
 		/* Handle any already added entries */
 		new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
-	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
-				       old.v, new.v)) != old.v);
-
-	if (j->res_get_blocked_start)
-		bch2_time_stats_update(j->blocked_time,
-				       j->res_get_blocked_start);
-	j->res_get_blocked_start = 0;
+	} while (!atomic64_try_cmpxchg(&j->reservations.counter,
+				       &old.v, new.v));
 
-	mod_delayed_work(system_freezable_wq,
-			 &j->write_work,
-			 msecs_to_jiffies(j->write_delay_ms));
+	if (nr_unwritten_journal_entries(j) == 1)
+		mod_delayed_work(j->wq,
+				 &j->write_work,
+				 msecs_to_jiffies(c->opts.journal_flush_delay));
 	journal_wake(j);
-	return 1;
+
+	if (j->early_journal_entries.nr)
+		darray_exit(&j->early_journal_entries);
+	return 0;
 }
 
-/*
- * returns true if there's nothing to flush and no journal write still in flight
- */
-static bool journal_flush_write(struct journal *j)
+static bool journal_quiesced(struct journal *j)
 {
-	bool ret;
-
-	spin_lock(&j->lock);
-	ret = !j->reservations.prev_buf_unwritten;
-
-	if (!journal_entry_is_open(j)) {
-		spin_unlock(&j->lock);
-		return ret;
-	}
+	bool ret = atomic64_read(&j->seq) == j->seq_ondisk;
 
-	set_bit(JOURNAL_NEED_WRITE, &j->flags);
-	if (journal_buf_switch(j, false) == JOURNAL_UNLOCKED)
-		ret = false;
-	else
-		spin_unlock(&j->lock);
+	if (!ret)
+		bch2_journal_entry_close(j);
 	return ret;
 }
 
-static void journal_write_work(struct work_struct *work)
+static void journal_quiesce(struct journal *j)
 {
-	struct journal *j = container_of(work, struct journal, write_work.work);
-
-	journal_flush_write(j);
+	wait_event(j->wait, journal_quiesced(j));
 }
 
-/*
- * Given an inode number, if that inode number has data in the journal that
- * hasn't yet been flushed, return the journal sequence number that needs to be
- * flushed:
- */
-u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
+static void journal_write_work(struct work_struct *work)
 {
-	size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
-	u64 seq = 0;
-
-	if (!test_bit(h, j->buf[0].has_inode) &&
-	    !test_bit(h, j->buf[1].has_inode))
-		return 0;
+	struct journal *j = container_of(work, struct journal, write_work.work);
 
 	spin_lock(&j->lock);
-	if (test_bit(h, journal_cur_buf(j)->has_inode))
-		seq = journal_cur_seq(j);
-	else if (test_bit(h, journal_prev_buf(j)->has_inode))
-		seq = journal_cur_seq(j) - 1;
-	spin_unlock(&j->lock);
+	if (__journal_entry_is_open(j->reservations)) {
+		long delta = journal_cur_buf(j)->expires - jiffies;
 
-	return seq;
+		if (delta > 0)
+			mod_delayed_work(j->wq, &j->write_work, delta);
+		else
+			__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
+	}
+	spin_unlock(&j->lock);
 }
 
 static int __journal_res_get(struct journal *j, struct journal_res *res,
-			      unsigned u64s_min, unsigned u64s_max)
+			     unsigned flags)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *buf;
+	bool can_discard;
 	int ret;
 retry:
-	ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
-	if (ret)
-		return ret;
+	if (journal_res_get_fast(j, res, flags))
+		return 0;
+
+	if (bch2_journal_error(j))
+		return -BCH_ERR_erofs_journal_err;
+
+	if (j->blocked)
+		return -BCH_ERR_journal_res_get_blocked;
+
+	if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
+		ret = JOURNAL_ERR_journal_full;
+		can_discard = j->can_discard;
+		goto out;
+	}
+
+	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
+		ret = JOURNAL_ERR_max_in_flight;
+		goto out;
+	}
 
 	spin_lock(&j->lock);
+
 	/*
 	 * Recheck after taking the lock, so we don't race with another thread
-	 * that just did journal_entry_open() and call journal_entry_close()
+	 * that just did journal_entry_open() and call bch2_journal_entry_close()
 	 * unnecessarily
 	 */
-	ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
-	if (ret) {
-		spin_unlock(&j->lock);
-		return 1;
+	if (journal_res_get_fast(j, res, flags)) {
+		ret = 0;
+		goto unlock;
 	}
 
 	/*
@@ -349,51 +549,56 @@ retry:
 	 */
 	buf = journal_cur_buf(j);
 	if (journal_entry_is_open(j) &&
-	    buf->size >> 9 < buf->disk_sectors &&
-	    buf->size < JOURNAL_ENTRY_SIZE_MAX)
-		j->buf_size_want = max(j->buf_size_want, buf->size << 1);
-
-	/*
-	 * Close the current journal entry if necessary, then try to start a new
-	 * one:
-	 */
-	switch (journal_buf_switch(j, false)) {
-	case JOURNAL_ENTRY_ERROR:
-		spin_unlock(&j->lock);
-		return -EROFS;
-	case JOURNAL_ENTRY_INUSE:
-		/* haven't finished writing out the previous one: */
-		spin_unlock(&j->lock);
-		trace_journal_entry_full(c);
-		goto blocked;
-	case JOURNAL_ENTRY_CLOSED:
-		break;
-	case JOURNAL_UNLOCKED:
+	    buf->buf_size >> 9 < buf->disk_sectors &&
+	    buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
+		j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
+
+	__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
+	ret = journal_entry_open(j) ?: JOURNAL_ERR_retry;
+unlock:
+	can_discard = j->can_discard;
+	spin_unlock(&j->lock);
+out:
+	if (ret == JOURNAL_ERR_retry)
 		goto retry;
-	}
+	if (!ret)
+		return 0;
 
-	/* We now have a new, closed journal buf - see if we can open it: */
-	ret = journal_entry_open(j);
-	spin_unlock(&j->lock);
+	if (journal_error_check_stuck(j, ret, flags))
+		ret = -BCH_ERR_journal_res_get_blocked;
 
-	if (ret < 0)
-		return ret;
-	if (ret)
-		goto retry;
+	if (ret == JOURNAL_ERR_max_in_flight &&
+	    track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) {
 
-	/* Journal's full, we have to wait */
+		struct printbuf buf = PRINTBUF;
+		prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
+		bch2_journal_bufs_to_text(&buf, j);
+		trace_journal_entry_full(c, buf.buf);
+		printbuf_exit(&buf);
+		count_event(c, journal_entry_full);
+	}
 
 	/*
-	 * Direct reclaim - can't rely on reclaim from work item
-	 * due to freezing..
+	 * Journal is full - can't rely on reclaim from work item due to
+	 * freezing:
 	 */
-	bch2_journal_reclaim_work(&j->reclaim_work.work);
+	if ((ret == JOURNAL_ERR_journal_full ||
+	     ret == JOURNAL_ERR_journal_pin_full) &&
+	    !(flags & JOURNAL_RES_GET_NONBLOCK)) {
+		if (can_discard) {
+			bch2_journal_do_discards(j);
+			goto retry;
+		}
 
-	trace_journal_full(c);
-blocked:
-	if (!j->res_get_blocked_start)
-		j->res_get_blocked_start = local_clock() ?: 1;
-	return 0;
+		if (mutex_trylock(&j->reclaim_lock)) {
+			bch2_journal_reclaim(j);
+			mutex_unlock(&j->reclaim_lock);
+		}
+	}
+
+	return ret == JOURNAL_ERR_insufficient_devices
+		? -BCH_ERR_erofs_journal_err
+		: -BCH_ERR_journal_res_get_blocked;
 }
 
 /*
@@ -407,298 +612,373 @@ blocked:
  * btree node write locks.
  */
 int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
-				 unsigned u64s_min, unsigned u64s_max)
-{
-	int ret;
-
-	wait_event(j->wait,
-		   (ret = __journal_res_get(j, res, u64s_min,
-					    u64s_max)));
-	return ret < 0 ? ret : 0;
-}
-
-u64 bch2_journal_last_unwritten_seq(struct journal *j)
-{
-	u64 seq;
-
-	spin_lock(&j->lock);
-	seq = journal_cur_seq(j);
-	if (j->reservations.prev_buf_unwritten)
-		seq--;
-	spin_unlock(&j->lock);
-
-	return seq;
-}
-
-/**
- * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't
- * open yet, or wait if we cannot
- *
- * used by the btree interior update machinery, when it needs to write a new
- * btree root - every journal entry contains the roots of all the btrees, so it
- * doesn't need to bother with getting a journal reservation
- */
-int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent)
+				  unsigned flags)
 {
 	int ret;
 
-	spin_lock(&j->lock);
-	BUG_ON(seq > journal_cur_seq(j));
-
-	if (seq < journal_cur_seq(j) ||
-	    journal_entry_is_open(j)) {
-		spin_unlock(&j->lock);
-		return 1;
-	}
-
-	ret = journal_entry_open(j);
-	if (!ret)
-		closure_wait(&j->async_wait, parent);
-	spin_unlock(&j->lock);
-
-	if (!ret)
-		bch2_journal_reclaim_work(&j->reclaim_work.work);
+	if (closure_wait_event_timeout(&j->async_wait,
+		   (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+		   (flags & JOURNAL_RES_GET_NONBLOCK),
+		   HZ * 10))
+		return ret;
 
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct printbuf buf = PRINTBUF;
+	bch2_journal_debug_to_text(&buf, j);
+	bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s",
+		buf.buf);
+	printbuf_exit(&buf);
+
+	closure_wait_event(&j->async_wait,
+		   (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+		   (flags & JOURNAL_RES_GET_NONBLOCK));
 	return ret;
 }
 
-/**
- * bch2_journal_wait_on_seq - wait for a journal entry to be written
- *
- * does _not_ cause @seq to be written immediately - if there is no other
- * activity to cause the relevant journal entry to be filled up or flushed it
- * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is
- * configurable).
- */
-void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent)
-{
-	spin_lock(&j->lock);
+/* journal_entry_res: */
 
-	BUG_ON(seq > journal_cur_seq(j));
+void bch2_journal_entry_res_resize(struct journal *j,
+				   struct journal_entry_res *res,
+				   unsigned new_u64s)
+{
+	union journal_res_state state;
+	int d = new_u64s - res->u64s;
 
-	if (bch2_journal_error(j)) {
-		spin_unlock(&j->lock);
-		return;
-	}
+	spin_lock(&j->lock);
 
-	if (seq == journal_cur_seq(j)) {
-		if (!closure_wait(&journal_cur_buf(j)->wait, parent))
-			BUG();
-	} else if (seq + 1 == journal_cur_seq(j) &&
-		   j->reservations.prev_buf_unwritten) {
-		if (!closure_wait(&journal_prev_buf(j)->wait, parent))
-			BUG();
+	j->entry_u64s_reserved += d;
+	if (d <= 0)
+		goto out;
 
-		smp_mb();
+	j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
+	smp_mb();
+	state = READ_ONCE(j->reservations);
 
-		/* check if raced with write completion (or failure) */
-		if (!j->reservations.prev_buf_unwritten ||
-		    bch2_journal_error(j))
-			closure_wake_up(&journal_prev_buf(j)->wait);
+	if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL &&
+	    state.cur_entry_offset > j->cur_entry_u64s) {
+		j->cur_entry_u64s += d;
+		/*
+		 * Not enough room in current journal entry, have to flush it:
+		 */
+		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
+	} else {
+		journal_cur_buf(j)->u64s_reserved += d;
 	}
-
+out:
 	spin_unlock(&j->lock);
+	res->u64s += d;
 }
 
+/* journal flushing: */
+
 /**
  * bch2_journal_flush_seq_async - wait for a journal entry to be written
+ * @j:		journal object
+ * @seq:	seq to flush
+ * @parent:	closure object to wait with
+ * Returns:	1 if @seq has already been flushed, 0 if @seq is being flushed,
+ *		-BCH_ERR_journal_flush_err if @seq will never be flushed
  *
- * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
+ * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if
  * necessary
  */
-void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent)
+int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
+				 struct closure *parent)
 {
 	struct journal_buf *buf;
+	int ret = 0;
+
+	if (seq <= j->flushed_seq_ondisk)
+		return 1;
 
 	spin_lock(&j->lock);
 
-	BUG_ON(seq > journal_cur_seq(j));
+	if (WARN_ONCE(seq > journal_cur_seq(j),
+		      "requested to flush journal seq %llu, but currently at %llu",
+		      seq, journal_cur_seq(j)))
+		goto out;
 
-	if (bch2_journal_error(j)) {
-		spin_unlock(&j->lock);
-		return;
+	/* Recheck under lock: */
+	if (j->err_seq && seq >= j->err_seq) {
+		ret = -BCH_ERR_journal_flush_err;
+		goto out;
 	}
 
-	if (seq == journal_cur_seq(j)) {
-		bool set_need_write = false;
+	if (seq <= j->flushed_seq_ondisk) {
+		ret = 1;
+		goto out;
+	}
 
-		buf = journal_cur_buf(j);
+	/* if seq was written, but not flushed - flush a newer one instead */
+	seq = max(seq, journal_last_unwritten_seq(j));
 
-		if (parent && !closure_wait(&buf->wait, parent))
-			BUG();
+recheck_need_open:
+	if (seq > journal_cur_seq(j)) {
+		struct journal_res res = { 0 };
 
-		if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
-			j->need_write_time = local_clock();
-			set_need_write = true;
-		}
+		if (journal_entry_is_open(j))
+			__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
 
-		switch (journal_buf_switch(j, set_need_write)) {
-		case JOURNAL_ENTRY_ERROR:
-			if (parent)
-				closure_wake_up(&buf->wait);
-			break;
-		case JOURNAL_ENTRY_CLOSED:
-			/*
-			 * Journal entry hasn't been opened yet, but caller
-			 * claims it has something
-			 */
-			BUG();
-		case JOURNAL_ENTRY_INUSE:
-			break;
-		case JOURNAL_UNLOCKED:
-			return;
+		spin_unlock(&j->lock);
+
+		/*
+		 * We're called from bch2_journal_flush_seq() -> wait_event();
+		 * but this might block. We won't usually block, so we won't
+		 * livelock:
+		 */
+		sched_annotate_sleep();
+		ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+		if (ret)
+			return ret;
+
+		seq = res.seq;
+		buf = journal_seq_to_buf(j, seq);
+		buf->must_flush = true;
+
+		if (!buf->flush_time) {
+			buf->flush_time	= local_clock() ?: 1;
+			buf->expires = jiffies;
 		}
-	} else if (parent &&
-		   seq + 1 == journal_cur_seq(j) &&
-		   j->reservations.prev_buf_unwritten) {
-		buf = journal_prev_buf(j);
 
-		if (!closure_wait(&buf->wait, parent))
+		if (parent && !closure_wait(&buf->wait, parent))
 			BUG();
 
-		smp_mb();
+		bch2_journal_res_put(j, &res);
 
-		/* check if raced with write completion (or failure) */
-		if (!j->reservations.prev_buf_unwritten ||
-		    bch2_journal_error(j))
-			closure_wake_up(&buf->wait);
+		spin_lock(&j->lock);
+		goto want_write;
 	}
 
+	/*
+	 * if write was kicked off without a flush, or if we promised it
+	 * wouldn't be a flush, flush the next sequence number instead
+	 */
+	buf = journal_seq_to_buf(j, seq);
+	if (buf->noflush) {
+		seq++;
+		goto recheck_need_open;
+	}
+
+	buf->must_flush = true;
+
+	if (parent && !closure_wait(&buf->wait, parent))
+		BUG();
+want_write:
+	if (seq == journal_cur_seq(j))
+		journal_entry_want_write(j);
+out:
 	spin_unlock(&j->lock);
+	return ret;
 }
 
-static int journal_seq_flushed(struct journal *j, u64 seq)
+int bch2_journal_flush_seq(struct journal *j, u64 seq, unsigned task_state)
 {
-	struct journal_buf *buf;
-	int ret = 1;
-
-	spin_lock(&j->lock);
-	BUG_ON(seq > journal_cur_seq(j));
-
-	if (seq == journal_cur_seq(j)) {
-		bool set_need_write = false;
+	u64 start_time = local_clock();
+	int ret, ret2;
 
-		ret = 0;
+	/*
+	 * Don't update time_stats when @seq is already flushed:
+	 */
+	if (seq <= j->flushed_seq_ondisk)
+		return 0;
 
-		buf = journal_cur_buf(j);
+	ret = wait_event_state(j->wait,
+			       (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)),
+			       task_state);
 
-		if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
-			j->need_write_time = local_clock();
-			set_need_write = true;
-		}
+	if (!ret)
+		bch2_time_stats_update(j->flush_seq_time, start_time);
 
-		switch (journal_buf_switch(j, set_need_write)) {
-		case JOURNAL_ENTRY_ERROR:
-			ret = -EIO;
-			break;
-		case JOURNAL_ENTRY_CLOSED:
-			/*
-			 * Journal entry hasn't been opened yet, but caller
-			 * claims it has something
-			 */
-			BUG();
-		case JOURNAL_ENTRY_INUSE:
-			break;
-		case JOURNAL_UNLOCKED:
-			return 0;
-		}
-	} else if (seq + 1 == journal_cur_seq(j) &&
-		   j->reservations.prev_buf_unwritten) {
-		ret = bch2_journal_error(j);
-	}
+	return ret ?: ret2 < 0 ? ret2 : 0;
+}
 
-	spin_unlock(&j->lock);
+/*
+ * bch2_journal_flush_async - if there is an open journal entry, or a journal
+ * still being written, write it and wait for the write to complete
+ */
+void bch2_journal_flush_async(struct journal *j, struct closure *parent)
+{
+	bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent);
+}
 
-	return ret;
+int bch2_journal_flush(struct journal *j)
+{
+	return bch2_journal_flush_seq(j, atomic64_read(&j->seq), TASK_UNINTERRUPTIBLE);
 }
 
-int bch2_journal_flush_seq(struct journal *j, u64 seq)
+/*
+ * bch2_journal_noflush_seq - tell the journal not to issue any flushes before
+ * @seq
+ */
+bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
 {
-	u64 start_time = local_clock();
-	int ret, ret2;
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	u64 unwritten_seq;
+	bool ret = false;
 
-	ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq)));
+	if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
+		return false;
 
-	bch2_time_stats_update(j->flush_seq_time, start_time);
+	if (seq <= c->journal.flushed_seq_ondisk)
+		return false;
 
-	return ret ?: ret2 < 0 ? ret2 : 0;
+	spin_lock(&j->lock);
+	if (seq <= c->journal.flushed_seq_ondisk)
+		goto out;
+
+	for (unwritten_seq = journal_last_unwritten_seq(j);
+	     unwritten_seq < seq;
+	     unwritten_seq++) {
+		struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
+
+		/* journal flush already in flight, or flush requseted */
+		if (buf->must_flush)
+			goto out;
+
+		buf->noflush = true;
+	}
+
+	ret = true;
+out:
+	spin_unlock(&j->lock);
+	return ret;
 }
 
-/**
- * bch2_journal_meta_async - force a journal entry to be written
- */
-void bch2_journal_meta_async(struct journal *j, struct closure *parent)
+static int __bch2_journal_meta(struct journal *j)
 {
-	struct journal_res res;
-	unsigned u64s = jset_u64s(0);
+	struct journal_res res = {};
+	int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+	if (ret)
+		return ret;
 
-	memset(&res, 0, sizeof(res));
+	struct journal_buf *buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
+	buf->must_flush = true;
+
+	if (!buf->flush_time) {
+		buf->flush_time	= local_clock() ?: 1;
+		buf->expires = jiffies;
+	}
 
-	bch2_journal_res_get(j, &res, u64s, u64s);
 	bch2_journal_res_put(j, &res);
 
-	bch2_journal_flush_seq_async(j, res.seq, parent);
+	return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE);
 }
 
 int bch2_journal_meta(struct journal *j)
 {
-	struct journal_res res;
-	unsigned u64s = jset_u64s(0);
-	int ret;
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 
-	memset(&res, 0, sizeof(res));
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal))
+		return -EROFS;
 
-	ret = bch2_journal_res_get(j, &res, u64s, u64s);
-	if (ret)
-		return ret;
+	int ret = __bch2_journal_meta(j);
+	bch2_write_ref_put(c, BCH_WRITE_REF_journal);
+	return ret;
+}
 
-	bch2_journal_res_put(j, &res);
+/* block/unlock the journal: */
 
-	return bch2_journal_flush_seq(j, res.seq);
+void bch2_journal_unblock(struct journal *j)
+{
+	spin_lock(&j->lock);
+	if (!--j->blocked &&
+	    j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL &&
+	    j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) {
+		union journal_res_state old, new;
+
+		old.v = atomic64_read(&j->reservations.counter);
+		do {
+			new.v = old.v;
+			new.cur_entry_offset = j->cur_entry_offset_if_blocked;
+		} while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
+	}
+	spin_unlock(&j->lock);
+
+	journal_wake(j);
 }
 
-/*
- * bch2_journal_flush_async - if there is an open journal entry, or a journal
- * still being written, write it and wait for the write to complete
- */
-void bch2_journal_flush_async(struct journal *j, struct closure *parent)
+static void __bch2_journal_block(struct journal *j)
 {
-	u64 seq, journal_seq;
+	if (!j->blocked++) {
+		union journal_res_state old, new;
 
-	spin_lock(&j->lock);
-	journal_seq = journal_cur_seq(j);
+		old.v = atomic64_read(&j->reservations.counter);
+		do {
+			j->cur_entry_offset_if_blocked = old.cur_entry_offset;
 
-	if (journal_entry_is_open(j)) {
-		seq = journal_seq;
-	} else if (journal_seq) {
-		seq = journal_seq - 1;
-	} else {
-		spin_unlock(&j->lock);
-		return;
+			if (j->cur_entry_offset_if_blocked >= JOURNAL_ENTRY_CLOSED_VAL)
+				break;
+
+			new.v = old.v;
+			new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL;
+		} while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
+
+		journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset);
 	}
+}
+
+void bch2_journal_block(struct journal *j)
+{
+	spin_lock(&j->lock);
+	__bch2_journal_block(j);
 	spin_unlock(&j->lock);
 
-	bch2_journal_flush_seq_async(j, seq, parent);
+	journal_quiesce(j);
 }
 
-int bch2_journal_flush(struct journal *j)
+static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j,
+						u64 max_seq, bool *blocked)
 {
-	u64 seq, journal_seq;
+	struct journal_buf *ret = NULL;
 
+	/* We're inside wait_event(), but using mutex_lock(: */
+	sched_annotate_sleep();
+	mutex_lock(&j->buf_lock);
 	spin_lock(&j->lock);
-	journal_seq = journal_cur_seq(j);
+	max_seq = min(max_seq, journal_cur_seq(j));
 
-	if (journal_entry_is_open(j)) {
-		seq = journal_seq;
-	} else if (journal_seq) {
-		seq = journal_seq - 1;
-	} else {
-		spin_unlock(&j->lock);
-		return 0;
+	for (u64 seq = journal_last_unwritten_seq(j);
+	     seq <= max_seq;
+	     seq++) {
+		unsigned idx = seq & JOURNAL_BUF_MASK;
+		struct journal_buf *buf = j->buf + idx;
+
+		if (buf->need_flush_to_write_buffer) {
+			union journal_res_state s;
+			s.v = atomic64_read_acquire(&j->reservations.counter);
+
+			unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s);
+
+			if (open && !*blocked) {
+				__bch2_journal_block(j);
+				*blocked = true;
+			}
+
+			ret = journal_state_count(s, idx) > open
+				? ERR_PTR(-EAGAIN)
+				: buf;
+			break;
+		}
 	}
+
 	spin_unlock(&j->lock);
+	if (IS_ERR_OR_NULL(ret))
+		mutex_unlock(&j->buf_lock);
+	return ret;
+}
+
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j,
+							     u64 max_seq, bool *blocked)
+{
+	struct journal_buf *ret;
+	*blocked = false;
+
+	wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j,
+						max_seq, blocked)) != ERR_PTR(-EAGAIN));
+	if (IS_ERR_OR_NULL(ret) && *blocked)
+		bch2_journal_unblock(j);
 
-	return bch2_journal_flush_seq(j, seq);
+	return ret;
 }
 
 /* allocate journal on a device: */
@@ -708,101 +988,127 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 {
 	struct bch_fs *c = ca->fs;
 	struct journal_device *ja = &ca->journal;
-	struct bch_sb_field_journal *journal_buckets;
 	u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+	struct open_bucket **ob = NULL;
+	long *bu = NULL;
+	unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr;
 	int ret = 0;
 
-	/* don't handle reducing nr of buckets yet: */
-	if (nr <= ja->nr)
-		return 0;
+	BUG_ON(nr <= ja->nr);
 
-	ret = -ENOMEM;
-	new_buckets	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
-	new_bucket_seq	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
-	if (!new_buckets || !new_bucket_seq)
-		goto err;
+	bu		= kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
+	ob		= kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
+	new_buckets	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
+	new_bucket_seq	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
+	if (!bu || !ob || !new_buckets || !new_bucket_seq) {
+		ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+		goto err_free;
+	}
 
-	journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
-				nr + sizeof(*journal_buckets) / sizeof(u64));
-	if (!journal_buckets)
-		goto err;
+	for (nr_got = 0; nr_got < nr_want; nr_got++) {
+		enum bch_watermark watermark = new_fs
+			? BCH_WATERMARK_btree
+			: BCH_WATERMARK_normal;
 
-	if (c)
-		spin_lock(&c->journal.lock);
+		ob[nr_got] = bch2_bucket_alloc(c, ca, watermark,
+					       BCH_DATA_journal, cl);
+		ret = PTR_ERR_OR_ZERO(ob[nr_got]);
+		if (ret)
+			break;
+
+		if (!new_fs) {
+			ret = bch2_trans_run(c,
+				bch2_trans_mark_metadata_bucket(trans, ca,
+						ob[nr_got]->bucket, BCH_DATA_journal,
+						ca->mi.bucket_size, BTREE_TRIGGER_transactional));
+			if (ret) {
+				bch2_open_bucket_put(c, ob[nr_got]);
+				bch_err_msg(c, ret, "marking new journal buckets");
+				break;
+			}
+		}
+
+		bu[nr_got] = ob[nr_got]->bucket;
+	}
+
+	if (!nr_got)
+		goto err_free;
+
+	/* Don't return an error if we successfully allocated some buckets: */
+	ret = 0;
+
+	if (c) {
+		bch2_journal_flush_all_pins(&c->journal);
+		bch2_journal_block(&c->journal);
+		mutex_lock(&c->sb_lock);
+	}
 
 	memcpy(new_buckets,	ja->buckets,	ja->nr * sizeof(u64));
 	memcpy(new_bucket_seq,	ja->bucket_seq,	ja->nr * sizeof(u64));
-	swap(new_buckets,	ja->buckets);
-	swap(new_bucket_seq,	ja->bucket_seq);
 
-	if (c)
-		spin_unlock(&c->journal.lock);
+	BUG_ON(ja->discard_idx > ja->nr);
 
-	while (ja->nr < nr) {
-		struct open_bucket *ob = NULL;
-		long bucket;
+	pos = ja->discard_idx ?: ja->nr;
 
-		if (new_fs) {
-			percpu_down_read_preempt_disable(&c->usage_lock);
-			bucket = bch2_bucket_alloc_new_fs(ca);
-			percpu_up_read_preempt_enable(&c->usage_lock);
+	memmove(new_buckets + pos + nr_got,
+		new_buckets + pos,
+		sizeof(new_buckets[0]) * (ja->nr - pos));
+	memmove(new_bucket_seq + pos + nr_got,
+		new_bucket_seq + pos,
+		sizeof(new_bucket_seq[0]) * (ja->nr - pos));
 
-			if (bucket < 0) {
-				ret = -ENOSPC;
-				goto err;
-			}
-		} else {
-			int ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, cl);
-			if (ob_idx < 0) {
-				ret = cl ? -EAGAIN : -ENOSPC;
-				goto err;
-			}
+	for (i = 0; i < nr_got; i++) {
+		new_buckets[pos + i] = bu[i];
+		new_bucket_seq[pos + i] = 0;
+	}
 
-			ob = c->open_buckets + ob_idx;
-			bucket = sector_to_bucket(ca, ob->ptr.offset);
-		}
+	nr = ja->nr + nr_got;
 
-		if (c) {
-			percpu_down_read_preempt_disable(&c->usage_lock);
-			spin_lock(&c->journal.lock);
-		}
+	ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr);
+	if (ret)
+		goto err_unblock;
 
-		__array_insert_item(ja->buckets,		ja->nr, ja->last_idx);
-		__array_insert_item(ja->bucket_seq,		ja->nr, ja->last_idx);
-		__array_insert_item(journal_buckets->buckets,	ja->nr, ja->last_idx);
+	bch2_write_super(c);
 
-		ja->buckets[ja->last_idx] = bucket;
-		ja->bucket_seq[ja->last_idx] = 0;
-		journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket);
+	/* Commit: */
+	if (c)
+		spin_lock(&c->journal.lock);
 
-		if (ja->last_idx < ja->nr) {
-			if (ja->cur_idx >= ja->last_idx)
-				ja->cur_idx++;
-			ja->last_idx++;
-		}
-		ja->nr++;
-
-		bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
-				ca->mi.bucket_size,
-				gc_phase(GC_PHASE_SB),
-				new_fs
-				? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
-				: 0);
-
-		if (c) {
-			spin_unlock(&c->journal.lock);
-			percpu_up_read_preempt_enable(&c->usage_lock);
-		}
+	swap(new_buckets,	ja->buckets);
+	swap(new_bucket_seq,	ja->bucket_seq);
+	ja->nr = nr;
+
+	if (pos <= ja->discard_idx)
+		ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr;
+	if (pos <= ja->dirty_idx_ondisk)
+		ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr;
+	if (pos <= ja->dirty_idx)
+		ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr;
+	if (pos <= ja->cur_idx)
+		ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr;
 
-		if (!new_fs)
-			bch2_open_bucket_put(c, ob);
+	if (c)
+		spin_unlock(&c->journal.lock);
+err_unblock:
+	if (c) {
+		bch2_journal_unblock(&c->journal);
+		mutex_unlock(&c->sb_lock);
 	}
 
-	ret = 0;
-err:
+	if (ret && !new_fs)
+		for (i = 0; i < nr_got; i++)
+			bch2_trans_run(c,
+				bch2_trans_mark_metadata_bucket(trans, ca,
+						bu[i], BCH_DATA_free, 0,
+						BTREE_TRIGGER_transactional));
+err_free:
+	for (i = 0; i < nr_got; i++)
+		bch2_open_bucket_put(c, ob[i]);
+
 	kfree(new_bucket_seq);
 	kfree(new_buckets);
-
+	kfree(ob);
+	kfree(bu);
 	return ret;
 }
 
@@ -815,77 +1121,111 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 {
 	struct journal_device *ja = &ca->journal;
 	struct closure cl;
-	unsigned current_nr;
-	int ret;
+	int ret = 0;
 
 	closure_init_stack(&cl);
 
-	do {
-		struct disk_reservation disk_res = { 0, 0 };
+	down_write(&c->state_lock);
 
-		closure_sync(&cl);
+	/* don't handle reducing nr of buckets yet: */
+	if (nr < ja->nr)
+		goto unlock;
 
-		mutex_lock(&c->sb_lock);
-		current_nr = ja->nr;
+	while (ja->nr < nr) {
+		struct disk_reservation disk_res = { 0, 0, 0 };
 
 		/*
 		 * note: journal buckets aren't really counted as _sectors_ used yet, so
 		 * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
 		 * when space used goes up without a reservation - but we do need the
 		 * reservation to ensure we'll actually be able to allocate:
+		 *
+		 * XXX: that's not right, disk reservations only ensure a
+		 * filesystem-wide allocation will succeed, this is a device
+		 * specific allocation - we can hang here:
 		 */
 
-		if (bch2_disk_reservation_get(c, &disk_res,
-				bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
-			mutex_unlock(&c->sb_lock);
-			return -ENOSPC;
-		}
+		ret = bch2_disk_reservation_get(c, &disk_res,
+						bucket_to_sector(ca, nr - ja->nr), 1, 0);
+		if (ret)
+			break;
 
 		ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
 
 		bch2_disk_reservation_put(c, &disk_res);
 
-		if (ja->nr != current_nr)
-			bch2_write_super(c);
-		mutex_unlock(&c->sb_lock);
-	} while (ret == -EAGAIN);
+		closure_sync(&cl);
 
+		if (ret && ret != -BCH_ERR_bucket_alloc_blocked)
+			break;
+	}
+
+	bch_err_fn(c, ret);
+unlock:
+	up_write(&c->state_lock);
 	return ret;
 }
 
-int bch2_dev_journal_alloc(struct bch_dev *ca)
+int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
 {
 	unsigned nr;
+	int ret;
 
-	if (dynamic_fault("bcachefs:add:journal_alloc"))
-		return -ENOMEM;
+	if (dynamic_fault("bcachefs:add:journal_alloc")) {
+		ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+		goto err;
+	}
+
+	/* 1/128th of the device by default: */
+	nr = ca->mi.nbuckets >> 7;
 
 	/*
-	 * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
+	 * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
 	 * is smaller:
 	 */
-	nr = clamp_t(unsigned, ca->mi.nbuckets >> 8,
+	nr = clamp_t(unsigned, nr,
 		     BCH_JOURNAL_BUCKETS_MIN,
-		     min(1 << 10,
-			 (1 << 20) / ca->mi.bucket_size));
+		     min(1 << 13,
+			 (1 << 24) / ca->mi.bucket_size));
 
-	return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+	ret = __bch2_set_nr_journal_buckets(ca, nr, new_fs, NULL);
+err:
+	bch_err_fn(ca, ret);
+	return ret;
+}
+
+int bch2_fs_journal_alloc(struct bch_fs *c)
+{
+	for_each_online_member(c, ca) {
+		if (ca->journal.nr)
+			continue;
+
+		int ret = bch2_dev_journal_alloc(ca, true);
+		if (ret) {
+			percpu_ref_put(&ca->io_ref);
+			return ret;
+		}
+	}
+
+	return 0;
 }
 
 /* startup/shutdown: */
 
 static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
 {
-	union journal_res_state state;
-	struct journal_buf *w;
-	bool ret;
+	bool ret = false;
+	u64 seq;
 
 	spin_lock(&j->lock);
-	state = READ_ONCE(j->reservations);
-	w = j->buf + !state.idx;
+	for (seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j) && !ret;
+	     seq++) {
+		struct journal_buf *buf = journal_seq_to_buf(j, seq);
 
-	ret = state.prev_buf_unwritten &&
-		bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), dev_idx);
+		if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx))
+			ret = true;
+	}
 	spin_unlock(&j->lock);
 
 	return ret;
@@ -893,245 +1233,416 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
 
 void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
 {
-	spin_lock(&j->lock);
-	bch2_extent_drop_device(bkey_i_to_s_extent(&j->key), ca->dev_idx);
-	spin_unlock(&j->lock);
-
 	wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
 }
 
 void bch2_fs_journal_stop(struct journal *j)
 {
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	if (!test_bit(JOURNAL_running, &j->flags))
+		return;
 
-	wait_event(j->wait, journal_flush_write(j));
+	bch2_journal_reclaim_stop(j);
+	bch2_journal_flush_all_pins(j);
 
-	/* do we need to write another journal entry? */
-	if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) ||
-	    c->btree_roots_dirty)
-		bch2_journal_meta(j);
+	wait_event(j->wait, bch2_journal_entry_close(j));
 
-	BUG_ON(!bch2_journal_error(j) &&
-	       test_bit(JOURNAL_NOT_EMPTY, &j->flags));
+	/*
+	 * Always write a new journal entry, to make sure the clock hands are up
+	 * to date (and match the superblock)
+	 */
+	__bch2_journal_meta(j);
 
+	journal_quiesce(j);
 	cancel_delayed_work_sync(&j->write_work);
-	cancel_delayed_work_sync(&j->reclaim_work);
+
+	WARN(!bch2_journal_error(j) &&
+	     test_bit(JOURNAL_replay_done, &j->flags) &&
+	     j->last_empty_seq != journal_cur_seq(j),
+	     "journal shutdown error: cur seq %llu but last empty seq %llu",
+	     journal_cur_seq(j), j->last_empty_seq);
+
+	if (!bch2_journal_error(j))
+		clear_bit(JOURNAL_running, &j->flags);
 }
 
-void bch2_fs_journal_start(struct journal *j)
+int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 {
-	struct journal_seq_blacklist *bl;
-	u64 blacklist = 0;
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_entry_pin_list *p;
+	struct journal_replay *i, **_i;
+	struct genradix_iter iter;
+	bool had_entries = false;
+	u64 last_seq = cur_seq, nr, seq;
+
+	if (cur_seq >= JOURNAL_SEQ_MAX) {
+		bch_err(c, "cannot start: journal seq overflow");
+		return -EINVAL;
+	}
+
+	genradix_for_each_reverse(&c->journal_entries, iter, _i) {
+		i = *_i;
+
+		if (journal_replay_ignore(i))
+			continue;
+
+		last_seq = le64_to_cpu(i->j.last_seq);
+		break;
+	}
+
+	nr = cur_seq - last_seq;
+
+	if (nr + 1 > j->pin.size) {
+		free_fifo(&j->pin);
+		init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
+		if (!j->pin.data) {
+			bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
+			return -BCH_ERR_ENOMEM_journal_pin_fifo;
+		}
+	}
+
+	j->replay_journal_seq	= last_seq;
+	j->replay_journal_seq_end = cur_seq;
+	j->last_seq_ondisk	= last_seq;
+	j->flushed_seq_ondisk	= cur_seq - 1;
+	j->seq_ondisk		= cur_seq - 1;
+	j->pin.front		= last_seq;
+	j->pin.back		= cur_seq;
+	atomic64_set(&j->seq, cur_seq - 1);
+
+	fifo_for_each_entry_ptr(p, &j->pin, seq)
+		journal_pin_list_init(p, 1);
+
+	genradix_for_each(&c->journal_entries, iter, _i) {
+		i = *_i;
+
+		if (journal_replay_ignore(i))
+			continue;
+
+		seq = le64_to_cpu(i->j.seq);
+		BUG_ON(seq >= cur_seq);
+
+		if (seq < last_seq)
+			continue;
+
+		if (journal_entry_empty(&i->j))
+			j->last_empty_seq = le64_to_cpu(i->j.seq);
+
+		p = journal_seq_pin(j, seq);
 
-	list_for_each_entry(bl, &j->seq_blacklist, list)
-		blacklist = max(blacklist, bl->end);
+		p->devs.nr = 0;
+		darray_for_each(i->ptrs, ptr)
+			bch2_dev_list_add_dev(&p->devs, ptr->dev);
+
+		had_entries = true;
+	}
+
+	if (!had_entries)
+		j->last_empty_seq = cur_seq - 1; /* to match j->seq */
 
 	spin_lock(&j->lock);
 
-	set_bit(JOURNAL_STARTED, &j->flags);
+	set_bit(JOURNAL_running, &j->flags);
+	j->last_flush_write = jiffies;
 
-	while (journal_cur_seq(j) < blacklist)
-		journal_pin_new_entry(j, 0);
+	j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
+	j->reservations.unwritten_idx++;
 
-	/*
-	 * journal_buf_switch() only inits the next journal entry when it
-	 * closes an open journal entry - the very first journal entry gets
-	 * initialized here:
-	 */
-	journal_pin_new_entry(j, 1);
-	bch2_journal_buf_init(j);
+	c->last_bucket_seq_cleanup = journal_cur_seq(j);
 
+	bch2_journal_space_available(j);
 	spin_unlock(&j->lock);
 
-	/*
-	 * Adding entries to the next journal entry before allocating space on
-	 * disk for the next journal entry - this is ok, because these entries
-	 * only have to go down with the next journal entry we write:
-	 */
-	bch2_journal_seq_blacklist_write(j);
-
-	queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+	return bch2_journal_reclaim_start(j);
 }
 
 /* init/exit: */
 
 void bch2_dev_journal_exit(struct bch_dev *ca)
 {
-	kfree(ca->journal.bio);
-	kfree(ca->journal.buckets);
-	kfree(ca->journal.bucket_seq);
+	struct journal_device *ja = &ca->journal;
 
-	ca->journal.bio		= NULL;
-	ca->journal.buckets	= NULL;
-	ca->journal.bucket_seq	= NULL;
+	for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+		kfree(ja->bio[i]);
+		ja->bio[i] = NULL;
+	}
+
+	kfree(ja->buckets);
+	kfree(ja->bucket_seq);
+	ja->buckets	= NULL;
+	ja->bucket_seq	= NULL;
 }
 
 int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 {
 	struct journal_device *ja = &ca->journal;
 	struct bch_sb_field_journal *journal_buckets =
-		bch2_sb_get_journal(sb);
-	unsigned i;
+		bch2_sb_field_get(sb, journal);
+	struct bch_sb_field_journal_v2 *journal_buckets_v2 =
+		bch2_sb_field_get(sb, journal_v2);
 
-	ja->nr = bch2_nr_journal_buckets(journal_buckets);
+	ja->nr = 0;
+
+	if (journal_buckets_v2) {
+		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+
+		for (unsigned i = 0; i < nr; i++)
+			ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
+	} else if (journal_buckets) {
+		ja->nr = bch2_nr_journal_buckets(journal_buckets);
+	}
 
 	ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
 	if (!ja->bucket_seq)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_dev_journal_init;
+
+	unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
 
-	ca->journal.bio = bio_kmalloc(GFP_KERNEL,
-			DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE));
-	if (!ca->journal.bio)
-		return -ENOMEM;
+	for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+		ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
+				     nr_bvecs), GFP_KERNEL);
+		if (!ja->bio[i])
+			return -BCH_ERR_ENOMEM_dev_journal_init;
+
+		ja->bio[i]->ca = ca;
+		ja->bio[i]->buf_idx = i;
+		bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0);
+	}
 
 	ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
 	if (!ja->buckets)
-		return -ENOMEM;
-
-	for (i = 0; i < ja->nr; i++)
-		ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+		return -BCH_ERR_ENOMEM_dev_journal_init;
+
+	if (journal_buckets_v2) {
+		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+		unsigned dst = 0;
+
+		for (unsigned i = 0; i < nr; i++)
+			for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+				ja->buckets[dst++] =
+					le64_to_cpu(journal_buckets_v2->d[i].start) + j;
+	} else if (journal_buckets) {
+		for (unsigned i = 0; i < ja->nr; i++)
+			ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+	}
 
 	return 0;
 }
 
 void bch2_fs_journal_exit(struct journal *j)
 {
-	kvpfree(j->buf[1].data, j->buf[1].size);
-	kvpfree(j->buf[0].data, j->buf[0].size);
+	if (j->wq)
+		destroy_workqueue(j->wq);
+
+	darray_exit(&j->early_journal_entries);
+
+	for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
+		kvfree(j->buf[i].data);
 	free_fifo(&j->pin);
 }
 
 int bch2_fs_journal_init(struct journal *j)
 {
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	static struct lock_class_key res_key;
-	int ret = 0;
-
-	pr_verbose_init(c->opts, "");
 
+	mutex_init(&j->buf_lock);
 	spin_lock_init(&j->lock);
 	spin_lock_init(&j->err_lock);
 	init_waitqueue_head(&j->wait);
 	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
-	INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
-	mutex_init(&j->blacklist_lock);
-	INIT_LIST_HEAD(&j->seq_blacklist);
+	init_waitqueue_head(&j->reclaim_wait);
+	init_waitqueue_head(&j->pin_flush_wait);
 	mutex_init(&j->reclaim_lock);
+	mutex_init(&j->discard_lock);
 
 	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
 
-	j->buf[0].size		= JOURNAL_ENTRY_SIZE_MIN;
-	j->buf[1].size		= JOURNAL_ENTRY_SIZE_MIN;
-	j->write_delay_ms	= 1000;
-	j->reclaim_delay_ms	= 100;
-
-	bkey_extent_init(&j->key);
-
 	atomic64_set(&j->reservations.counter,
 		((union journal_res_state)
 		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
 
-	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
-	    !(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) ||
-	    !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL))) {
-		ret = -ENOMEM;
-		goto out;
+	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
+		return -BCH_ERR_ENOMEM_journal_pin_fifo;
+
+	for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) {
+		j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
+		j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL);
+		if (!j->buf[i].data)
+			return -BCH_ERR_ENOMEM_journal_buf;
+		j->buf[i].idx = i;
 	}
 
 	j->pin.front = j->pin.back = 1;
-out:
-	pr_verbose_init(c->opts, "ret %i", ret);
-	return ret;
+
+	j->wq = alloc_workqueue("bcachefs_journal",
+				WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
+	if (!j->wq)
+		return -BCH_ERR_ENOMEM_fs_other_alloc;
+	return 0;
 }
 
 /* debug: */
 
-ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
+static const char * const bch2_journal_flags_strs[] = {
+#define x(n)	#n,
+	JOURNAL_FLAGS()
+#undef x
+	NULL
+};
+
+void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	union journal_res_state *s = &j->reservations;
-	struct bch_dev *ca;
-	unsigned iter;
-	ssize_t ret = 0;
+	union journal_res_state s;
+	unsigned long now = jiffies;
+	u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
+
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, 28);
+	out->atomic++;
 
 	rcu_read_lock();
-	spin_lock(&j->lock);
+	s = READ_ONCE(j->reservations);
+
+	prt_printf(out, "flags:\t");
+	prt_bitflags(out, bch2_journal_flags_strs, j->flags);
+	prt_newline(out);
+	prt_printf(out, "dirty journal entries:\t%llu/%llu\n",	fifo_used(&j->pin), j->pin.size);
+	prt_printf(out, "seq:\t%llu\n",				journal_cur_seq(j));
+	prt_printf(out, "seq_ondisk:\t%llu\n",			j->seq_ondisk);
+	prt_printf(out, "last_seq:\t%llu\n",			journal_last_seq(j));
+	prt_printf(out, "last_seq_ondisk:\t%llu\n",		j->last_seq_ondisk);
+	prt_printf(out, "flushed_seq_ondisk:\t%llu\n",		j->flushed_seq_ondisk);
+	prt_printf(out, "watermark:\t%s\n",			bch2_watermarks[j->watermark]);
+	prt_printf(out, "each entry reserved:\t%u\n",		j->entry_u64s_reserved);
+	prt_printf(out, "nr flush writes:\t%llu\n",		j->nr_flush_writes);
+	prt_printf(out, "nr noflush writes:\t%llu\n",		j->nr_noflush_writes);
+	prt_printf(out, "average write size:\t");
+	prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
+	prt_newline(out);
+	prt_printf(out, "nr direct reclaim:\t%llu\n",		j->nr_direct_reclaim);
+	prt_printf(out, "nr background reclaim:\t%llu\n",	j->nr_background_reclaim);
+	prt_printf(out, "reclaim kicked:\t%u\n",		j->reclaim_kicked);
+	prt_printf(out, "reclaim runs in:\t%u ms\n",		time_after(j->next_reclaim, now)
+	       ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
+	prt_printf(out, "blocked:\t%u\n",			j->blocked);
+	prt_printf(out, "current entry sectors:\t%u\n",		j->cur_entry_sectors);
+	prt_printf(out, "current entry error:\t%s\n",		bch2_journal_errors[j->cur_entry_error]);
+	prt_printf(out, "current entry:\t");
+
+	switch (s.cur_entry_offset) {
+	case JOURNAL_ENTRY_ERROR_VAL:
+		prt_printf(out, "error\n");
+		break;
+	case JOURNAL_ENTRY_CLOSED_VAL:
+		prt_printf(out, "closed\n");
+		break;
+	case JOURNAL_ENTRY_BLOCKED_VAL:
+		prt_printf(out, "blocked\n");
+		break;
+	default:
+		prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
+		break;
+	}
 
-	ret += scnprintf(buf + ret, PAGE_SIZE - ret,
-			 "active journal entries:\t%llu\n"
-			 "seq:\t\t\t%llu\n"
-			 "last_seq:\t\t%llu\n"
-			 "last_seq_ondisk:\t%llu\n"
-			 "reservation count:\t%u\n"
-			 "reservation offset:\t%u\n"
-			 "current entry u64s:\t%u\n"
-			 "io in flight:\t\t%i\n"
-			 "need write:\t\t%i\n"
-			 "dirty:\t\t\t%i\n"
-			 "replay done:\t\t%i\n",
-			 fifo_used(&j->pin),
-			 journal_cur_seq(j),
-			 journal_last_seq(j),
-			 j->last_seq_ondisk,
-			 journal_state_count(*s, s->idx),
-			 s->cur_entry_offset,
-			 j->cur_entry_u64s,
-			 s->prev_buf_unwritten,
-			 test_bit(JOURNAL_NEED_WRITE,	&j->flags),
-			 journal_entry_is_open(j),
-			 test_bit(JOURNAL_REPLAY_DONE,	&j->flags));
-
-	for_each_member_device_rcu(ca, c, iter,
-				   &c->rw_devs[BCH_DATA_JOURNAL]) {
+	prt_printf(out, "unwritten entries:\n");
+	bch2_journal_bufs_to_text(out, j);
+
+	prt_printf(out, "space:\n");
+	printbuf_indent_add(out, 2);
+	prt_printf(out, "discarded\t%u:%u\n",
+	       j->space[journal_space_discarded].next_entry,
+	       j->space[journal_space_discarded].total);
+	prt_printf(out, "clean ondisk\t%u:%u\n",
+	       j->space[journal_space_clean_ondisk].next_entry,
+	       j->space[journal_space_clean_ondisk].total);
+	prt_printf(out, "clean\t%u:%u\n",
+	       j->space[journal_space_clean].next_entry,
+	       j->space[journal_space_clean].total);
+	prt_printf(out, "total\t%u:%u\n",
+	       j->space[journal_space_total].next_entry,
+	       j->space[journal_space_total].total);
+	printbuf_indent_sub(out, 2);
+
+	for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
 		struct journal_device *ja = &ca->journal;
 
+		if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
+			continue;
+
 		if (!ja->nr)
 			continue;
 
-		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
-				 "dev %u:\n"
-				 "\tnr\t\t%u\n"
-				 "\tcur_idx\t\t%u (seq %llu)\n"
-				 "\tlast_idx\t%u (seq %llu)\n",
-				 iter, ja->nr,
-				 ja->cur_idx,	ja->bucket_seq[ja->cur_idx],
-				 ja->last_idx,	ja->bucket_seq[ja->last_idx]);
+		prt_printf(out, "dev %u:\n",			ca->dev_idx);
+		printbuf_indent_add(out, 2);
+		prt_printf(out, "nr\t%u\n",			ja->nr);
+		prt_printf(out, "bucket size\t%u\n",		ca->mi.bucket_size);
+		prt_printf(out, "available\t%u:%u\n",		bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
+		prt_printf(out, "discard_idx\t%u\n",		ja->discard_idx);
+		prt_printf(out, "dirty_ondisk\t%u (seq %llu)\n",ja->dirty_idx_ondisk,	ja->bucket_seq[ja->dirty_idx_ondisk]);
+		prt_printf(out, "dirty_idx\t%u (seq %llu)\n",	ja->dirty_idx,		ja->bucket_seq[ja->dirty_idx]);
+		prt_printf(out, "cur_idx\t%u (seq %llu)\n",	ja->cur_idx,		ja->bucket_seq[ja->cur_idx]);
+		printbuf_indent_sub(out, 2);
 	}
 
-	spin_unlock(&j->lock);
 	rcu_read_unlock();
 
-	return ret;
+	--out->atomic;
+}
+
+void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
+{
+	spin_lock(&j->lock);
+	__bch2_journal_debug_to_text(out, j);
+	spin_unlock(&j->lock);
 }
 
-ssize_t bch2_journal_print_pins(struct journal *j, char *buf)
+bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
 {
 	struct journal_entry_pin_list *pin_list;
 	struct journal_entry_pin *pin;
-	ssize_t ret = 0;
-	u64 i;
 
 	spin_lock(&j->lock);
-	fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
-		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
-				 "%llu: count %u\n",
-				 i, atomic_read(&pin_list->count));
-
-		list_for_each_entry(pin, &pin_list->list, list)
-			ret += scnprintf(buf + ret, PAGE_SIZE - ret,
-					 "\t%p %pf\n",
-					 pin, pin->flush);
-
-		if (!list_empty(&pin_list->flushed))
-			ret += scnprintf(buf + ret, PAGE_SIZE - ret,
-					 "flushed:\n");
-
-		list_for_each_entry(pin, &pin_list->flushed, list)
-			ret += scnprintf(buf + ret, PAGE_SIZE - ret,
-					 "\t%p %pf\n",
-					 pin, pin->flush);
+	if (!test_bit(JOURNAL_running, &j->flags)) {
+		spin_unlock(&j->lock);
+		return true;
+	}
+
+	*seq = max(*seq, j->pin.front);
+
+	if (*seq >= j->pin.back) {
+		spin_unlock(&j->lock);
+		return true;
 	}
+
+	out->atomic++;
+
+	pin_list = journal_seq_pin(j, *seq);
+
+	prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
+	printbuf_indent_add(out, 2);
+
+	for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
+		list_for_each_entry(pin, &pin_list->list[i], list)
+			prt_printf(out, "\t%px %ps\n", pin, pin->flush);
+
+	if (!list_empty(&pin_list->flushed))
+		prt_printf(out, "flushed:\n");
+
+	list_for_each_entry(pin, &pin_list->flushed, list)
+		prt_printf(out, "\t%px %ps\n", pin, pin->flush);
+
+	printbuf_indent_sub(out, 2);
+
+	--out->atomic;
 	spin_unlock(&j->lock);
 
-	return ret;
+	return false;
+}
+
+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+{
+	u64 seq = 0;
+
+	while (!bch2_journal_seq_pins_to_text(out, j, &seq))
+		seq++;
 }
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index 6759810b..a6a2e888 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_JOURNAL_H
 #define _BCACHEFS_JOURNAL_H
 
@@ -28,8 +29,8 @@
  *
  * Synchronous updates are specified by passing a closure (@flush_cl) to
  * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
- * down to the journalling code. That closure will will wait on the journal
- * write to complete (via closure_wait()).
+ * down to the journalling code. That closure will wait on the journal write to
+ * complete (via closure_wait()).
  *
  * If the index update wasn't synchronous, the journal entry will be
  * written out after 10 ms have elapsed, by default (the delay_ms field
@@ -125,11 +126,6 @@ static inline struct journal_buf *journal_cur_buf(struct journal *j)
 	return j->buf + j->reservations.idx;
 }
 
-static inline struct journal_buf *journal_prev_buf(struct journal *j)
-{
-	return j->buf + !j->reservations.idx;
-}
-
 /* Sequence number of oldest dirty journal entry */
 
 static inline u64 journal_last_seq(struct journal *j)
@@ -139,34 +135,31 @@ static inline u64 journal_last_seq(struct journal *j)
 
 static inline u64 journal_cur_seq(struct journal *j)
 {
-	BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
-
-	return j->pin.back - 1;
+	return atomic64_read(&j->seq);
 }
 
-u64 bch2_inode_journal_seq(struct journal *, u64);
+static inline u64 journal_last_unwritten_seq(struct journal *j)
+{
+	return j->seq_ondisk + 1;
+}
 
 static inline int journal_state_count(union journal_res_state s, int idx)
 {
-	return idx == 0 ? s.buf0_count : s.buf1_count;
+	switch (idx) {
+	case 0: return s.buf0_count;
+	case 1: return s.buf1_count;
+	case 2: return s.buf2_count;
+	case 3: return s.buf3_count;
+	}
+	BUG();
 }
 
 static inline void journal_state_inc(union journal_res_state *s)
 {
 	s->buf0_count += s->idx == 0;
 	s->buf1_count += s->idx == 1;
-}
-
-static inline void bch2_journal_set_has_inode(struct journal *j,
-					      struct journal_res *res,
-					      u64 inum)
-{
-	struct journal_buf *buf = &j->buf[res->idx];
-	unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8));
-
-	/* avoid atomic op if possible */
-	if (unlikely(!test_bit(bit, buf->has_inode)))
-		set_bit(bit, buf->has_inode);
+	s->buf2_count += s->idx == 2;
+	s->buf3_count += s->idx == 3;
 }
 
 /*
@@ -178,6 +171,11 @@ static inline unsigned jset_u64s(unsigned u64s)
 	return u64s + sizeof(struct jset_entry) / sizeof(u64);
 }
 
+static inline int journal_entry_overhead(struct journal *j)
+{
+	return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved;
+}
+
 static inline struct jset_entry *
 bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
 {
@@ -192,58 +190,103 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
 	return entry;
 }
 
-static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
-					  unsigned type, enum btree_id id,
-					  unsigned level,
+static inline struct jset_entry *
+journal_res_entry(struct journal *j, struct journal_res *res)
+{
+	return vstruct_idx(j->buf[res->idx].data, res->offset);
+}
+
+static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
+					  enum btree_id id, unsigned level,
+					  unsigned u64s)
+{
+	entry->u64s	= cpu_to_le16(u64s);
+	entry->btree_id = id;
+	entry->level	= level;
+	entry->type	= type;
+	entry->pad[0]	= 0;
+	entry->pad[1]	= 0;
+	entry->pad[2]	= 0;
+	return jset_u64s(u64s);
+}
+
+static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
+					  enum btree_id id, unsigned level,
 					  const void *data, unsigned u64s)
 {
-	struct journal_buf *buf = &j->buf[res->idx];
-	struct jset_entry *entry = vstruct_idx(buf->data, res->offset);
-	unsigned actual = jset_u64s(u64s);
+	unsigned ret = journal_entry_init(entry, type, id, level, u64s);
+
+	memcpy_u64s_small(entry->_data, data, u64s);
+	return ret;
+}
+
+static inline struct jset_entry *
+bch2_journal_add_entry(struct journal *j, struct journal_res *res,
+			 unsigned type, enum btree_id id,
+			 unsigned level, unsigned u64s)
+{
+	struct jset_entry *entry = journal_res_entry(j, res);
+	unsigned actual = journal_entry_init(entry, type, id, level, u64s);
 
 	EBUG_ON(!res->ref);
 	EBUG_ON(actual > res->u64s);
 
 	res->offset	+= actual;
 	res->u64s	-= actual;
-
-	memset(entry, 0, sizeof(*entry));
-	entry->u64s	= cpu_to_le16(u64s);
-	entry->type	= type;
-	entry->btree_id = id;
-	entry->level	= level;
-	memcpy_u64s(entry->_data, data, u64s);
+	return entry;
 }
 
-static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
-					enum btree_id id, const struct bkey_i *k)
+static inline bool journal_entry_empty(struct jset *j)
 {
-	bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
-			       id, 0, k, k->k.u64s);
-}
+	if (j->seq != j->last_seq)
+		return false;
 
-void bch2_journal_buf_put_slowpath(struct journal *, bool);
+	vstruct_for_each(j, i)
+		if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s)
+			return false;
+	return true;
+}
 
-static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
-				       bool need_write_just_set)
+/*
+ * Drop reference on a buffer index and return true if the count has hit zero.
+ */
+static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx)
 {
 	union journal_res_state s;
 
 	s.v = atomic64_sub_return(((union journal_res_state) {
 				    .buf0_count = idx == 0,
 				    .buf1_count = idx == 1,
+				    .buf2_count = idx == 2,
+				    .buf3_count = idx == 3,
 				    }).v, &j->reservations.counter);
+	return s;
+}
+
+bool bch2_journal_entry_close(struct journal *);
+void bch2_journal_do_writes(struct journal *);
+void bch2_journal_buf_put_final(struct journal *, u64);
+
+static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+{
+	union journal_res_state s;
+
+	s = journal_state_buf_put(j, idx);
+	if (!journal_state_count(s, idx))
+		bch2_journal_buf_put_final(j, seq);
+}
 
-	EBUG_ON(s.idx != idx && !s.prev_buf_unwritten);
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+{
+	union journal_res_state s;
 
-	/*
-	 * Do not initiate a journal write if the journal is in an error state
-	 * (previous journal entry write may have failed)
-	 */
-	if (s.idx != idx &&
-	    !journal_state_count(s, idx) &&
-	    s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL)
-		bch2_journal_buf_put_slowpath(j, need_write_just_set);
+	s = journal_state_buf_put(j, idx);
+	if (!journal_state_count(s, idx)) {
+		spin_lock(&j->lock);
+		bch2_journal_buf_put_final(j, seq);
+		spin_unlock(&j->lock);
+	} else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL))
+		wake_up(&j->wait);
 }
 
 /*
@@ -256,85 +299,112 @@ static inline void bch2_journal_res_put(struct journal *j,
 	if (!res->ref)
 		return;
 
-	lock_release(&j->res_map, 0, _RET_IP_);
+	lock_release(&j->res_map, _THIS_IP_);
 
 	while (res->u64s)
 		bch2_journal_add_entry(j, res,
 				       BCH_JSET_ENTRY_btree_keys,
-				       0, 0, NULL, 0);
+				       0, 0, 0);
 
-	bch2_journal_buf_put(j, res->idx, false);
+	bch2_journal_buf_put(j, res->idx, res->seq);
 
 	res->ref = 0;
 }
 
 int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
-				 unsigned, unsigned);
+				  unsigned);
+
+/* First bits for BCH_WATERMARK: */
+enum journal_res_flags {
+	__JOURNAL_RES_GET_NONBLOCK	= BCH_WATERMARK_BITS,
+	__JOURNAL_RES_GET_CHECK,
+};
+
+#define JOURNAL_RES_GET_NONBLOCK	(1 << __JOURNAL_RES_GET_NONBLOCK)
+#define JOURNAL_RES_GET_CHECK		(1 << __JOURNAL_RES_GET_CHECK)
 
 static inline int journal_res_get_fast(struct journal *j,
 				       struct journal_res *res,
-				       unsigned u64s_min,
-				       unsigned u64s_max)
+				       unsigned flags)
 {
 	union journal_res_state old, new;
-	u64 v = atomic64_read(&j->reservations.counter);
 
+	old.v = atomic64_read(&j->reservations.counter);
 	do {
-		old.v = new.v = v;
+		new.v = old.v;
 
 		/*
 		 * Check if there is still room in the current journal
 		 * entry:
 		 */
-		if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s)
+		if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
 			return 0;
 
-		res->offset	= old.cur_entry_offset;
-		res->u64s	= min(u64s_max, j->cur_entry_u64s -
-				      old.cur_entry_offset);
+		EBUG_ON(!journal_state_count(new, new.idx));
+
+		if ((flags & BCH_WATERMARK_MASK) < j->watermark)
+			return 0;
 
-		journal_state_inc(&new);
 		new.cur_entry_offset += res->u64s;
-	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
-				       old.v, new.v)) != old.v);
+		journal_state_inc(&new);
 
-	res->ref = true;
-	res->idx = new.idx;
-	res->seq = le64_to_cpu(j->buf[res->idx].data->seq);
+		/*
+		 * If the refcount would overflow, we have to wait:
+		 * XXX - tracepoint this:
+		 */
+		if (!journal_state_count(new, new.idx))
+			return 0;
+
+		if (flags & JOURNAL_RES_GET_CHECK)
+			return 1;
+	} while (!atomic64_try_cmpxchg(&j->reservations.counter,
+				       &old.v, new.v));
+
+	res->ref	= true;
+	res->idx	= old.idx;
+	res->offset	= old.cur_entry_offset;
+	res->seq	= le64_to_cpu(j->buf[old.idx].data->seq);
 	return 1;
 }
 
 static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
-				      unsigned u64s_min, unsigned u64s_max)
+				       unsigned u64s, unsigned flags)
 {
 	int ret;
 
 	EBUG_ON(res->ref);
-	EBUG_ON(u64s_max < u64s_min);
-	EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+	EBUG_ON(!test_bit(JOURNAL_running, &j->flags));
 
-	if (journal_res_get_fast(j, res, u64s_min, u64s_max))
+	res->u64s = u64s;
+
+	if (journal_res_get_fast(j, res, flags))
 		goto out;
 
-	ret = bch2_journal_res_get_slowpath(j, res, u64s_min, u64s_max);
+	ret = bch2_journal_res_get_slowpath(j, res, flags);
 	if (ret)
 		return ret;
 out:
-	lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
-	EBUG_ON(!res->ref);
+	if (!(flags & JOURNAL_RES_GET_CHECK)) {
+		lock_acquire_shared(&j->res_map, 0,
+				    (flags & JOURNAL_RES_GET_NONBLOCK) != 0,
+				    NULL, _THIS_IP_);
+		EBUG_ON(!res->ref);
+	}
 	return 0;
 }
 
-u64 bch2_journal_last_unwritten_seq(struct journal *);
-int bch2_journal_open_seq_async(struct journal *, u64, struct closure *);
+/* journal_entry_res: */
 
-void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *);
-void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
+void bch2_journal_entry_res_resize(struct journal *,
+				   struct journal_entry_res *,
+				   unsigned);
+
+int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
 void bch2_journal_flush_async(struct journal *, struct closure *);
-void bch2_journal_meta_async(struct journal *, struct closure *);
 
-int bch2_journal_flush_seq(struct journal *, u64);
+int bch2_journal_flush_seq(struct journal *, u64, unsigned);
 int bch2_journal_flush(struct journal *);
+bool bch2_journal_noflush_seq(struct journal *, u64);
 int bch2_journal_meta(struct journal *);
 
 void bch2_journal_halt(struct journal *);
@@ -342,36 +412,36 @@ void bch2_journal_halt(struct journal *);
 static inline int bch2_journal_error(struct journal *j)
 {
 	return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
-		? -EIO : 0;
+		? -BCH_ERR_journal_shutdown : 0;
 }
 
 struct bch_dev;
 
-static inline bool journal_flushes_device(struct bch_dev *ca)
-{
-	return true;
-}
-
-int bch2_journal_mark(struct bch_fs *, struct list_head *);
-void bch2_journal_entries_free(struct list_head *);
-int bch2_journal_replay(struct bch_fs *, struct list_head *);
-
 static inline void bch2_journal_set_replay_done(struct journal *j)
 {
-	BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
-	set_bit(JOURNAL_REPLAY_DONE, &j->flags);
+	BUG_ON(!test_bit(JOURNAL_running, &j->flags));
+	set_bit(JOURNAL_replay_done, &j->flags);
 }
 
-ssize_t bch2_journal_print_debug(struct journal *, char *);
-ssize_t bch2_journal_print_pins(struct journal *, char *);
+void bch2_journal_unblock(struct journal *);
+void bch2_journal_block(struct journal *);
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *);
+
+void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
+void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
+void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
+bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
 
 int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
 				unsigned nr);
-int bch2_dev_journal_alloc(struct bch_dev *);
+int bch2_dev_journal_alloc(struct bch_dev *, bool);
+int bch2_fs_journal_alloc(struct bch_fs *);
 
 void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
+
 void bch2_fs_journal_stop(struct journal *);
-void bch2_fs_journal_start(struct journal *);
+int bch2_fs_journal_start(struct journal *, u64);
+
 void bch2_dev_journal_exit(struct bch_dev *);
 int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
 void bch2_fs_journal_exit(struct journal *);
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 8a4e7b2a..1627f3e1 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1,22 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
-#include "alloc.h"
-#include "btree_gc.h"
-#include "btree_update.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_io.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "checksum.h"
+#include "disk_groups.h"
 #include "error.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
 #include "replicas.h"
+#include "sb-clean.h"
+#include "trace.h"
 
-#include <trace/events/bcachefs.h>
+#include <linux/string_choices.h>
+
+void bch2_journal_pos_from_member_info_set(struct bch_fs *c)
+{
+	lockdep_assert_held(&c->sb_lock);
+
+	for_each_member_device(c, ca) {
+		struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+
+		m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx);
+		m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free);
+	}
+}
+
+void bch2_journal_pos_from_member_info_resume(struct bch_fs *c)
+{
+	mutex_lock(&c->sb_lock);
+	for_each_member_device(c, ca) {
+		struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
+
+		unsigned idx = le32_to_cpu(m.last_journal_bucket);
+		if (idx < ca->journal.nr)
+			ca->journal.cur_idx = idx;
+		unsigned offset = le32_to_cpu(m.last_journal_bucket_offset);
+		if (offset <= ca->mi.bucket_size)
+			ca->journal.sectors_free = ca->mi.bucket_size - offset;
+	}
+	mutex_unlock(&c->sb_lock);
+}
+
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+			       struct journal_replay *j)
+{
+	darray_for_each(j->ptrs, i) {
+		if (i != j->ptrs.data)
+			prt_printf(out, " ");
+		prt_printf(out, "%u:%u:%u (sector %llu)",
+			   i->dev, i->bucket, i->bucket_offset, i->sector);
+	}
+}
+
+static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
+					struct journal_replay *j)
+{
+	prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
+
+	bch2_journal_ptrs_to_text(out, c, j);
+
+	for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
+		struct jset_entry_datetime *datetime =
+			container_of(entry, struct jset_entry_datetime, entry);
+		bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+		break;
+	}
+}
+
+static struct nonce journal_nonce(const struct jset *jset)
+{
+	return (struct nonce) {{
+		[0] = 0,
+		[1] = ((__le32 *) &jset->seq)[0],
+		[2] = ((__le32 *) &jset->seq)[1],
+		[3] = BCH_NONCE_JOURNAL,
+	}};
+}
+
+static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum)
+{
+	if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) {
+		*csum = (struct bch_csum) {};
+		return false;
+	}
+
+	*csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
+	return !bch2_crc_cmp(j->csum, *csum);
+}
+
+static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
+{
+	return (seq - c->journal_entries_base_seq) & (~0U >> 1);
+}
+
+static void __journal_replay_free(struct bch_fs *c,
+				  struct journal_replay *i)
+{
+	struct journal_replay **p =
+		genradix_ptr(&c->journal_entries,
+			     journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
+
+	BUG_ON(*p != i);
+	*p = NULL;
+	kvfree(i);
+}
+
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted)
+{
+	if (blacklisted)
+		i->ignore_blacklisted = true;
+	else
+		i->ignore_not_dirty = true;
+
+	if (!c->opts.read_entire_journal)
+		__journal_replay_free(c, i);
+}
 
 struct journal_list {
 	struct closure		cl;
+	u64			last_seq;
 	struct mutex		lock;
-	struct list_head	*head;
 	int			ret;
 };
 
@@ -28,82 +137,122 @@ struct journal_list {
  * be replayed:
  */
 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+			     struct journal_ptr entry_ptr,
 			     struct journal_list *jlist, struct jset *j)
 {
-	struct journal_replay *i, *pos;
-	struct list_head *where;
+	struct genradix_iter iter;
+	struct journal_replay **_i, *i, *dup;
 	size_t bytes = vstruct_bytes(j);
-	__le64 last_seq;
-	int ret;
+	u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
+	struct printbuf buf = PRINTBUF;
+	int ret = JOURNAL_ENTRY_ADD_OK;
 
-	last_seq = !list_empty(jlist->head)
-		? list_last_entry(jlist->head, struct journal_replay,
-				  list)->j.last_seq
-		: 0;
+	if (!c->journal.oldest_seq_found_ondisk ||
+	    le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk)
+		c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq);
 
 	/* Is this entry older than the range we need? */
-	if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
-		ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
-		goto out;
-	}
+	if (!c->opts.read_entire_journal &&
+	    le64_to_cpu(j->seq) < jlist->last_seq)
+		return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+
+	/*
+	 * genradixes are indexed by a ulong, not a u64, so we can't index them
+	 * by sequence number directly: Assume instead that they will all fall
+	 * within the range of +-2billion of the filrst one we find.
+	 */
+	if (!c->journal_entries_base_seq)
+		c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
 
 	/* Drop entries we don't need anymore */
-	list_for_each_entry_safe(i, pos, jlist->head, list) {
-		if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
-			break;
-		list_del(&i->list);
-		kvpfree(i, offsetof(struct journal_replay, j) +
-			vstruct_bytes(&i->j));
-	}
+	if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
+		genradix_for_each_from(&c->journal_entries, iter, _i,
+				       journal_entry_radix_idx(c, jlist->last_seq)) {
+			i = *_i;
 
-	list_for_each_entry_reverse(i, jlist->head, list) {
-		/* Duplicate? */
-		if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
-			fsck_err_on(bytes != vstruct_bytes(&i->j) ||
-				    memcmp(j, &i->j, bytes), c,
-				    "found duplicate but non identical journal entries (seq %llu)",
-				    le64_to_cpu(j->seq));
-			goto found;
-		}
+			if (journal_replay_ignore(i))
+				continue;
+
+			if (le64_to_cpu(i->j.seq) >= last_seq)
+				break;
 
-		if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
-			where = &i->list;
-			goto add;
+			journal_replay_free(c, i, false);
 		}
 	}
 
-	where = jlist->head;
-add:
-	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
-	if (!i) {
-		ret = -ENOMEM;
+	jlist->last_seq = max(jlist->last_seq, last_seq);
+
+	_i = genradix_ptr_alloc(&c->journal_entries,
+				journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
+				GFP_KERNEL);
+	if (!_i)
+		return -BCH_ERR_ENOMEM_journal_entry_add;
+
+	/*
+	 * Duplicate journal entries? If so we want the one that didn't have a
+	 * checksum error:
+	 */
+	dup = *_i;
+	if (dup) {
+		bool identical = bytes == vstruct_bytes(&dup->j) &&
+			!memcmp(j, &dup->j, bytes);
+		bool not_identical = !identical &&
+			entry_ptr.csum_good &&
+			dup->csum_good;
+
+		bool same_device = false;
+		darray_for_each(dup->ptrs, ptr)
+			if (ptr->dev == ca->dev_idx)
+				same_device = true;
+
+		ret = darray_push(&dup->ptrs, entry_ptr);
+		if (ret)
+			goto out;
+
+		bch2_journal_replay_to_text(&buf, c, dup);
+
+		fsck_err_on(same_device,
+			    c, journal_entry_dup_same_device,
+			    "duplicate journal entry on same device\n  %s",
+			    buf.buf);
+
+		fsck_err_on(not_identical,
+			    c, journal_entry_replicas_data_mismatch,
+			    "found duplicate but non identical journal entries\n  %s",
+			    buf.buf);
+
+		if (entry_ptr.csum_good && !identical)
+			goto replace;
+
 		goto out;
 	}
+replace:
+	i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+	if (!i)
+		return -BCH_ERR_ENOMEM_journal_entry_add;
+
+	darray_init(&i->ptrs);
+	i->csum_good		= entry_ptr.csum_good;
+	i->ignore_blacklisted	= false;
+	i->ignore_not_dirty	= false;
+	unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
+
+	if (dup) {
+		/* The first ptr should represent the jset we kept: */
+		darray_for_each(dup->ptrs, ptr)
+			darray_push(&i->ptrs, *ptr);
+		__journal_replay_free(c, dup);
+	} else {
+		darray_push(&i->ptrs, entry_ptr);
+	}
 
-	list_add(&i->list, where);
-	i->devs.nr = 0;
-	memcpy(&i->j, j, bytes);
-found:
-	if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
-		bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
-	else
-		fsck_err_on(1, c, "duplicate journal entries on same device");
-	ret = JOURNAL_ENTRY_ADD_OK;
+	*_i = i;
 out:
 fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
-static struct nonce journal_nonce(const struct jset *jset)
-{
-	return (struct nonce) {{
-		[0] = 0,
-		[1] = ((__le32 *) &jset->seq)[0],
-		[2] = ((__le32 *) &jset->seq)[1],
-		[3] = BCH_NONCE_JOURNAL,
-	}};
-}
-
 /* this fills in a range with empty jset_entries: */
 static void journal_entry_null_range(void *start, void *end)
 {
@@ -117,110 +266,184 @@ static void journal_entry_null_range(void *start, void *end)
 #define JOURNAL_ENTRY_NONE	6
 #define JOURNAL_ENTRY_BAD	7
 
-#define journal_entry_err(c, msg, ...)					\
+static void journal_entry_err_msg(struct printbuf *out,
+				  u32 version,
+				  struct jset *jset,
+				  struct jset_entry *entry)
+{
+	prt_str(out, "invalid journal entry, version=");
+	bch2_version_to_text(out, version);
+
+	if (entry) {
+		prt_str(out, " type=");
+		bch2_prt_jset_entry_type(out, entry->type);
+	}
+
+	if (!jset) {
+		prt_printf(out, " in superblock");
+	} else {
+
+		prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
+
+		if (entry)
+			prt_printf(out, " offset=%zi/%u",
+				   (u64 *) entry - jset->_data,
+				   le32_to_cpu(jset->u64s));
+	}
+
+	prt_str(out, ": ");
+}
+
+#define journal_entry_err(c, version, jset, entry, _err, msg, ...)	\
 ({									\
-	switch (write) {						\
+	struct printbuf _buf = PRINTBUF;				\
+									\
+	journal_entry_err_msg(&_buf, version, jset, entry);		\
+	prt_printf(&_buf, msg, ##__VA_ARGS__);				\
+									\
+	switch (flags & BCH_VALIDATE_write) {				\
 	case READ:							\
-		mustfix_fsck_err(c, msg, ##__VA_ARGS__);		\
+		mustfix_fsck_err(c, _err, "%s", _buf.buf);		\
 		break;							\
 	case WRITE:							\
-		bch_err(c, "corrupt metadata before write:\n"		\
-			msg, ##__VA_ARGS__);				\
+		bch2_sb_error_count(c, BCH_FSCK_ERR_##_err);		\
+		bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\
 		if (bch2_fs_inconsistent(c)) {				\
-			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
+			ret = -BCH_ERR_fsck_errors_not_fixed;		\
 			goto fsck_err;					\
 		}							\
 		break;							\
 	}								\
+									\
+	printbuf_exit(&_buf);						\
 	true;								\
 })
 
-#define journal_entry_err_on(cond, c, msg, ...)				\
-	((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
+#define journal_entry_err_on(cond, ...)					\
+	((cond) ? journal_entry_err(__VA_ARGS__) : false)
+
+#define FSCK_DELETED_KEY	5
 
-static int journal_validate_key(struct bch_fs *c, struct jset *jset,
+static int journal_validate_key(struct bch_fs *c,
+				struct jset *jset,
 				struct jset_entry *entry,
-				struct bkey_i *k, enum bkey_type key_type,
-				const char *type, int write)
+				struct bkey_i *k,
+				struct bkey_validate_context from,
+				unsigned version, int big_endian)
 {
+	enum bch_validate_flags flags = from.flags;
+	int write = flags & BCH_VALIDATE_write;
 	void *next = vstruct_next(entry);
-	const char *invalid;
-	char buf[160];
 	int ret = 0;
 
-	if (journal_entry_err_on(!k->k.u64s, c,
-			"invalid %s in journal: k->u64s 0", type)) {
+	if (journal_entry_err_on(!k->k.u64s,
+				 c, version, jset, entry,
+				 journal_entry_bkey_u64s_0,
+				 "k->u64s 0")) {
 		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
 		journal_entry_null_range(vstruct_next(entry), next);
-		return 0;
+		return FSCK_DELETED_KEY;
 	}
 
 	if (journal_entry_err_on((void *) bkey_next(k) >
-				(void *) vstruct_next(entry), c,
-			"invalid %s in journal: extends past end of journal entry",
-			type)) {
+				 (void *) vstruct_next(entry),
+				 c, version, jset, entry,
+				 journal_entry_bkey_past_end,
+				 "extends past end of journal entry")) {
 		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
 		journal_entry_null_range(vstruct_next(entry), next);
-		return 0;
+		return FSCK_DELETED_KEY;
 	}
 
-	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
-			"invalid %s in journal: bad format %u",
-			type, k->k.format)) {
-		le16_add_cpu(&entry->u64s, -k->k.u64s);
+	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
+				 c, version, jset, entry,
+				 journal_entry_bkey_bad_format,
+				 "bad format %u", k->k.format)) {
+		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
 		journal_entry_null_range(vstruct_next(entry), next);
-		return 0;
+		return FSCK_DELETED_KEY;
 	}
 
-	if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN)
-		bch2_bkey_swab(key_type, NULL, bkey_to_packed(k));
+	if (!write)
+		bch2_bkey_compat(from.level, from.btree, version, big_endian,
+				 write, NULL, bkey_to_packed(k));
 
-	invalid = bch2_bkey_invalid(c, key_type, bkey_i_to_s_c(k));
-	if (invalid) {
-		bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf),
-				     bkey_i_to_s_c(k));
-		mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
-				 type, invalid, buf);
-
-		le16_add_cpu(&entry->u64s, -k->k.u64s);
+	ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from);
+	if (ret == -BCH_ERR_fsck_delete_bkey) {
+		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
 		journal_entry_null_range(vstruct_next(entry), next);
-		return 0;
+		return FSCK_DELETED_KEY;
 	}
+	if (ret)
+		goto fsck_err;
+
+	if (write)
+		bch2_bkey_compat(from.level, from.btree, version, big_endian,
+				 write, NULL, bkey_to_packed(k));
 fsck_err:
 	return ret;
 }
 
-static int journal_entry_validate_btree_keys(struct bch_fs *c,
-					     struct jset *jset,
-					     struct jset_entry *entry,
-					     int write)
+static int journal_entry_btree_keys_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bch_validate_flags flags)
 {
-	struct bkey_i *k;
-
-	vstruct_for_each(entry, k) {
-		int ret = journal_validate_key(c, jset, entry, k,
-				bkey_type(entry->level,
-					  entry->btree_id),
-				"key", write);
-		if (ret)
+	struct bkey_i *k = entry->start;
+	struct bkey_validate_context from = {
+		.from	= BKEY_VALIDATE_journal,
+		.level	= entry->level,
+		.btree	= entry->btree_id,
+		.flags	= flags|BCH_VALIDATE_journal,
+	};
+
+	while (k != vstruct_last(entry)) {
+		int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
+		if (ret == FSCK_DELETED_KEY)
+			continue;
+		else if (ret)
 			return ret;
+
+		k = bkey_next(k);
 	}
 
 	return 0;
 }
 
-static int journal_entry_validate_btree_root(struct bch_fs *c,
-					     struct jset *jset,
-					     struct jset_entry *entry,
-					     int write)
+static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
+					     struct jset_entry *entry)
+{
+	bool first = true;
+
+	jset_entry_for_each_key(entry, k) {
+		if (!first) {
+			prt_newline(out);
+			bch2_prt_jset_entry_type(out, entry->type);
+			prt_str(out, ": ");
+		}
+		bch2_btree_id_level_to_text(out, entry->btree_id, entry->level);
+		prt_char(out, ' ');
+		bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
+		first = false;
+	}
+}
+
+static int journal_entry_btree_root_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bch_validate_flags flags)
 {
 	struct bkey_i *k = entry->start;
 	int ret = 0;
 
 	if (journal_entry_err_on(!entry->u64s ||
-				 le16_to_cpu(entry->u64s) != k->k.u64s, c,
+				 le16_to_cpu(entry->u64s) != k->k.u64s,
+				 c, version, jset, entry,
+				 journal_entry_btree_root_bad_size,
 				 "invalid btree root journal entry: wrong number of keys")) {
 		void *next = vstruct_next(entry);
 		/*
@@ -233,29 +456,52 @@ static int journal_entry_validate_btree_root(struct bch_fs *c,
 		return 0;
 	}
 
-	return journal_validate_key(c, jset, entry, k, BKEY_TYPE_BTREE,
-				    "btree root", write);
+	struct bkey_validate_context from = {
+		.from	= BKEY_VALIDATE_journal,
+		.level	= entry->level + 1,
+		.btree	= entry->btree_id,
+		.root	= true,
+		.flags	= flags,
+	};
+	ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
+	if (ret == FSCK_DELETED_KEY)
+		ret = 0;
 fsck_err:
 	return ret;
 }
 
-static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
-					    struct jset *jset,
-					    struct jset_entry *entry,
-					    int write)
+static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
+					     struct jset_entry *entry)
+{
+	journal_entry_btree_keys_to_text(out, c, entry);
+}
+
+static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bch_validate_flags flags)
 {
 	/* obsolete, don't care: */
 	return 0;
 }
 
-static int journal_entry_validate_blacklist(struct bch_fs *c,
-					    struct jset *jset,
-					    struct jset_entry *entry,
-					    int write)
+static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+}
+
+static int journal_entry_blacklist_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bch_validate_flags flags)
 {
 	int ret = 0;
 
-	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
+	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
+				 c, version, jset, entry,
+				 journal_entry_blacklist_bad_size,
 		"invalid journal seq blacklist entry: bad size")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 	}
@@ -263,77 +509,369 @@ fsck_err:
 	return ret;
 }
 
-static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
-					       struct jset *jset,
-					       struct jset_entry *entry,
-					       int write)
+static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+	struct jset_entry_blacklist *bl =
+		container_of(entry, struct jset_entry_blacklist, entry);
+
+	prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
+}
+
+static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bch_validate_flags flags)
 {
 	struct jset_entry_blacklist_v2 *bl_entry;
 	int ret = 0;
 
-	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
+	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
+				 c, version, jset, entry,
+				 journal_entry_blacklist_v2_bad_size,
 		"invalid journal seq blacklist entry: bad size")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
+		goto out;
 	}
 
 	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
 
 	if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
-				 le64_to_cpu(bl_entry->end), c,
+				 le64_to_cpu(bl_entry->end),
+				 c, version, jset, entry,
+				 journal_entry_blacklist_v2_start_past_end,
 		"invalid journal seq blacklist entry: start > end")) {
 		journal_entry_null_range(entry, vstruct_next(entry));
 	}
+out:
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
+					       struct jset_entry *entry)
+{
+	struct jset_entry_blacklist_v2 *bl =
+		container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+	prt_printf(out, "start=%llu end=%llu",
+	       le64_to_cpu(bl->start),
+	       le64_to_cpu(bl->end));
+}
+
+static int journal_entry_usage_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bch_validate_flags flags)
+{
+	struct jset_entry_usage *u =
+		container_of(entry, struct jset_entry_usage, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes < sizeof(*u),
+				 c, version, jset, entry,
+				 journal_entry_usage_bad_size,
+				 "invalid journal entry usage: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
+					struct jset_entry *entry)
+{
+	struct jset_entry_usage *u =
+		container_of(entry, struct jset_entry_usage, entry);
+
+	prt_str(out, "type=");
+	bch2_prt_fs_usage_type(out, u->entry.btree_id);
+	prt_printf(out, " v=%llu", le64_to_cpu(u->v));
+}
+
+static int journal_entry_data_usage_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bch_validate_flags flags)
+{
+	struct jset_entry_data_usage *u =
+		container_of(entry, struct jset_entry_data_usage, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	struct printbuf err = PRINTBUF;
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes < sizeof(*u) ||
+				 bytes < sizeof(*u) + u->r.nr_devs,
+				 c, version, jset, entry,
+				 journal_entry_data_usage_bad_size,
+				 "invalid journal entry usage: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		goto out;
+	}
+
+	if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err),
+				 c, version, jset, entry,
+				 journal_entry_data_usage_bad_size,
+				 "invalid journal entry usage: %s", err.buf)) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		goto out;
+	}
+out:
+fsck_err:
+	printbuf_exit(&err);
+	return ret;
+}
+
+static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
+					     struct jset_entry *entry)
+{
+	struct jset_entry_data_usage *u =
+		container_of(entry, struct jset_entry_data_usage, entry);
+
+	bch2_replicas_entry_to_text(out, &u->r);
+	prt_printf(out, "=%llu", le64_to_cpu(u->v));
+}
+
+static int journal_entry_clock_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bch_validate_flags flags)
+{
+	struct jset_entry_clock *clock =
+		container_of(entry, struct jset_entry_clock, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes != sizeof(*clock),
+				 c, version, jset, entry,
+				 journal_entry_clock_bad_size,
+				 "bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+	if (journal_entry_err_on(clock->rw > 1,
+				 c, version, jset, entry,
+				 journal_entry_clock_bad_rw,
+				 "bad rw")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
 
 fsck_err:
 	return ret;
 }
 
+static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
+					struct jset_entry *entry)
+{
+	struct jset_entry_clock *clock =
+		container_of(entry, struct jset_entry_clock, entry);
+
+	prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time));
+}
+
+static int journal_entry_dev_usage_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bch_validate_flags flags)
+{
+	struct jset_entry_dev_usage *u =
+		container_of(entry, struct jset_entry_dev_usage, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	unsigned expected = sizeof(*u);
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes < expected,
+				 c, version, jset, entry,
+				 journal_entry_dev_usage_bad_size,
+				 "bad size (%u < %u)",
+				 bytes, expected)) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+	if (journal_entry_err_on(u->pad,
+				 c, version, jset, entry,
+				 journal_entry_dev_usage_bad_pad,
+				 "bad pad")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+	struct jset_entry_dev_usage *u =
+		container_of(entry, struct jset_entry_dev_usage, entry);
+	unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
+
+	if (vstruct_bytes(entry) < sizeof(*u))
+		return;
+
+	prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
+
+	printbuf_indent_add(out, 2);
+	for (i = 0; i < nr_types; i++) {
+		prt_newline(out);
+		bch2_prt_data_type(out, i);
+		prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
+		       le64_to_cpu(u->d[i].buckets),
+		       le64_to_cpu(u->d[i].sectors),
+		       le64_to_cpu(u->d[i].fragmented));
+	}
+	printbuf_indent_sub(out, 2);
+}
+
+static int journal_entry_log_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bch_validate_flags flags)
+{
+	return 0;
+}
+
+static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
+				      struct jset_entry *entry)
+{
+	struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
+
+	prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d);
+}
+
+static int journal_entry_overwrite_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bch_validate_flags flags)
+{
+	return journal_entry_btree_keys_validate(c, jset, entry,
+				version, big_endian, READ);
+}
+
+static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+	journal_entry_btree_keys_to_text(out, c, entry);
+}
+
+static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bch_validate_flags flags)
+{
+	return journal_entry_btree_keys_validate(c, jset, entry,
+				version, big_endian, READ);
+}
+
+static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+	journal_entry_btree_keys_to_text(out, c, entry);
+}
+
+static int journal_entry_datetime_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bch_validate_flags flags)
+{
+	unsigned bytes = vstruct_bytes(entry);
+	unsigned expected = 16;
+	int ret = 0;
+
+	if (journal_entry_err_on(vstruct_bytes(entry) < expected,
+				 c, version, jset, entry,
+				 journal_entry_dev_usage_bad_size,
+				 "bad size (%u < %u)",
+				 bytes, expected)) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+	struct jset_entry_datetime *datetime =
+		container_of(entry, struct jset_entry_datetime, entry);
+
+	bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+}
+
 struct jset_entry_ops {
 	int (*validate)(struct bch_fs *, struct jset *,
-			struct jset_entry *, int);
+			struct jset_entry *, unsigned, int,
+			enum bch_validate_flags);
+	void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
 };
 
 static const struct jset_entry_ops bch2_jset_entry_ops[] = {
 #define x(f, nr)						\
 	[BCH_JSET_ENTRY_##f]	= (struct jset_entry_ops) {	\
-		.validate	= journal_entry_validate_##f,	\
+		.validate	= journal_entry_##f##_validate,	\
+		.to_text	= journal_entry_##f##_to_text,	\
 	},
 	BCH_JSET_ENTRY_TYPES()
 #undef x
 };
 
-static int journal_entry_validate(struct bch_fs *c, struct jset *jset,
-				  struct jset_entry *entry, int write)
+int bch2_journal_entry_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bch_validate_flags flags)
 {
-	int ret = 0;
+	return entry->type < BCH_JSET_ENTRY_NR
+		? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
+				version, big_endian, flags)
+		: 0;
+}
 
-	if (entry->type >= BCH_JSET_ENTRY_NR) {
-		journal_entry_err(c, "invalid journal entry type %u",
-				  entry->type);
-		journal_entry_null_range(entry, vstruct_next(entry));
-		return 0;
-	}
+void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
+				struct jset_entry *entry)
+{
+	bch2_prt_jset_entry_type(out, entry->type);
 
-	ret = bch2_jset_entry_ops[entry->type].validate(c, jset, entry, write);
-fsck_err:
-	return ret;
+	if (entry->type < BCH_JSET_ENTRY_NR) {
+		prt_str(out, ": ");
+		bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
+	}
 }
 
 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
-				 int write)
+				 enum bch_validate_flags flags)
 {
-	struct jset_entry *entry;
+	unsigned version = le32_to_cpu(jset->version);
 	int ret = 0;
 
 	vstruct_for_each(jset, entry) {
-		if (journal_entry_err_on(vstruct_next(entry) >
-					 vstruct_last(jset), c,
+		if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
+				c, version, jset, entry,
+				journal_entry_past_jset_end,
 				"journal entry extends past end of jset")) {
 			jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
 			break;
 		}
 
-		ret = journal_entry_validate(c, jset, entry, write);
+		ret = bch2_journal_entry_validate(c, jset, entry,
+					version, JSET_BIG_ENDIAN(jset), flags);
 		if (ret)
 			break;
 	}
@@ -342,56 +880,94 @@ fsck_err:
 }
 
 static int jset_validate(struct bch_fs *c,
+			 struct bch_dev *ca,
 			 struct jset *jset, u64 sector,
-			 unsigned bucket_sectors_left,
-			 unsigned sectors_read,
-			 int write)
+			 enum bch_validate_flags flags)
 {
-	size_t bytes = vstruct_bytes(jset);
-	struct bch_csum csum;
+	unsigned version;
 	int ret = 0;
 
 	if (le64_to_cpu(jset->magic) != jset_magic(c))
 		return JOURNAL_ENTRY_NONE;
 
-	if (le32_to_cpu(jset->version) != BCACHE_JSET_VERSION) {
-		bch_err(c, "unknown journal entry version %u",
-			le32_to_cpu(jset->version));
-		return BCH_FSCK_UNKNOWN_VERSION;
+	version = le32_to_cpu(jset->version);
+	if (journal_entry_err_on(!bch2_version_compatible(version),
+			c, version, jset, NULL,
+			jset_unsupported_version,
+			"%s sector %llu seq %llu: incompatible journal entry version %u.%u",
+			ca ? ca->name : c->name,
+			sector, le64_to_cpu(jset->seq),
+			BCH_VERSION_MAJOR(version),
+			BCH_VERSION_MINOR(version))) {
+		/* don't try to continue: */
+		return -EINVAL;
 	}
 
-	if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
-				 "journal entry too big (%zu bytes), sector %lluu",
-				 bytes, sector)) {
-		/* XXX: note we might have missing journal entries */
+	if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
+			c, version, jset, NULL,
+			jset_unknown_csum,
+			"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
+			ca ? ca->name : c->name,
+			sector, le64_to_cpu(jset->seq),
+			JSET_CSUM_TYPE(jset)))
+		ret = JOURNAL_ENTRY_BAD;
+
+	/* last_seq is ignored when JSET_NO_FLUSH is true */
+	if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
+				 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
+				 c, version, jset, NULL,
+				 jset_last_seq_newer_than_seq,
+				 "invalid journal entry: last_seq > seq (%llu > %llu)",
+				 le64_to_cpu(jset->last_seq),
+				 le64_to_cpu(jset->seq))) {
+		jset->last_seq = jset->seq;
 		return JOURNAL_ENTRY_BAD;
 	}
 
-	if (bytes > sectors_read << 9)
-		return JOURNAL_ENTRY_REREAD;
+	ret = jset_validate_entries(c, jset, flags);
+fsck_err:
+	return ret;
+}
 
-	if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
-			"journal entry with unknown csum type %llu sector %lluu",
-			JSET_CSUM_TYPE(jset), sector))
-		return JOURNAL_ENTRY_BAD;
+static int jset_validate_early(struct bch_fs *c,
+			 struct bch_dev *ca,
+			 struct jset *jset, u64 sector,
+			 unsigned bucket_sectors_left,
+			 unsigned sectors_read)
+{
+	size_t bytes = vstruct_bytes(jset);
+	unsigned version;
+	enum bch_validate_flags flags = BCH_VALIDATE_journal;
+	int ret = 0;
 
-	csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
-	if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
-				 "journal checksum bad, sector %llu", sector)) {
-		/* XXX: retry IO, when we start retrying checksum errors */
-		/* XXX: note we might have missing journal entries */
-		return JOURNAL_ENTRY_BAD;
-	}
+	if (le64_to_cpu(jset->magic) != jset_magic(c))
+		return JOURNAL_ENTRY_NONE;
 
-	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
-		     jset->encrypted_start,
-		     vstruct_end(jset) - (void *) jset->encrypted_start);
+	version = le32_to_cpu(jset->version);
+	if (journal_entry_err_on(!bch2_version_compatible(version),
+			c, version, jset, NULL,
+			jset_unsupported_version,
+			"%s sector %llu seq %llu: unknown journal entry version %u.%u",
+			ca ? ca->name : c->name,
+			sector, le64_to_cpu(jset->seq),
+			BCH_VERSION_MAJOR(version),
+			BCH_VERSION_MINOR(version))) {
+		/* don't try to continue: */
+		return -EINVAL;
+	}
 
-	if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
-				 "invalid journal entry: last_seq > seq"))
-		jset->last_seq = jset->seq;
+	if (bytes > (sectors_read << 9) &&
+	    sectors_read < bucket_sectors_left)
+		return JOURNAL_ENTRY_REREAD;
 
-	return 0;
+	if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
+			c, version, jset, NULL,
+			jset_past_bucket_end,
+			"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
+			ca ? ca->name : c->name,
+			sector, le64_to_cpu(jset->seq), bytes))
+		le32_add_cpu(&jset->u64s,
+			     -((bytes - (bucket_sectors_left << 9)) / 8));
 fsck_err:
 	return ret;
 }
@@ -408,14 +984,14 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
 
 	/* the bios are sized for this many pages, max: */
 	if (new_size > JOURNAL_ENTRY_SIZE_MAX)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
 
 	new_size = roundup_pow_of_two(new_size);
-	n = kvpmalloc(new_size, GFP_KERNEL);
+	n = kvmalloc(new_size, GFP_KERNEL);
 	if (!n)
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
 
-	kvpfree(b->data, b->size);
+	kvfree(b->data);
 	b->data = n;
 	b->size = new_size;
 	return 0;
@@ -424,68 +1000,89 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
 static int journal_read_bucket(struct bch_dev *ca,
 			       struct journal_read_buf *buf,
 			       struct journal_list *jlist,
-			       unsigned bucket, u64 *seq, bool *entries_found)
+			       unsigned bucket)
 {
 	struct bch_fs *c = ca->fs;
 	struct journal_device *ja = &ca->journal;
-	struct bio *bio = ja->bio;
 	struct jset *j = NULL;
 	unsigned sectors, sectors_read = 0;
 	u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
 	    end = offset + ca->mi.bucket_size;
-	bool saw_bad = false;
+	bool saw_bad = false, csum_good;
+	struct printbuf err = PRINTBUF;
 	int ret = 0;
 
 	pr_debug("reading %u", bucket);
 
 	while (offset < end) {
 		if (!sectors_read) {
-reread:			sectors_read = min_t(unsigned,
+			struct bio *bio;
+			unsigned nr_bvecs;
+reread:
+			sectors_read = min_t(unsigned,
 				end - offset, buf->size >> 9);
+			nr_bvecs = buf_pages(buf->data, sectors_read << 9);
 
-			bio_reset(bio);
-			bio_set_dev(bio, ca->disk_sb.bdev);
-			bio->bi_iter.bi_sector	= offset;
-			bio->bi_iter.bi_size	= sectors_read << 9;
-			bio_set_op_attrs(bio, REQ_OP_READ, 0);
-			bch2_bio_map(bio, buf->data);
+			bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+			if (!bio)
+				return -BCH_ERR_ENOMEM_journal_read_bucket;
+			bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
+
+			bio->bi_iter.bi_sector = offset;
+			bch2_bio_map(bio, buf->data, sectors_read << 9);
 
 			ret = submit_bio_wait(bio);
+			kfree(bio);
 
-			if (bch2_dev_io_err_on(ret, ca,
-					       "journal read from sector %llu",
+			if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
+					       "journal read error: sector %llu",
 					       offset) ||
-			    bch2_meta_read_fault("journal"))
-				return -EIO;
+			    bch2_meta_read_fault("journal")) {
+				/*
+				 * We don't error out of the recovery process
+				 * here, since the relevant journal entry may be
+				 * found on a different device, and missing or
+				 * no journal entries will be handled later
+				 */
+				goto out;
+			}
 
 			j = buf->data;
 		}
 
-		ret = jset_validate(c, j, offset,
-				    end - offset, sectors_read,
-				    READ);
+		ret = jset_validate_early(c, ca, j, offset,
+				    end - offset, sectors_read);
 		switch (ret) {
-		case BCH_FSCK_OK:
+		case 0:
+			sectors = vstruct_sectors(j, c->block_bits);
 			break;
 		case JOURNAL_ENTRY_REREAD:
 			if (vstruct_bytes(j) > buf->size) {
 				ret = journal_read_buf_realloc(buf,
 							vstruct_bytes(j));
 				if (ret)
-					return ret;
+					goto err;
 			}
 			goto reread;
 		case JOURNAL_ENTRY_NONE:
 			if (!saw_bad)
-				return 0;
-			sectors = c->opts.block_size;
-			goto next_block;
-		case JOURNAL_ENTRY_BAD:
-			saw_bad = true;
-			sectors = c->opts.block_size;
+				goto out;
+			/*
+			 * On checksum error we don't really trust the size
+			 * field of the journal entry we read, so try reading
+			 * again at next block boundary:
+			 */
+			sectors = block_sectors(c);
 			goto next_block;
 		default:
-			return ret;
+			goto err;
+		}
+
+		if (le64_to_cpu(j->seq) > ja->highest_seq_found) {
+			ja->highest_seq_found = le64_to_cpu(j->seq);
+			ja->cur_idx = bucket;
+			ja->sectors_free = ca->mi.bucket_size -
+				bucket_remainder(ca, offset) - sectors;
 		}
 
 		/*
@@ -495,28 +1092,46 @@ reread:			sectors_read = min_t(unsigned,
 		 * bucket:
 		 */
 		if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
-			return 0;
+			goto out;
 
 		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 
+		enum bch_csum_type csum_type = JSET_CSUM_TYPE(j);
+		struct bch_csum csum;
+		csum_good = jset_csum_good(c, j, &csum);
+
+		if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
+				       "%s",
+				       (printbuf_reset(&err),
+					prt_str(&err, "journal "),
+					bch2_csum_err_msg(&err, csum_type, j->csum, csum),
+					err.buf)))
+			saw_bad = true;
+
+		ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
+			     j->encrypted_start,
+			     vstruct_end(j) - (void *) j->encrypted_start);
+		bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret));
+
 		mutex_lock(&jlist->lock);
-		ret = journal_entry_add(c, ca, jlist, j);
+		ret = journal_entry_add(c, ca, (struct journal_ptr) {
+					.csum_good	= csum_good,
+					.dev		= ca->dev_idx,
+					.bucket		= bucket,
+					.bucket_offset	= offset -
+						bucket_to_sector(ca, ja->buckets[bucket]),
+					.sector		= offset,
+					}, jlist, j);
 		mutex_unlock(&jlist->lock);
 
 		switch (ret) {
 		case JOURNAL_ENTRY_ADD_OK:
-			*entries_found = true;
 			break;
 		case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
 			break;
 		default:
-			return ret;
+			goto err;
 		}
-
-		if (le64_to_cpu(j->seq) > *seq)
-			*seq = le64_to_cpu(j->seq);
-
-		sectors = vstruct_sectors(j, c->block_bits);
 next_block:
 		pr_debug("next");
 		offset		+= sectors;
@@ -524,145 +1139,49 @@ next_block:
 		j = ((void *) j) + (sectors << 9);
 	}
 
-	return 0;
+out:
+	ret = 0;
+err:
+	printbuf_exit(&err);
+	return ret;
 }
 
-static void bch2_journal_read_device(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_journal_read_device)
 {
-#define read_bucket(b)							\
-	({								\
-		bool entries_found = false;				\
-		ret = journal_read_bucket(ca, &buf, jlist, b, &seq,	\
-					  &entries_found);		\
-		if (ret)						\
-			goto err;					\
-		__set_bit(b, bitmap);					\
-		entries_found;						\
-	 })
-
-	struct journal_device *ja =
-		container_of(cl, struct journal_device, read);
+	closure_type(ja, struct journal_device, read);
 	struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
+	struct bch_fs *c = ca->fs;
 	struct journal_list *jlist =
 		container_of(cl->parent, struct journal_list, cl);
-	struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
 	struct journal_read_buf buf = { NULL, 0 };
-
-	DECLARE_BITMAP(bitmap, ja->nr);
-	unsigned i, l, r;
-	u64 seq = 0;
-	int ret;
+	unsigned i;
+	int ret = 0;
 
 	if (!ja->nr)
 		goto out;
 
-	bitmap_zero(bitmap, ja->nr);
 	ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
 	if (ret)
 		goto err;
 
 	pr_debug("%u journal buckets", ja->nr);
 
-	/*
-	 * If the device supports discard but not secure discard, we can't do
-	 * the fancy fibonacci hash/binary search because the live journal
-	 * entries might not form a contiguous range:
-	 */
-	for (i = 0; i < ja->nr; i++)
-		read_bucket(i);
-	goto search_done;
-
-	if (!blk_queue_nonrot(q))
-		goto linear_scan;
-
-	/*
-	 * Read journal buckets ordered by golden ratio hash to quickly
-	 * find a sequence of buckets with valid journal entries
-	 */
 	for (i = 0; i < ja->nr; i++) {
-		l = (i * 2654435769U) % ja->nr;
-
-		if (test_bit(l, bitmap))
-			break;
-
-		if (read_bucket(l))
-			goto bsearch;
-	}
-
-	/*
-	 * If that fails, check all the buckets we haven't checked
-	 * already
-	 */
-	pr_debug("falling back to linear search");
-linear_scan:
-	for (l = find_first_zero_bit(bitmap, ja->nr);
-	     l < ja->nr;
-	     l = find_next_zero_bit(bitmap, ja->nr, l + 1))
-		if (read_bucket(l))
-			goto bsearch;
-
-	/* no journal entries on this device? */
-	if (l == ja->nr)
-		goto out;
-bsearch:
-	/* Binary search */
-	r = find_next_bit(bitmap, ja->nr, l + 1);
-	pr_debug("starting binary search, l %u r %u", l, r);
-
-	while (l + 1 < r) {
-		unsigned m = (l + r) >> 1;
-		u64 cur_seq = seq;
-
-		read_bucket(m);
-
-		if (cur_seq != seq)
-			l = m;
-		else
-			r = m;
+		ret = journal_read_bucket(ca, &buf, jlist, i);
+		if (ret)
+			goto err;
 	}
 
-search_done:
 	/*
-	 * Find the journal bucket with the highest sequence number:
-	 *
-	 * If there's duplicate journal entries in multiple buckets (which
-	 * definitely isn't supposed to happen, but...) - make sure to start
-	 * cur_idx at the last of those buckets, so we don't deadlock trying to
-	 * allocate
-	 */
-	seq = 0;
-
-	for (i = 0; i < ja->nr; i++)
-		if (ja->bucket_seq[i] >= seq &&
-		    ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) {
-			/*
-			 * When journal_next_bucket() goes to allocate for
-			 * the first time, it'll use the bucket after
-			 * ja->cur_idx
-			 */
-			ja->cur_idx = i;
-			seq = ja->bucket_seq[i];
-		}
-
-	/*
-	 * Set last_idx to indicate the entire journal is full and needs to be
+	 * Set dirty_idx to indicate the entire journal is full and needs to be
 	 * reclaimed - journal reclaim will immediately reclaim whatever isn't
 	 * pinned when it first runs:
 	 */
-	ja->last_idx = (ja->cur_idx + 1) % ja->nr;
-
-	/*
-	 * Read buckets in reverse order until we stop finding more journal
-	 * entries:
-	 */
-	for (i = (ja->cur_idx + ja->nr - 1) % ja->nr;
-	     i != ja->cur_idx;
-	     i = (i + ja->nr - 1) % ja->nr)
-		if (!test_bit(i, bitmap) &&
-		    !read_bucket(i))
-			break;
+	ja->discard_idx = ja->dirty_idx_ondisk =
+		ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
 out:
-	kvpfree(buf.data, buf.size);
+	bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
+	kvfree(buf.data);
 	percpu_ref_put(&ca->io_ref);
 	closure_return(cl);
 	return;
@@ -671,76 +1190,33 @@ err:
 	jlist->ret = ret;
 	mutex_unlock(&jlist->lock);
 	goto out;
-#undef read_bucket
 }
 
-void bch2_journal_entries_free(struct list_head *list)
+int bch2_journal_read(struct bch_fs *c,
+		      u64 *last_seq,
+		      u64 *blacklist_seq,
+		      u64 *start_seq)
 {
-
-	while (!list_empty(list)) {
-		struct journal_replay *i =
-			list_first_entry(list, struct journal_replay, list);
-		list_del(&i->list);
-		kvpfree(i, offsetof(struct journal_replay, j) +
-			vstruct_bytes(&i->j));
-	}
-}
-
-int bch2_journal_set_seq(struct bch_fs *c, u64 last_seq, u64 end_seq)
-{
-	struct journal *j = &c->journal;
-	struct journal_entry_pin_list *p;
-	u64 seq, nr = end_seq - last_seq + 1;
-
-	if (nr > j->pin.size) {
-		free_fifo(&j->pin);
-		init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
-		if (!j->pin.data) {
-			bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
-			return -ENOMEM;
-		}
-	}
-
-	atomic64_set(&j->seq, end_seq);
-	j->last_seq_ondisk = last_seq;
-
-	j->pin.front	= last_seq;
-	j->pin.back	= end_seq + 1;
-
-	fifo_for_each_entry_ptr(p, &j->pin, seq) {
-		INIT_LIST_HEAD(&p->list);
-		INIT_LIST_HEAD(&p->flushed);
-		atomic_set(&p->count, 0);
-		p->devs.nr = 0;
-	}
-
-	return 0;
-}
-
-int bch2_journal_read(struct bch_fs *c, struct list_head *list)
-{
-	struct journal *j = &c->journal;
 	struct journal_list jlist;
-	struct journal_replay *i;
-	struct journal_entry_pin_list *p;
-	struct bch_dev *ca;
-	u64 cur_seq, end_seq;
-	unsigned iter;
-	size_t keys = 0, entries = 0;
-	bool degraded = false;
+	struct journal_replay *i, **_i, *prev = NULL;
+	struct genradix_iter radix_iter;
+	struct printbuf buf = PRINTBUF;
+	bool degraded = false, last_write_torn = false;
+	u64 seq;
 	int ret = 0;
 
 	closure_init_stack(&jlist.cl);
 	mutex_init(&jlist.lock);
-	jlist.head = list;
+	jlist.last_seq = 0;
 	jlist.ret = 0;
 
-	for_each_member_device(ca, c, iter) {
-		if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL)))
+	for_each_member_device(c, ca) {
+		if (!c->opts.fsck &&
+		    !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
 			continue;
 
-		if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
-		     ca->mi.state == BCH_MEMBER_STATE_RO) &&
+		if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
+		     ca->mi.state == BCH_MEMBER_STATE_ro) &&
 		    percpu_ref_tryget(&ca->io_ref))
 			closure_call(&ca->journal.read,
 				     bch2_journal_read_device,
@@ -755,639 +1231,860 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 	if (jlist.ret)
 		return jlist.ret;
 
-	if (list_empty(list)){
-		bch_err(c, "no journal entries found");
-		return BCH_FSCK_REPAIR_IMPOSSIBLE;
-	}
-
-	list_for_each_entry(i, list, list) {
-		ret = jset_validate_entries(c, &i->j, READ);
-		if (ret)
-			goto fsck_err;
+	*last_seq	= 0;
+	*start_seq	= 0;
+	*blacklist_seq	= 0;
 
-		/*
-		 * If we're mounting in degraded mode - if we didn't read all
-		 * the devices - this is wrong:
-		 */
+	/*
+	 * Find most recent flush entry, and ignore newer non flush entries -
+	 * those entries will be blacklisted:
+	 */
+	genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
+		enum bch_validate_flags flags = BCH_VALIDATE_journal;
 
-		if (!degraded &&
-		    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-		     fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL,
-						       i->devs), c,
-				 "superblock not marked as containing replicas (type %u)",
-				 BCH_DATA_JOURNAL))) {
-			ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs);
-			if (ret)
-				return ret;
-		}
-	}
+		i = *_i;
 
-	i = list_last_entry(list, struct journal_replay, list);
+		if (journal_replay_ignore(i))
+			continue;
 
-	ret = bch2_journal_set_seq(c,
-				   le64_to_cpu(i->j.last_seq),
-				   le64_to_cpu(i->j.seq));
-	if (ret)
-		return ret;
+		if (!*start_seq)
+			*blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
 
-	mutex_lock(&j->blacklist_lock);
+		if (JSET_NO_FLUSH(&i->j)) {
+			i->ignore_blacklisted = true;
+			continue;
+		}
 
-	list_for_each_entry(i, list, list) {
-		p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
+		if (!last_write_torn && !i->csum_good) {
+			last_write_torn = true;
+			i->ignore_blacklisted = true;
+			continue;
+		}
 
-		atomic_set(&p->count, 1);
-		p->devs = i->devs;
+		if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
+					 c, le32_to_cpu(i->j.version), &i->j, NULL,
+					 jset_last_seq_newer_than_seq,
+					 "invalid journal entry: last_seq > seq (%llu > %llu)",
+					 le64_to_cpu(i->j.last_seq),
+					 le64_to_cpu(i->j.seq)))
+			i->j.last_seq = i->j.seq;
+
+		*last_seq	= le64_to_cpu(i->j.last_seq);
+		*blacklist_seq	= le64_to_cpu(i->j.seq) + 1;
+		break;
+	}
 
-		if (bch2_journal_seq_blacklist_read(j, i)) {
-			mutex_unlock(&j->blacklist_lock);
-			return -ENOMEM;
-		}
+	if (!*start_seq) {
+		bch_info(c, "journal read done, but no entries found");
+		return 0;
 	}
 
-	mutex_unlock(&j->blacklist_lock);
+	if (!*last_seq) {
+		fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes,
+			 "journal read done, but no entries found after dropping non-flushes");
+		return 0;
+	}
 
-	cur_seq = journal_last_seq(j);
-	end_seq = le64_to_cpu(list_last_entry(list,
-				struct journal_replay, list)->j.seq);
+	bch_info(c, "journal read done, replaying entries %llu-%llu",
+		 *last_seq, *blacklist_seq - 1);
 
-	list_for_each_entry(i, list, list) {
-		struct jset_entry *entry;
-		struct bkey_i *k, *_n;
-		bool blacklisted;
+	if (*start_seq != *blacklist_seq)
+		bch_info(c, "dropped unflushed entries %llu-%llu",
+			 *blacklist_seq, *start_seq - 1);
 
-		mutex_lock(&j->blacklist_lock);
-		while (cur_seq < le64_to_cpu(i->j.seq) &&
-		       bch2_journal_seq_blacklist_find(j, cur_seq))
-			cur_seq++;
+	/* Drop blacklisted entries and entries older than last_seq: */
+	genradix_for_each(&c->journal_entries, radix_iter, _i) {
+		i = *_i;
 
-		blacklisted = bch2_journal_seq_blacklist_find(j,
-							 le64_to_cpu(i->j.seq));
-		mutex_unlock(&j->blacklist_lock);
+		if (journal_replay_ignore(i))
+			continue;
 
-		fsck_err_on(blacklisted, c,
-			    "found blacklisted journal entry %llu",
-			    le64_to_cpu(i->j.seq));
+		seq = le64_to_cpu(i->j.seq);
+		if (seq < *last_seq) {
+			journal_replay_free(c, i, false);
+			continue;
+		}
 
-		fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
-			"journal entries %llu-%llu missing! (replaying %llu-%llu)",
-			cur_seq, le64_to_cpu(i->j.seq) - 1,
-			journal_last_seq(j), end_seq);
+		if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
+			fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
+				    jset_seq_blacklisted,
+				    "found blacklisted journal entry %llu", seq);
+			i->ignore_blacklisted = true;
+		}
+	}
 
-		cur_seq = le64_to_cpu(i->j.seq) + 1;
+	/* Check for missing entries: */
+	seq = *last_seq;
+	genradix_for_each(&c->journal_entries, radix_iter, _i) {
+		i = *_i;
 
-		for_each_jset_key(k, _n, entry, &i->j)
-			keys++;
-		entries++;
-	}
+		if (journal_replay_ignore(i))
+			continue;
 
-	bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
-		 keys, entries, journal_cur_seq(j));
-fsck_err:
-	return ret;
-}
+		BUG_ON(seq > le64_to_cpu(i->j.seq));
 
-/* journal replay: */
+		while (seq < le64_to_cpu(i->j.seq)) {
+			u64 missing_start, missing_end;
+			struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 
-int bch2_journal_mark(struct bch_fs *c, struct list_head *list)
-{
-	struct bkey_i *k, *n;
-	struct jset_entry *j;
-	struct journal_replay *r;
-	int ret;
+			while (seq < le64_to_cpu(i->j.seq) &&
+			       bch2_journal_seq_is_blacklisted(c, seq, false))
+				seq++;
 
-	list_for_each_entry(r, list, list)
-		for_each_jset_key(k, n, j, &r->j) {
-			enum bkey_type type = bkey_type(j->level, j->btree_id);
-			struct bkey_s_c k_s_c = bkey_i_to_s_c(k);
+			if (seq == le64_to_cpu(i->j.seq))
+				break;
 
-			if (btree_type_has_ptrs(type)) {
-				ret = bch2_btree_mark_key_initial(c, type, k_s_c);
-				if (ret)
-					return ret;
-			}
-		}
+			missing_start = seq;
 
-	return 0;
-}
+			while (seq < le64_to_cpu(i->j.seq) &&
+			       !bch2_journal_seq_is_blacklisted(c, seq, false))
+				seq++;
 
-int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
-{
-	struct journal *j = &c->journal;
-	struct journal_entry_pin_list *pin_list;
-	struct bkey_i *k, *_n;
-	struct jset_entry *entry;
-	struct journal_replay *i, *n;
-	int ret = 0;
+			if (prev) {
+				bch2_journal_ptrs_to_text(&buf1, c, prev);
+				prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
+			} else
+				prt_printf(&buf1, "(none)");
+			bch2_journal_ptrs_to_text(&buf2, c, i);
 
-	list_for_each_entry_safe(i, n, list, list) {
+			missing_end = seq - 1;
+			fsck_err(c, journal_entries_missing,
+				 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
+				 "  prev at %s\n"
+				 "  next at %s, continue?",
+				 missing_start, missing_end,
+				 *last_seq, *blacklist_seq - 1,
+				 buf1.buf, buf2.buf);
 
-		j->replay_journal_seq = le64_to_cpu(i->j.seq);
+			printbuf_exit(&buf1);
+			printbuf_exit(&buf2);
+		}
 
-		for_each_jset_key(k, _n, entry, &i->j) {
+		prev = i;
+		seq++;
+	}
 
-			if (entry->btree_id == BTREE_ID_ALLOC) {
-				/*
-				 * allocation code handles replay for
-				 * BTREE_ID_ALLOC keys:
-				 */
-				ret = bch2_alloc_replay_key(c, k->k.p);
-			} else {
-				/*
-				 * We might cause compressed extents to be
-				 * split, so we need to pass in a
-				 * disk_reservation:
-				 */
-				struct disk_reservation disk_res =
-					bch2_disk_reservation_init(c, 0);
+	genradix_for_each(&c->journal_entries, radix_iter, _i) {
+		struct bch_replicas_padded replicas = {
+			.e.data_type = BCH_DATA_journal,
+			.e.nr_devs = 0,
+			.e.nr_required = 1,
+		};
 
-				ret = bch2_btree_insert(c, entry->btree_id, k,
-							&disk_res, NULL, NULL,
-							BTREE_INSERT_NOFAIL|
-							BTREE_INSERT_JOURNAL_REPLAY);
-			}
+		i = *_i;
+		if (journal_replay_ignore(i))
+			continue;
 
-			if (ret) {
-				bch_err(c, "journal replay: error %d while replaying key",
-					ret);
-				goto err;
-			}
+		darray_for_each(i->ptrs, ptr) {
+			struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
 
-			cond_resched();
+			if (!ptr->csum_good)
+				bch_err_dev_offset(ca, ptr->sector,
+						   "invalid journal checksum, seq %llu%s",
+						   le64_to_cpu(i->j.seq),
+						   i->csum_good ? " (had good copy on another device)" : "");
 		}
 
-		pin_list = journal_seq_pin(j, j->replay_journal_seq);
+		ret = jset_validate(c,
+				    bch2_dev_have_ref(c, i->ptrs.data[0].dev),
+				    &i->j,
+				    i->ptrs.data[0].sector,
+				    READ);
+		if (ret)
+			goto err;
 
-		if (atomic_dec_and_test(&pin_list->count))
-			journal_wake(j);
-	}
+		darray_for_each(i->ptrs, ptr)
+			replicas_entry_add_dev(&replicas.e, ptr->dev);
 
-	j->replay_journal_seq = 0;
+		bch2_replicas_entry_sort(&replicas.e);
 
-	bch2_journal_set_replay_done(j);
-	bch2_journal_flush_all_pins(j);
-	ret = bch2_journal_error(j);
+		printbuf_reset(&buf);
+		bch2_replicas_entry_to_text(&buf, &replicas.e);
+
+		if (!degraded &&
+		    !bch2_replicas_marked(c, &replicas.e) &&
+		    (le64_to_cpu(i->j.seq) == *last_seq ||
+		     fsck_err(c, journal_entry_replicas_not_marked,
+			      "superblock not marked as containing replicas for journal entry %llu\n  %s",
+			      le64_to_cpu(i->j.seq), buf.buf))) {
+			ret = bch2_mark_replicas(c, &replicas.e);
+			if (ret)
+				goto err;
+		}
+	}
 err:
-	bch2_journal_entries_free(list);
+fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
 /* journal write: */
 
-static void bch2_journal_add_btree_root(struct journal_buf *buf,
-				       enum btree_id id, struct bkey_i *k,
-				       unsigned level)
-{
-	struct jset_entry *entry;
-
-	entry = bch2_journal_add_entry_noreservation(buf, k->k.u64s);
-	entry->type	= BCH_JSET_ENTRY_btree_root;
-	entry->btree_id = id;
-	entry->level	= level;
-	memcpy_u64s(entry->_data, k, k->k.u64s);
-}
-
-static unsigned journal_dev_buckets_available(struct journal *j,
-					      struct bch_dev *ca)
-{
-	struct journal_device *ja = &ca->journal;
-	unsigned next = (ja->cur_idx + 1) % ja->nr;
-	unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
-
-	/*
-	 * Hack to avoid a deadlock during journal replay:
-	 * journal replay might require setting a new btree
-	 * root, which requires writing another journal entry -
-	 * thus, if the journal is full (and this happens when
-	 * replaying the first journal bucket's entries) we're
-	 * screwed.
-	 *
-	 * So don't let the journal fill up unless we're in
-	 * replay:
-	 */
-	if (test_bit(JOURNAL_REPLAY_DONE, &j->flags))
-		available = max((int) available - 2, 0);
-
-	/*
-	 * Don't use the last bucket unless writing the new last_seq
-	 * will make another bucket available:
-	 */
-	if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j))
-		available = max((int) available - 1, 0);
-
-	return available;
-}
-
-/* returns number of sectors available for next journal entry: */
-int bch2_journal_entry_sectors(struct journal *j)
+static void __journal_write_alloc(struct journal *j,
+				  struct journal_buf *w,
+				  struct dev_alloc_list *devs_sorted,
+				  unsigned sectors,
+				  unsigned *replicas,
+				  unsigned replicas_want)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_device *ja;
 	struct bch_dev *ca;
-	struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
-	unsigned sectors_available = UINT_MAX;
-	unsigned i, nr_online = 0, nr_devs = 0;
-
-	lockdep_assert_held(&j->lock);
+	unsigned i;
 
-	rcu_read_lock();
-	for_each_member_device_rcu(ca, c, i,
-				   &c->rw_devs[BCH_DATA_JOURNAL]) {
-		struct journal_device *ja = &ca->journal;
-		unsigned buckets_required = 0;
+	if (*replicas >= replicas_want)
+		return;
 
-		if (!ja->nr)
+	for (i = 0; i < devs_sorted->nr; i++) {
+		ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
+		if (!ca)
 			continue;
 
-		sectors_available = min_t(unsigned, sectors_available,
-					  ca->mi.bucket_size);
+		ja = &ca->journal;
 
 		/*
-		 * Note that we don't allocate the space for a journal entry
-		 * until we write it out - thus, if we haven't started the write
-		 * for the previous entry we have to make sure we have space for
-		 * it too:
+		 * Check that we can use this device, and aren't already using
+		 * it:
 		 */
-		if (bch2_extent_has_device(e.c, ca->dev_idx)) {
-			if (j->prev_buf_sectors > ja->sectors_free)
-				buckets_required++;
-
-			if (j->prev_buf_sectors + sectors_available >
-			    ja->sectors_free)
-				buckets_required++;
-		} else {
-			if (j->prev_buf_sectors + sectors_available >
-			    ca->mi.bucket_size)
-				buckets_required++;
-
-			buckets_required++;
-		}
+		if (!ca->mi.durability ||
+		    ca->mi.state != BCH_MEMBER_STATE_rw ||
+		    !ja->nr ||
+		    bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
+		    sectors > ja->sectors_free)
+			continue;
 
-		if (journal_dev_buckets_available(j, ca) >= buckets_required)
-			nr_devs++;
-		nr_online++;
-	}
-	rcu_read_unlock();
+		bch2_dev_stripe_increment(ca, &j->wp.stripe);
 
-	if (nr_online < c->opts.metadata_replicas_required)
-		return -EROFS;
+		bch2_bkey_append_ptr(&w->key,
+			(struct bch_extent_ptr) {
+				  .offset = bucket_to_sector(ca,
+					ja->buckets[ja->cur_idx]) +
+					ca->mi.bucket_size -
+					ja->sectors_free,
+				  .dev = ca->dev_idx,
+		});
 
-	if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas))
-		return 0;
+		ja->sectors_free -= sectors;
+		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+
+		*replicas += ca->mi.durability;
 
-	return sectors_available;
+		if (*replicas >= replicas_want)
+			break;
+	}
 }
 
 /**
- * journal_next_bucket - move on to the next journal bucket if possible
+ * journal_write_alloc - decide where to write next journal entry
+ *
+ * @j:		journal object
+ * @w:		journal buf (entry to be written)
+ *
+ * Returns: 0 on success, or -EROFS on failure
  */
-static int journal_write_alloc(struct journal *j, struct journal_buf *w,
-			       unsigned sectors)
+static int journal_write_alloc(struct journal *j, struct journal_buf *w)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bkey_s_extent e;
-	struct bch_extent_ptr *ptr;
+	struct bch_devs_mask devs;
 	struct journal_device *ja;
 	struct bch_dev *ca;
 	struct dev_alloc_list devs_sorted;
-	unsigned i, replicas, replicas_want =
+	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
+	unsigned target = c->opts.metadata_target ?:
+		c->opts.foreground_target;
+	unsigned i, replicas = 0, replicas_want =
 		READ_ONCE(c->opts.metadata_replicas);
+	unsigned replicas_need = min_t(unsigned, replicas_want,
+				       READ_ONCE(c->opts.metadata_replicas_required));
 
-	spin_lock(&j->lock);
-	e = bkey_i_to_s_extent(&j->key);
-
-	/*
-	 * Drop any pointers to devices that have been removed, are no longer
-	 * empty, or filled up their current journal bucket:
-	 *
-	 * Note that a device may have had a small amount of free space (perhaps
-	 * one sector) that wasn't enough for the smallest possible journal
-	 * entry - that's why we drop pointers to devices <= current free space,
-	 * i.e. whichever device was limiting the current journal entry size.
-	 */
-	extent_for_each_ptr_backwards(e, ptr) {
-		   ca = bch_dev_bkey_exists(c, ptr->dev);
+	rcu_read_lock();
+retry:
+	devs = target_rw_devs(c, BCH_DATA_journal, target);
 
-		if (ca->mi.state != BCH_MEMBER_STATE_RW ||
-		    ca->journal.sectors_free <= sectors)
-			__bch2_extent_drop_ptr(e, ptr);
-		else
-			ca->journal.sectors_free -= sectors;
-	}
+	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
 
-	replicas = bch2_extent_nr_ptrs(e.c);
+	__journal_write_alloc(j, w, &devs_sorted,
+			      sectors, &replicas, replicas_want);
 
-	rcu_read_lock();
-	devs_sorted = bch2_wp_alloc_list(c, &j->wp,
-					 &c->rw_devs[BCH_DATA_JOURNAL]);
+	if (replicas >= replicas_want)
+		goto done;
 
 	for (i = 0; i < devs_sorted.nr; i++) {
 		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
 		if (!ca)
 			continue;
 
-		if (!ca->mi.durability)
-			continue;
-
 		ja = &ca->journal;
-		if (!ja->nr)
-			continue;
 
-		if (replicas >= replicas_want)
-			break;
+		if (sectors > ja->sectors_free &&
+		    sectors <= ca->mi.bucket_size &&
+		    bch2_journal_dev_buckets_available(j, ja,
+					journal_space_discarded)) {
+			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+			ja->sectors_free = ca->mi.bucket_size;
 
-		/*
-		 * Check that we can use this device, and aren't already using
-		 * it:
-		 */
-		if (bch2_extent_has_device(e.c, ca->dev_idx) ||
-		    !journal_dev_buckets_available(j, ca) ||
-		    sectors > ca->mi.bucket_size)
-			continue;
-
-		j->wp.next_alloc[ca->dev_idx] += U32_MAX;
-		bch2_wp_rescale(c, ca, &j->wp);
-
-		ja->sectors_free = ca->mi.bucket_size - sectors;
-		ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
-		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+			/*
+			 * ja->bucket_seq[ja->cur_idx] must always have
+			 * something sensible:
+			 */
+			ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+		}
+	}
 
-		extent_ptr_append(bkey_i_to_extent(&j->key),
-			(struct bch_extent_ptr) {
-				  .offset = bucket_to_sector(ca,
-					ja->buckets[ja->cur_idx]),
-				  .dev = ca->dev_idx,
-		});
+	__journal_write_alloc(j, w, &devs_sorted,
+			      sectors, &replicas, replicas_want);
 
-		replicas += ca->mi.durability;
+	if (replicas < replicas_want && target) {
+		/* Retry from all devices: */
+		target = 0;
+		goto retry;
 	}
+done:
 	rcu_read_unlock();
 
-	j->prev_buf_sectors = 0;
-
-	bkey_copy(&w->key, &j->key);
-	spin_unlock(&j->lock);
-
-	if (replicas < c->opts.metadata_replicas_required)
-		return -EROFS;
-
-	BUG_ON(!replicas);
-
-	return 0;
-}
-
-static void journal_write_compact(struct jset *jset)
-{
-	struct jset_entry *i, *next, *prev = NULL;
-
-	/*
-	 * Simple compaction, dropping empty jset_entries (from journal
-	 * reservations that weren't fully used) and merging jset_entries that
-	 * can be.
-	 *
-	 * If we wanted to be really fancy here, we could sort all the keys in
-	 * the jset and drop keys that were overwritten - probably not worth it:
-	 */
-	vstruct_for_each_safe(jset, i, next) {
-		unsigned u64s = le16_to_cpu(i->u64s);
+	BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
 
-		/* Empty entry: */
-		if (!u64s)
-			continue;
-
-		/* Can we merge with previous entry? */
-		if (prev &&
-		    i->btree_id == prev->btree_id &&
-		    i->level	== prev->level &&
-		    i->type	== prev->type &&
-		    i->type	== BCH_JSET_ENTRY_btree_keys &&
-		    le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
-			memmove_u64s_down(vstruct_next(prev),
-					  i->_data,
-					  u64s);
-			le16_add_cpu(&prev->u64s, u64s);
-			continue;
-		}
-
-		/* Couldn't merge, move i into new position (after prev): */
-		prev = prev ? vstruct_next(prev) : jset->start;
-		if (i != prev)
-			memmove_u64s_down(prev, i, jset_u64s(u64s));
-	}
-
-	prev = prev ? vstruct_next(prev) : jset->start;
-	jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
+	return replicas >= replicas_need ? 0 : -EROFS;
 }
 
 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 {
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
 	/* we aren't holding j->lock: */
 	unsigned new_size = READ_ONCE(j->buf_size_want);
 	void *new_buf;
 
-	if (buf->size >= new_size)
+	if (buf->buf_size >= new_size)
 		return;
 
-	new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
+	size_t btree_write_buffer_size = new_size / 64;
+
+	if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
+		return;
+
+	new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
 	if (!new_buf)
 		return;
 
-	memcpy(new_buf, buf->data, buf->size);
-	kvpfree(buf->data, buf->size);
-	buf->data	= new_buf;
-	buf->size	= new_size;
+	memcpy(new_buf, buf->data, buf->buf_size);
+
+	spin_lock(&j->lock);
+	swap(buf->data,		new_buf);
+	swap(buf->buf_size,	new_size);
+	spin_unlock(&j->lock);
+
+	kvfree(new_buf);
 }
 
-static void journal_write_done(struct closure *cl)
+static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
 {
-	struct journal *j = container_of(cl, struct journal, io);
+	return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
+}
+
+static CLOSURE_CALLBACK(journal_write_done)
+{
+	closure_type(w, struct journal_buf, io);
+	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *w = journal_prev_buf(j);
-	struct bch_devs_list devs =
-		bch2_extent_devs(bkey_i_to_s_c_extent(&w->key));
+	struct bch_replicas_padded replicas;
+	union journal_res_state old, new;
 	u64 seq = le64_to_cpu(w->data->seq);
+	int err = 0;
+
+	bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
+			       ? j->flush_write_time
+			       : j->noflush_write_time, j->write_start_time);
 
-	if (!devs.nr) {
+	if (!w->devs_written.nr) {
 		bch_err(c, "unable to write journal to sufficient devices");
-		goto err;
+		err = -EIO;
+	} else {
+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+					 w->devs_written);
+		if (bch2_mark_replicas(c, &replicas.e))
+			err = -EIO;
 	}
 
-	if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs))
-		goto err;
-out:
-	bch2_time_stats_update(j->write_time, j->write_start_time);
+	if (err)
+		bch2_fatal_error(c);
+
+	closure_debug_destroy(cl);
 
 	spin_lock(&j->lock);
-	j->last_seq_ondisk = seq;
 	if (seq >= j->pin.front)
-		journal_seq_pin(j, seq)->devs = devs;
+		journal_seq_pin(j, seq)->devs = w->devs_written;
+	if (err && (!j->err_seq || seq < j->err_seq))
+		j->err_seq	= seq;
+	w->write_done = true;
+
+	bool completed = false;
+
+	for (seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j);
+	     seq++) {
+		w = j->buf + (seq & JOURNAL_BUF_MASK);
+		if (!w->write_done)
+			break;
 
-	/*
-	 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
-	 * more buckets:
-	 *
-	 * Must come before signaling write completion, for
-	 * bch2_fs_journal_stop():
-	 */
-	mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+		if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
+			j->flushed_seq_ondisk = seq;
+			j->last_seq_ondisk = w->last_seq;
 
-	/* also must come before signalling write completion: */
-	closure_debug_destroy(cl);
+			bch2_do_discards(c);
+			closure_wake_up(&c->freelist_wait);
+			bch2_reset_alloc_cursors(c);
+		}
 
-	BUG_ON(!j->reservations.prev_buf_unwritten);
-	atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
-		     &j->reservations.counter);
+		j->seq_ondisk = seq;
 
-	closure_wake_up(&w->wait);
-	journal_wake(j);
+		/*
+		 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+		 * more buckets:
+		 *
+		 * Must come before signaling write completion, for
+		 * bch2_fs_journal_stop():
+		 */
+		if (j->watermark != BCH_WATERMARK_stripe)
+			journal_reclaim_kick(&c->journal);
+
+		old.v = atomic64_read(&j->reservations.counter);
+		do {
+			new.v = old.v;
+			BUG_ON(journal_state_count(new, new.unwritten_idx));
+			BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
+
+			new.unwritten_idx++;
+		} while (!atomic64_try_cmpxchg(&j->reservations.counter,
+					       &old.v, new.v));
+
+		closure_wake_up(&w->wait);
+		completed = true;
+	}
+
+	if (completed) {
+		bch2_journal_reclaim_fast(j);
+		bch2_journal_space_available(j);
 
-	if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
-		mod_delayed_work(system_freezable_wq, &j->write_work, 0);
+		track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
+
+		journal_wake(j);
+	}
+
+	if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
+		   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
+		struct journal_buf *buf = journal_cur_buf(j);
+		long delta = buf->expires - jiffies;
+
+		/*
+		 * We don't close a journal entry to write it while there's
+		 * previous entries still in flight - the current journal entry
+		 * might want to be written now:
+		 */
+		mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
+	}
+
+	/*
+	 * We don't typically trigger journal writes from her - the next journal
+	 * write will be triggered immediately after the previous one is
+	 * allocated, in bch2_journal_write() - but the journal write error path
+	 * is special:
+	 */
+	bch2_journal_do_writes(j);
 	spin_unlock(&j->lock);
-	return;
-err:
-	bch2_fatal_error(c);
-	bch2_journal_halt(j);
-	goto out;
 }
 
 static void journal_write_endio(struct bio *bio)
 {
-	struct bch_dev *ca = bio->bi_private;
+	struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
+	struct bch_dev *ca = jbio->ca;
 	struct journal *j = &ca->fs->journal;
+	struct journal_buf *w = j->buf + jbio->buf_idx;
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") ||
+	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+			       "error writing journal entry %llu: %s",
+			       le64_to_cpu(w->data->seq),
+			       bch2_blk_status_to_str(bio->bi_status)) ||
 	    bch2_meta_write_fault("journal")) {
-		struct journal_buf *w = journal_prev_buf(j);
 		unsigned long flags;
 
 		spin_lock_irqsave(&j->err_lock, flags);
-		bch2_extent_drop_device(bkey_i_to_s_extent(&w->key), ca->dev_idx);
+		bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
 		spin_unlock_irqrestore(&j->err_lock, flags);
 	}
 
-	closure_put(&j->io);
+	closure_put(&w->io);
 	percpu_ref_put(&ca->io_ref);
 }
 
-void bch2_journal_write(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write_submit)
 {
-	struct journal *j = container_of(cl, struct journal, io);
+	closure_type(w, struct journal_buf, io);
+	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bch_dev *ca;
-	struct journal_buf *w = journal_prev_buf(j);
-	struct jset *jset;
-	struct bio *bio;
-	struct bch_extent_ptr *ptr;
-	unsigned i, sectors, bytes;
+	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
 
-	journal_buf_realloc(j, w);
-	jset = w->data;
+	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+		struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);
+		if (!ca) {
+			/* XXX: fix this */
+			bch_err(c, "missing device for journal write\n");
+			continue;
+		}
 
-	j->write_start_time = local_clock();
-	mutex_lock(&c->btree_root_lock);
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		struct btree_root *r = &c->btree_roots[i];
+		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
+			     sectors);
+
+		struct journal_device *ja = &ca->journal;
+		struct bio *bio = &ja->bio[w->idx]->bio;
+		bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
+		bio->bi_iter.bi_sector	= ptr->offset;
+		bio->bi_end_io		= journal_write_endio;
+		bio->bi_private		= ca;
+
+		BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
+		ca->prev_journal_sector = bio->bi_iter.bi_sector;
+
+		if (!JSET_NO_FLUSH(w->data))
+			bio->bi_opf    |= REQ_FUA;
+		if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
+			bio->bi_opf    |= REQ_PREFLUSH;
+
+		bch2_bio_map(bio, w->data, sectors << 9);
+
+		trace_and_count(c, journal_write, bio);
+		closure_bio_submit(bio, cl);
+
+		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+	}
+
+	continue_at(cl, journal_write_done, j->wq);
+}
+
+static CLOSURE_CALLBACK(journal_write_preflush)
+{
+	closure_type(w, struct journal_buf, io);
+	struct journal *j = container_of(w, struct journal, buf[w->idx]);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+	if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
+		spin_lock(&j->lock);
+		if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
+			closure_wait(&j->async_wait, cl);
+			spin_unlock(&j->lock);
+			continue_at(cl, journal_write_preflush, j->wq);
+			return;
+		}
+		spin_unlock(&j->lock);
+	}
+
+	if (w->separate_flush) {
+		for_each_rw_member(c, ca) {
+			percpu_ref_get(&ca->io_ref);
+
+			struct journal_device *ja = &ca->journal;
+			struct bio *bio = &ja->bio[w->idx]->bio;
+			bio_reset(bio, ca->disk_sb.bdev,
+				  REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
+			bio->bi_end_io		= journal_write_endio;
+			bio->bi_private		= ca;
+			closure_bio_submit(bio, cl);
+		}
 
-		if (r->alive)
-			bch2_journal_add_btree_root(w, i, &r->key, r->level);
+		continue_at(cl, journal_write_submit, j->wq);
+	} else {
+		/*
+		 * no need to punt to another work item if we're not waiting on
+		 * preflushes
+		 */
+		journal_write_submit(&cl->work);
+	}
+}
+
+static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct jset_entry *start, *end;
+	struct jset *jset = w->data;
+	struct journal_keys_to_wb wb = { NULL };
+	unsigned sectors, bytes, u64s;
+	unsigned long btree_roots_have = 0;
+	bool validate_before_checksum = false;
+	u64 seq = le64_to_cpu(jset->seq);
+	int ret;
+
+	/*
+	 * Simple compaction, dropping empty jset_entries (from journal
+	 * reservations that weren't fully used) and merging jset_entries that
+	 * can be.
+	 *
+	 * If we wanted to be really fancy here, we could sort all the keys in
+	 * the jset and drop keys that were overwritten - probably not worth it:
+	 */
+	vstruct_for_each(jset, i) {
+		unsigned u64s = le16_to_cpu(i->u64s);
+
+		/* Empty entry: */
+		if (!u64s)
+			continue;
+
+		/*
+		 * New btree roots are set by journalling them; when the journal
+		 * entry gets written we have to propagate them to
+		 * c->btree_roots
+		 *
+		 * But, every journal entry we write has to contain all the
+		 * btree roots (at least for now); so after we copy btree roots
+		 * to c->btree_roots we have to get any missing btree roots and
+		 * add them to this journal entry:
+		 */
+		switch (i->type) {
+		case BCH_JSET_ENTRY_btree_root:
+			bch2_journal_entry_to_btree_root(c, i);
+			__set_bit(i->btree_id, &btree_roots_have);
+			break;
+		case BCH_JSET_ENTRY_write_buffer_keys:
+			EBUG_ON(!w->need_flush_to_write_buffer);
+
+			if (!wb.wb)
+				bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
+
+			jset_entry_for_each_key(i, k) {
+				ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
+				if (ret) {
+					bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s",
+							    bch2_err_str(ret));
+					bch2_journal_keys_to_write_buffer_end(c, &wb);
+					return ret;
+				}
+			}
+			i->type = BCH_JSET_ENTRY_btree_keys;
+			break;
+		}
+	}
+
+	if (wb.wb) {
+		ret = bch2_journal_keys_to_write_buffer_end(c, &wb);
+		if (ret) {
+			bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s",
+					    bch2_err_str(ret));
+			return ret;
+		}
 	}
-	c->btree_roots_dirty = false;
-	mutex_unlock(&c->btree_root_lock);
 
-	journal_write_compact(jset);
+	spin_lock(&c->journal.lock);
+	w->need_flush_to_write_buffer = false;
+	spin_unlock(&c->journal.lock);
+
+	start = end = vstruct_last(jset);
+
+	end	= bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
+
+	struct jset_entry_datetime *d =
+		container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
+	d->entry.type	= BCH_JSET_ENTRY_datetime;
+	d->seconds	= cpu_to_le64(ktime_get_real_seconds());
+
+	bch2_journal_super_entries_add_common(c, &end, seq);
+	u64s	= (u64 *) end - (u64 *) start;
+
+	WARN_ON(u64s > j->entry_u64s_reserved);
+
+	le32_add_cpu(&jset->u64s, u64s);
+
+	sectors = vstruct_sectors(jset, c->block_bits);
+	bytes	= vstruct_bytes(jset);
+
+	if (sectors > w->sectors) {
+		bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
+				    vstruct_bytes(jset), w->sectors << 9,
+				    u64s, w->u64s_reserved, j->entry_u64s_reserved);
+		return -EINVAL;
+	}
 
-	jset->read_clock	= cpu_to_le16(c->bucket_clock[READ].hand);
-	jset->write_clock	= cpu_to_le16(c->bucket_clock[WRITE].hand);
 	jset->magic		= cpu_to_le64(jset_magic(c));
-	jset->version		= cpu_to_le32(BCACHE_JSET_VERSION);
+	jset->version		= cpu_to_le32(c->sb.version);
 
 	SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
 	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
 
-	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
-	    jset_validate_entries(c, jset, WRITE))
-		goto err;
+	if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
+		j->last_empty_seq = seq;
 
-	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
+		validate_before_checksum = true;
+
+	if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
+		validate_before_checksum = true;
+
+	if (validate_before_checksum &&
+	    (ret = jset_validate(c, NULL, jset, 0, WRITE)))
+		return ret;
+
+	ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
 		    jset->encrypted_start,
 		    vstruct_end(jset) - (void *) jset->encrypted_start);
+	if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)))
+		return ret;
 
 	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
 				  journal_nonce(jset), jset);
 
-	if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
-	    jset_validate_entries(c, jset, WRITE))
-		goto err;
-
-	sectors = vstruct_sectors(jset, c->block_bits);
-	BUG_ON(sectors > j->prev_buf_sectors);
+	if (!validate_before_checksum &&
+	    (ret = jset_validate(c, NULL, jset, 0, WRITE)))
+		return ret;
 
-	bytes = vstruct_bytes(w->data);
-	memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
+	memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
+	return 0;
+}
 
-	if (journal_write_alloc(j, w, sectors)) {
-		bch2_journal_halt(j);
-		bch_err(c, "Unable to allocate journal write");
-		bch2_fatal_error(c);
-		continue_at(cl, journal_write_done, system_highpri_wq);
-		return;
-	}
+static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	int error = bch2_journal_error(j);
 
 	/*
-	 * XXX: we really should just disable the entire journal in nochanges
-	 * mode
+	 * If the journal is in an error state - we did an emergency shutdown -
+	 * we prefer to continue doing journal writes. We just mark them as
+	 * noflush so they'll never be used, but they'll still be visible by the
+	 * list_journal tool - this helps in debugging.
+	 *
+	 * There's a caveat: the first journal write after marking the
+	 * superblock dirty must always be a flush write, because on startup
+	 * from a clean shutdown we didn't necessarily read the journal and the
+	 * new journal write might overwrite whatever was in the journal
+	 * previously - we can't leave the journal without any flush writes in
+	 * it.
+	 *
+	 * So if we're in an error state, and we're still starting up, we don't
+	 * write anything at all.
 	 */
-	if (c->opts.nochanges)
-		goto no_io;
+	if (error && test_bit(JOURNAL_need_flush_write, &j->flags))
+		return -EIO;
+
+	if (error ||
+	    w->noflush ||
+	    (!w->must_flush &&
+	     time_before(jiffies, j->last_flush_write +
+		 msecs_to_jiffies(c->opts.journal_flush_delay)) &&
+	     test_bit(JOURNAL_may_skip_flush, &j->flags))) {
+		w->noflush = true;
+		SET_JSET_NO_FLUSH(w->data, true);
+		w->data->last_seq	= 0;
+		w->last_seq		= 0;
+
+		j->nr_noflush_writes++;
+	} else {
+		w->must_flush = true;
+		j->last_flush_write = jiffies;
+		j->nr_flush_writes++;
+		clear_bit(JOURNAL_need_flush_write, &j->flags);
+	}
 
-	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
-		ca = bch_dev_bkey_exists(c, ptr->dev);
-		if (!percpu_ref_tryget(&ca->io_ref)) {
-			/* XXX: fix this */
-			bch_err(c, "missing device for journal write\n");
-			continue;
-		}
+	return 0;
+}
 
-		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL],
-			     sectors);
+CLOSURE_CALLBACK(bch2_journal_write)
+{
+	closure_type(w, struct journal_buf, io);
+	struct journal *j = container_of(w, struct journal, buf[w->idx]);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_replicas_padded replicas;
+	unsigned nr_rw_members = 0;
+	int ret;
 
-		bio = ca->journal.bio;
-		bio_reset(bio);
-		bio_set_dev(bio, ca->disk_sb.bdev);
-		bio->bi_iter.bi_sector	= ptr->offset;
-		bio->bi_iter.bi_size	= sectors << 9;
-		bio->bi_end_io		= journal_write_endio;
-		bio->bi_private		= ca;
-		bio_set_op_attrs(bio, REQ_OP_WRITE,
-				 REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
-		bch2_bio_map(bio, jset);
+	for_each_rw_member(c, ca)
+		nr_rw_members++;
 
-		trace_journal_write(bio);
-		closure_bio_submit(bio, cl);
+	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+	BUG_ON(!w->write_started);
+	BUG_ON(w->write_allocated);
+	BUG_ON(w->write_done);
+
+	j->write_start_time = local_clock();
+
+	spin_lock(&j->lock);
+	if (nr_rw_members > 1)
+		w->separate_flush = true;
+
+	ret = bch2_journal_write_pick_flush(j, w);
+	spin_unlock(&j->lock);
+	if (ret)
+		goto err;
 
-		ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq);
+	mutex_lock(&j->buf_lock);
+	journal_buf_realloc(j, w);
+
+	ret = bch2_journal_write_prep(j, w);
+	mutex_unlock(&j->buf_lock);
+	if (ret)
+		goto err;
+
+	j->entry_bytes_written += vstruct_bytes(w->data);
+
+	while (1) {
+		spin_lock(&j->lock);
+		ret = journal_write_alloc(j, w);
+		if (!ret || !j->can_discard)
+			break;
+
+		spin_unlock(&j->lock);
+		bch2_journal_do_discards(j);
 	}
 
-	for_each_rw_member(ca, c, i)
-		if (journal_flushes_device(ca) &&
-		    !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) {
-			percpu_ref_get(&ca->io_ref);
+	if (ret) {
+		struct printbuf buf = PRINTBUF;
+		buf.atomic++;
+
+		prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu: %s"),
+					  le64_to_cpu(w->data->seq),
+					  bch2_err_str(ret));
+		__bch2_journal_debug_to_text(&buf, j);
+		spin_unlock(&j->lock);
+		bch2_print_string_as_lines(KERN_ERR, buf.buf);
+		printbuf_exit(&buf);
+		goto err;
+	}
 
-			bio = ca->journal.bio;
-			bio_reset(bio);
-			bio_set_dev(bio, ca->disk_sb.bdev);
-			bio->bi_opf		= REQ_OP_FLUSH;
-			bio->bi_end_io		= journal_write_endio;
-			bio->bi_private		= ca;
-			closure_bio_submit(bio, cl);
-		}
+	/*
+	 * write is allocated, no longer need to account for it in
+	 * bch2_journal_space_available():
+	 */
+	w->sectors = 0;
+	w->write_allocated = true;
 
-no_io:
-	extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
-		ptr->offset += sectors;
+	/*
+	 * journal entry has been compacted and allocated, recalculate space
+	 * available:
+	 */
+	bch2_journal_space_available(j);
+	bch2_journal_do_writes(j);
+	spin_unlock(&j->lock);
+
+	w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
+
+	if (c->opts.nochanges)
+		goto no_io;
 
-	continue_at(cl, journal_write_done, system_highpri_wq);
+	/*
+	 * Mark journal replicas before we submit the write to guarantee
+	 * recovery will find the journal entries after a crash.
+	 */
+	bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+				 w->devs_written);
+	ret = bch2_mark_replicas(c, &replicas.e);
+	if (ret)
+		goto err;
+
+	if (!JSET_NO_FLUSH(w->data))
+		continue_at(cl, journal_write_preflush, j->wq);
+	else
+		continue_at(cl, journal_write_submit, j->wq);
+	return;
+no_io:
+	continue_at(cl, journal_write_done, j->wq);
 	return;
 err:
-	bch2_inconsistent_error(c);
-	continue_at(cl, journal_write_done, system_highpri_wq);
+	bch2_fatal_error(c);
+	continue_at(cl, journal_write_done, j->wq);
 }
diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h
index e303df92..2ca9cde3 100644
--- a/libbcachefs/journal_io.h
+++ b/libbcachefs/journal_io.h
@@ -1,17 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_JOURNAL_IO_H
 #define _BCACHEFS_JOURNAL_IO_H
 
+#include "darray.h"
+
+void bch2_journal_pos_from_member_info_set(struct bch_fs *);
+void bch2_journal_pos_from_member_info_resume(struct bch_fs *);
+
+struct journal_ptr {
+	bool		csum_good;
+	u8		dev;
+	u32		bucket;
+	u32		bucket_offset;
+	u64		sector;
+};
+
 /*
  * Only used for holding the journal entries we read in btree_journal_read()
  * during cache_registration
  */
 struct journal_replay {
-	struct list_head	list;
-	struct bch_devs_list	devs;
+	DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
+
+	bool			csum_good;
+	bool			ignore_blacklisted;
+	bool			ignore_not_dirty;
 	/* must be last: */
 	struct jset		j;
 };
 
+static inline bool journal_replay_ignore(struct journal_replay *i)
+{
+	return !i || i->ignore_blacklisted || i->ignore_not_dirty;
+}
+
 static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 					struct jset_entry *entry, unsigned type)
 {
@@ -26,18 +48,46 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 }
 
 #define for_each_jset_entry_type(entry, jset, type)			\
-	for (entry = (jset)->start;					\
+	for (struct jset_entry *entry = (jset)->start;			\
 	     (entry = __jset_entry_type_next(jset, entry, type));	\
 	     entry = vstruct_next(entry))
 
-#define for_each_jset_key(k, _n, entry, jset)				\
-	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)	\
-		vstruct_for_each_safe(entry, k, _n)
+#define jset_entry_for_each_key(_e, _k)					\
+	for (struct bkey_i *_k = (_e)->start;				\
+	     _k < vstruct_last(_e);					\
+	     _k = bkey_next(_k))
 
-int bch2_journal_set_seq(struct bch_fs *c, u64, u64);
-int bch2_journal_read(struct bch_fs *, struct list_head *);
+#define for_each_jset_key(k, entry, jset)				\
+	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)\
+		jset_entry_for_each_key(entry, k)
 
-int bch2_journal_entry_sectors(struct journal *);
-void bch2_journal_write(struct closure *);
+int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
+				struct jset_entry *, unsigned, int,
+				enum bch_validate_flags);
+void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
+				struct jset_entry *);
+
+void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
+			       struct journal_replay *);
+
+int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
+
+CLOSURE_CALLBACK(bch2_journal_write);
+
+static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
+{
+	struct jset_entry *entry = *end;
+	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+
+	memset(entry, 0, u64s * sizeof(u64));
+	/*
+	 * The u64s field counts from the start of data, ignoring the shared
+	 * fields.
+	 */
+	entry->u64s = cpu_to_le16(u64s - 1);
+
+	*end = vstruct_next(*end);
+	return entry;
+}
 
 #endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index 394b72bb..1aabbbe3 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -1,112 +1,480 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "errcode.h"
+#include "error.h"
 #include "journal.h"
+#include "journal_io.h"
 #include "journal_reclaim.h"
 #include "replicas.h"
-#include "super.h"
+#include "sb-members.h"
+#include "trace.h"
 
-/*
- * Journal entry pinning - machinery for holding a reference on a given journal
- * entry, holding it open to ensure it gets replayed during recovery:
- */
+#include <linux/kthread.h>
+#include <linux/sched/mm.h>
+
+/* Free space calculations: */
+
+static unsigned journal_space_from(struct journal_device *ja,
+				   enum journal_space_from from)
+{
+	switch (from) {
+	case journal_space_discarded:
+		return ja->discard_idx;
+	case journal_space_clean_ondisk:
+		return ja->dirty_idx_ondisk;
+	case journal_space_clean:
+		return ja->dirty_idx;
+	default:
+		BUG();
+	}
+}
 
-static inline u64 journal_pin_seq(struct journal *j,
-				  struct journal_entry_pin_list *pin_list)
+unsigned bch2_journal_dev_buckets_available(struct journal *j,
+					    struct journal_device *ja,
+					    enum journal_space_from from)
 {
-	return fifo_entry_idx_abs(&j->pin, pin_list);
+	if (!ja->nr)
+		return 0;
+
+	unsigned available = (journal_space_from(ja, from) -
+			      ja->cur_idx - 1 + ja->nr) % ja->nr;
+
+	/*
+	 * Don't use the last bucket unless writing the new last_seq
+	 * will make another bucket available:
+	 */
+	if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
+		--available;
+
+	return available;
+}
+
+void bch2_journal_set_watermark(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	bool low_on_space = j->space[journal_space_clean].total * 4 <=
+		j->space[journal_space_total].total;
+	bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
+	bool low_on_wb = bch2_btree_write_buffer_must_wait(c);
+	unsigned watermark = low_on_space || low_on_pin || low_on_wb
+		? BCH_WATERMARK_reclaim
+		: BCH_WATERMARK_stripe;
+
+	if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) ||
+	    track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) ||
+	    track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
+		trace_and_count(c, journal_full, c);
+
+	mod_bit(JOURNAL_space_low, &j->flags, low_on_space || low_on_pin);
+
+	swap(watermark, j->watermark);
+	if (watermark > j->watermark)
+		journal_wake(j);
+}
+
+static struct journal_space
+journal_dev_space_available(struct journal *j, struct bch_dev *ca,
+			    enum journal_space_from from)
+{
+	struct journal_device *ja = &ca->journal;
+	unsigned sectors, buckets, unwritten;
+	u64 seq;
+
+	if (from == journal_space_total)
+		return (struct journal_space) {
+			.next_entry	= ca->mi.bucket_size,
+			.total		= ca->mi.bucket_size * ja->nr,
+		};
+
+	buckets = bch2_journal_dev_buckets_available(j, ja, from);
+	sectors = ja->sectors_free;
+
+	/*
+	 * We that we don't allocate the space for a journal entry
+	 * until we write it out - thus, account for it here:
+	 */
+	for (seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j);
+	     seq++) {
+		unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors;
+
+		if (!unwritten)
+			continue;
+
+		/* entry won't fit on this device, skip: */
+		if (unwritten > ca->mi.bucket_size)
+			continue;
+
+		if (unwritten >= sectors) {
+			if (!buckets) {
+				sectors = 0;
+				break;
+			}
+
+			buckets--;
+			sectors = ca->mi.bucket_size;
+		}
+
+		sectors -= unwritten;
+	}
+
+	if (sectors < ca->mi.bucket_size && buckets) {
+		buckets--;
+		sectors = ca->mi.bucket_size;
+	}
+
+	return (struct journal_space) {
+		.next_entry	= sectors,
+		.total		= sectors + buckets * ca->mi.bucket_size,
+	};
 }
 
-u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
+static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want,
+			    enum journal_space_from from)
 {
-	u64 ret = 0;
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	unsigned pos, nr_devs = 0;
+	struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
+
+	BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
+
+	rcu_read_lock();
+	for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
+		if (!ca->journal.nr)
+			continue;
+
+		space = journal_dev_space_available(j, ca, from);
+		if (!space.next_entry)
+			continue;
+
+		for (pos = 0; pos < nr_devs; pos++)
+			if (space.total > dev_space[pos].total)
+				break;
+
+		array_insert_item(dev_space, nr_devs, pos, space);
+	}
+	rcu_read_unlock();
+
+	if (nr_devs < nr_devs_want)
+		return (struct journal_space) { 0, 0 };
+
+	/*
+	 * We sorted largest to smallest, and we want the smallest out of the
+	 * @nr_devs_want largest devices:
+	 */
+	return dev_space[nr_devs_want - 1];
+}
+
+void bch2_journal_space_available(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	unsigned clean, clean_ondisk, total;
+	unsigned max_entry_size	 = min(j->buf[0].buf_size >> 9,
+				       j->buf[1].buf_size >> 9);
+	unsigned nr_online = 0, nr_devs_want;
+	bool can_discard = false;
+	int ret = 0;
+
+	lockdep_assert_held(&j->lock);
+
+	rcu_read_lock();
+	for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
+		struct journal_device *ja = &ca->journal;
+
+		if (!ja->nr)
+			continue;
+
+		while (ja->dirty_idx != ja->cur_idx &&
+		       ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
+			ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
+
+		while (ja->dirty_idx_ondisk != ja->dirty_idx &&
+		       ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
+			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
+
+		if (ja->discard_idx != ja->dirty_idx_ondisk)
+			can_discard = true;
+
+		max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
+		nr_online++;
+	}
+	rcu_read_unlock();
+
+	j->can_discard = can_discard;
+
+	if (nr_online < metadata_replicas_required(c)) {
+		struct printbuf buf = PRINTBUF;
+		buf.atomic++;
+		prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
+			   "rw journal devs:", nr_online, metadata_replicas_required(c));
+
+		rcu_read_lock();
+		for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
+			prt_printf(&buf, " %s", ca->name);
+		rcu_read_unlock();
+
+		bch_err(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+		ret = JOURNAL_ERR_insufficient_devices;
+		goto out;
+	}
+
+	nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
+
+	for (unsigned i = 0; i < journal_space_nr; i++)
+		j->space[i] = __journal_space_available(j, nr_devs_want, i);
+
+	clean_ondisk	= j->space[journal_space_clean_ondisk].total;
+	clean		= j->space[journal_space_clean].total;
+	total		= j->space[journal_space_total].total;
+
+	if (!j->space[journal_space_discarded].next_entry)
+		ret = JOURNAL_ERR_journal_full;
+
+	if ((j->space[journal_space_clean_ondisk].next_entry <
+	     j->space[journal_space_clean_ondisk].total) &&
+	    (clean - clean_ondisk <= total / 8) &&
+	    (clean_ondisk * 2 > clean))
+		set_bit(JOURNAL_may_skip_flush, &j->flags);
+	else
+		clear_bit(JOURNAL_may_skip_flush, &j->flags);
+
+	bch2_journal_set_watermark(j);
+out:
+	j->cur_entry_sectors	= !ret ? j->space[journal_space_discarded].next_entry : 0;
+	j->cur_entry_error	= ret;
+
+	if (!ret)
+		journal_wake(j);
+}
+
+/* Discards - last part of journal reclaim: */
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+	bool ret;
 
 	spin_lock(&j->lock);
-	if (journal_pin_active(pin))
-		ret = journal_pin_seq(j, pin->pin_list);
+	ret = ja->discard_idx != ja->dirty_idx_ondisk;
 	spin_unlock(&j->lock);
 
 	return ret;
 }
 
-static inline void __journal_pin_add(struct journal *j,
-				     struct journal_entry_pin_list *pin_list,
-				     struct journal_entry_pin *pin,
-				     journal_pin_flush_fn flush_fn)
+/*
+ * Advance ja->discard_idx as long as it points to buckets that are no longer
+ * dirty, issuing discards if necessary:
+ */
+void bch2_journal_do_discards(struct journal *j)
 {
-	BUG_ON(journal_pin_active(pin));
-	BUG_ON(!atomic_read(&pin_list->count));
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 
-	atomic_inc(&pin_list->count);
-	pin->pin_list	= pin_list;
-	pin->flush	= flush_fn;
+	mutex_lock(&j->discard_lock);
 
-	if (flush_fn)
-		list_add(&pin->list, &pin_list->list);
-	else
-		INIT_LIST_HEAD(&pin->list);
+	for_each_rw_member(c, ca) {
+		struct journal_device *ja = &ca->journal;
+
+		while (should_discard_bucket(j, ja)) {
+			if (!c->opts.nochanges &&
+			    ca->mi.discard &&
+			    bdev_max_discard_sectors(ca->disk_sb.bdev))
+				blkdev_issue_discard(ca->disk_sb.bdev,
+					bucket_to_sector(ca,
+						ja->buckets[ja->discard_idx]),
+					ca->mi.bucket_size, GFP_NOFS);
+
+			spin_lock(&j->lock);
+			ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
+
+			bch2_journal_space_available(j);
+			spin_unlock(&j->lock);
+		}
+	}
+
+	mutex_unlock(&j->discard_lock);
+}
+
+/*
+ * Journal entry pinning - machinery for holding a reference on a given journal
+ * entry, holding it open to ensure it gets replayed during recovery:
+ */
+
+void bch2_journal_reclaim_fast(struct journal *j)
+{
+	bool popped = false;
+
+	lockdep_assert_held(&j->lock);
 
 	/*
-	 * If the journal is currently full,  we might want to call flush_fn
-	 * immediately:
+	 * Unpin journal entries whose reference counts reached zero, meaning
+	 * all btree nodes got written out
 	 */
-	journal_wake(j);
+	while (!fifo_empty(&j->pin) &&
+	       j->pin.front <= j->seq_ondisk &&
+	       !atomic_read(&fifo_peek_front(&j->pin).count)) {
+		j->pin.front++;
+		popped = true;
+	}
+
+	if (popped)
+		bch2_journal_space_available(j);
 }
 
-void bch2_journal_pin_add(struct journal *j, u64 seq,
-			  struct journal_entry_pin *pin,
-			  journal_pin_flush_fn flush_fn)
+bool __bch2_journal_pin_put(struct journal *j, u64 seq)
 {
-	spin_lock(&j->lock);
-	__journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn);
-	spin_unlock(&j->lock);
+	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+	return atomic_dec_and_test(&pin_list->count);
+}
+
+void bch2_journal_pin_put(struct journal *j, u64 seq)
+{
+	if (__bch2_journal_pin_put(j, seq)) {
+		spin_lock(&j->lock);
+		bch2_journal_reclaim_fast(j);
+		spin_unlock(&j->lock);
+	}
 }
 
-static inline void __journal_pin_drop(struct journal *j,
+static inline bool __journal_pin_drop(struct journal *j,
 				      struct journal_entry_pin *pin)
 {
-	struct journal_entry_pin_list *pin_list = pin->pin_list;
+	struct journal_entry_pin_list *pin_list;
 
 	if (!journal_pin_active(pin))
-		return;
+		return false;
+
+	if (j->flush_in_progress == pin)
+		j->flush_in_progress_dropped = true;
 
-	pin->pin_list = NULL;
+	pin_list = journal_seq_pin(j, pin->seq);
+	pin->seq = 0;
 	list_del_init(&pin->list);
 
 	/*
-	 * Unpinning a journal entry make make journal_next_bucket() succeed, if
+	 * Unpinning a journal entry may make journal_next_bucket() succeed, if
 	 * writing a new last_seq will now make another bucket available:
 	 */
-	if (atomic_dec_and_test(&pin_list->count) &&
-	    pin_list == &fifo_peek_front(&j->pin))
-		bch2_journal_reclaim_fast(j);
+	return atomic_dec_and_test(&pin_list->count) &&
+		pin_list == &fifo_peek_front(&j->pin);
 }
 
 void bch2_journal_pin_drop(struct journal *j,
-			  struct journal_entry_pin *pin)
+			   struct journal_entry_pin *pin)
 {
 	spin_lock(&j->lock);
-	__journal_pin_drop(j, pin);
+	if (__journal_pin_drop(j, pin))
+		bch2_journal_reclaim_fast(j);
 	spin_unlock(&j->lock);
 }
 
-void bch2_journal_pin_add_if_older(struct journal *j,
-				  struct journal_entry_pin *src_pin,
-				  struct journal_entry_pin *pin,
-				  journal_pin_flush_fn flush_fn)
+static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
+{
+	if (fn == bch2_btree_node_flush0 ||
+	    fn == bch2_btree_node_flush1)
+		return JOURNAL_PIN_btree;
+	else if (fn == bch2_btree_key_cache_journal_flush)
+		return JOURNAL_PIN_key_cache;
+	else
+		return JOURNAL_PIN_other;
+}
+
+static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
+			  struct journal_entry_pin *pin,
+			  journal_pin_flush_fn flush_fn,
+			  enum journal_pin_type type)
+{
+	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+	/*
+	 * flush_fn is how we identify journal pins in debugfs, so must always
+	 * exist, even if it doesn't do anything:
+	 */
+	BUG_ON(!flush_fn);
+
+	atomic_inc(&pin_list->count);
+	pin->seq	= seq;
+	pin->flush	= flush_fn;
+	list_add(&pin->list, &pin_list->list[type]);
+}
+
+void bch2_journal_pin_copy(struct journal *j,
+			   struct journal_entry_pin *dst,
+			   struct journal_entry_pin *src,
+			   journal_pin_flush_fn flush_fn)
 {
 	spin_lock(&j->lock);
 
-	if (journal_pin_active(src_pin) &&
-	    (!journal_pin_active(pin) ||
-	     journal_pin_seq(j, src_pin->pin_list) <
-	     journal_pin_seq(j, pin->pin_list))) {
-		__journal_pin_drop(j, pin);
-		__journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
+	u64 seq = READ_ONCE(src->seq);
+
+	if (seq < journal_last_seq(j)) {
+		/*
+		 * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
+		 * the src pin - with the pin dropped, the entry to pin might no
+		 * longer to exist, but that means there's no longer anything to
+		 * copy and we can bail out here:
+		 */
+		spin_unlock(&j->lock);
+		return;
 	}
 
+	bool reclaim = __journal_pin_drop(j, dst);
+
+	bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
+
+	if (reclaim)
+		bch2_journal_reclaim_fast(j);
+
+	/*
+	 * If the journal is currently full,  we might want to call flush_fn
+	 * immediately:
+	 */
+	if (seq == journal_last_seq(j))
+		journal_wake(j);
+	spin_unlock(&j->lock);
+}
+
+void bch2_journal_pin_set(struct journal *j, u64 seq,
+			  struct journal_entry_pin *pin,
+			  journal_pin_flush_fn flush_fn)
+{
+	spin_lock(&j->lock);
+
+	BUG_ON(seq < journal_last_seq(j));
+
+	bool reclaim = __journal_pin_drop(j, pin);
+
+	bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
+
+	if (reclaim)
+		bch2_journal_reclaim_fast(j);
+	/*
+	 * If the journal is currently full,  we might want to call flush_fn
+	 * immediately:
+	 */
+	if (seq == journal_last_seq(j))
+		journal_wake(j);
+
 	spin_unlock(&j->lock);
 }
 
+/**
+ * bch2_journal_pin_flush: ensure journal pin callback is no longer running
+ * @j:		journal object
+ * @pin:	pin to flush
+ */
+void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
+{
+	BUG_ON(journal_pin_active(pin));
+
+	wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
+}
+
 /*
  * Journal reclaim: flush references to open journal entries to reclaim space in
  * the journal
@@ -116,88 +484,147 @@ void bch2_journal_pin_add_if_older(struct journal *j,
  * data off of a specific device:
  */
 
-/**
- * bch2_journal_reclaim_fast - do the fast part of journal reclaim
- *
- * Called from IO submission context, does not block. Cleans up after btree
- * write completions by advancing the journal pin and each cache's last_idx,
- * kicking off discards and background reclaim as necessary.
- */
-void bch2_journal_reclaim_fast(struct journal *j)
+static struct journal_entry_pin *
+journal_get_next_pin(struct journal *j,
+		     u64 seq_to_flush,
+		     unsigned allowed_below_seq,
+		     unsigned allowed_above_seq,
+		     u64 *seq)
 {
-	struct journal_entry_pin_list temp;
-	bool popped = false;
+	struct journal_entry_pin_list *pin_list;
+	struct journal_entry_pin *ret = NULL;
+	unsigned i;
 
-	lockdep_assert_held(&j->lock);
+	fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
+		if (*seq > seq_to_flush && !allowed_above_seq)
+			break;
 
-	/*
-	 * Unpin journal entries whose reference counts reached zero, meaning
-	 * all btree nodes got written out
-	 */
-	while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
-		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
-		BUG_ON(!fifo_pop(&j->pin, temp));
-		popped = true;
+		for (i = 0; i < JOURNAL_PIN_NR; i++)
+			if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) ||
+			    ((1U << i) & allowed_above_seq)) {
+				ret = list_first_entry_or_null(&pin_list->list[i],
+					struct journal_entry_pin, list);
+				if (ret)
+					return ret;
+			}
 	}
 
-	if (popped)
-		journal_wake(j);
+	return NULL;
 }
 
-static struct journal_entry_pin *
-__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+/* returns true if we did work */
+static size_t journal_flush_pins(struct journal *j,
+				 u64 seq_to_flush,
+				 unsigned allowed_below_seq,
+				 unsigned allowed_above_seq,
+				 unsigned min_any,
+				 unsigned min_key_cache)
 {
-	struct journal_entry_pin_list *pin_list;
-	struct journal_entry_pin *ret;
-	u64 iter;
+	struct journal_entry_pin *pin;
+	size_t nr_flushed = 0;
+	journal_pin_flush_fn flush_fn;
+	u64 seq;
+	int err;
 
-	/* no need to iterate over empty fifo entries: */
-	bch2_journal_reclaim_fast(j);
+	lockdep_assert_held(&j->reclaim_lock);
 
-	fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
-		if (iter > seq_to_flush)
-			break;
+	while (1) {
+		unsigned allowed_above = allowed_above_seq;
+		unsigned allowed_below = allowed_below_seq;
 
-		ret = list_first_entry_or_null(&pin_list->list,
-				struct journal_entry_pin, list);
-		if (ret) {
-			/* must be list_del_init(), see bch2_journal_pin_drop() */
-			list_move(&ret->list, &pin_list->flushed);
-			*seq = iter;
-			return ret;
+		if (min_any) {
+			allowed_above |= ~0;
+			allowed_below |= ~0;
 		}
+
+		if (min_key_cache) {
+			allowed_above |= 1U << JOURNAL_PIN_key_cache;
+			allowed_below |= 1U << JOURNAL_PIN_key_cache;
+		}
+
+		cond_resched();
+
+		j->last_flushed = jiffies;
+
+		spin_lock(&j->lock);
+		pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq);
+		if (pin) {
+			BUG_ON(j->flush_in_progress);
+			j->flush_in_progress = pin;
+			j->flush_in_progress_dropped = false;
+			flush_fn = pin->flush;
+		}
+		spin_unlock(&j->lock);
+
+		if (!pin)
+			break;
+
+		if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush)
+			min_key_cache--;
+
+		if (min_any)
+			min_any--;
+
+		err = flush_fn(j, pin, seq);
+
+		spin_lock(&j->lock);
+		/* Pin might have been dropped or rearmed: */
+		if (likely(!err && !j->flush_in_progress_dropped))
+			list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
+		j->flush_in_progress = NULL;
+		j->flush_in_progress_dropped = false;
+		spin_unlock(&j->lock);
+
+		wake_up(&j->pin_flush_wait);
+
+		if (err)
+			break;
+
+		nr_flushed++;
 	}
 
-	return NULL;
+	return nr_flushed;
 }
 
-static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+static u64 journal_seq_to_flush(struct journal *j)
 {
-	struct journal_entry_pin *ret;
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	u64 seq_to_flush = 0;
 
 	spin_lock(&j->lock);
-	ret = __journal_get_next_pin(j, seq_to_flush, seq);
-	spin_unlock(&j->lock);
 
-	return ret;
-}
+	for_each_rw_member(c, ca) {
+		struct journal_device *ja = &ca->journal;
+		unsigned nr_buckets, bucket_to_flush;
 
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
-{
-	bool ret;
+		if (!ja->nr)
+			continue;
 
-	spin_lock(&j->lock);
-	ret = ja->nr &&
-		(ja->last_idx != ja->cur_idx &&
-		 ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
+		/* Try to keep the journal at most half full: */
+		nr_buckets = ja->nr / 2;
+
+		nr_buckets = min(nr_buckets, ja->nr);
+
+		bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
+		seq_to_flush = max(seq_to_flush,
+				   ja->bucket_seq[bucket_to_flush]);
+	}
+
+	/* Also flush if the pin fifo is more than half full */
+	seq_to_flush = max_t(s64, seq_to_flush,
+			     (s64) journal_cur_seq(j) -
+			     (j->pin.size >> 1));
 	spin_unlock(&j->lock);
 
-	return ret;
+	return seq_to_flush;
 }
 
 /**
- * bch2_journal_reclaim_work - free up journal buckets
+ * __bch2_journal_reclaim - free up journal buckets
+ * @j:		journal object
+ * @direct:	direct or background reclaim?
+ * @kicked:	requested to run since we last ran?
+ * Returns:	0 on success, or -EIO if the journal has been shutdown
  *
  * Background journal reclaim writes out btree nodes. It should be run
  * early enough so that we never completely run out of journal buckets.
@@ -214,152 +641,229 @@ static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
  * 512 journal entries or 25% of all journal buckets, then
  * journal_next_bucket() should not stall.
  */
-void bch2_journal_reclaim_work(struct work_struct *work)
+static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
 {
-	struct bch_fs *c = container_of(to_delayed_work(work),
-				struct bch_fs, journal.reclaim_work);
-	struct journal *j = &c->journal;
-	struct bch_dev *ca;
-	struct journal_entry_pin *pin;
-	u64 seq, seq_to_flush = 0;
-	unsigned iter, bucket_to_flush;
-	unsigned long next_flush;
-	bool reclaim_lock_held = false, need_flush;
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct btree_cache *bc = &c->btree_cache;
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	u64 seq_to_flush;
+	size_t min_nr, min_key_cache, nr_flushed;
+	unsigned flags;
+	int ret = 0;
 
 	/*
-	 * Advance last_idx to point to the oldest journal entry containing
-	 * btree node updates that have not yet been written out
+	 * We can't invoke memory reclaim while holding the reclaim_lock -
+	 * journal reclaim is required to make progress for memory reclaim
+	 * (cleaning the caches), so we can't get stuck in memory reclaim while
+	 * we're holding the reclaim lock:
 	 */
-	for_each_rw_member(ca, c, iter) {
-		struct journal_device *ja = &ca->journal;
+	lockdep_assert_held(&j->reclaim_lock);
+	flags = memalloc_noreclaim_save();
 
-		if (!ja->nr)
-			continue;
-
-		while (should_discard_bucket(j, ja)) {
-			if (!reclaim_lock_held) {
-				/*
-				 * ugh:
-				 * might be called from __journal_res_get()
-				 * under wait_event() - have to go back to
-				 * TASK_RUNNING before doing something that
-				 * would block, but only if we're doing work:
-				 */
-				__set_current_state(TASK_RUNNING);
-
-				mutex_lock(&j->reclaim_lock);
-				reclaim_lock_held = true;
-				/* recheck under reclaim_lock: */
-				continue;
-			}
+	do {
+		if (kthread && kthread_should_stop())
+			break;
 
-			if (ca->mi.discard &&
-			    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
-				blkdev_issue_discard(ca->disk_sb.bdev,
-					bucket_to_sector(ca,
-						ja->buckets[ja->last_idx]),
-					ca->mi.bucket_size, GFP_NOIO, 0);
+		if (bch2_journal_error(j)) {
+			ret = -EIO;
+			break;
+		}
 
-			spin_lock(&j->lock);
-			ja->last_idx = (ja->last_idx + 1) % ja->nr;
-			spin_unlock(&j->lock);
+		bch2_journal_do_discards(j);
 
-			journal_wake(j);
-		}
+		seq_to_flush = journal_seq_to_flush(j);
+		min_nr = 0;
 
 		/*
-		 * Write out enough btree nodes to free up 50% journal
-		 * buckets
+		 * If it's been longer than j->reclaim_delay_ms since we last flushed,
+		 * make sure to flush at least one journal pin:
 		 */
-		spin_lock(&j->lock);
-		bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
-		seq_to_flush = max_t(u64, seq_to_flush,
-				     ja->bucket_seq[bucket_to_flush]);
-		spin_unlock(&j->lock);
-	}
+		if (time_after(jiffies, j->last_flushed +
+			       msecs_to_jiffies(c->opts.journal_reclaim_delay)))
+			min_nr = 1;
+
+		if (j->watermark != BCH_WATERMARK_stripe)
+			min_nr = 1;
+
+		size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr;
+		if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live)
+			min_nr = 1;
+
+		min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
+
+		trace_and_count(c, journal_reclaim_start, c,
+				direct, kicked,
+				min_nr, min_key_cache,
+				atomic_long_read(&bc->nr_dirty), btree_cache_live,
+				atomic_long_read(&c->btree_key_cache.nr_dirty),
+				atomic_long_read(&c->btree_key_cache.nr_keys));
 
-	if (reclaim_lock_held)
+		nr_flushed = journal_flush_pins(j, seq_to_flush,
+						~0, 0,
+						min_nr, min_key_cache);
+
+		if (direct)
+			j->nr_direct_reclaim += nr_flushed;
+		else
+			j->nr_background_reclaim += nr_flushed;
+		trace_and_count(c, journal_reclaim_finish, c, nr_flushed);
+
+		if (nr_flushed)
+			wake_up(&j->reclaim_wait);
+	} while ((min_nr || min_key_cache) && nr_flushed && !direct);
+
+	memalloc_noreclaim_restore(flags);
+
+	return ret;
+}
+
+int bch2_journal_reclaim(struct journal *j)
+{
+	return __bch2_journal_reclaim(j, true, true);
+}
+
+static int bch2_journal_reclaim_thread(void *arg)
+{
+	struct journal *j = arg;
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	unsigned long delay, now;
+	bool journal_empty;
+	int ret = 0;
+
+	set_freezable();
+
+	j->last_flushed = jiffies;
+
+	while (!ret && !kthread_should_stop()) {
+		bool kicked = j->reclaim_kicked;
+
+		j->reclaim_kicked = false;
+
+		mutex_lock(&j->reclaim_lock);
+		ret = __bch2_journal_reclaim(j, false, kicked);
 		mutex_unlock(&j->reclaim_lock);
 
-	/* Also flush if the pin fifo is more than half full */
-	spin_lock(&j->lock);
-	seq_to_flush = max_t(s64, seq_to_flush,
-			     (s64) journal_cur_seq(j) -
-			     (j->pin.size >> 1));
-	spin_unlock(&j->lock);
+		now = jiffies;
+		delay = msecs_to_jiffies(c->opts.journal_reclaim_delay);
+		j->next_reclaim = j->last_flushed + delay;
 
-	/*
-	 * If it's been longer than j->reclaim_delay_ms since we last flushed,
-	 * make sure to flush at least one journal pin:
-	 */
-	next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
-	need_flush = time_after(jiffies, next_flush);
+		if (!time_in_range(j->next_reclaim, now, now + delay))
+			j->next_reclaim = now + delay;
+
+		while (1) {
+			set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+			if (kthread_should_stop())
+				break;
+			if (j->reclaim_kicked)
+				break;
 
-	while ((pin = journal_get_next_pin(j, need_flush
-					   ? U64_MAX
-					   : seq_to_flush, &seq))) {
+			spin_lock(&j->lock);
+			journal_empty = fifo_empty(&j->pin);
+			spin_unlock(&j->lock);
+
+			long timeout = j->next_reclaim - jiffies;
+
+			if (journal_empty)
+				schedule();
+			else if (timeout > 0)
+				schedule_timeout(timeout);
+			else
+				break;
+		}
 		__set_current_state(TASK_RUNNING);
-		pin->flush(j, pin, seq);
-		need_flush = false;
+	}
 
-		j->last_flushed = jiffies;
+	return 0;
+}
+
+void bch2_journal_reclaim_stop(struct journal *j)
+{
+	struct task_struct *p = j->reclaim_thread;
+
+	j->reclaim_thread = NULL;
+
+	if (p) {
+		kthread_stop(p);
+		put_task_struct(p);
 	}
+}
+
+int bch2_journal_reclaim_start(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct task_struct *p;
+	int ret;
+
+	if (j->reclaim_thread)
+		return 0;
 
-	if (!test_bit(BCH_FS_RO, &c->flags))
-		queue_delayed_work(system_freezable_wq, &j->reclaim_work,
-				   msecs_to_jiffies(j->reclaim_delay_ms));
+	p = kthread_create(bch2_journal_reclaim_thread, j,
+			   "bch-reclaim/%s", c->name);
+	ret = PTR_ERR_OR_ZERO(p);
+	bch_err_msg(c, ret, "creating journal reclaim thread");
+	if (ret)
+		return ret;
+
+	get_task_struct(p);
+	j->reclaim_thread = p;
+	wake_up_process(p);
+	return 0;
 }
 
 static int journal_flush_done(struct journal *j, u64 seq_to_flush,
-			      struct journal_entry_pin **pin,
-			      u64 *pin_seq)
+			      bool *did_work)
 {
 	int ret;
 
-	*pin = NULL;
-
 	ret = bch2_journal_error(j);
 	if (ret)
 		return ret;
 
+	mutex_lock(&j->reclaim_lock);
+
+	if (journal_flush_pins(j, seq_to_flush,
+			       (1U << JOURNAL_PIN_key_cache)|
+			       (1U << JOURNAL_PIN_other), 0, 0, 0) ||
+	    journal_flush_pins(j, seq_to_flush,
+			       (1U << JOURNAL_PIN_btree), 0, 0, 0))
+		*did_work = true;
+
+	if (seq_to_flush > journal_cur_seq(j))
+		bch2_journal_entry_close(j);
+
 	spin_lock(&j->lock);
 	/*
 	 * If journal replay hasn't completed, the unreplayed journal entries
 	 * hold refs on their corresponding sequence numbers
 	 */
-	ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
-		!test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
+	ret = !test_bit(JOURNAL_replay_done, &j->flags) ||
 		journal_last_seq(j) > seq_to_flush ||
-		(fifo_used(&j->pin) == 1 &&
-		 atomic_read(&fifo_peek_front(&j->pin).count) == 1);
+		!fifo_used(&j->pin);
+
 	spin_unlock(&j->lock);
+	mutex_unlock(&j->reclaim_lock);
 
 	return ret;
 }
 
-void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
+bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
 {
-	struct journal_entry_pin *pin;
-	u64 pin_seq;
+	/* time_stats this */
+	bool did_work = false;
 
-	if (!test_bit(JOURNAL_STARTED, &j->flags))
-		return;
+	if (!test_bit(JOURNAL_running, &j->flags))
+		return false;
 
-	while (1) {
-		wait_event(j->wait, journal_flush_done(j, seq_to_flush,
-						       &pin, &pin_seq));
-		if (!pin)
-			break;
+	closure_wait_event(&j->async_wait,
+		journal_flush_done(j, seq_to_flush, &did_work));
 
-		pin->flush(j, pin, pin_seq);
-	}
+	return did_work;
 }
 
 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_entry_pin_list *p;
-	struct bch_devs_list devs;
 	u64 iter, seq = 0;
 	int ret = 0;
 
@@ -378,22 +882,39 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 		return ret;
 
 	mutex_lock(&c->replicas_gc_lock);
-	bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_journal);
 
-	seq = 0;
+	/*
+	 * Now that we've populated replicas_gc, write to the journal to mark
+	 * active journal devices. This handles the case where the journal might
+	 * be empty. Otherwise we could clear all journal replicas and
+	 * temporarily put the fs into an unrecoverable state. Journal recovery
+	 * expects to find devices marked for journal data on unclean mount.
+	 */
+	ret = bch2_journal_meta(&c->journal);
+	if (ret)
+		goto err;
 
+	seq = 0;
 	spin_lock(&j->lock);
-	while (!ret && seq < j->pin.back) {
+	while (!ret) {
+		struct bch_replicas_padded replicas;
+
 		seq = max(seq, journal_last_seq(j));
-		devs = journal_seq_pin(j, seq)->devs;
+		if (seq >= j->pin.back)
+			break;
+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+					 journal_seq_pin(j, seq)->devs);
 		seq++;
 
-		spin_unlock(&j->lock);
-		ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
-		spin_lock(&j->lock);
+		if (replicas.e.nr_devs) {
+			spin_unlock(&j->lock);
+			ret = bch2_mark_replicas(c, &replicas.e);
+			spin_lock(&j->lock);
+		}
 	}
 	spin_unlock(&j->lock);
-
+err:
 	ret = bch2_replicas_gc_end(c, ret);
 	mutex_unlock(&c->replicas_gc_lock);
 
diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h
index eb227902..ec84c334 100644
--- a/libbcachefs/journal_reclaim.h
+++ b/libbcachefs/journal_reclaim.h
@@ -1,39 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_JOURNAL_RECLAIM_H
 #define _BCACHEFS_JOURNAL_RECLAIM_H
 
 #define JOURNAL_PIN	(32 * 1024)
 
+static inline void journal_reclaim_kick(struct journal *j)
+{
+	struct task_struct *p = READ_ONCE(j->reclaim_thread);
+
+	j->reclaim_kicked = true;
+	if (p)
+		wake_up_process(p);
+}
+
+unsigned bch2_journal_dev_buckets_available(struct journal *,
+					    struct journal_device *,
+					    enum journal_space_from);
+void bch2_journal_set_watermark(struct journal *);
+void bch2_journal_space_available(struct journal *);
+
 static inline bool journal_pin_active(struct journal_entry_pin *pin)
 {
-	return pin->pin_list != NULL;
+	return pin->seq != 0;
 }
 
 static inline struct journal_entry_pin_list *
 journal_seq_pin(struct journal *j, u64 seq)
 {
-	BUG_ON(seq < j->pin.front || seq >= j->pin.back);
+	EBUG_ON(seq < j->pin.front || seq >= j->pin.back);
 
 	return &j->pin.data[seq & j->pin.mask];
 }
 
-u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
+void bch2_journal_reclaim_fast(struct journal *);
+bool __bch2_journal_pin_put(struct journal *, u64);
+void bch2_journal_pin_put(struct journal *, u64);
+void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
 
-void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
+void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *,
 			  journal_pin_flush_fn);
-void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
-void bch2_journal_pin_add_if_older(struct journal *,
-				  struct journal_entry_pin *,
-				  struct journal_entry_pin *,
-				  journal_pin_flush_fn);
 
-void bch2_journal_reclaim_fast(struct journal *);
-void bch2_journal_reclaim_work(struct work_struct *);
+static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
+					struct journal_entry_pin *pin,
+					journal_pin_flush_fn flush_fn)
+{
+	if (unlikely(!journal_pin_active(pin) || pin->seq > seq))
+		bch2_journal_pin_set(j, seq, pin, flush_fn);
+}
+
+void bch2_journal_pin_copy(struct journal *,
+			   struct journal_entry_pin *,
+			   struct journal_entry_pin *,
+			   journal_pin_flush_fn);
+
+static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
+					   struct journal_entry_pin *pin,
+					   journal_pin_flush_fn flush_fn)
+{
+	if (unlikely(!journal_pin_active(pin) || pin->seq < seq))
+		bch2_journal_pin_set(j, seq, pin, flush_fn);
+}
+
+void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
+
+void bch2_journal_do_discards(struct journal *);
+int bch2_journal_reclaim(struct journal *);
+
+void bch2_journal_reclaim_stop(struct journal *);
+int bch2_journal_reclaim_start(struct journal *);
 
-void bch2_journal_flush_pins(struct journal *, u64);
+bool bch2_journal_flush_pins(struct journal *, u64);
 
-static inline void bch2_journal_flush_all_pins(struct journal *j)
+static inline bool bch2_journal_flush_all_pins(struct journal *j)
 {
-	bch2_journal_flush_pins(j, U64_MAX);
+	return bch2_journal_flush_pins(j, U64_MAX);
 }
 
 int bch2_journal_flush_device_pins(struct journal *, int);
diff --git a/libbcachefs/journal_sb.c b/libbcachefs/journal_sb.c
new file mode 100644
index 00000000..62b910f2
--- /dev/null
+++ b/libbcachefs/journal_sb.c
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "journal_sb.h"
+#include "darray.h"
+
+#include <linux/sort.h>
+
+/* BCH_SB_FIELD_journal: */
+
+static int u64_cmp(const void *_l, const void *_r)
+{
+	const u64 *l = _l;
+	const u64 *r = _r;
+
+	return cmp_int(*l, *r);
+}
+
+static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				enum bch_validate_flags flags, struct printbuf *err)
+{
+	struct bch_sb_field_journal *journal = field_to_type(f, journal);
+	struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
+	int ret = -BCH_ERR_invalid_sb_journal;
+	unsigned nr;
+	unsigned i;
+	u64 *b;
+
+	nr = bch2_nr_journal_buckets(journal);
+	if (!nr)
+		return 0;
+
+	b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL);
+	if (!b)
+		return -BCH_ERR_ENOMEM_sb_journal_validate;
+
+	for (i = 0; i < nr; i++)
+		b[i] = le64_to_cpu(journal->buckets[i]);
+
+	sort(b, nr, sizeof(u64), u64_cmp, NULL);
+
+	if (!b[0]) {
+		prt_printf(err, "journal bucket at sector 0");
+		goto err;
+	}
+
+	if (b[0] < le16_to_cpu(m.first_bucket)) {
+		prt_printf(err, "journal bucket %llu before first bucket %u",
+		       b[0], le16_to_cpu(m.first_bucket));
+		goto err;
+	}
+
+	if (b[nr - 1] >= le64_to_cpu(m.nbuckets)) {
+		prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+		       b[nr - 1], le64_to_cpu(m.nbuckets));
+		goto err;
+	}
+
+	for (i = 0; i + 1 < nr; i++)
+		if (b[i] == b[i + 1]) {
+			prt_printf(err, "duplicate journal buckets %llu", b[i]);
+			goto err;
+		}
+
+	ret = 0;
+err:
+	kfree(b);
+	return ret;
+}
+
+static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
+				    struct bch_sb_field *f)
+{
+	struct bch_sb_field_journal *journal = field_to_type(f, journal);
+	unsigned i, nr = bch2_nr_journal_buckets(journal);
+
+	prt_printf(out, "Buckets: ");
+	for (i = 0; i < nr; i++)
+		prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i]));
+	prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal = {
+	.validate	= bch2_sb_journal_validate,
+	.to_text	= bch2_sb_journal_to_text,
+};
+
+struct u64_range {
+	u64	start;
+	u64	end;
+};
+
+static int u64_range_cmp(const void *_l, const void *_r)
+{
+	const struct u64_range *l = _l;
+	const struct u64_range *r = _r;
+
+	return cmp_int(l->start, r->start);
+}
+
+static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				enum bch_validate_flags flags, struct printbuf *err)
+{
+	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+	struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
+	int ret = -BCH_ERR_invalid_sb_journal;
+	u64 sum = 0;
+	unsigned nr;
+	unsigned i;
+	struct u64_range *b;
+
+	nr = bch2_sb_field_journal_v2_nr_entries(journal);
+	if (!nr)
+		return 0;
+
+	b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL);
+	if (!b)
+		return -BCH_ERR_ENOMEM_sb_journal_v2_validate;
+
+	for (i = 0; i < nr; i++) {
+		b[i].start = le64_to_cpu(journal->d[i].start);
+		b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
+
+		if (b[i].end <= b[i].start) {
+			prt_printf(err, "journal buckets entry with bad nr: %llu+%llu",
+				   le64_to_cpu(journal->d[i].start),
+				   le64_to_cpu(journal->d[i].nr));
+			goto err;
+		}
+
+		sum += le64_to_cpu(journal->d[i].nr);
+	}
+
+	sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
+
+	if (!b[0].start) {
+		prt_printf(err, "journal bucket at sector 0");
+		goto err;
+	}
+
+	if (b[0].start < le16_to_cpu(m.first_bucket)) {
+		prt_printf(err, "journal bucket %llu before first bucket %u",
+		       b[0].start, le16_to_cpu(m.first_bucket));
+		goto err;
+	}
+
+	if (b[nr - 1].end > le64_to_cpu(m.nbuckets)) {
+		prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+		       b[nr - 1].end - 1, le64_to_cpu(m.nbuckets));
+		goto err;
+	}
+
+	for (i = 0; i + 1 < nr; i++) {
+		if (b[i].end > b[i + 1].start) {
+			prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
+			       b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
+			goto err;
+		}
+	}
+
+	if (sum > UINT_MAX) {
+		prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX);
+		goto err;
+	}
+
+	ret = 0;
+err:
+	kfree(b);
+	return ret;
+}
+
+static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb,
+				    struct bch_sb_field *f)
+{
+	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+	unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal);
+
+	prt_printf(out, "Buckets: ");
+	for (i = 0; i < nr; i++)
+		prt_printf(out, " %llu-%llu",
+		       le64_to_cpu(journal->d[i].start),
+		       le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr));
+	prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
+	.validate	= bch2_sb_journal_v2_validate,
+	.to_text	= bch2_sb_journal_v2_to_text,
+};
+
+int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
+			       u64 *buckets, unsigned nr)
+{
+	struct bch_sb_field_journal_v2 *j;
+	unsigned i, dst = 0, nr_compacted = 1;
+
+	if (c)
+		lockdep_assert_held(&c->sb_lock);
+
+	if (!nr) {
+		bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+		bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
+		return 0;
+	}
+
+	for (i = 0; i + 1 < nr; i++)
+		if (buckets[i] + 1 != buckets[i + 1])
+			nr_compacted++;
+
+	j = bch2_sb_field_resize(&ca->disk_sb, journal_v2,
+			 (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
+	if (!j)
+		return -BCH_ERR_ENOSPC_sb_journal;
+
+	bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+
+	j->d[dst].start = cpu_to_le64(buckets[0]);
+	j->d[dst].nr	= cpu_to_le64(1);
+
+	for (i = 1; i < nr; i++) {
+		if (buckets[i] == buckets[i - 1] + 1) {
+			le64_add_cpu(&j->d[dst].nr, 1);
+		} else {
+			dst++;
+			j->d[dst].start = cpu_to_le64(buckets[i]);
+			j->d[dst].nr	= cpu_to_le64(1);
+		}
+	}
+
+	BUG_ON(dst + 1 != nr_compacted);
+	return 0;
+}
diff --git a/libbcachefs/journal_sb.h b/libbcachefs/journal_sb.h
new file mode 100644
index 00000000..ba40a7e8
--- /dev/null
+++ b/libbcachefs/journal_sb.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "super-io.h"
+#include "vstructs.h"
+
+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
+{
+	return j
+		? (__le64 *) vstruct_end(&j->field) - j->buckets
+		: 0;
+}
+
+static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j)
+{
+	if (!j)
+		return 0;
+
+	return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0];
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
+
+int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned);
diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c
index dd0e8d2f..1f25c111 100644
--- a/libbcachefs/journal_seq_blacklist.c
+++ b/libbcachefs/journal_seq_blacklist.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "error.h"
+#include "eytzinger.h"
 #include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
+#include "super-io.h"
 
 /*
  * journal_seq_blacklist machinery:
@@ -36,324 +34,222 @@
  * record that it was blacklisted so that a) on recovery we don't think we have
  * missing journal entries and b) so that the btree code continues to ignore
  * that bset, until that btree node is rewritten.
- *
- * Blacklisted journal sequence numbers are themselves recorded as entries in
- * the journal.
  */
 
-/*
- * Called when journal needs to evict a blacklist entry to reclaim space: find
- * any btree nodes that refer to the blacklist journal sequence numbers, and
- * rewrite them:
- */
-static void journal_seq_blacklist_flush(struct journal *j,
-					struct journal_entry_pin *pin, u64 seq)
+static unsigned sb_blacklist_u64s(unsigned nr)
 {
-	struct bch_fs *c =
-		container_of(j, struct bch_fs, journal);
-	struct journal_seq_blacklist *bl =
-		container_of(pin, struct journal_seq_blacklist, pin);
-	struct blacklisted_node n;
-	struct closure cl;
-	unsigned i;
-	int ret;
+	struct bch_sb_field_journal_seq_blacklist *bl;
 
-	closure_init_stack(&cl);
-
-	for (i = 0;; i++) {
-		struct btree_iter iter;
-		struct btree *b;
+	return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
+}
 
-		mutex_lock(&j->blacklist_lock);
-		if (i >= bl->nr_entries) {
-			mutex_unlock(&j->blacklist_lock);
-			break;
-		}
-		n = bl->entries[i];
-		mutex_unlock(&j->blacklist_lock);
-
-		__bch2_btree_iter_init(&iter, c, n.btree_id, n.pos,
-				       0, 0, BTREE_ITER_NODES);
-
-		b = bch2_btree_iter_peek_node(&iter);
-
-		/* The node might have already been rewritten: */
-
-		if (b->data->keys.seq == n.seq) {
-			ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0);
-			if (ret) {
-				bch2_btree_iter_unlock(&iter);
-				bch2_fs_fatal_error(c,
-					"error %i rewriting btree node with blacklisted journal seq",
-					ret);
-				bch2_journal_halt(j);
-				return;
-			}
-		}
+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl;
+	unsigned i = 0, nr;
+	int ret = 0;
 
-		bch2_btree_iter_unlock(&iter);
-	}
+	mutex_lock(&c->sb_lock);
+	bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
+	nr = blacklist_nr_entries(bl);
 
-	for (i = 0;; i++) {
-		struct btree_update *as;
-		struct pending_btree_node_free *d;
+	while (i < nr) {
+		struct journal_seq_blacklist_entry *e =
+			bl->start + i;
 
-		mutex_lock(&j->blacklist_lock);
-		if (i >= bl->nr_entries) {
-			mutex_unlock(&j->blacklist_lock);
+		if (end < le64_to_cpu(e->start))
 			break;
+
+		if (start > le64_to_cpu(e->end)) {
+			i++;
+			continue;
 		}
-		n = bl->entries[i];
-		mutex_unlock(&j->blacklist_lock);
-redo_wait:
-		mutex_lock(&c->btree_interior_update_lock);
 
 		/*
-		 * Is the node on the list of pending interior node updates -
-		 * being freed? If so, wait for that to finish:
+		 * Entry is contiguous or overlapping with new entry: merge it
+		 * with new entry, and delete:
 		 */
-		for_each_pending_btree_node_free(c, as, d)
-			if (n.seq	== d->seq &&
-			    n.btree_id	== d->btree_id &&
-			    !d->level &&
-			    !bkey_cmp(n.pos, d->key.k.p)) {
-				closure_wait(&as->wait, &cl);
-				mutex_unlock(&c->btree_interior_update_lock);
-				closure_sync(&cl);
-				goto redo_wait;
-			}
-
-		mutex_unlock(&c->btree_interior_update_lock);
+
+		start	= min(start,	le64_to_cpu(e->start));
+		end	= max(end,	le64_to_cpu(e->end));
+		array_remove_item(bl->start, nr, i);
 	}
 
-	mutex_lock(&j->blacklist_lock);
+	bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+				  sb_blacklist_u64s(nr + 1));
+	if (!bl) {
+		ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist;
+		goto out;
+	}
 
-	bch2_journal_pin_drop(j, &bl->pin);
-	list_del(&bl->list);
-	kfree(bl->entries);
-	kfree(bl);
+	array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) {
+		.start	= cpu_to_le64(start),
+		.end	= cpu_to_le64(end),
+	}));
+	c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
 
-	mutex_unlock(&j->blacklist_lock);
+	ret = bch2_write_super(c);
+out:
+	mutex_unlock(&c->sb_lock);
+
+	return ret ?: bch2_blacklist_table_initialize(c);
 }
 
-/*
- * Determine if a particular sequence number is blacklisted - if so, return
- * blacklist entry:
- */
-struct journal_seq_blacklist *
-bch2_journal_seq_blacklist_find(struct journal *j, u64 seq)
+static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
 {
-	struct journal_seq_blacklist *bl;
-
-	lockdep_assert_held(&j->blacklist_lock);
+	const struct journal_seq_blacklist_table_entry *l = _l;
+	const struct journal_seq_blacklist_table_entry *r = _r;
 
-	list_for_each_entry(bl, &j->seq_blacklist, list)
-		if (seq >= bl->start && seq <= bl->end)
-			return bl;
-
-	return NULL;
+	return cmp_int(l->start, r->start);
 }
 
-/*
- * Allocate a new, in memory blacklist entry:
- */
-static struct journal_seq_blacklist *
-bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end)
+bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
+				     bool dirty)
 {
-	struct journal_seq_blacklist *bl;
+	struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
+	struct journal_seq_blacklist_table_entry search = { .start = seq };
+	int idx;
 
-	lockdep_assert_held(&j->blacklist_lock);
+	if (!t)
+		return false;
 
-	/*
-	 * When we start the journal, bch2_journal_start() will skip over @seq:
-	 */
+	idx = eytzinger0_find_le(t->entries, t->nr,
+				 sizeof(t->entries[0]),
+				 journal_seq_blacklist_table_cmp,
+				 &search);
+	if (idx < 0)
+		return false;
 
-	bl = kzalloc(sizeof(*bl), GFP_KERNEL);
-	if (!bl)
-		return NULL;
+	BUG_ON(t->entries[idx].start > seq);
 
-	bl->start	= start;
-	bl->end		= end;
+	if (seq >= t->entries[idx].end)
+		return false;
 
-	list_add_tail(&bl->list, &j->seq_blacklist);
-	return bl;
+	if (dirty)
+		t->entries[idx].dirty = true;
+	return true;
 }
 
-/*
- * Returns true if @seq is newer than the most recent journal entry that got
- * written, and data corresponding to @seq should be ignored - also marks @seq
- * as blacklisted so that on future restarts the corresponding data will still
- * be ignored:
- */
-int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
+int bch2_blacklist_table_initialize(struct bch_fs *c)
 {
-	struct journal *j = &c->journal;
-	struct journal_seq_blacklist *bl = NULL;
-	struct blacklisted_node *n;
-	u64 journal_seq;
-	int ret = 0;
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
+	struct journal_seq_blacklist_table *t;
+	unsigned i, nr = blacklist_nr_entries(bl);
 
-	if (!seq)
+	if (!bl)
 		return 0;
 
-	spin_lock(&j->lock);
-	journal_seq = journal_cur_seq(j);
-	spin_unlock(&j->lock);
+	t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL);
+	if (!t)
+		return -BCH_ERR_ENOMEM_blacklist_table_init;
 
-	/* Interier updates aren't journalled: */
-	BUG_ON(b->level);
-	BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
-
-	/*
-	 * Decrease this back to j->seq + 2 when we next rev the on disk format:
-	 * increasing it temporarily to work around bug in old kernels
-	 */
-	fsck_err_on(seq > journal_seq + 4, c,
-		    "bset journal seq too far in the future: %llu > %llu",
-		    seq, journal_seq);
-
-	if (seq <= journal_seq &&
-	    list_empty_careful(&j->seq_blacklist))
-		return 0;
+	t->nr = nr;
 
-	mutex_lock(&j->blacklist_lock);
-
-	if (seq <= journal_seq) {
-		bl = bch2_journal_seq_blacklist_find(j, seq);
-		if (!bl)
-			goto out;
-	} else {
-		bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
-			    b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
-
-		if (!j->new_blacklist) {
-			j->new_blacklist = bch2_journal_seq_blacklisted_new(j,
-						journal_seq + 1,
-						journal_seq + 1);
-			if (!j->new_blacklist) {
-				ret = -ENOMEM;
-				goto out;
-			}
-		}
-		bl = j->new_blacklist;
-		bl->end = max(bl->end, seq);
+	for (i = 0; i < nr; i++) {
+		t->entries[i].start	= le64_to_cpu(bl->start[i].start);
+		t->entries[i].end	= le64_to_cpu(bl->start[i].end);
 	}
 
-	for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
-		if (b->data->keys.seq	== n->seq &&
-		    b->btree_id		== n->btree_id &&
-		    !bkey_cmp(b->key.k.p, n->pos))
-			goto found_entry;
-
-	if (!bl->nr_entries ||
-	    is_power_of_2(bl->nr_entries)) {
-		n = krealloc(bl->entries,
-			     max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n),
-			     GFP_KERNEL);
-		if (!n) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		bl->entries = n;
-	}
+	eytzinger0_sort(t->entries,
+			t->nr,
+			sizeof(t->entries[0]),
+			journal_seq_blacklist_table_cmp,
+			NULL);
 
-	bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
-		.seq		= b->data->keys.seq,
-		.btree_id	= b->btree_id,
-		.pos		= b->key.k.p,
-	};
-found_entry:
-	ret = 1;
-out:
-fsck_err:
-	mutex_unlock(&j->blacklist_lock);
-	return ret;
+	kfree(c->journal_seq_blacklist_table);
+	c->journal_seq_blacklist_table = t;
+	return 0;
 }
 
-static int __bch2_journal_seq_blacklist_read(struct journal *j,
-					     struct journal_replay *i,
-					     u64 start, u64 end)
+static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				enum bch_validate_flags flags, struct printbuf *err)
 {
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_seq_blacklist *bl;
-
-	bch_verbose(c, "blacklisting existing journal seq %llu-%llu",
-		    start, end);
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		field_to_type(f, journal_seq_blacklist);
+	unsigned i, nr = blacklist_nr_entries(bl);
+
+	for (i = 0; i < nr; i++) {
+		struct journal_seq_blacklist_entry *e = bl->start + i;
+
+		if (le64_to_cpu(e->start) >=
+		    le64_to_cpu(e->end)) {
+			prt_printf(err, "entry %u start >= end (%llu >= %llu)",
+			       i, le64_to_cpu(e->start), le64_to_cpu(e->end));
+			return -BCH_ERR_invalid_sb_journal_seq_blacklist;
+		}
 
-	bl = bch2_journal_seq_blacklisted_new(j, start, end);
-	if (!bl)
-		return -ENOMEM;
+		if (i + 1 < nr &&
+		    le64_to_cpu(e[0].end) >
+		    le64_to_cpu(e[1].start)) {
+			prt_printf(err, "entry %u out of order with next entry (%llu > %llu)",
+			       i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
+			return -BCH_ERR_invalid_sb_journal_seq_blacklist;
+		}
+	}
 
-	bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin,
-			     journal_seq_blacklist_flush);
 	return 0;
 }
 
-/*
- * After reading the journal, find existing journal seq blacklist entries and
- * read them into memory:
- */
-int bch2_journal_seq_blacklist_read(struct journal *j,
-				    struct journal_replay *i)
+static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
+						  struct bch_sb *sb,
+						  struct bch_sb_field *f)
 {
-	struct jset_entry *entry;
-	int ret = 0;
-
-	vstruct_for_each(&i->j, entry) {
-		switch (entry->type) {
-		case BCH_JSET_ENTRY_blacklist: {
-			struct jset_entry_blacklist *bl_entry =
-				container_of(entry, struct jset_entry_blacklist, entry);
-
-			ret = __bch2_journal_seq_blacklist_read(j, i,
-					le64_to_cpu(bl_entry->seq),
-					le64_to_cpu(bl_entry->seq));
-			break;
-		}
-		case BCH_JSET_ENTRY_blacklist_v2: {
-			struct jset_entry_blacklist_v2 *bl_entry =
-				container_of(entry, struct jset_entry_blacklist_v2, entry);
-
-			ret = __bch2_journal_seq_blacklist_read(j, i,
-					le64_to_cpu(bl_entry->start),
-					le64_to_cpu(bl_entry->end));
-			break;
-		}
-		}
-
-		if (ret)
-			break;
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		field_to_type(f, journal_seq_blacklist);
+	struct journal_seq_blacklist_entry *i;
+	unsigned nr = blacklist_nr_entries(bl);
+
+	for (i = bl->start; i < bl->start + nr; i++) {
+		if (i != bl->start)
+			prt_printf(out, " ");
+
+		prt_printf(out, "%llu-%llu",
+		       le64_to_cpu(i->start),
+		       le64_to_cpu(i->end));
 	}
-
-	return ret;
+	prt_newline(out);
 }
 
-/*
- * After reading the journal and walking the btree, we might have new journal
- * sequence numbers to blacklist - add entries to the next journal entry to be
- * written:
- */
-void bch2_journal_seq_blacklist_write(struct journal *j)
+const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
+	.validate	= bch2_sb_journal_seq_blacklist_validate,
+	.to_text	= bch2_sb_journal_seq_blacklist_to_text
+};
+
+bool bch2_blacklist_entries_gc(struct bch_fs *c)
 {
-	struct journal_seq_blacklist *bl = j->new_blacklist;
-	struct jset_entry_blacklist_v2 *bl_entry;
-	struct jset_entry *entry;
+	struct journal_seq_blacklist_entry *src, *dst;
 
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
 	if (!bl)
-		return;
+		return false;
+
+	unsigned nr = blacklist_nr_entries(bl);
+	dst = bl->start;
 
-	entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j),
-			(sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64));
+	struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
+	BUG_ON(nr != t->nr);
+
+	unsigned i;
+	for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr);
+	     src < bl->start + nr;
+	     src++, i = eytzinger0_next(i, nr)) {
+		BUG_ON(t->entries[i].start	!= le64_to_cpu(src->start));
+		BUG_ON(t->entries[i].end	!= le64_to_cpu(src->end));
+
+		if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk)
+			*dst++ = *src;
+	}
 
-	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
-	bl_entry->entry.type	= BCH_JSET_ENTRY_blacklist_v2;
-	bl_entry->start		= cpu_to_le64(bl->start);
-	bl_entry->end		= cpu_to_le64(bl->end);
+	unsigned new_nr = dst - bl->start;
+	if (new_nr == nr)
+		return false;
 
-	bch2_journal_pin_add(j,
-			     journal_cur_seq(j),
-			     &bl->pin,
-			     journal_seq_blacklist_flush);
+	bch_verbose(c, "nr blacklist entries was %u, now %u", nr, new_nr);
 
-	j->new_blacklist = NULL;
+	bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+				  new_nr ? sb_blacklist_u64s(new_nr) : 0);
+	BUG_ON(new_nr && !bl);
+	return true;
 }
diff --git a/libbcachefs/journal_seq_blacklist.h b/libbcachefs/journal_seq_blacklist.h
index 95ea6e90..d47636f9 100644
--- a/libbcachefs/journal_seq_blacklist.h
+++ b/libbcachefs/journal_seq_blacklist.h
@@ -1,13 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
 #define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
 
-struct journal_replay;
+static inline unsigned
+blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
+{
+	return bl
+		? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
+		   sizeof(struct journal_seq_blacklist_entry))
+		: 0;
+}
 
-struct journal_seq_blacklist *
-bch2_journal_seq_blacklist_find(struct journal *, u64);
-int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
-int bch2_journal_seq_blacklist_read(struct journal *,
-				    struct journal_replay *);
-void bch2_journal_seq_blacklist_write(struct journal *);
+bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
+int bch2_blacklist_table_initialize(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
+
+bool bch2_blacklist_entries_gc(struct bch_fs *);
 
 #endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/libbcachefs/journal_seq_blacklist_format.h b/libbcachefs/journal_seq_blacklist_format.h
new file mode 100644
index 00000000..2566b12d
--- /dev/null
+++ b/libbcachefs/journal_seq_blacklist_format.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H
+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H
+
+struct journal_seq_blacklist_entry {
+	__le64			start;
+	__le64			end;
+};
+
+struct bch_sb_field_journal_seq_blacklist {
+	struct bch_sb_field	field;
+	struct journal_seq_blacklist_entry start[];
+};
+
+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H */
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index effbeece..e9bd716f 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_JOURNAL_TYPES_H
 #define _BCACHEFS_JOURNAL_TYPES_H
 
@@ -8,23 +9,42 @@
 #include "super_types.h"
 #include "fifo.h"
 
-struct journal_res;
+/* btree write buffer steals 8 bits for its own purposes: */
+#define JOURNAL_SEQ_MAX		((1ULL << 56) - 1)
+
+#define JOURNAL_BUF_BITS	2
+#define JOURNAL_BUF_NR		(1U << JOURNAL_BUF_BITS)
+#define JOURNAL_BUF_MASK	(JOURNAL_BUF_NR - 1)
 
 /*
- * We put two of these in struct journal; we used them for writes to the
- * journal that are being staged or in flight.
+ * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
+ * the journal that are being staged or in flight.
  */
 struct journal_buf {
+	struct closure		io;
 	struct jset		*data;
 
-	BKEY_PADDED(key);
+	__BKEY_PADDED(key, BCH_REPLICAS_MAX);
+	struct bch_devs_list	devs_written;
 
 	struct closure_waitlist	wait;
-
-	unsigned		size;
-	unsigned		disk_sectors;
-	/* bloom filter: */
-	unsigned long		has_inode[1024 / sizeof(unsigned long)];
+	u64			last_seq;	/* copy of data->last_seq */
+	long			expires;
+	u64			flush_time;
+
+	unsigned		buf_size;	/* size in bytes of @data */
+	unsigned		sectors;	/* maximum size for current entry */
+	unsigned		disk_sectors;	/* maximum size entry could have been, if
+						   buf_size was bigger */
+	unsigned		u64s_reserved;
+	bool			noflush:1;	/* write has already been kicked off, and was noflush */
+	bool			must_flush:1;	/* something wants a flush */
+	bool			separate_flush:1;
+	bool			need_flush_to_write_buffer:1;
+	bool			write_started:1;
+	bool			write_allocated:1;
+	bool			write_done:1;
+	u8			idx;
 };
 
 /*
@@ -32,8 +52,15 @@ struct journal_buf {
  * flushed:
  */
 
+enum journal_pin_type {
+	JOURNAL_PIN_btree,
+	JOURNAL_PIN_key_cache,
+	JOURNAL_PIN_other,
+	JOURNAL_PIN_NR,
+};
+
 struct journal_entry_pin_list {
-	struct list_head		list;
+	struct list_head		list[JOURNAL_PIN_NR];
 	struct list_head		flushed;
 	atomic_t			count;
 	struct bch_devs_list		devs;
@@ -41,31 +68,13 @@ struct journal_entry_pin_list {
 
 struct journal;
 struct journal_entry_pin;
-typedef void (*journal_pin_flush_fn)(struct journal *j,
+typedef int (*journal_pin_flush_fn)(struct journal *j,
 				struct journal_entry_pin *, u64);
 
 struct journal_entry_pin {
 	struct list_head		list;
 	journal_pin_flush_fn		flush;
-	struct journal_entry_pin_list	*pin_list;
-};
-
-/* corresponds to a btree node with a blacklisted bset: */
-struct blacklisted_node {
-	__le64			seq;
-	enum btree_id		btree_id;
-	struct bpos		pos;
-};
-
-struct journal_seq_blacklist {
-	struct list_head	list;
-	u64			start;
-	u64			end;
-
-	struct journal_entry_pin pin;
-
-	struct blacklisted_node	*entries;
-	size_t			nr_entries;
+	u64				seq;
 };
 
 struct journal_res {
@@ -87,10 +96,12 @@ union journal_res_state {
 
 	struct {
 		u64		cur_entry_offset:20,
-				idx:1,
-				prev_buf_unwritten:1,
-				buf0_count:21,
-				buf1_count:21;
+				idx:2,
+				unwritten_idx:2,
+				buf0_count:10,
+				buf1_count:10,
+				buf2_count:10,
+				buf3_count:10;
 	};
 };
 
@@ -104,54 +115,131 @@ union journal_res_state {
  */
 #define JOURNAL_ENTRY_OFFSET_MAX	((1U << 20) - 1)
 
+#define JOURNAL_ENTRY_BLOCKED_VAL	(JOURNAL_ENTRY_OFFSET_MAX - 2)
 #define JOURNAL_ENTRY_CLOSED_VAL	(JOURNAL_ENTRY_OFFSET_MAX - 1)
 #define JOURNAL_ENTRY_ERROR_VAL		(JOURNAL_ENTRY_OFFSET_MAX)
 
-/*
- * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
- * either because something's waiting on the write to complete or because it's
- * been dirty too long and the timer's expired.
- */
+struct journal_space {
+	/* Units of 512 bytes sectors: */
+	unsigned	next_entry; /* How big the next journal entry can be */
+	unsigned	total;
+};
+
+enum journal_space_from {
+	journal_space_discarded,
+	journal_space_clean_ondisk,
+	journal_space_clean,
+	journal_space_total,
+	journal_space_nr,
+};
+
+#define JOURNAL_FLAGS()			\
+	x(replay_done)			\
+	x(running)			\
+	x(may_skip_flush)		\
+	x(need_flush_write)		\
+	x(space_low)
+
+enum journal_flags {
+#define x(n)	JOURNAL_##n,
+	JOURNAL_FLAGS()
+#undef x
+};
 
-enum {
-	JOURNAL_REPLAY_DONE,
-	JOURNAL_STARTED,
-	JOURNAL_NEED_WRITE,
-	JOURNAL_NOT_EMPTY,
+/* Reasons we may fail to get a journal reservation: */
+#define JOURNAL_ERRORS()		\
+	x(ok)				\
+	x(retry)			\
+	x(blocked)			\
+	x(max_in_flight)		\
+	x(journal_full)			\
+	x(journal_pin_full)		\
+	x(journal_stuck)		\
+	x(insufficient_devices)
+
+enum journal_errors {
+#define x(n)	JOURNAL_ERR_##n,
+	JOURNAL_ERRORS()
+#undef x
+};
+
+typedef DARRAY(u64)		darray_u64;
+
+struct journal_bio {
+	struct bch_dev		*ca;
+	unsigned		buf_idx;
+
+	struct bio		bio;
 };
 
 /* Embedded in struct bch_fs */
 struct journal {
 	/* Fastpath stuff up front: */
+	struct {
+
+	union journal_res_state reservations;
+	enum bch_watermark	watermark;
+
+	} __aligned(SMP_CACHE_BYTES);
 
 	unsigned long		flags;
 
-	union journal_res_state reservations;
+	/* Max size of current journal entry */
 	unsigned		cur_entry_u64s;
-	unsigned		prev_buf_sectors;
-	unsigned		cur_buf_sectors;
+	unsigned		cur_entry_sectors;
+
+	/* Reserved space in journal entry to be used just prior to write */
+	unsigned		entry_u64s_reserved;
+
+
+	/*
+	 * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
+	 * insufficient devices:
+	 */
+	enum journal_errors	cur_entry_error;
+	unsigned		cur_entry_offset_if_blocked;
+
 	unsigned		buf_size_want;
+	/*
+	 * We may queue up some things to be journalled (log messages) before
+	 * the journal has actually started - stash them here:
+	 */
+	darray_u64		early_journal_entries;
 
 	/*
+	 * Protects journal_buf->data, when accessing without a jorunal
+	 * reservation: for synchronization between the btree write buffer code
+	 * and the journal write path:
+	 */
+	struct mutex		buf_lock;
+	/*
 	 * Two journal entries -- one is currently open for new entries, the
 	 * other is possibly being written out.
 	 */
-	struct journal_buf	buf[2];
+	struct journal_buf	buf[JOURNAL_BUF_NR];
 
 	spinlock_t		lock;
 
+	/* if nonzero, we may not open a new journal entry: */
+	unsigned		blocked;
+
 	/* Used when waiting because the journal was full */
 	wait_queue_head_t	wait;
 	struct closure_waitlist	async_wait;
 
-	struct closure		io;
 	struct delayed_work	write_work;
+	struct workqueue_struct *wq;
 
 	/* Sequence number of most recent journal entry (last entry in @pin) */
 	atomic64_t		seq;
 
-	/* last_seq from the most recent journal entry written */
+	/* seq, last_seq from the most recent journal entry successfully written */
+	u64			seq_ondisk;
+	u64			flushed_seq_ondisk;
 	u64			last_seq_ondisk;
+	u64			err_seq;
+	u64			last_empty_seq;
+	u64			oldest_seq_found_ondisk;
 
 	/*
 	 * FIFO of journal entries whose btree updates have not yet been
@@ -173,37 +261,52 @@ struct journal {
 		u64 front, back, size, mask;
 		struct journal_entry_pin_list *data;
 	}			pin;
-	u64			replay_journal_seq;
 
-	struct mutex		blacklist_lock;
-	struct list_head	seq_blacklist;
-	struct journal_seq_blacklist *new_blacklist;
+	struct journal_space	space[journal_space_nr];
+
+	u64			replay_journal_seq;
+	u64			replay_journal_seq_end;
 
-	BKEY_PADDED(key);
 	struct write_point	wp;
 	spinlock_t		err_lock;
 
-	struct delayed_work	reclaim_work;
+	struct mutex		reclaim_lock;
+	/*
+	 * Used for waiting until journal reclaim has freed up space in the
+	 * journal:
+	 */
+	wait_queue_head_t	reclaim_wait;
+	struct task_struct	*reclaim_thread;
+	bool			reclaim_kicked;
+	unsigned long		next_reclaim;
+	u64			nr_direct_reclaim;
+	u64			nr_background_reclaim;
+
 	unsigned long		last_flushed;
+	struct journal_entry_pin *flush_in_progress;
+	bool			flush_in_progress_dropped;
+	wait_queue_head_t	pin_flush_wait;
 
-	/* protects advancing ja->last_idx: */
-	struct mutex		reclaim_lock;
-	unsigned		write_delay_ms;
-	unsigned		reclaim_delay_ms;
+	/* protects advancing ja->discard_idx: */
+	struct mutex		discard_lock;
+	bool			can_discard;
+
+	unsigned long		last_flush_write;
 
-	u64			res_get_blocked_start;
-	u64			need_write_time;
 	u64			write_start_time;
 
-	struct time_stats	*write_time;
-	struct time_stats	*delay_time;
-	struct time_stats	*blocked_time;
-	struct time_stats	*flush_seq_time;
+	u64			nr_flush_writes;
+	u64			nr_noflush_writes;
+	u64			entry_bytes_written;
+
+	struct bch2_time_stats	*flush_write_time;
+	struct bch2_time_stats	*noflush_write_time;
+	struct bch2_time_stats	*flush_seq_time;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	res_map;
 #endif
-};
+} __aligned(SMP_CACHE_BYTES);
 
 /*
  * Embedded in struct bch_dev. First three fields refer to the array of journal
@@ -218,24 +321,30 @@ struct journal_device {
 
 	unsigned		sectors_free;
 
-	/* Journal bucket we're currently writing to */
-	unsigned		cur_idx;
-
-	/* Last journal bucket that still contains an open journal entry */
-
 	/*
-	 * j->lock and j->reclaim_lock must both be held to modify, j->lock
-	 * sufficient to read:
+	 * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx:
 	 */
-	unsigned		last_idx;
+	unsigned		discard_idx;		/* Next bucket to discard */
+	unsigned		dirty_idx_ondisk;
+	unsigned		dirty_idx;
+	unsigned		cur_idx;		/* Journal bucket we're currently writing to */
 	unsigned		nr;
+
 	u64			*buckets;
 
 	/* Bio for journal reads/writes to this device */
-	struct bio		*bio;
+	struct journal_bio	*bio[JOURNAL_BUF_NR];
 
 	/* for bch_journal_read_device */
 	struct closure		read;
+	u64			highest_seq_found;
+};
+
+/*
+ * journal_entry_res - reserve space in every journal entry:
+ */
+struct journal_entry_res {
+	unsigned		u64s;
 };
 
 #endif /* _BCACHEFS_JOURNAL_TYPES_H */
diff --git a/libbcachefs/keylist.c b/libbcachefs/keylist.c
index bc724e77..1b828bdd 100644
--- a/libbcachefs/keylist.c
+++ b/libbcachefs/keylist.c
@@ -1,11 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey.h"
 #include "keylist.h"
 
 int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
 			size_t nr_inline_u64s, size_t new_u64s)
 {
-	size_t oldsize = bch_keylist_u64s(l);
+	size_t oldsize = bch2_keylist_u64s(l);
 	size_t newsize = oldsize + new_u64s;
 	u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
 	u64 *new_keys;
@@ -16,7 +18,7 @@ int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
 	    (old_buf && roundup_pow_of_two(oldsize) == newsize))
 		return 0;
 
-	new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO);
+	new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOFS);
 	if (!new_keys)
 		return -ENOMEM;
 
@@ -29,38 +31,20 @@ int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
 	return 0;
 }
 
-void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
-{
-	struct bkey_i *where;
-
-	for_each_keylist_key(l, where)
-		if (bkey_cmp(insert->k.p, where->k.p) < 0)
-			break;
-
-	memmove_u64s_up((u64 *) where + insert->k.u64s,
-			where,
-			((u64 *) l->top) - ((u64 *) where));
-
-	l->top_p += insert->k.u64s;
-	bkey_copy(where, insert);
-}
-
 void bch2_keylist_pop_front(struct keylist *l)
 {
 	l->top_p -= bch2_keylist_front(l)->k.u64s;
 
 	memmove_u64s_down(l->keys,
 			  bkey_next(l->keys),
-			  bch_keylist_u64s(l));
+			  bch2_keylist_u64s(l));
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_verify_keylist_sorted(struct keylist *l)
 {
-	struct bkey_i *k;
-
 	for_each_keylist_key(l, k)
 		BUG_ON(bkey_next(k) != l->top &&
-		       bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
+		       bpos_ge(k->k.p, bkey_next(k)->k.p));
 }
 #endif
diff --git a/libbcachefs/keylist.h b/libbcachefs/keylist.h
index 3106759e..e687e0e9 100644
--- a/libbcachefs/keylist.h
+++ b/libbcachefs/keylist.h
@@ -1,10 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_KEYLIST_H
 #define _BCACHEFS_KEYLIST_H
 
 #include "keylist_types.h"
 
 int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
-void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *);
 void bch2_keylist_pop_front(struct keylist *);
 
 static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
@@ -16,7 +16,6 @@ static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
 {
 	if (l->keys_p != inline_keys)
 		kfree(l->keys_p);
-	bch2_keylist_init(l, inline_keys);
 }
 
 static inline void bch2_keylist_push(struct keylist *l)
@@ -35,14 +34,14 @@ static inline bool bch2_keylist_empty(struct keylist *l)
 	return l->top == l->keys;
 }
 
-static inline size_t bch_keylist_u64s(struct keylist *l)
+static inline size_t bch2_keylist_u64s(struct keylist *l)
 {
 	return l->top_p - l->keys_p;
 }
 
 static inline size_t bch2_keylist_bytes(struct keylist *l)
 {
-	return bch_keylist_u64s(l) * sizeof(u64);
+	return bch2_keylist_u64s(l) * sizeof(u64);
 }
 
 static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
@@ -51,18 +50,16 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
 }
 
 #define for_each_keylist_key(_keylist, _k)			\
-	for (_k = (_keylist)->keys;				\
+	for (struct bkey_i *_k = (_keylist)->keys;		\
 	     _k != (_keylist)->top;				\
 	     _k = bkey_next(_k))
 
 static inline u64 keylist_sectors(struct keylist *keys)
 {
-	struct bkey_i *k;
 	u64 ret = 0;
 
 	for_each_keylist_key(keys, k)
 		ret += k->k.size;
-
 	return ret;
 }
 
diff --git a/libbcachefs/keylist_types.h b/libbcachefs/keylist_types.h
index 48a17d7a..4b3ff7d8 100644
--- a/libbcachefs/keylist_types.h
+++ b/libbcachefs/keylist_types.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_KEYLIST_TYPES_H
 #define _BCACHEFS_KEYLIST_TYPES_H
 
diff --git a/libbcachefs/logged_ops.c b/libbcachefs/logged_ops.c
new file mode 100644
index 00000000..60e00702
--- /dev/null
+++ b/libbcachefs/logged_ops.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "error.h"
+#include "io_misc.h"
+#include "logged_ops.h"
+#include "super.h"
+
+struct bch_logged_op_fn {
+	u8		type;
+	int		(*resume)(struct btree_trans *, struct bkey_i *);
+};
+
+static const struct bch_logged_op_fn logged_op_fns[] = {
+#define x(n)		{					\
+	.type		= KEY_TYPE_logged_op_##n,		\
+	.resume		= bch2_resume_logged_op_##n,		\
+},
+	BCH_LOGGED_OPS()
+#undef x
+};
+
+static const struct bch_logged_op_fn *logged_op_fn(enum bch_bkey_type type)
+{
+	for (unsigned i = 0; i < ARRAY_SIZE(logged_op_fns); i++)
+		if (logged_op_fns[i].type == type)
+			return logged_op_fns + i;
+	return NULL;
+}
+
+static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
+			    struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	u32 restart_count = trans->restart_count;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	fsck_err_on(test_bit(BCH_FS_clean_recovery, &c->flags),
+		    trans, logged_op_but_clean,
+		    "filesystem marked as clean but have logged op\n%s",
+		    (bch2_bkey_val_to_text(&buf, c, k),
+		     buf.buf));
+
+	struct bkey_buf sk;
+	bch2_bkey_buf_init(&sk);
+	bch2_bkey_buf_reassemble(&sk, c, k);
+
+	const struct bch_logged_op_fn *fn = logged_op_fn(sk.k->k.type);
+	if (fn)
+		fn->resume(trans, sk.k);
+
+	ret = bch2_logged_op_finish(trans, sk.k);
+
+	bch2_bkey_buf_exit(&sk, c);
+fsck_err:
+	printbuf_exit(&buf);
+	return ret ?: trans_was_restarted(trans, restart_count);
+}
+
+int bch2_resume_logged_ops(struct bch_fs *c)
+{
+	int ret = bch2_trans_run(c,
+		for_each_btree_key(trans, iter,
+				   BTREE_ID_logged_ops, POS_MIN,
+				   BTREE_ITER_prefetch, k,
+			resume_logged_op(trans, &iter, k)));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
+{
+	struct btree_iter iter;
+	int ret;
+
+	ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX);
+	if (ret)
+		return ret;
+
+	k->k.p = iter.pos;
+
+	ret = bch2_trans_update(trans, &iter, k, 0);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
+{
+	return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			 __bch2_logged_op_start(trans, k));
+}
+
+int bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
+{
+	int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			    bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
+	/*
+	 * This needs to be a fatal error because we've left an unfinished
+	 * operation in the logged ops btree.
+	 *
+	 * We should only ever see an error here if the filesystem has already
+	 * been shut down, but make sure of that here:
+	 */
+	if (ret) {
+		struct bch_fs *c = trans->c;
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+		bch2_fs_fatal_error(c, "deleting logged operation %s: %s",
+				    buf.buf, bch2_err_str(ret));
+		printbuf_exit(&buf);
+	}
+
+	return ret;
+}
diff --git a/libbcachefs/logged_ops.h b/libbcachefs/logged_ops.h
new file mode 100644
index 00000000..30ae9ef7
--- /dev/null
+++ b/libbcachefs/logged_ops.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LOGGED_OPS_H
+#define _BCACHEFS_LOGGED_OPS_H
+
+#include "bkey.h"
+
+#define BCH_LOGGED_OPS()			\
+	x(truncate)				\
+	x(finsert)
+
+static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op)
+{
+	return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0);
+}
+
+int bch2_resume_logged_ops(struct bch_fs *);
+int bch2_logged_op_start(struct btree_trans *, struct bkey_i *);
+int bch2_logged_op_finish(struct btree_trans *, struct bkey_i *);
+
+#endif /* _BCACHEFS_LOGGED_OPS_H */
diff --git a/libbcachefs/logged_ops_format.h b/libbcachefs/logged_ops_format.h
new file mode 100644
index 00000000..6a4bf712
--- /dev/null
+++ b/libbcachefs/logged_ops_format.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
+#define _BCACHEFS_LOGGED_OPS_FORMAT_H
+
+struct bch_logged_op_truncate {
+	struct bch_val		v;
+	__le32			subvol;
+	__le32			pad;
+	__le64			inum;
+	__le64			new_i_size;
+};
+
+enum logged_op_finsert_state {
+	LOGGED_OP_FINSERT_start,
+	LOGGED_OP_FINSERT_shift_extents,
+	LOGGED_OP_FINSERT_finish,
+};
+
+struct bch_logged_op_finsert {
+	struct bch_val		v;
+	__u8			state;
+	__u8			pad[3];
+	__le32			subvol;
+	__le64			inum;
+	__le64			dst_offset;
+	__le64			src_offset;
+	__le64			pos;
+};
+
+#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */
diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c
new file mode 100644
index 00000000..ce794d55
--- /dev/null
+++ b/libbcachefs/lru.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "bkey_buf.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "error.h"
+#include "lru.h"
+#include "recovery.h"
+
+/* KEY_TYPE_lru is obsolete: */
+int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k,
+		      struct bkey_validate_context from)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(!lru_pos_time(k.k->p),
+			 c, lru_entry_at_time_0,
+			 "lru entry at time=0");
+fsck_err:
+	return ret;
+}
+
+void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
+		      struct bkey_s_c k)
+{
+	const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
+
+	prt_printf(out, "idx %llu", le64_to_cpu(lru->idx));
+}
+
+void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru)
+{
+	prt_printf(out, "%llu:%llu -> %llu:%llu",
+		   lru_pos_id(lru),
+		   lru_pos_time(lru),
+		   u64_to_bucket(lru.offset).inode,
+		   u64_to_bucket(lru.offset).offset);
+}
+
+static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
+			  u64 dev_bucket, u64 time, bool set)
+{
+	return time
+		? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru,
+					      lru_pos(lru_id, dev_bucket, time), set)
+		: 0;
+}
+
+int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
+{
+	return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted);
+}
+
+int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
+{
+	return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set);
+}
+
+int bch2_lru_change(struct btree_trans *trans,
+		    u16 lru_id, u64 dev_bucket,
+		    u64 old_time, u64 new_time)
+{
+	if (old_time == new_time)
+		return 0;
+
+	return  bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?:
+		bch2_lru_set(trans, lru_id, dev_bucket, new_time);
+}
+
+static const char * const bch2_lru_types[] = {
+#define x(n) #n,
+	BCH_LRU_TYPES()
+#undef x
+	NULL
+};
+
+int bch2_lru_check_set(struct btree_trans *trans,
+		       u16 lru_id, u64 time,
+		       struct bkey_s_c referring_k,
+		       struct bkey_buf *last_flushed)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	struct btree_iter lru_iter;
+	struct bkey_s_c lru_k =
+		bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
+				   lru_pos(lru_id,
+					   bucket_to_u64(referring_k.k->p),
+					   time), 0);
+	int ret = bkey_err(lru_k);
+	if (ret)
+		return ret;
+
+	if (lru_k.k->type != KEY_TYPE_set) {
+		ret = bch2_btree_write_buffer_maybe_flush(trans, referring_k, last_flushed);
+		if (ret)
+			goto err;
+
+		if (fsck_err(trans, alloc_key_to_missing_lru_entry,
+			     "missing %s lru entry\n"
+			     "  %s",
+			     bch2_lru_types[lru_type(lru_k)],
+			     (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) {
+			ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time);
+			if (ret)
+				goto err;
+		}
+	}
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &lru_iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int bch2_check_lru_key(struct btree_trans *trans,
+			      struct btree_iter *lru_iter,
+			      struct bkey_s_c lru_k,
+			      struct bkey_buf *last_flushed)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+	enum bch_lru_type type = lru_type(lru_k);
+	struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset);
+	u64 idx;
+	int ret;
+
+	struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_pos);
+
+	if (fsck_err_on(!ca,
+			trans, lru_entry_to_invalid_bucket,
+			"lru key points to nonexistent device:bucket %llu:%llu",
+			alloc_pos.inode, alloc_pos.offset))
+		return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	a = bch2_alloc_to_v4(k, &a_convert);
+
+	switch (type) {
+	case BCH_LRU_read:
+		idx = alloc_lru_idx_read(*a);
+		break;
+	case BCH_LRU_fragmentation:
+		idx = alloc_lru_idx_fragmentation(*a, ca);
+		break;
+	}
+
+	if (lru_k.k->type != KEY_TYPE_set ||
+	    lru_pos_time(lru_k.k->p) != idx) {
+		ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed);
+		if (ret)
+			goto err;
+
+		if (fsck_err(trans, lru_entry_bad,
+			     "incorrect lru entry: lru %s time %llu\n"
+			     "  %s\n"
+			     "  for %s",
+			     bch2_lru_types[type],
+			     lru_pos_time(lru_k.k->p),
+			     (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
+			     (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf)))
+			ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
+	}
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_dev_put(ca);
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
+	return ret;
+}
+
+int bch2_check_lrus(struct bch_fs *c)
+{
+	struct bkey_buf last_flushed;
+
+	bch2_bkey_buf_init(&last_flushed);
+	bkey_init(&last_flushed.k->k);
+
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			bch2_check_lru_key(trans, &iter, k, &last_flushed)));
+
+	bch2_bkey_buf_exit(&last_flushed, c);
+	bch_err_fn(c, ret);
+	return ret;
+
+}
diff --git a/libbcachefs/lru.h b/libbcachefs/lru.h
new file mode 100644
index 00000000..f31a6cf1
--- /dev/null
+++ b/libbcachefs/lru.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LRU_H
+#define _BCACHEFS_LRU_H
+
+static inline u64 lru_pos_id(struct bpos pos)
+{
+	return pos.inode >> LRU_TIME_BITS;
+}
+
+static inline u64 lru_pos_time(struct bpos pos)
+{
+	return pos.inode & ~(~0ULL << LRU_TIME_BITS);
+}
+
+static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
+{
+	struct bpos pos = POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket);
+
+	EBUG_ON(time > LRU_TIME_MAX);
+	EBUG_ON(lru_pos_id(pos) != lru_id);
+	EBUG_ON(lru_pos_time(pos) != time);
+	EBUG_ON(pos.offset != dev_bucket);
+
+	return pos;
+}
+
+static inline enum bch_lru_type lru_type(struct bkey_s_c l)
+{
+	u16 lru_id = l.k->p.inode >> 48;
+
+	if (lru_id == BCH_LRU_FRAGMENTATION_START)
+		return BCH_LRU_fragmentation;
+	return BCH_LRU_read;
+}
+
+int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context);
+void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
+
+#define bch2_bkey_ops_lru ((struct bkey_ops) {	\
+	.key_validate	= bch2_lru_validate,	\
+	.val_to_text	= bch2_lru_to_text,	\
+	.min_val_size	= 8,			\
+})
+
+int bch2_lru_del(struct btree_trans *, u16, u64, u64);
+int bch2_lru_set(struct btree_trans *, u16, u64, u64);
+int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
+
+struct bkey_buf;
+int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *);
+
+int bch2_check_lrus(struct bch_fs *);
+
+#endif /* _BCACHEFS_LRU_H */
diff --git a/libbcachefs/lru_format.h b/libbcachefs/lru_format.h
new file mode 100644
index 00000000..f372cb3b
--- /dev/null
+++ b/libbcachefs/lru_format.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LRU_FORMAT_H
+#define _BCACHEFS_LRU_FORMAT_H
+
+struct bch_lru {
+	struct bch_val		v;
+	__le64			idx;
+} __packed __aligned(8);
+
+#define BCH_LRU_TYPES()		\
+	x(read)			\
+	x(fragmentation)
+
+enum bch_lru_type {
+#define x(n) BCH_LRU_##n,
+	BCH_LRU_TYPES()
+#undef x
+};
+
+#define BCH_LRU_FRAGMENTATION_START	((1U << 16) - 1)
+
+#define LRU_TIME_BITS			48
+#define LRU_TIME_MAX			((1ULL << LRU_TIME_BITS) - 1)
+
+#endif /* _BCACHEFS_LRU_FORMAT_H */
diff --git a/libbcachefs/lz4.h b/libbcachefs/lz4.h
deleted file mode 100644
index 22e7859c..00000000
--- a/libbcachefs/lz4.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef __BCH_LZ4_H__
-#define __BCH_LZ4_H__
-
-int bch2_lz4_decompress(const unsigned char *src, size_t *src_len,
-			unsigned char *dest, size_t actual_dest_len);
-
-#endif
diff --git a/libbcachefs/lz4_decompress.c b/libbcachefs/lz4_decompress.c
deleted file mode 100644
index 9e809f97..00000000
--- a/libbcachefs/lz4_decompress.c
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- * LZ4 Decompressor for Linux kernel
- *
- * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
- *
- * Based on LZ4 implementation by Yann Collet.
- *
- * LZ4 - Fast LZ compression algorithm
- * Copyright (C) 2011-2012, Yann Collet.
- * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *  You can contact the author at :
- *  - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
- *  - LZ4 source repository : http://code.google.com/p/lz4/
- */
-
-#ifndef STATIC
-#include <linux/module.h>
-#include <linux/kernel.h>
-#endif
-
-#include "lz4.h"
-
-/*
- * Detects 64 bits mode
- */
-#if defined(CONFIG_64BIT)
-#define LZ4_ARCH64 1
-#else
-#define LZ4_ARCH64 0
-#endif
-
-#include <asm/unaligned.h>
-#include <linux/log2.h>
-#include <linux/string.h>
-
-#define A32(_p) get_unaligned((u32 *) (_p))
-#define A16(_p) get_unaligned((u16 *) (_p))
-
-#define GET_LE16_ADVANCE(_src)				\
-({							\
-	u16 _r = get_unaligned_le16(_src);		\
-	(_src) += 2;					\
-	_r;						\
-})
-
-#define PUT_LE16_ADVANCE(_dst, _v)			\
-do {							\
-	put_unaligned_le16((_v), (_dst));		\
-	(_dst) += 2;					\
-} while (0)
-
-#define LENGTH_LONG		15
-#define COPYLENGTH		8
-#define ML_BITS			4
-#define ML_MASK			((1U << ML_BITS) - 1)
-#define RUN_BITS		(8 - ML_BITS)
-#define RUN_MASK		((1U << RUN_BITS) - 1)
-#define MEMORY_USAGE		14
-#define MINMATCH		4
-#define SKIPSTRENGTH		6
-#define LASTLITERALS		5
-#define MFLIMIT			(COPYLENGTH + MINMATCH)
-#define MINLENGTH		(MFLIMIT + 1)
-#define MAXD_LOG		16
-#define MAXD			(1 << MAXD_LOG)
-#define MAXD_MASK		(u32)(MAXD - 1)
-#define MAX_DISTANCE		(MAXD - 1)
-#define HASH_LOG		(MAXD_LOG - 1)
-#define HASHTABLESIZE		(1 << HASH_LOG)
-#define MAX_NB_ATTEMPTS		256
-#define OPTIMAL_ML		(int)((ML_MASK-1)+MINMATCH)
-#define LZ4_64KLIMIT		((1<<16) + (MFLIMIT - 1))
-
-#define __HASH_VALUE(p, bits)				\
-	(((A32(p)) * 2654435761U) >> (32 - (bits)))
-
-#define HASH_VALUE(p)		__HASH_VALUE(p, HASH_LOG)
-
-#define MEMCPY_ADVANCE(_dst, _src, length)		\
-do {							\
-	typeof(length) _length = (length);		\
-	memcpy(_dst, _src, _length);			\
-	_src += _length;				\
-	_dst += _length;				\
-} while (0)
-
-#define MEMCPY_ADVANCE_BYTES(_dst, _src, _length)	\
-do {							\
-	const u8 *_end = (_src) + (_length);		\
-	while ((_src) < _end)				\
-		*_dst++ = *_src++;			\
-} while (0)
-
-#define STEPSIZE		__SIZEOF_LONG__
-
-#define LZ4_COPYPACKET(_src, _dst)			\
-do {							\
-	MEMCPY_ADVANCE(_dst, _src, STEPSIZE);		\
-	MEMCPY_ADVANCE(_dst, _src, COPYLENGTH - STEPSIZE);\
-} while (0)
-
-/*
- * Equivalent to MEMCPY_ADVANCE - except may overrun @_dst and @_src by
- * COPYLENGTH:
- *
- * Note: src and dst may overlap (with src < dst) - we must do the copy in
- * STEPSIZE chunks for correctness
- *
- * Note also: length may be negative - we must not call memcpy if length is
- * negative, but still adjust dst and src by length
- */
-#define MEMCPY_ADVANCE_CHUNKED(_dst, _src, _length)	\
-do {							\
-	u8 *_end = (_dst) + (_length);			\
-	while ((_dst) < _end)				\
-		LZ4_COPYPACKET(_src, _dst);		\
-	_src -= (_dst) - _end;				\
-	_dst = _end;					\
-} while (0)
-
-#define MEMCPY_ADVANCE_CHUNKED_NOFIXUP(_dst, _src, _end)\
-do {							\
-	while ((_dst) < (_end))				\
-		LZ4_COPYPACKET((_src), (_dst));		\
-} while (0)
-
-static const int dec32table[8] = {0, 3, 2, 3, 0, 0, 0, 0};
-#if LZ4_ARCH64
-static const int dec64table[8] = {0, 0, 0, -1, 0, 1, 2, 3};
-#else
-static const int dec64table[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-#endif
-
-static inline size_t get_length(const u8 **ip, size_t length)
-{
-	if (length == LENGTH_LONG) {
-		size_t len;
-
-		do {
-			length += (len = *(*ip)++);
-		} while (len == 255);
-	}
-
-	return length;
-}
-
-static int lz4_uncompress(const u8 *source, u8 *dest, int osize)
-{
-	const u8 *ip = source;
-	const u8 *ref;
-	u8 *op = dest;
-	u8 * const oend = op + osize;
-	u8 *cpy;
-	unsigned token, offset;
-	ssize_t length;
-
-	while (1) {
-		/* get runlength */
-		token = *ip++;
-		length = get_length(&ip, token >> ML_BITS);
-
-		/* copy literals */
-		if (unlikely(op + length > oend - COPYLENGTH)) {
-			/*
-			 * Error: not enough place for another match
-			 * (min 4) + 5 literals
-			 */
-			if (op + length != oend)
-				goto _output_error;
-
-			MEMCPY_ADVANCE(op, ip, length);
-			break; /* EOF */
-		}
-		MEMCPY_ADVANCE_CHUNKED(op, ip, length);
-
-		/* get match offset */
-		offset = GET_LE16_ADVANCE(ip);
-		ref = op - offset;
-
-		/* Error: offset create reference outside destination buffer */
-		if (unlikely(ref < (u8 *const) dest))
-			goto _output_error;
-
-		/* get match length */
-		length = get_length(&ip, token & ML_MASK);
-		length += MINMATCH;
-
-		/* copy first STEPSIZE bytes of match: */
-		if (unlikely(offset < STEPSIZE)) {
-			MEMCPY_ADVANCE_BYTES(op, ref, 4);
-			ref -= dec32table[offset];
-
-			memcpy(op, ref, 4);
-			op += STEPSIZE - 4;
-			ref -= dec64table[offset];
-		} else {
-			MEMCPY_ADVANCE(op, ref, STEPSIZE);
-		}
-		length -= STEPSIZE;
-		/*
-		 * Note - length could have been < STEPSIZE; that's ok, length
-		 * will now be negative and we'll just end up rewinding op:
-		 */
-
-		/* copy rest of match: */
-		cpy = op + length;
-		if (cpy > oend - COPYLENGTH) {
-			/* Error: request to write beyond destination buffer */
-			if (cpy              > oend ||
-			    ref + COPYLENGTH > oend)
-				goto _output_error;
-#if !LZ4_ARCH64
-			if (op  + COPYLENGTH > oend)
-				goto _output_error;
-#endif
-			MEMCPY_ADVANCE_CHUNKED_NOFIXUP(op, ref, oend - COPYLENGTH);
-			/* op could be > cpy here */
-			while (op < cpy)
-				*op++ = *ref++;
-			op = cpy;
-			/*
-			 * Check EOF (should never happen, since last 5 bytes
-			 * are supposed to be literals)
-			 */
-			if (op == oend)
-				goto _output_error;
-		} else {
-			MEMCPY_ADVANCE_CHUNKED(op, ref, length);
-		}
-	}
-	/* end of decoding */
-	return ip - source;
-
-	/* write overflow error detected */
-_output_error:
-	return -1;
-}
-
-int bch2_lz4_decompress(const unsigned char *src, size_t *src_len,
-			unsigned char *dest, size_t actual_dest_len)
-{
-	int ret = -1;
-	int input_len = 0;
-
-	input_len = lz4_uncompress(src, dest, actual_dest_len);
-	if (input_len < 0)
-		goto exit_0;
-	*src_len = input_len;
-
-	return 0;
-exit_0:
-	return ret;
-}
diff --git a/libbcachefs/mean_and_variance.c b/libbcachefs/mean_and_variance.c
new file mode 100644
index 00000000..0ea9f308
--- /dev/null
+++ b/libbcachefs/mean_and_variance.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Functions for incremental mean and variance.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Copyright © 2022 Daniel B. Hill
+ *
+ * Author: Daniel B. Hill <daniel@gluo.nz>
+ *
+ * Description:
+ *
+ * This is includes some incremental algorithms for mean and variance calculation
+ *
+ * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
+ *
+ * Create a struct and if it's the weighted variant set the w field (weight = 2^k).
+ *
+ * Use mean_and_variance[_weighted]_update() on the struct to update it's state.
+ *
+ * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation
+ * is deferred to these functions for performance reasons.
+ *
+ * see lib/math/mean_and_variance_test.c for examples of usage.
+ *
+ * DO NOT access the mean and variance fields of the weighted variants directly.
+ * DO NOT change the weight after calling update.
+ */
+
+#include <linux/bug.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
+#include <linux/limits.h>
+#include <linux/math.h>
+#include <linux/math64.h>
+#include <linux/module.h>
+
+#include "mean_and_variance.h"
+
+u128_u u128_div(u128_u n, u64 d)
+{
+	u128_u r;
+	u64 rem;
+	u64 hi = u128_hi(n);
+	u64 lo = u128_lo(n);
+	u64  h =  hi & ((u64) U32_MAX  << 32);
+	u64  l = (hi &  (u64) U32_MAX) << 32;
+
+	r =             u128_shl(u64_to_u128(div64_u64_rem(h,                d, &rem)), 64);
+	r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l  + (rem << 32), d, &rem)), 32));
+	r = u128_add(r,          u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem)));
+	return r;
+}
+EXPORT_SYMBOL_GPL(u128_div);
+
+/**
+ * mean_and_variance_get_mean() - get mean from @s
+ * @s: mean and variance number of samples and their sums
+ */
+s64 mean_and_variance_get_mean(struct mean_and_variance s)
+{
+	return s.n ? div64_u64(s.sum, s.n) : 0;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
+
+/**
+ * mean_and_variance_get_variance() -  get variance from @s1
+ * @s1: mean and variance number of samples and sums
+ *
+ * see linked pdf equation 12.
+ */
+u64 mean_and_variance_get_variance(struct mean_and_variance s1)
+{
+	if (s1.n) {
+		u128_u s2 = u128_div(s1.sum_squares, s1.n);
+		u64  s3 = abs(mean_and_variance_get_mean(s1));
+
+		return u128_lo(u128_sub(s2, u128_square(s3)));
+	} else {
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_variance);
+
+/**
+ * mean_and_variance_get_stddev() - get standard deviation from @s
+ * @s: mean and variance number of samples and their sums
+ */
+u32 mean_and_variance_get_stddev(struct mean_and_variance s)
+{
+	return int_sqrt64(mean_and_variance_get_variance(s));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
+
+/**
+ * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
+ * @s: mean and variance number of samples and their sums
+ * @x: new value to include in the &mean_and_variance_weighted
+ * @initted: caller must track whether this is the first use or not
+ * @weight: ewma weight
+ *
+ * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
+ * values are stored bitshifted for performance and added precision.
+ */
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
+		s64 x, bool initted, u8 weight)
+{
+	// previous weighted variance.
+	u8 w		= weight;
+	u64 var_w0	= s->variance;
+	// new value weighted.
+	s64 x_w		= x << w;
+	s64 diff_w	= x_w - s->mean;
+	s64 diff	= fast_divpow2(diff_w, w);
+	// new mean weighted.
+	s64 u_w1	= s->mean + diff;
+
+	if (!initted) {
+		s->mean = x_w;
+		s->variance = 0;
+	} else {
+		s->mean = u_w1;
+		s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
+	}
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
+
+/**
+ * mean_and_variance_weighted_get_mean() - get mean from @s
+ * @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
+ */
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
+		u8 weight)
+{
+	return fast_divpow2(s.mean, weight);
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
+
+/**
+ * mean_and_variance_weighted_get_variance() -- get variance from @s
+ * @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
+ */
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
+		u8 weight)
+{
+	// always positive don't need fast divpow2
+	return s.variance >> weight;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
+
+/**
+ * mean_and_variance_weighted_get_stddev() - get standard deviation from @s
+ * @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
+ */
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
+		u8 weight)
+{
+	return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
+
+MODULE_AUTHOR("Daniel B. Hill");
+MODULE_LICENSE("GPL");
diff --git a/libbcachefs/mean_and_variance.h b/libbcachefs/mean_and_variance.h
new file mode 100644
index 00000000..47e4a3c3
--- /dev/null
+++ b/libbcachefs/mean_and_variance.h
@@ -0,0 +1,203 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef MEAN_AND_VARIANCE_H_
+#define MEAN_AND_VARIANCE_H_
+
+#include <linux/types.h>
+#include <linux/limits.h>
+#include <linux/math.h>
+#include <linux/math64.h>
+
+#define SQRT_U64_MAX 4294967295ULL
+
+/*
+ * u128_u: u128 user mode, because not all architectures support a real int128
+ * type
+ *
+ * We don't use this version in userspace, because in userspace we link with
+ * Rust and rustc has issues with u128.
+ */
+
+#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC)
+
+typedef struct {
+	unsigned __int128 v;
+} __aligned(16) u128_u;
+
+static inline u128_u u64_to_u128(u64 a)
+{
+	return (u128_u) { .v = a };
+}
+
+static inline u64 u128_lo(u128_u a)
+{
+	return a.v;
+}
+
+static inline u64 u128_hi(u128_u a)
+{
+	return a.v >> 64;
+}
+
+static inline u128_u u128_add(u128_u a, u128_u b)
+{
+	a.v += b.v;
+	return a;
+}
+
+static inline u128_u u128_sub(u128_u a, u128_u b)
+{
+	a.v -= b.v;
+	return a;
+}
+
+static inline u128_u u128_shl(u128_u a, s8 shift)
+{
+	a.v <<= shift;
+	return a;
+}
+
+static inline u128_u u128_square(u64 a)
+{
+	u128_u b = u64_to_u128(a);
+
+	b.v *= b.v;
+	return b;
+}
+
+#else
+
+typedef struct {
+	u64 hi, lo;
+} __aligned(16) u128_u;
+
+/* conversions */
+
+static inline u128_u u64_to_u128(u64 a)
+{
+	return (u128_u) { .lo = a };
+}
+
+static inline u64 u128_lo(u128_u a)
+{
+	return a.lo;
+}
+
+static inline u64 u128_hi(u128_u a)
+{
+	return a.hi;
+}
+
+/* arithmetic */
+
+static inline u128_u u128_add(u128_u a, u128_u b)
+{
+	u128_u c;
+
+	c.lo = a.lo + b.lo;
+	c.hi = a.hi + b.hi + (c.lo < a.lo);
+	return c;
+}
+
+static inline u128_u u128_sub(u128_u a, u128_u b)
+{
+	u128_u c;
+
+	c.lo = a.lo - b.lo;
+	c.hi = a.hi - b.hi - (c.lo > a.lo);
+	return c;
+}
+
+static inline u128_u u128_shl(u128_u i, s8 shift)
+{
+	u128_u r;
+
+	r.lo = i.lo << (shift & 63);
+	if (shift < 64)
+		r.hi = (i.hi << (shift & 63)) | (i.lo >> (-shift & 63));
+	else {
+		r.hi = i.lo << (-shift & 63);
+		r.lo = 0;
+	}
+	return r;
+}
+
+static inline u128_u u128_square(u64 i)
+{
+	u128_u r;
+	u64  h = i >> 32, l = i & U32_MAX;
+
+	r =             u128_shl(u64_to_u128(h*h), 64);
+	r = u128_add(r, u128_shl(u64_to_u128(h*l), 32));
+	r = u128_add(r, u128_shl(u64_to_u128(l*h), 32));
+	r = u128_add(r,          u64_to_u128(l*l));
+	return r;
+}
+
+#endif
+
+static inline u128_u u64s_to_u128(u64 hi, u64 lo)
+{
+	u128_u c = u64_to_u128(hi);
+
+	c = u128_shl(c, 64);
+	c = u128_add(c, u64_to_u128(lo));
+	return c;
+}
+
+u128_u u128_div(u128_u n, u64 d);
+
+struct mean_and_variance {
+	s64	n;
+	s64	sum;
+	u128_u	sum_squares;
+};
+
+/* expontentially weighted variant */
+struct mean_and_variance_weighted {
+	s64	mean;
+	u64	variance;
+};
+
+/**
+ * fast_divpow2() - fast approximation for n / (1 << d)
+ * @n: numerator
+ * @d: the power of 2 denominator.
+ *
+ * note: this rounds towards 0.
+ */
+static inline s64 fast_divpow2(s64 n, u8 d)
+{
+	return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
+}
+
+/**
+ * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1
+ * and return it.
+ * @s1: the mean_and_variance to update.
+ * @v1: the new sample.
+ *
+ * see linked pdf equation 12.
+ */
+static inline void
+mean_and_variance_update(struct mean_and_variance *s, s64 v)
+{
+	s->n++;
+	s->sum += v;
+	s->sum_squares = u128_add(s->sum_squares, u128_square(abs(v)));
+}
+
+s64 mean_and_variance_get_mean(struct mean_and_variance s);
+u64 mean_and_variance_get_variance(struct mean_and_variance s1);
+u32 mean_and_variance_get_stddev(struct mean_and_variance s);
+
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
+		s64 v, bool initted, u8 weight);
+
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
+		u8 weight);
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
+		u8 weight);
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
+		u8 weight);
+
+#endif // MEAN_AND_VAIRANCE_H_
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index 215c5aa5..ddc187fb 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -1,12 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Code for moving data off a device.
  */
 
 #include "bcachefs.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
 #include "buckets.h"
+#include "errcode.h"
 #include "extents.h"
-#include "io.h"
+#include "io_write.h"
 #include "journal.h"
 #include "keylist.h"
 #include "migrate.h"
@@ -14,7 +18,7 @@
 #include "replicas.h"
 #include "super-io.h"
 
-static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
 			 unsigned dev_idx, int flags, bool metadata)
 {
 	unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
@@ -22,152 +26,145 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
 	unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
 	unsigned nr_good;
 
-	bch2_extent_drop_device(e, dev_idx);
+	bch2_bkey_drop_device(k, dev_idx);
 
-	nr_good = bch2_extent_durability(c, e.c);
+	nr_good = bch2_bkey_durability(c, k.s_c);
 	if ((!nr_good && !(flags & lost)) ||
 	    (nr_good < replicas && !(flags & degraded)))
-		return -EINVAL;
+		return -BCH_ERR_remove_would_lose_data;
 
 	return 0;
 }
 
-static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
+				     struct btree_iter *iter,
+				     struct bkey_s_c k,
+				     unsigned dev_idx,
+				     int flags)
 {
-	struct bkey_s_c k;
-	struct bkey_s_extent e;
-	BKEY_PADDED(key) tmp;
-	struct btree_iter iter;
-	int ret = 0;
+	struct bch_fs *c = trans->c;
+	struct bkey_i *n;
+	int ret;
 
-	mutex_lock(&c->replicas_gc_lock);
-	bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
+	if (!bch2_bkey_has_device_c(k, dev_idx))
+		return 0;
+
+	n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_internal_snapshot_node);
+	ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false);
+	if (ret)
+		return ret;
+
+	/*
+	 * If the new extent no longer has any pointers, bch2_extent_normalize()
+	 * will do the appropriate thing with it (turning it into a
+	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
+	 */
+	bch2_extent_normalize(c, bkey_i_to_s(n));
+
+	/*
+	 * Since we're not inserting through an extent iterator
+	 * (BTREE_ITER_all_snapshots iterators aren't extent iterators),
+	 * we aren't using the extent overwrite path to delete, we're
+	 * just using the normal key deletion path:
+	 */
+	if (bkey_deleted(&n->k))
+		n->k.size = 0;
+	return 0;
+}
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
-			     POS_MIN, BTREE_ITER_PREFETCH);
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	enum btree_id id;
+	int ret = 0;
 
-	while ((k = bch2_btree_iter_peek(&iter)).k &&
-	       !(ret = btree_iter_err(k))) {
-		if (!bkey_extent_is_data(k.k) ||
-		    !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
-			ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
-			if (ret)
-				break;
-			bch2_btree_iter_next(&iter);
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		if (!btree_type_has_ptrs(id))
 			continue;
-		}
-
-		bkey_reassemble(&tmp.key, k);
-		e = bkey_i_to_s_extent(&tmp.key);
-
-		ret = drop_dev_ptrs(c, e, dev_idx, flags, false);
-		if (ret)
-			break;
-
-		/*
-		 * If the new extent no longer has any pointers, bch2_extent_normalize()
-		 * will do the appropriate thing with it (turning it into a
-		 * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
-		 */
-		bch2_extent_normalize(c, e.s);
 
-		ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
-					      bkey_i_to_s_c(&tmp.key));
-		if (ret)
-			break;
-
-		iter.pos = bkey_start_pos(&tmp.key.k);
-
-		ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
-					   BTREE_INSERT_ATOMIC|
-					   BTREE_INSERT_NOFAIL,
-					   BTREE_INSERT_ENTRY(&iter, &tmp.key));
-
-		/*
-		 * don't want to leave ret == -EINTR, since if we raced and
-		 * something else overwrote the key we could spuriously return
-		 * -EINTR below:
-		 */
-		if (ret == -EINTR)
-			ret = 0;
+		ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
+				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
 		if (ret)
 			break;
 	}
 
-	bch2_btree_iter_unlock(&iter);
-
-	bch2_replicas_gc_end(c, ret);
-	mutex_unlock(&c->replicas_gc_lock);
+	bch2_trans_put(trans);
 
 	return ret;
 }
 
 static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
+	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct closure cl;
 	struct btree *b;
+	struct bkey_buf k;
 	unsigned id;
 	int ret;
 
 	/* don't handle this yet: */
 	if (flags & BCH_FORCE_IF_METADATA_LOST)
-		return -EINVAL;
+		return -BCH_ERR_remove_with_metadata_missing_unimplemented;
 
+	trans = bch2_trans_get(c);
+	bch2_bkey_buf_init(&k);
 	closure_init_stack(&cl);
 
-	mutex_lock(&c->replicas_gc_lock);
-	bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
-
 	for (id = 0; id < BTREE_ID_NR; id++) {
-		for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
-			__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-			struct bkey_i_extent *new_key;
+		bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
+					  BTREE_ITER_prefetch);
 retry:
-			if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
-						    dev_idx)) {
-				/*
-				 * we might have found a btree node key we
-				 * needed to update, and then tried to update it
-				 * but got -EINTR after upgrading the iter, but
-				 * then raced and the node is now gone:
-				 */
-				bch2_btree_iter_downgrade(&iter);
-
-				ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
-							      bkey_i_to_s_c(&b->key));
-				if (ret)
-					goto err;
-			} else {
-				bkey_copy(&tmp.k, &b->key);
-				new_key = bkey_i_to_extent(&tmp.k);
-
-				ret = drop_dev_ptrs(c, extent_i_to_s(new_key),
-						    dev_idx, flags, true);
-				if (ret)
-					goto err;
-
-				ret = bch2_btree_node_update_key(c, &iter, b, new_key);
-				if (ret == -EINTR) {
-					b = bch2_btree_iter_peek_node(&iter);
-					goto retry;
-				}
-				if (ret)
-					goto err;
+		ret = 0;
+		while (bch2_trans_begin(trans),
+		       (b = bch2_btree_iter_peek_node(&iter)) &&
+		       !(ret = PTR_ERR_OR_ZERO(b))) {
+			if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
+				goto next;
+
+			bch2_bkey_buf_copy(&k, c, &b->key);
+
+			ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
+					    dev_idx, flags, true);
+			if (ret)
+				break;
+
+			ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+				ret = 0;
+				continue;
 			}
+
+			bch_err_msg(c, ret, "updating btree node key");
+			if (ret)
+				break;
+next:
+			bch2_btree_iter_next_node(&iter);
 		}
-		bch2_btree_iter_unlock(&iter);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto retry;
+
+		bch2_trans_iter_exit(trans, &iter);
+
+		if (ret)
+			goto err;
 	}
 
+	bch2_btree_interior_updates_flush(c);
 	ret = 0;
-out:
-	ret = bch2_replicas_gc_end(c, ret);
-	mutex_unlock(&c->replicas_gc_lock);
+err:
+	bch2_bkey_buf_exit(&k, c);
+	bch2_trans_put(trans);
+
+	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
 
 	return ret;
-err:
-	bch2_btree_iter_unlock(&iter);
-	goto out;
 }
 
 int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
diff --git a/libbcachefs/migrate.h b/libbcachefs/migrate.h
index de2faab2..027efaa0 100644
--- a/libbcachefs/migrate.h
+++ b/libbcachefs/migrate.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_MIGRATE_H
 #define _BCACHEFS_MIGRATE_H
 
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 3e52b7a2..46017546 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -1,327 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "backpointers.h"
+#include "bkey_buf.h"
 #include "btree_gc.h"
+#include "btree_io.h"
 #include "btree_update.h"
-#include "buckets.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "compress.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "errcode.h"
+#include "error.h"
 #include "inode.h"
-#include "io.h"
+#include "io_read.h"
+#include "io_write.h"
 #include "journal_reclaim.h"
+#include "keylist.h"
 #include "move.h"
+#include "rebalance.h"
 #include "replicas.h"
+#include "snapshot.h"
 #include "super-io.h"
-#include "keylist.h"
+#include "trace.h"
 
 #include <linux/ioprio.h>
 #include <linux/kthread.h>
 
-#include <trace/events/bcachefs.h>
-
-#define SECTORS_IN_FLIGHT_PER_DEVICE	2048
-
-struct moving_io {
-	struct list_head	list;
-	struct closure		cl;
-	bool			read_completed;
-
-	unsigned		read_sectors;
-	unsigned		write_sectors;
-
-	struct bch_read_bio	rbio;
-
-	struct migrate_write	write;
-	/* Must be last since it is variable size */
-	struct bio_vec		bi_inline_vecs[0];
+const char * const bch2_data_ops_strs[] = {
+#define x(t, n, ...) [n] = #t,
+	BCH_DATA_OPS()
+#undef x
+	NULL
 };
 
-struct moving_context {
-	/* Closure for waiting on all reads and writes to complete */
-	struct closure		cl;
-
-	struct bch_move_stats	*stats;
-
-	struct list_head	reads;
-
-	/* in flight sectors: */
-	atomic_t		read_sectors;
-	atomic_t		write_sectors;
-
-	wait_queue_head_t	wait;
-};
-
-static int bch2_migrate_index_update(struct bch_write_op *op)
+static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
+			       struct bch_io_opts *io_opts,
+			       struct data_update_opts *data_opts)
 {
-	struct bch_fs *c = op->c;
-	struct migrate_write *m =
-		container_of(op, struct migrate_write, op);
-	struct keylist *keys = &op->insert_keys;
-	struct btree_iter iter;
-	int ret = 0;
-
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
-			     bkey_start_pos(&bch2_keylist_front(keys)->k),
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-	while (1) {
-		struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
-		struct bkey_i_extent *insert, *new =
-			bkey_i_to_extent(bch2_keylist_front(keys));
-		BKEY_PADDED(k) _new, _insert;
-		struct bch_extent_ptr *ptr;
-		struct bch_extent_crc_unpacked crc;
-		bool did_work = false;
-		int nr;
-
-		if (btree_iter_err(k)) {
-			ret = bch2_btree_iter_unlock(&iter);
-			break;
-		}
-
-		if (bversion_cmp(k.k->version, new->k.version) ||
-		    !bkey_extent_is_data(k.k) ||
-		    !bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k),
-					     m->ptr, m->offset))
-			goto nomatch;
-
-		if (m->data_cmd == DATA_REWRITE &&
-		    !bch2_extent_has_device(bkey_s_c_to_extent(k),
-					    m->data_opts.rewrite_dev))
-			goto nomatch;
-
-		bkey_reassemble(&_insert.k, k);
-		insert = bkey_i_to_extent(&_insert.k);
-
-		bkey_copy(&_new.k, bch2_keylist_front(keys));
-		new = bkey_i_to_extent(&_new.k);
-
-		bch2_cut_front(iter.pos, &insert->k_i);
-		bch2_cut_back(new->k.p, &insert->k);
-		bch2_cut_back(insert->k.p, &new->k);
-
-		if (m->data_cmd == DATA_REWRITE) {
-			ptr = (struct bch_extent_ptr *)
-				bch2_extent_has_device(extent_i_to_s_c(insert),
-						       m->data_opts.rewrite_dev);
-			bch2_extent_drop_ptr(extent_i_to_s(insert), ptr);
-		}
-
-		extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) {
-			if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) {
-				/*
-				 * raced with another move op? extent already
-				 * has a pointer to the device we just wrote
-				 * data to
-				 */
-				continue;
-			}
-
-			bch2_extent_crc_append(insert, crc);
-			extent_ptr_append(insert, *ptr);
-			did_work = true;
-		}
-
-		if (!did_work)
-			goto nomatch;
-
-		bch2_extent_narrow_crcs(insert,
-				(struct bch_extent_crc_unpacked) { 0 });
-		bch2_extent_normalize(c, extent_i_to_s(insert).s);
-		bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert),
-						 op->opts.background_target,
-						 op->opts.data_replicas);
-
-		/*
-		 * It's possible we race, and for whatever reason the extent now
-		 * has fewer replicas than when we last looked at it - meaning
-		 * we need to get a disk reservation here:
-		 */
-		nr = bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) -
-			(bch2_extent_nr_dirty_ptrs(k) + m->nr_ptrs_reserved);
-		if (nr > 0) {
-			/*
-			 * can't call bch2_disk_reservation_add() with btree
-			 * locks held, at least not without a song and dance
-			 */
-			bch2_btree_iter_unlock(&iter);
-
-			ret = bch2_disk_reservation_add(c, &op->res,
-					keylist_sectors(keys) * nr, 0);
-			if (ret)
-				goto out;
-
-			m->nr_ptrs_reserved += nr;
-			goto next;
-		}
-
-		ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
-					      extent_i_to_s_c(insert).s_c);
-		if (ret)
-			break;
-
-		ret = bch2_btree_insert_at(c, &op->res,
-				NULL, op_journal_seq(op),
-				BTREE_INSERT_ATOMIC|
-				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_USE_RESERVE|
-				m->data_opts.btree_insert_flags,
-				BTREE_INSERT_ENTRY(&iter, &insert->k_i));
-		if (!ret)
-			atomic_long_inc(&c->extent_migrate_done);
-		if (ret == -EINTR)
-			ret = 0;
-		if (ret)
-			break;
-next:
-		while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
-			bch2_keylist_pop_front(keys);
-			if (bch2_keylist_empty(keys))
-				goto out;
-		}
-
-		bch2_cut_front(iter.pos, bch2_keylist_front(keys));
-		continue;
-nomatch:
-		if (m->ctxt)
-			atomic64_add(k.k->p.offset - iter.pos.offset,
-				     &m->ctxt->stats->sectors_raced);
-		atomic_long_inc(&c->extent_migrate_raced);
-		trace_move_race(&new->k);
-		bch2_btree_iter_next_slot(&iter);
-		goto next;
+	if (trace_move_extent_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		prt_newline(&buf);
+		bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
+		trace_move_extent(c, buf.buf);
+		printbuf_exit(&buf);
 	}
-out:
-	bch2_btree_iter_unlock(&iter);
-	return ret;
 }
 
-void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
+static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
 {
-	/* write bio must own pages: */
-	BUG_ON(!m->op.wbio.bio.bi_vcnt);
+	if (trace_move_extent_read_enabled()) {
+		struct printbuf buf = PRINTBUF;
 
-	m->ptr		= rbio->pick.ptr;
-	m->offset	= rbio->pos.offset - rbio->pick.crc.offset;
-	m->op.devs_have	= rbio->devs_have;
-	m->op.pos	= rbio->pos;
-	m->op.version	= rbio->version;
-	m->op.crc	= rbio->pick.crc;
-	m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
-
-	if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
-		m->op.nonce	= m->op.crc.nonce + m->op.crc.offset;
-		m->op.csum_type = m->op.crc.csum_type;
+		bch2_bkey_val_to_text(&buf, c, k);
+		trace_move_extent_read(c, buf.buf);
+		printbuf_exit(&buf);
 	}
-
-	if (m->data_cmd == DATA_REWRITE)
-		bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
 }
 
-int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
-			    struct write_point_specifier wp,
-			    struct bch_io_opts io_opts,
-			    enum data_cmd data_cmd,
-			    struct data_opts data_opts,
-			    struct bkey_s_c k)
-{
-	int ret;
-
-	m->data_cmd	= data_cmd;
-	m->data_opts	= data_opts;
-	m->nr_ptrs_reserved = 0;
-
-	bch2_write_op_init(&m->op, c, io_opts);
-	m->op.compression_type =
-		bch2_compression_opt_to_type[io_opts.background_compression ?:
-					     io_opts.compression];
-	m->op.target	= data_opts.target,
-	m->op.write_point = wp;
-
-	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
-		m->op.alloc_reserve = RESERVE_MOVINGGC;
-
-	m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
-		BCH_WRITE_PAGES_STABLE|
-		BCH_WRITE_PAGES_OWNED|
-		BCH_WRITE_DATA_ENCODED|
-		BCH_WRITE_NOMARK_REPLICAS;
-
-	m->op.nr_replicas	= 1;
-	m->op.nr_replicas_required = 1;
-	m->op.index_update_fn	= bch2_migrate_index_update;
+struct moving_io {
+	struct list_head		read_list;
+	struct list_head		io_list;
+	struct move_bucket_in_flight	*b;
+	struct closure			cl;
+	bool				read_completed;
 
-	switch (data_cmd) {
-	case DATA_ADD_REPLICAS: {
-		int nr = (int) io_opts.data_replicas -
-			bch2_extent_nr_dirty_ptrs(k);
+	unsigned			read_sectors;
+	unsigned			write_sectors;
 
-		if (nr > 0) {
-			m->op.nr_replicas = m->nr_ptrs_reserved = nr;
+	struct bch_read_bio		rbio;
 
-			ret = bch2_disk_reservation_get(c, &m->op.res,
-					k.k->size, m->op.nr_replicas, 0);
-			if (ret)
-				return ret;
-		}
-		break;
-	}
-	case DATA_REWRITE:
-		break;
-	case DATA_PROMOTE:
-		m->op.flags	|= BCH_WRITE_ALLOC_NOWAIT;
-		m->op.flags	|= BCH_WRITE_CACHED;
-		break;
-	default:
-		BUG();
-	}
-
-	return 0;
-}
+	struct data_update		write;
+	/* Must be last since it is variable size */
+	struct bio_vec			bi_inline_vecs[];
+};
 
-static void move_free(struct closure *cl)
+static void move_free(struct moving_io *io)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	struct moving_context *ctxt = io->write.ctxt;
-	struct bio_vec *bv;
-	int i;
 
-	bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
+	if (io->b)
+		atomic_dec(&io->b->count);
 
-	bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
-		if (bv->bv_page)
-			__free_page(bv->bv_page);
+	bch2_data_update_exit(&io->write);
 
+	mutex_lock(&ctxt->lock);
+	list_del(&io->io_list);
 	wake_up(&ctxt->wait);
+	mutex_unlock(&ctxt->lock);
 
 	kfree(io);
 }
 
-static void move_write_done(struct closure *cl)
+static void move_write_done(struct bch_write_op *op)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct moving_io *io = container_of(op, struct moving_io, write.op);
+	struct moving_context *ctxt = io->write.ctxt;
+
+	if (io->write.op.error)
+		ctxt->write_error = true;
 
 	atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
-	closure_return_with_destructor(cl, move_free);
+	atomic_dec(&io->write.ctxt->write_ios);
+	move_free(io);
+	closure_put(&ctxt->cl);
 }
 
-static void move_write(struct closure *cl)
+static void move_write(struct moving_io *io)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
-
 	if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
-		closure_return_with_destructor(cl, move_free);
+		move_free(io);
 		return;
 	}
 
-	bch2_migrate_read_done(&io->write, &io->rbio);
+	if (trace_move_extent_write_enabled()) {
+		struct bch_fs *c = io->write.op.c;
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
+		trace_move_extent_write(c, buf.buf);
+		printbuf_exit(&buf);
+	}
 
+	closure_get(&io->write.ctxt->cl);
 	atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
-	closure_call(&io->write.op.cl, bch2_write, NULL, cl);
-	continue_at(cl, move_write_done, NULL);
+	atomic_inc(&io->write.ctxt->write_ios);
+
+	bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
 }
 
-static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
+struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
 {
 	struct moving_io *io =
-		list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
+		list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
 
 	return io && io->read_completed ? io : NULL;
 }
@@ -332,35 +148,25 @@ static void move_read_endio(struct bio *bio)
 	struct moving_context *ctxt = io->write.ctxt;
 
 	atomic_sub(io->read_sectors, &ctxt->read_sectors);
+	atomic_dec(&ctxt->read_ios);
 	io->read_completed = true;
 
-	if (next_pending_write(ctxt))
-		wake_up(&ctxt->wait);
-
+	wake_up(&ctxt->wait);
 	closure_put(&ctxt->cl);
 }
 
-static void do_pending_writes(struct moving_context *ctxt)
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
 {
 	struct moving_io *io;
 
-	while ((io = next_pending_write(ctxt))) {
-		list_del(&io->list);
-		closure_call(&io->cl, move_write, NULL, &ctxt->cl);
+	while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
+		bch2_trans_unlock_long(ctxt->trans);
+		list_del(&io->read_list);
+		move_write(io);
 	}
 }
 
-#define move_ctxt_wait_event(_ctxt, _cond)			\
-do {								\
-	do_pending_writes(_ctxt);				\
-								\
-	if (_cond)						\
-		break;						\
-	__wait_event((_ctxt)->wait,				\
-		     next_pending_write(_ctxt) || (_cond));	\
-} while (1)
-
-static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
+void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
 {
 	unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
 
@@ -369,31 +175,118 @@ static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
 		atomic_read(&ctxt->write_sectors) != sectors_pending);
 }
 
-static int bch2_move_extent(struct bch_fs *c,
-			    struct moving_context *ctxt,
-			    struct write_point_specifier wp,
-			    struct bch_io_opts io_opts,
-			    struct bkey_s_c_extent e,
-			    enum data_cmd data_cmd,
-			    struct data_opts data_opts)
+void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
+{
+	move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
+	bch2_trans_unlock_long(ctxt->trans);
+	closure_sync(&ctxt->cl);
+}
+
+void bch2_moving_ctxt_exit(struct moving_context *ctxt)
+{
+	struct bch_fs *c = ctxt->trans->c;
+
+	bch2_moving_ctxt_flush_all(ctxt);
+
+	EBUG_ON(atomic_read(&ctxt->write_sectors));
+	EBUG_ON(atomic_read(&ctxt->write_ios));
+	EBUG_ON(atomic_read(&ctxt->read_sectors));
+	EBUG_ON(atomic_read(&ctxt->read_ios));
+
+	mutex_lock(&c->moving_context_lock);
+	list_del(&ctxt->list);
+	mutex_unlock(&c->moving_context_lock);
+
+	/*
+	 * Generally, releasing a transaction within a transaction restart means
+	 * an unhandled transaction restart: but this can happen legitimately
+	 * within the move code, e.g. when bch2_move_ratelimit() tells us to
+	 * exit before we've retried
+	 */
+	bch2_trans_begin(ctxt->trans);
+	bch2_trans_put(ctxt->trans);
+	memset(ctxt, 0, sizeof(*ctxt));
+}
+
+void bch2_moving_ctxt_init(struct moving_context *ctxt,
+			   struct bch_fs *c,
+			   struct bch_ratelimit *rate,
+			   struct bch_move_stats *stats,
+			   struct write_point_specifier wp,
+			   bool wait_on_copygc)
 {
+	memset(ctxt, 0, sizeof(*ctxt));
+
+	ctxt->trans	= bch2_trans_get(c);
+	ctxt->fn	= (void *) _RET_IP_;
+	ctxt->rate	= rate;
+	ctxt->stats	= stats;
+	ctxt->wp	= wp;
+	ctxt->wait_on_copygc = wait_on_copygc;
+
+	closure_init_stack(&ctxt->cl);
+
+	mutex_init(&ctxt->lock);
+	INIT_LIST_HEAD(&ctxt->reads);
+	INIT_LIST_HEAD(&ctxt->ios);
+	init_waitqueue_head(&ctxt->wait);
+
+	mutex_lock(&c->moving_context_lock);
+	list_add(&ctxt->list, &c->moving_context_list);
+	mutex_unlock(&c->moving_context_lock);
+}
+
+void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
+{
+	trace_move_data(c, stats);
+}
+
+void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
+{
+	memset(stats, 0, sizeof(*stats));
+	stats->data_type = BCH_DATA_user;
+	scnprintf(stats->name, sizeof(stats->name), "%s", name);
+}
+
+int bch2_move_extent(struct moving_context *ctxt,
+		     struct move_bucket_in_flight *bucket_in_flight,
+		     struct btree_iter *iter,
+		     struct bkey_s_c k,
+		     struct bch_io_opts io_opts,
+		     struct data_update_opts data_opts)
+{
+	struct btree_trans *trans = ctxt->trans;
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	struct moving_io *io;
-	const struct bch_extent_ptr *ptr;
-	struct bch_extent_crc_unpacked crc;
-	unsigned sectors = e.k->size, pages;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned sectors = k.k->size, pages;
 	int ret = -ENOMEM;
 
-	move_ctxt_wait_event(ctxt,
-		atomic_read(&ctxt->write_sectors) <
-		SECTORS_IN_FLIGHT_PER_DEVICE);
+	trace_move_extent2(c, k, &io_opts, &data_opts);
 
-	move_ctxt_wait_event(ctxt,
-		atomic_read(&ctxt->read_sectors) <
-		SECTORS_IN_FLIGHT_PER_DEVICE);
+	if (ctxt->stats)
+		ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
+
+	bch2_data_update_opts_normalize(k, &data_opts);
+
+	if (!data_opts.rewrite_ptrs &&
+	    !data_opts.extra_replicas) {
+		if (data_opts.kill_ptrs)
+			return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
+		return 0;
+	}
+
+	/*
+	 * Before memory allocations & taking nocow locks in
+	 * bch2_data_update_init():
+	 */
+	bch2_trans_unlock(trans);
 
 	/* write path might have to decompress data: */
-	extent_for_each_ptr_crc(e, ptr, crc)
-		sectors = max_t(unsigned, sectors, crc.uncompressed_size);
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
 
 	pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
 	io = kzalloc(sizeof(struct moving_io) +
@@ -401,48 +294,70 @@ static int bch2_move_extent(struct bch_fs *c,
 	if (!io)
 		goto err;
 
+	INIT_LIST_HEAD(&io->io_list);
 	io->write.ctxt		= ctxt;
-	io->read_sectors	= e.k->size;
-	io->write_sectors	= e.k->size;
+	io->read_sectors	= k.k->size;
+	io->write_sectors	= k.k->size;
 
-	bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
+	bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
 	bio_set_prio(&io->write.op.wbio.bio,
 		     IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
-	io->write.op.wbio.bio.bi_iter.bi_size = sectors << 9;
 
-	bch2_bio_map(&io->write.op.wbio.bio, NULL);
-	if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL))
+	if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
+				 GFP_KERNEL))
 		goto err_free;
 
-	io->rbio.opts = io_opts;
-	bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
+	io->rbio.c		= c;
+	io->rbio.opts		= io_opts;
+	bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
 	io->rbio.bio.bi_vcnt = pages;
 	bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
 	io->rbio.bio.bi_iter.bi_size = sectors << 9;
 
-	bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
-	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(e.k);
+	io->rbio.bio.bi_opf		= REQ_OP_READ;
+	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(k.k);
 	io->rbio.bio.bi_end_io		= move_read_endio;
 
-	ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
-				      data_cmd, data_opts, e.s_c);
+	ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
+				    io_opts, data_opts, iter->btree_id, k);
 	if (ret)
 		goto err_free_pages;
 
-	atomic64_inc(&ctxt->stats->keys_moved);
-	atomic64_add(e.k->size, &ctxt->stats->sectors_moved);
+	io->write.op.end_io = move_write_done;
+
+	if (ctxt->rate)
+		bch2_ratelimit_increment(ctxt->rate, k.k->size);
+
+	if (ctxt->stats) {
+		atomic64_inc(&ctxt->stats->keys_moved);
+		atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
+	}
+
+	if (bucket_in_flight) {
+		io->b = bucket_in_flight;
+		atomic_inc(&io->b->count);
+	}
 
-	trace_move_extent(e.k);
+	this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
+	this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
+	trace_move_extent_read2(c, k);
 
+	mutex_lock(&ctxt->lock);
 	atomic_add(io->read_sectors, &ctxt->read_sectors);
-	list_add_tail(&io->list, &ctxt->reads);
+	atomic_inc(&ctxt->read_ios);
+
+	list_add_tail(&io->read_list, &ctxt->reads);
+	list_add_tail(&io->io_list, &ctxt->ios);
+	mutex_unlock(&ctxt->lock);
 
 	/*
 	 * dropped by move_read_endio() - guards against use after free of
 	 * ctxt when doing wakeup
 	 */
 	closure_get(&ctxt->cl);
-	bch2_read_extent(c, &io->rbio, e.s_c,
+	bch2_read_extent(trans, &io->rbio,
+			 bkey_start_pos(k.k),
+			 iter->btree_id, k, 0,
 			 BCH_READ_NODECODE|
 			 BCH_READ_LAST_FRAGMENT);
 	return 0;
@@ -451,312 +366,831 @@ err_free_pages:
 err_free:
 	kfree(io);
 err:
-	trace_move_alloc_fail(e.k);
+	if (ret == -BCH_ERR_data_update_done)
+		return 0;
+
+	if (bch2_err_matches(ret, EROFS) ||
+	    bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		return ret;
+
+	count_event(c, move_extent_start_fail);
+
+	if (trace_move_extent_start_fail_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		prt_str(&buf, ": ");
+		prt_str(&buf, bch2_err_str(ret));
+		trace_move_extent_start_fail(c, buf.buf);
+		printbuf_exit(&buf);
+	}
 	return ret;
 }
 
-int bch2_move_data(struct bch_fs *c,
-		   struct bch_ratelimit *rate,
-		   struct write_point_specifier wp,
-		   struct bpos start,
-		   struct bpos end,
-		   move_pred_fn pred, void *arg,
-		   struct bch_move_stats *stats)
+static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
+			  struct per_snapshot_io_opts *io_opts,
+			  struct btree_iter *extent_iter,
+			  struct bkey_s_c extent_k)
 {
-	bool kthread = (current->flags & PF_KTHREAD) != 0;
-	struct moving_context ctxt = { .stats = stats };
-	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-	BKEY_PADDED(k) tmp;
+	struct bch_fs *c = trans->c;
+	u32 restart_count = trans->restart_count;
+	struct bch_io_opts *opts_ret = &io_opts->fs_io_opts;
+	int ret = 0;
+
+	if (extent_k.k->type == KEY_TYPE_reflink_v)
+		goto out;
+
+	if (io_opts->cur_inum != extent_k.k->p.inode) {
+		io_opts->d.nr = 0;
+
+		ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
+					 BTREE_ITER_all_snapshots, k, ({
+			if (k.k->p.offset != extent_k.k->p.inode)
+				break;
+
+			if (!bkey_is_inode(k.k))
+				continue;
+
+			struct bch_inode_unpacked inode;
+			BUG_ON(bch2_inode_unpack(k, &inode));
+
+			struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
+			bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
+
+			darray_push(&io_opts->d, e);
+		}));
+		io_opts->cur_inum = extent_k.k->p.inode;
+	}
+
+	ret = ret ?: trans_was_restarted(trans, restart_count);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (extent_k.k->p.snapshot)
+		darray_for_each(io_opts->d, i)
+			if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) {
+				opts_ret = &i->io_opts;
+				break;
+			}
+out:
+	ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k);
+	if (ret)
+		return ERR_PTR(ret);
+	return opts_ret;
+}
+
+int bch2_move_get_io_opts_one(struct btree_trans *trans,
+			      struct bch_io_opts *io_opts,
+			      struct btree_iter *extent_iter,
+			      struct bkey_s_c extent_k)
+{
+	struct bch_fs *c = trans->c;
+
+	*io_opts = bch2_opts_to_inode_opts(c->opts);
+
+	/* reflink btree? */
+	if (!extent_k.k->p.inode)
+		goto out;
+
+	struct btree_iter inode_iter;
+	struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
+			       SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
+			       BTREE_ITER_cached);
+	int ret = bkey_err(inode_k);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		return ret;
+
+	if (!ret && bkey_is_inode(inode_k.k)) {
+		struct bch_inode_unpacked inode;
+		bch2_inode_unpack(inode_k, &inode);
+		bch2_inode_opts_get(io_opts, c, &inode);
+	}
+	bch2_trans_iter_exit(trans, &inode_iter);
+out:
+	return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k);
+}
+
+int bch2_move_ratelimit(struct moving_context *ctxt)
+{
+	struct bch_fs *c = ctxt->trans->c;
+	bool is_kthread = current->flags & PF_KTHREAD;
+	u64 delay;
+
+	if (ctxt->wait_on_copygc && c->copygc_running) {
+		bch2_moving_ctxt_flush_all(ctxt);
+		wait_event_killable(c->copygc_running_wq,
+				    !c->copygc_running ||
+				    (is_kthread && kthread_should_stop()));
+	}
+
+	do {
+		delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
+
+		if (is_kthread && kthread_should_stop())
+			return 1;
+
+		if (delay)
+			move_ctxt_wait_event_timeout(ctxt,
+					freezing(current) ||
+					(is_kthread && kthread_should_stop()),
+					delay);
+
+		if (unlikely(freezing(current))) {
+			bch2_moving_ctxt_flush_all(ctxt);
+			try_to_freeze();
+		}
+	} while (delay);
+
+	/*
+	 * XXX: these limits really ought to be per device, SSDs and hard drives
+	 * will want different limits
+	 */
+	move_ctxt_wait_event(ctxt,
+		atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
+		atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
+		atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
+		atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
+
+	return 0;
+}
+
+static int bch2_move_data_btree(struct moving_context *ctxt,
+				struct bpos start,
+				struct bpos end,
+				move_pred_fn pred, void *arg,
+				enum btree_id btree_id)
+{
+	struct btree_trans *trans = ctxt->trans;
+	struct bch_fs *c = trans->c;
+	struct per_snapshot_io_opts snapshot_io_opts;
+	struct bch_io_opts *io_opts;
+	struct bkey_buf sk;
+	struct btree_iter iter;
 	struct bkey_s_c k;
-	struct bkey_s_c_extent e;
-	struct data_opts data_opts;
-	enum data_cmd data_cmd;
-	u64 cur_inum = U64_MAX;
+	struct data_update_opts data_opts;
 	int ret = 0, ret2;
 
-	closure_init_stack(&ctxt.cl);
-	INIT_LIST_HEAD(&ctxt.reads);
-	init_waitqueue_head(&ctxt.wait);
+	per_snapshot_io_opts_init(&snapshot_io_opts, c);
+	bch2_bkey_buf_init(&sk);
 
-	stats->data_type = BCH_DATA_USER;
-	bch2_btree_iter_init(&stats->iter, c, BTREE_ID_EXTENTS, start,
-			     BTREE_ITER_PREFETCH);
+	if (ctxt->stats) {
+		ctxt->stats->data_type	= BCH_DATA_user;
+		ctxt->stats->pos	= BBPOS(btree_id, start);
+	}
 
-	if (rate)
-		bch2_ratelimit_reset(rate);
+	bch2_trans_begin(trans);
+	bch2_trans_iter_init(trans, &iter, btree_id, start,
+			     BTREE_ITER_prefetch|
+			     BTREE_ITER_all_snapshots);
 
-	while (!kthread || !(ret = kthread_should_stop())) {
-		if (rate &&
-		    bch2_ratelimit_delay(rate) &&
-		    (bch2_btree_iter_unlock(&stats->iter),
-		     (ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
-			break;
-peek:
-		k = bch2_btree_iter_peek(&stats->iter);
+	if (ctxt->rate)
+		bch2_ratelimit_reset(ctxt->rate);
+
+	while (!bch2_move_ratelimit(ctxt)) {
+		bch2_trans_begin(trans);
+
+		k = bch2_btree_iter_peek(&iter);
 		if (!k.k)
 			break;
-		ret = btree_iter_err(k);
+
+		ret = bkey_err(k);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
 		if (ret)
 			break;
-		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
-			break;
-
-		if (!bkey_extent_is_data(k.k))
-			goto next_nondata;
 
-		e = bkey_s_c_to_extent(k);
+		if (bkey_ge(bkey_start_pos(k.k), end))
+			break;
 
-		if (cur_inum != k.k->p.inode) {
-			struct bch_inode_unpacked inode;
+		if (ctxt->stats)
+			ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
 
-			/* don't hold btree locks while looking up inode: */
-			bch2_btree_iter_unlock(&stats->iter);
+		if (!bkey_extent_is_direct_data(k.k))
+			goto next_nondata;
 
-			io_opts = bch2_opts_to_inode_opts(c->opts);
-			if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
-				bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
-			cur_inum = k.k->p.inode;
-			goto peek;
-		}
+		io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, &iter, k);
+		ret = PTR_ERR_OR_ZERO(io_opts);
+		if (ret)
+			continue;
 
-		switch ((data_cmd = pred(c, arg, BKEY_TYPE_EXTENTS, e,
-					 &io_opts, &data_opts))) {
-		case DATA_SKIP:
+		memset(&data_opts, 0, sizeof(data_opts));
+		if (!pred(c, arg, k, io_opts, &data_opts))
 			goto next;
-		case DATA_SCRUB:
-			BUG();
-		case DATA_ADD_REPLICAS:
-		case DATA_REWRITE:
-		case DATA_PROMOTE:
-			break;
-		default:
-			BUG();
-		}
 
-		/* unlock before doing IO: */
-		bkey_reassemble(&tmp.k, k);
-		k = bkey_i_to_s_c(&tmp.k);
-		bch2_btree_iter_unlock(&stats->iter);
+		/*
+		 * The iterator gets unlocked by __bch2_read_extent - need to
+		 * save a copy of @k elsewhere:
+		 */
+		bch2_bkey_buf_reassemble(&sk, c, k);
+		k = bkey_i_to_s_c(sk.k);
 
-		ret2 = bch2_move_extent(c, &ctxt, wp, io_opts,
-					bkey_s_c_to_extent(k),
-					data_cmd, data_opts);
+		ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
 		if (ret2) {
+			if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
+				continue;
+
 			if (ret2 == -ENOMEM) {
 				/* memory allocation failure, wait for some IO to finish */
-				bch2_move_ctxt_wait_for_io(&ctxt);
+				bch2_move_ctxt_wait_for_io(ctxt);
 				continue;
 			}
 
 			/* XXX signal failure */
 			goto next;
 		}
-
-		if (rate)
-			bch2_ratelimit_increment(rate, k.k->size);
 next:
-		atomic64_add(k.k->size * bch2_extent_nr_dirty_ptrs(k),
-			     &stats->sectors_seen);
+		if (ctxt->stats)
+			atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
 next_nondata:
-		bch2_btree_iter_next(&stats->iter);
-		bch2_btree_iter_cond_resched(&stats->iter);
+		bch2_btree_iter_advance(&iter);
 	}
 
-	bch2_btree_iter_unlock(&stats->iter);
-
-	move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
-	closure_sync(&ctxt.cl);
-
-	EBUG_ON(atomic_read(&ctxt.write_sectors));
-
-	trace_move_data(c,
-			atomic64_read(&stats->sectors_moved),
-			atomic64_read(&stats->keys_moved));
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_bkey_buf_exit(&sk, c);
+	per_snapshot_io_opts_exit(&snapshot_io_opts);
 
 	return ret;
 }
 
-static int bch2_gc_data_replicas(struct bch_fs *c)
+int __bch2_move_data(struct moving_context *ctxt,
+		     struct bbpos start,
+		     struct bbpos end,
+		     move_pred_fn pred, void *arg)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
+	struct bch_fs *c = ctxt->trans->c;
+	enum btree_id id;
+	int ret = 0;
+
+	for (id = start.btree;
+	     id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
+	     id++) {
+		ctxt->stats->pos = BBPOS(id, POS_MIN);
 
-	mutex_lock(&c->replicas_gc_lock);
-	bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
+		if (!btree_type_has_ptrs(id) ||
+		    !bch2_btree_id_root(c, id)->b)
+			continue;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
-			   BTREE_ITER_PREFETCH, k) {
-		ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
+		ret = bch2_move_data_btree(ctxt,
+				       id == start.btree ? start.pos : POS_MIN,
+				       id == end.btree   ? end.pos   : POS_MAX,
+				       pred, arg, id);
 		if (ret)
 			break;
 	}
-	ret = bch2_btree_iter_unlock(&iter) ?: ret;
 
-	bch2_replicas_gc_end(c, ret);
-	mutex_unlock(&c->replicas_gc_lock);
+	return ret;
+}
+
+int bch2_move_data(struct bch_fs *c,
+		   struct bbpos start,
+		   struct bbpos end,
+		   struct bch_ratelimit *rate,
+		   struct bch_move_stats *stats,
+		   struct write_point_specifier wp,
+		   bool wait_on_copygc,
+		   move_pred_fn pred, void *arg)
+{
+
+	struct moving_context ctxt;
+	int ret;
+
+	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+	ret = __bch2_move_data(&ctxt, start, end, pred, arg);
+	bch2_moving_ctxt_exit(&ctxt);
 
 	return ret;
 }
 
-static int bch2_gc_btree_replicas(struct bch_fs *c)
+int bch2_evacuate_bucket(struct moving_context *ctxt,
+			   struct move_bucket_in_flight *bucket_in_flight,
+			   struct bpos bucket, int gen,
+			   struct data_update_opts _data_opts)
 {
-	struct btree_iter iter;
-	struct btree *b;
-	unsigned id;
+	struct btree_trans *trans = ctxt->trans;
+	struct bch_fs *c = trans->c;
+	bool is_kthread = current->flags & PF_KTHREAD;
+	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+	struct btree_iter iter = {}, bp_iter = {};
+	struct bkey_buf sk;
+	struct bkey_s_c k;
+	struct data_update_opts data_opts;
+	unsigned sectors_moved = 0;
 	int ret = 0;
 
-	mutex_lock(&c->replicas_gc_lock);
-	bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
+	struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
+	if (!ca)
+		return 0;
 
-	for (id = 0; id < BTREE_ID_NR; id++) {
-		for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
-			ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
-						      bkey_i_to_s_c(&b->key));
+	trace_bucket_evacuate(c, &bucket);
 
-			bch2_btree_iter_cond_resched(&iter);
-		}
+	bch2_bkey_buf_init(&sk);
 
-		ret = bch2_btree_iter_unlock(&iter) ?: ret;
-	}
+	/*
+	 * We're not run in a context that handles transaction restarts:
+	 */
+	bch2_trans_begin(trans);
 
-	bch2_replicas_gc_end(c, ret);
-	mutex_unlock(&c->replicas_gc_lock);
+	bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+			     bucket_pos_to_bp_start(ca, bucket), 0);
 
+	bch_err_msg(c, ret, "looking up alloc key");
+	if (ret)
+		goto err;
+
+	ret = bch2_btree_write_buffer_tryflush(trans);
+	bch_err_msg(c, ret, "flushing btree write buffer");
+	if (ret)
+		goto err;
+
+	while (!(ret = bch2_move_ratelimit(ctxt))) {
+		if (is_kthread && kthread_should_stop())
+			break;
+
+		bch2_trans_begin(trans);
+
+		k = bch2_btree_iter_peek(&bp_iter);
+		ret = bkey_err(k);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			goto err;
+
+		if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket)))
+			break;
+
+		if (k.k->type != KEY_TYPE_backpointer)
+			goto next;
+
+		struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+
+		if (!bp.v->level) {
+			k = bch2_backpointer_get_key(trans, bp, &iter, 0);
+			ret = bkey_err(k);
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret)
+				goto err;
+			if (!k.k)
+				goto next;
+
+			bch2_bkey_buf_reassemble(&sk, c, k);
+			k = bkey_i_to_s_c(sk.k);
+
+			ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k);
+			if (ret) {
+				bch2_trans_iter_exit(trans, &iter);
+				continue;
+			}
+
+			data_opts = _data_opts;
+			data_opts.target	= io_opts.background_target;
+			data_opts.rewrite_ptrs = 0;
+
+			unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */
+			unsigned i = 0;
+			const union bch_extent_entry *entry;
+			struct extent_ptr_decoded p;
+			bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
+				if (p.ptr.dev == bucket.inode) {
+					if (p.ptr.cached) {
+						bch2_trans_iter_exit(trans, &iter);
+						goto next;
+					}
+					data_opts.rewrite_ptrs |= 1U << i;
+					break;
+				}
+				i++;
+			}
+
+			ret = bch2_move_extent(ctxt, bucket_in_flight,
+					       &iter, k, io_opts, data_opts);
+			bch2_trans_iter_exit(trans, &iter);
+
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret == -ENOMEM) {
+				/* memory allocation failure, wait for some IO to finish */
+				bch2_move_ctxt_wait_for_io(ctxt);
+				continue;
+			}
+			if (ret)
+				goto err;
+
+			if (ctxt->stats)
+				atomic64_add(sectors, &ctxt->stats->sectors_seen);
+			sectors_moved += sectors;
+		} else {
+			struct btree *b;
+
+			b = bch2_backpointer_get_node(trans, bp, &iter);
+			ret = PTR_ERR_OR_ZERO(b);
+			if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+				continue;
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret)
+				goto err;
+			if (!b)
+				goto next;
+
+			unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
+
+			ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+			bch2_trans_iter_exit(trans, &iter);
+
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret)
+				goto err;
+
+			if (ctxt->rate)
+				bch2_ratelimit_increment(ctxt->rate, sectors);
+			if (ctxt->stats) {
+				atomic64_add(sectors, &ctxt->stats->sectors_seen);
+				atomic64_add(sectors, &ctxt->stats->sectors_moved);
+			}
+			sectors_moved += btree_sectors(c);
+		}
+next:
+		bch2_btree_iter_advance(&bp_iter);
+	}
+
+	trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret);
+err:
+	bch2_trans_iter_exit(trans, &bp_iter);
+	bch2_dev_put(ca);
+	bch2_bkey_buf_exit(&sk, c);
 	return ret;
 }
 
+typedef bool (*move_btree_pred)(struct bch_fs *, void *,
+				struct btree *, struct bch_io_opts *,
+				struct data_update_opts *);
+
 static int bch2_move_btree(struct bch_fs *c,
-			   move_pred_fn pred,
-			   void *arg,
+			   struct bbpos start,
+			   struct bbpos end,
+			   move_btree_pred pred, void *arg,
 			   struct bch_move_stats *stats)
 {
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+	struct moving_context ctxt;
+	struct btree_trans *trans;
+	struct btree_iter iter;
 	struct btree *b;
-	unsigned id;
-	struct data_opts data_opts;
-	enum data_cmd cmd;
+	enum btree_id btree;
+	struct data_update_opts data_opts;
 	int ret = 0;
 
-	stats->data_type = BCH_DATA_BTREE;
+	bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
+			      writepoint_ptr(&c->btree_write_point),
+			      true);
+	trans = ctxt.trans;
+
+	stats->data_type = BCH_DATA_btree;
+
+	for (btree = start.btree;
+	     btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
+	     btree ++) {
+		stats->pos = BBPOS(btree, POS_MIN);
+
+		if (!bch2_btree_id_root(c, btree)->b)
+			continue;
+
+		bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
+					  BTREE_ITER_prefetch);
+retry:
+		ret = 0;
+		while (bch2_trans_begin(trans),
+		       (b = bch2_btree_iter_peek_node(&iter)) &&
+		       !(ret = PTR_ERR_OR_ZERO(b))) {
+			if (kthread && kthread_should_stop())
+				break;
 
-	for (id = 0; id < BTREE_ID_NR; id++) {
-		for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
-			switch ((cmd = pred(c, arg, BKEY_TYPE_BTREE,
-					    bkey_i_to_s_c_extent(&b->key),
-					    &io_opts,
-					    &data_opts))) {
-			case DATA_SKIP:
-				goto next;
-			case DATA_SCRUB:
-				BUG();
-			case DATA_ADD_REPLICAS:
-			case DATA_REWRITE:
+			if ((cmp_int(btree, end.btree) ?:
+			     bpos_cmp(b->key.k.p, end.pos)) > 0)
 				break;
-			default:
-				BUG();
-			}
 
-			ret = bch2_btree_node_rewrite(c, &stats->iter,
-					b->data->keys.seq, 0) ?: ret;
+			stats->pos = BBPOS(iter.btree_id, iter.pos);
+
+			if (!pred(c, arg, b, &io_opts, &data_opts))
+				goto next;
+
+			ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret)
+				break;
 next:
-			bch2_btree_iter_cond_resched(&stats->iter);
+			bch2_btree_iter_next_node(&iter);
 		}
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto retry;
 
-		ret = bch2_btree_iter_unlock(&stats->iter) ?: ret;
+		bch2_trans_iter_exit(trans, &iter);
+
+		if (kthread && kthread_should_stop())
+			break;
 	}
 
-	return ret;
-}
+	bch_err_fn(c, ret);
+	bch2_moving_ctxt_exit(&ctxt);
+	bch2_btree_interior_updates_flush(c);
 
-#if 0
-static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
-				enum bkey_type type,
-				struct bkey_s_c_extent e,
-				struct bch_io_opts *io_opts,
-				struct data_opts *data_opts)
-{
-	return DATA_SCRUB;
+	return ret;
 }
-#endif
 
-static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
-				      enum bkey_type type,
-				      struct bkey_s_c_extent e,
-				      struct bch_io_opts *io_opts,
-				      struct data_opts *data_opts)
+static bool rereplicate_pred(struct bch_fs *c, void *arg,
+			     struct bkey_s_c k,
+			     struct bch_io_opts *io_opts,
+			     struct data_update_opts *data_opts)
 {
-	unsigned nr_good = bch2_extent_durability(c, e);
-	unsigned replicas = type == BKEY_TYPE_BTREE
+	unsigned nr_good = bch2_bkey_durability(c, k);
+	unsigned replicas = bkey_is_btree_ptr(k.k)
 		? c->opts.metadata_replicas
 		: io_opts->data_replicas;
 
-	if (!nr_good || nr_good >= replicas)
-		return DATA_SKIP;
+	rcu_read_lock();
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	unsigned i = 0;
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+		if (!ptr->cached &&
+		    (!ca || !ca->mi.durability))
+			data_opts->kill_ptrs |= BIT(i);
+		i++;
+	}
+	rcu_read_unlock();
+
+	if (!data_opts->kill_ptrs &&
+	    (!nr_good || nr_good >= replicas))
+		return false;
 
 	data_opts->target		= 0;
-	data_opts->btree_insert_flags = 0;
-	return DATA_ADD_REPLICAS;
+	data_opts->extra_replicas	= replicas - nr_good;
+	data_opts->btree_insert_flags	= 0;
+	return true;
 }
 
-static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
-				  enum bkey_type type,
-				  struct bkey_s_c_extent e,
-				  struct bch_io_opts *io_opts,
-				  struct data_opts *data_opts)
+static bool migrate_pred(struct bch_fs *c, void *arg,
+			 struct bkey_s_c k,
+			 struct bch_io_opts *io_opts,
+			 struct data_update_opts *data_opts)
 {
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	struct bch_ioctl_data *op = arg;
+	unsigned i = 0;
 
-	if (!bch2_extent_has_device(e, op->migrate.dev))
-		return DATA_SKIP;
-
+	data_opts->rewrite_ptrs		= 0;
 	data_opts->target		= 0;
+	data_opts->extra_replicas	= 0;
 	data_opts->btree_insert_flags	= 0;
-	data_opts->rewrite_dev		= op->migrate.dev;
-	return DATA_REWRITE;
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		if (ptr->dev == op->migrate.dev)
+			data_opts->rewrite_ptrs |= 1U << i;
+		i++;
+	}
+
+	return data_opts->rewrite_ptrs != 0;
+}
+
+static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
+				   struct btree *b,
+				   struct bch_io_opts *io_opts,
+				   struct data_update_opts *data_opts)
+{
+	return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
+static bool migrate_btree_pred(struct bch_fs *c, void *arg,
+			       struct btree *b,
+			       struct bch_io_opts *io_opts,
+			       struct data_update_opts *data_opts)
+{
+	return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
+/*
+ * Ancient versions of bcachefs produced packed formats which could represent
+ * keys that the in memory format cannot represent; this checks for those
+ * formats so we can get rid of them.
+ */
+static bool bformat_needs_redo(struct bkey_format *f)
+{
+	for (unsigned i = 0; i < f->nr_fields; i++)
+		if (bch2_bkey_format_field_overflows(f, i))
+			return true;
+
+	return false;
+}
+
+static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
+				   struct btree *b,
+				   struct bch_io_opts *io_opts,
+				   struct data_update_opts *data_opts)
+{
+	if (b->version_ondisk != c->sb.version ||
+	    btree_node_need_rewrite(b) ||
+	    bformat_needs_redo(&b->format)) {
+		data_opts->target		= 0;
+		data_opts->extra_replicas	= 0;
+		data_opts->btree_insert_flags	= 0;
+		return true;
+	}
+
+	return false;
+}
+
+int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
+{
+	int ret;
+
+	ret = bch2_move_btree(c,
+			      BBPOS_MIN,
+			      BBPOS_MAX,
+			      rewrite_old_nodes_pred, c, stats);
+	if (!ret) {
+		mutex_lock(&c->sb_lock);
+		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
+		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
+		c->disk_sb.sb->version_min = c->disk_sb.sb->version;
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	}
+
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
+			     struct bkey_s_c k,
+			     struct bch_io_opts *io_opts,
+			     struct data_update_opts *data_opts)
+{
+	unsigned durability = bch2_bkey_durability(c, k);
+	unsigned replicas = bkey_is_btree_ptr(k.k)
+		? c->opts.metadata_replicas
+		: io_opts->data_replicas;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned i = 0;
+
+	rcu_read_lock();
+	bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
+		unsigned d = bch2_extent_ptr_durability(c, &p);
+
+		if (d && durability - d >= replicas) {
+			data_opts->kill_ptrs |= BIT(i);
+			durability -= d;
+		}
+
+		i++;
+	}
+	rcu_read_unlock();
+
+	return data_opts->kill_ptrs != 0;
+}
+
+static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
+				   struct btree *b,
+				   struct bch_io_opts *io_opts,
+				   struct data_update_opts *data_opts)
+{
+	return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
 }
 
 int bch2_data_job(struct bch_fs *c,
 		  struct bch_move_stats *stats,
 		  struct bch_ioctl_data op)
 {
+	struct bbpos start	= BBPOS(op.start_btree, op.start_pos);
+	struct bbpos end	= BBPOS(op.end_btree, op.end_pos);
 	int ret = 0;
 
-	switch (op.op) {
-	case BCH_DATA_OP_REREPLICATE:
-		stats->data_type = BCH_DATA_JOURNAL;
-		ret = bch2_journal_flush_device_pins(&c->journal, -1);
+	if (op.op >= BCH_DATA_OP_NR)
+		return -EINVAL;
 
-		ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
-		ret = bch2_gc_btree_replicas(c) ?: ret;
+	bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
 
-		ret = bch2_move_data(c, NULL,
+	switch (op.op) {
+	case BCH_DATA_OP_rereplicate:
+		stats->data_type = BCH_DATA_journal;
+		ret = bch2_journal_flush_device_pins(&c->journal, -1);
+		ret = bch2_move_btree(c, start, end,
+				      rereplicate_btree_pred, c, stats) ?: ret;
+		ret = bch2_move_data(c, start, end,
+				     NULL,
+				     stats,
 				     writepoint_hashed((unsigned long) current),
-				     op.start,
-				     op.end,
-				     rereplicate_pred, c, stats) ?: ret;
-		ret = bch2_gc_data_replicas(c) ?: ret;
+				     true,
+				     rereplicate_pred, c) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
 		break;
-	case BCH_DATA_OP_MIGRATE:
+	case BCH_DATA_OP_migrate:
 		if (op.migrate.dev >= c->sb.nr_devices)
 			return -EINVAL;
 
-		stats->data_type = BCH_DATA_JOURNAL;
+		stats->data_type = BCH_DATA_journal;
 		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
-
-		ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
-		ret = bch2_gc_btree_replicas(c) ?: ret;
-
-		ret = bch2_move_data(c, NULL,
+		ret = bch2_move_btree(c, start, end,
+				      migrate_btree_pred, &op, stats) ?: ret;
+		ret = bch2_move_data(c, start, end,
+				     NULL,
+				     stats,
 				     writepoint_hashed((unsigned long) current),
-				     op.start,
-				     op.end,
-				     migrate_pred, &op, stats) ?: ret;
-		ret = bch2_gc_data_replicas(c) ?: ret;
+				     true,
+				     migrate_pred, &op) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
+		break;
+	case BCH_DATA_OP_rewrite_old_nodes:
+		ret = bch2_scan_old_btree_nodes(c, stats);
+		break;
+	case BCH_DATA_OP_drop_extra_replicas:
+		ret = bch2_move_btree(c, start, end,
+				drop_extra_replicas_btree_pred, c, stats) ?: ret;
+		ret = bch2_move_data(c, start, end, NULL, stats,
+				writepoint_hashed((unsigned long) current),
+				true,
+				drop_extra_replicas_pred, c) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
 		break;
 	default:
 		ret = -EINVAL;
 	}
 
+	bch2_move_stats_exit(stats, c);
 	return ret;
 }
+
+void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
+{
+	prt_printf(out, "%s: data type==", stats->name);
+	bch2_prt_data_type(out, stats->data_type);
+	prt_str(out, " pos=");
+	bch2_bbpos_to_text(out, stats->pos);
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	prt_printf(out, "keys moved:  %llu\n",	atomic64_read(&stats->keys_moved));
+	prt_printf(out, "keys raced:  %llu\n",	atomic64_read(&stats->keys_raced));
+	prt_printf(out, "bytes seen:  ");
+	prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
+	prt_newline(out);
+
+	prt_printf(out, "bytes moved: ");
+	prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
+	prt_newline(out);
+
+	prt_printf(out, "bytes raced: ");
+	prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
+	prt_newline(out);
+
+	printbuf_indent_sub(out, 2);
+}
+
+static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
+{
+	struct moving_io *io;
+
+	bch2_move_stats_to_text(out, ctxt->stats);
+	printbuf_indent_add(out, 2);
+
+	prt_printf(out, "reads: ios %u/%u sectors %u/%u\n",
+		   atomic_read(&ctxt->read_ios),
+		   c->opts.move_ios_in_flight,
+		   atomic_read(&ctxt->read_sectors),
+		   c->opts.move_bytes_in_flight >> 9);
+
+	prt_printf(out, "writes: ios %u/%u sectors %u/%u\n",
+		   atomic_read(&ctxt->write_ios),
+		   c->opts.move_ios_in_flight,
+		   atomic_read(&ctxt->write_sectors),
+		   c->opts.move_bytes_in_flight >> 9);
+
+	printbuf_indent_add(out, 2);
+
+	mutex_lock(&ctxt->lock);
+	list_for_each_entry(io, &ctxt->ios, io_list)
+		bch2_write_op_to_text(out, &io->write.op);
+	mutex_unlock(&ctxt->lock);
+
+	printbuf_indent_sub(out, 4);
+}
+
+void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct moving_context *ctxt;
+
+	mutex_lock(&c->moving_context_lock);
+	list_for_each_entry(ctxt, &c->moving_context_list, list)
+		bch2_moving_ctxt_to_text(out, c, ctxt);
+	mutex_unlock(&c->moving_context_lock);
+}
+
+void bch2_fs_move_init(struct bch_fs *c)
+{
+	INIT_LIST_HEAD(&c->moving_context_list);
+	mutex_init(&c->moving_context_lock);
+}
diff --git a/libbcachefs/move.h b/libbcachefs/move.h
index bc87e067..51e0505a 100644
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@@ -1,62 +1,154 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_MOVE_H
 #define _BCACHEFS_MOVE_H
 
+#include "bbpos.h"
+#include "bcachefs_ioctl.h"
 #include "btree_iter.h"
 #include "buckets.h"
-#include "io_types.h"
+#include "data_update.h"
 #include "move_types.h"
 
 struct bch_read_bio;
-struct moving_context;
-
-enum data_cmd {
-	DATA_SKIP,
-	DATA_SCRUB,
-	DATA_ADD_REPLICAS,
-	DATA_REWRITE,
-	DATA_PROMOTE,
-};
 
-struct data_opts {
-	u16		target;
-	unsigned	rewrite_dev;
-	int		btree_insert_flags;
+struct moving_context {
+	struct btree_trans	*trans;
+	struct list_head	list;
+	void			*fn;
+
+	struct bch_ratelimit	*rate;
+	struct bch_move_stats	*stats;
+	struct write_point_specifier wp;
+	bool			wait_on_copygc;
+	bool			write_error;
+
+	/* For waiting on outstanding reads and writes: */
+	struct closure		cl;
+
+	struct mutex		lock;
+	struct list_head	reads;
+	struct list_head	ios;
+
+	/* in flight sectors: */
+	atomic_t		read_sectors;
+	atomic_t		write_sectors;
+	atomic_t		read_ios;
+	atomic_t		write_ios;
+
+	wait_queue_head_t	wait;
 };
 
-struct migrate_write {
-	enum data_cmd		data_cmd;
-	struct data_opts	data_opts;
+#define move_ctxt_wait_event_timeout(_ctxt, _cond, _timeout)			\
+({										\
+	int _ret = 0;								\
+	while (true) {								\
+		bool cond_finished = false;					\
+		bch2_moving_ctxt_do_pending_writes(_ctxt);			\
+										\
+		if (_cond)							\
+			break;							\
+		bch2_trans_unlock_long((_ctxt)->trans);				\
+		_ret = __wait_event_timeout((_ctxt)->wait,			\
+			     bch2_moving_ctxt_next_pending_write(_ctxt) ||	\
+			     (cond_finished = (_cond)), _timeout);		\
+		if (_ret || ( cond_finished))					\
+			break;							\
+	}									\
+	_ret;									\
+})
 
-	unsigned		nr_ptrs_reserved;
+#define move_ctxt_wait_event(_ctxt, _cond)				\
+do {									\
+	bool cond_finished = false;					\
+	bch2_moving_ctxt_do_pending_writes(_ctxt);			\
+									\
+	if (_cond)							\
+		break;							\
+	bch2_trans_unlock_long((_ctxt)->trans);				\
+	__wait_event((_ctxt)->wait,					\
+		     bch2_moving_ctxt_next_pending_write(_ctxt) ||	\
+		     (cond_finished = (_cond)));			\
+	if (cond_finished)						\
+		break;							\
+} while (1)
 
-	struct moving_context	*ctxt;
+typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
+			     struct bch_io_opts *, struct data_update_opts *);
 
-	/* what we read: */
-	struct bch_extent_ptr	ptr;
-	u64			offset;
+extern const char * const bch2_data_ops_strs[];
 
-	struct bch_write_op	op;
+void bch2_moving_ctxt_exit(struct moving_context *);
+void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
+			   struct bch_ratelimit *, struct bch_move_stats *,
+			   struct write_point_specifier, bool);
+struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *);
+void bch2_moving_ctxt_flush_all(struct moving_context *);
+void bch2_move_ctxt_wait_for_io(struct moving_context *);
+int bch2_move_ratelimit(struct moving_context *);
+
+/* Inodes in different snapshots may have different IO options: */
+struct snapshot_io_opts_entry {
+	u32			snapshot;
+	struct bch_io_opts	io_opts;
 };
 
-void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *);
-int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
-			    struct write_point_specifier,
-			    struct bch_io_opts,
-			    enum data_cmd, struct data_opts,
-			    struct bkey_s_c);
+struct per_snapshot_io_opts {
+	u64			cur_inum;
+	struct bch_io_opts	fs_io_opts;
+	DARRAY(struct snapshot_io_opts_entry) d;
+};
 
-typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
-				enum bkey_type, struct bkey_s_c_extent,
-				struct bch_io_opts *, struct data_opts *);
+static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c)
+{
+	memset(io_opts, 0, sizeof(*io_opts));
+	io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts);
+}
 
-int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
+static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts)
+{
+	darray_exit(&io_opts->d);
+}
+
+int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *,
+			      struct btree_iter *, struct bkey_s_c);
+
+int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
+
+int bch2_move_extent(struct moving_context *,
+		     struct move_bucket_in_flight *,
+		     struct btree_iter *,
+		     struct bkey_s_c,
+		     struct bch_io_opts,
+		     struct data_update_opts);
+
+int __bch2_move_data(struct moving_context *,
+		     struct bbpos,
+		     struct bbpos,
+		     move_pred_fn, void *);
+int bch2_move_data(struct bch_fs *,
+		   struct bbpos start,
+		   struct bbpos end,
+		   struct bch_ratelimit *,
+		   struct bch_move_stats *,
 		   struct write_point_specifier,
-		   struct bpos, struct bpos,
-		   move_pred_fn, void *,
-		   struct bch_move_stats *);
+		   bool,
+		   move_pred_fn, void *);
 
+int bch2_evacuate_bucket(struct moving_context *,
+			   struct move_bucket_in_flight *,
+			   struct bpos, int,
+			   struct data_update_opts);
 int bch2_data_job(struct bch_fs *,
 		  struct bch_move_stats *,
 		  struct bch_ioctl_data);
 
+void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
+void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
+void bch2_move_stats_init(struct bch_move_stats *, const char *);
+
+void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_fs_move_init(struct bch_fs *);
+
 #endif /* _BCACHEFS_MOVE_H */
diff --git a/libbcachefs/move_types.h b/libbcachefs/move_types.h
index 832542a8..e22841ef 100644
--- a/libbcachefs/move_types.h
+++ b/libbcachefs/move_types.h
@@ -1,14 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_MOVE_TYPES_H
 #define _BCACHEFS_MOVE_TYPES_H
 
+#include "bbpos_types.h"
+
 struct bch_move_stats {
 	enum bch_data_type	data_type;
-	struct btree_iter	iter;
+	struct bbpos		pos;
+	char			name[32];
 
 	atomic64_t		keys_moved;
-	atomic64_t		sectors_moved;
+	atomic64_t		keys_raced;
 	atomic64_t		sectors_seen;
+	atomic64_t		sectors_moved;
 	atomic64_t		sectors_raced;
 };
 
+struct move_bucket_key {
+	struct bpos		bucket;
+	u8			gen;
+};
+
+struct move_bucket {
+	struct move_bucket_key	k;
+	unsigned		sectors;
+};
+
+struct move_bucket_in_flight {
+	struct move_bucket_in_flight *next;
+	struct rhash_head	hash;
+	struct move_bucket	bucket;
+	atomic_t		count;
+};
+
 #endif /* _BCACHEFS_MOVE_TYPES_H */
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 7bef4561..85c361e7 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Moving/copying garbage collector
  *
@@ -5,282 +6,421 @@
  */
 
 #include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
 #include "btree_iter.h"
 #include "btree_update.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "clock.h"
-#include "disk_groups.h"
-#include "extents.h"
-#include "eytzinger.h"
-#include "io.h"
-#include "keylist.h"
+#include "errcode.h"
+#include "error.h"
+#include "lru.h"
 #include "move.h"
 #include "movinggc.h"
-#include "super-io.h"
+#include "trace.h"
 
-#include <trace/events/bcachefs.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/math64.h>
 #include <linux/sched/task.h>
-#include <linux/sort.h>
 #include <linux/wait.h>
 
-/*
- * We can't use the entire copygc reserve in one iteration of copygc: we may
- * need the buckets we're freeing up to go back into the copygc reserve to make
- * forward progress, but if the copygc reserve is full they'll be available for
- * any allocation - and it's possible that in a given iteration, we free up most
- * of the buckets we're going to free before we allocate most of the buckets
- * we're going to allocate.
- *
- * If we only use half of the reserve per iteration, then in steady state we'll
- * always have room in the reserve for the buckets we're going to need in the
- * next iteration:
- */
-#define COPYGC_BUCKETS_PER_ITER(ca)					\
-	((ca)->free[RESERVE_MOVINGGC].size / 2)
+struct buckets_in_flight {
+	struct rhashtable		table;
+	struct move_bucket_in_flight	*first;
+	struct move_bucket_in_flight	*last;
+	size_t				nr;
+	size_t				sectors;
+};
+
+static const struct rhashtable_params bch_move_bucket_params = {
+	.head_offset		= offsetof(struct move_bucket_in_flight, hash),
+	.key_offset		= offsetof(struct move_bucket_in_flight, bucket.k),
+	.key_len		= sizeof(struct move_bucket_key),
+	.automatic_shrinking	= true,
+};
+
+static struct move_bucket_in_flight *
+move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b)
+{
+	struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL);
+	int ret;
 
-/*
- * Max sectors to move per iteration: Have to take into account internal
- * fragmentation from the multiple write points for each generation:
- */
-#define COPYGC_SECTORS_PER_ITER(ca)					\
-	((ca)->mi.bucket_size *	COPYGC_BUCKETS_PER_ITER(ca))
+	if (!new)
+		return ERR_PTR(-ENOMEM);
 
-static inline int sectors_used_cmp(copygc_heap *heap,
-				   struct copygc_heap_entry l,
-				   struct copygc_heap_entry r)
-{
-	return (l.sectors > r.sectors) - (l.sectors < r.sectors);
+	new->bucket = b;
+
+	ret = rhashtable_lookup_insert_fast(&list->table, &new->hash,
+					    bch_move_bucket_params);
+	if (ret) {
+		kfree(new);
+		return ERR_PTR(ret);
+	}
+
+	if (!list->first)
+		list->first = new;
+	else
+		list->last->next = new;
+
+	list->last = new;
+	list->nr++;
+	list->sectors += b.sectors;
+	return new;
 }
 
-static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
+static int bch2_bucket_is_movable(struct btree_trans *trans,
+				  struct move_bucket *b, u64 time)
 {
-	const struct copygc_heap_entry *l = _l;
-	const struct copygc_heap_entry *r = _r;
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_alloc_v4 _a;
+	const struct bch_alloc_v4 *a;
+	int ret;
+
+	if (bch2_bucket_is_open(trans->c,
+				b->k.bucket.inode,
+				b->k.bucket.offset))
+		return 0;
 
-	return (l->offset > r->offset) - (l->offset < r->offset);
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
+			       b->k.bucket, BTREE_ITER_cached);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	struct bch_dev *ca = bch2_dev_tryget(c, k.k->p.inode);
+	if (!ca)
+		goto out;
+
+	a = bch2_alloc_to_v4(k, &_a);
+	b->k.gen	= a->gen;
+	b->sectors	= bch2_bucket_sectors_dirty(*a);
+	u64 lru_idx	= alloc_lru_idx_fragmentation(*a, ca);
+
+	ret = lru_idx && lru_idx <= time;
+
+	bch2_dev_put(ca);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
 }
 
-static bool __copygc_pred(struct bch_dev *ca,
-			  struct bkey_s_c_extent e)
+static void move_buckets_wait(struct moving_context *ctxt,
+			      struct buckets_in_flight *list,
+			      bool flush)
 {
-	copygc_heap *h = &ca->copygc_heap;
-	const struct bch_extent_ptr *ptr =
-		bch2_extent_has_device(e, ca->dev_idx);
+	struct move_bucket_in_flight *i;
+	int ret;
+
+	while ((i = list->first)) {
+		if (flush)
+			move_ctxt_wait_event(ctxt, !atomic_read(&i->count));
 
-	if (ptr) {
-		struct copygc_heap_entry search = { .offset = ptr->offset };
+		if (atomic_read(&i->count))
+			break;
 
-		ssize_t i = eytzinger0_find_le(h->data, h->used,
-					       sizeof(h->data[0]),
-					       bucket_offset_cmp, &search);
+		list->first = i->next;
+		if (!list->first)
+			list->last = NULL;
 
-		return (i >= 0 &&
-			ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
-			ptr->gen == h->data[i].gen);
+		list->nr--;
+		list->sectors -= i->bucket.sectors;
+
+		ret = rhashtable_remove_fast(&list->table, &i->hash,
+					     bch_move_bucket_params);
+		BUG_ON(ret);
+		kfree(i);
 	}
 
-	return false;
+	bch2_trans_unlock_long(ctxt->trans);
 }
 
-static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
-				 enum bkey_type type,
-				 struct bkey_s_c_extent e,
-				 struct bch_io_opts *io_opts,
-				 struct data_opts *data_opts)
+static bool bucket_in_flight(struct buckets_in_flight *list,
+			     struct move_bucket_key k)
 {
-	struct bch_dev *ca = arg;
-
-	if (!__copygc_pred(ca, e))
-		return DATA_SKIP;
-
-	data_opts->target		= dev_to_target(ca->dev_idx);
-	data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE;
-	data_opts->rewrite_dev		= ca->dev_idx;
-	return DATA_REWRITE;
+	return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params);
 }
 
-static bool have_copygc_reserve(struct bch_dev *ca)
+typedef DARRAY(struct move_bucket) move_buckets;
+
+static int bch2_copygc_get_buckets(struct moving_context *ctxt,
+			struct buckets_in_flight *buckets_in_flight,
+			move_buckets *buckets)
 {
-	bool ret;
+	struct btree_trans *trans = ctxt->trans;
+	struct bch_fs *c = trans->c;
+	size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
+	size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
+	int ret;
 
-	spin_lock(&ca->freelist_lock);
-	ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
-		ca->allocator_blocked;
-	spin_unlock(&ca->freelist_lock);
+	move_buckets_wait(ctxt, buckets_in_flight, false);
 
-	return ret;
+	ret = bch2_btree_write_buffer_tryflush(trans);
+	if (bch2_err_matches(ret, EROFS))
+		return ret;
+
+	if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
+		return ret;
+
+	bch2_trans_begin(trans);
+
+	ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
+				  lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
+				  lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
+				  0, k, ({
+		struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
+		int ret2 = 0;
+
+		saw++;
+
+		ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p));
+		if (ret2 < 0)
+			goto err;
+
+		if (!ret2)
+			not_movable++;
+		else if (bucket_in_flight(buckets_in_flight, b.k))
+			in_flight++;
+		else {
+			ret2 = darray_push(buckets, b);
+			if (ret2)
+				goto err;
+			sectors += b.sectors;
+		}
+
+		ret2 = buckets->nr >= nr_to_get;
+err:
+		ret2;
+	}));
+
+	pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
+		 buckets_in_flight->nr, buckets_in_flight->sectors,
+		 saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret);
+
+	return ret < 0 ? ret : 0;
 }
 
-static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
+noinline
+static int bch2_copygc(struct moving_context *ctxt,
+		       struct buckets_in_flight *buckets_in_flight,
+		       bool *did_work)
 {
-	copygc_heap *h = &ca->copygc_heap;
-	struct copygc_heap_entry e, *i;
-	struct bucket_array *buckets;
-	struct bch_move_stats move_stats;
-	u64 sectors_to_move = 0, sectors_not_moved = 0;
-	u64 buckets_to_move, buckets_not_moved = 0;
-	size_t b;
-	int ret;
+	struct btree_trans *trans = ctxt->trans;
+	struct bch_fs *c = trans->c;
+	struct data_update_opts data_opts = {
+		.btree_insert_flags = BCH_WATERMARK_copygc,
+	};
+	move_buckets buckets = { 0 };
+	struct move_bucket_in_flight *f;
+	u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
+	int ret = 0;
+
+	ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
+	if (ret)
+		goto err;
+
+	darray_for_each(buckets, i) {
+		if (kthread_should_stop() || freezing(current))
+			break;
 
-	memset(&move_stats, 0, sizeof(move_stats));
-	closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
-
-	/*
-	 * Find buckets with lowest sector counts, skipping completely
-	 * empty buckets, by building a maxheap sorted by sector count,
-	 * and repeatedly replacing the maximum element until all
-	 * buckets have been visited.
-	 */
-	h->used = 0;
-
-	/*
-	 * We need bucket marks to be up to date - gc can't be recalculating
-	 * them:
-	 */
-	down_read(&c->gc_lock);
-	down_read(&ca->bucket_lock);
-	buckets = bucket_array(ca);
-
-	for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
-		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
-		struct copygc_heap_entry e;
-
-		if (m.owned_by_allocator ||
-		    m.data_type != BCH_DATA_USER ||
-		    !bucket_sectors_used(m) ||
-		    bucket_sectors_used(m) >= ca->mi.bucket_size)
+		f = move_bucket_in_flight_add(buckets_in_flight, *i);
+		ret = PTR_ERR_OR_ZERO(f);
+		if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */
+			ret = 0;
 			continue;
+		}
+		if (ret == -ENOMEM) { /* flush IO, continue later */
+			ret = 0;
+			break;
+		}
+
+		ret = bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
+					     f->bucket.k.gen, data_opts);
+		if (ret)
+			goto err;
 
-		e = (struct copygc_heap_entry) {
-			.gen		= m.gen,
-			.sectors	= bucket_sectors_used(m),
-			.offset		= bucket_to_sector(ca, b),
-		};
-		heap_add_or_replace(h, e, -sectors_used_cmp);
+		*did_work = true;
 	}
-	up_read(&ca->bucket_lock);
-	up_read(&c->gc_lock);
+err:
+	darray_exit(&buckets);
 
-	for (i = h->data; i < h->data + h->used; i++)
-		sectors_to_move += i->sectors;
+	/* no entries in LRU btree found, or got to end: */
+	if (bch2_err_matches(ret, ENOENT))
+		ret = 0;
 
-	while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
-		BUG_ON(!heap_pop(h, e, -sectors_used_cmp));
-		sectors_to_move -= e.sectors;
-	}
+	if (ret < 0 && !bch2_err_matches(ret, EROFS))
+		bch_err_msg(c, ret, "from bch2_move_data()");
 
-	buckets_to_move = h->used;
+	moved = atomic64_read(&ctxt->stats->sectors_moved) - moved;
+	trace_and_count(c, copygc, c, moved, 0, 0, 0);
+	return ret;
+}
 
-	if (!buckets_to_move)
-		return;
+/*
+ * Copygc runs when the amount of fragmented data is above some arbitrary
+ * threshold:
+ *
+ * The threshold at the limit - when the device is full - is the amount of space
+ * we reserved in bch2_recalc_capacity; we can't have more than that amount of
+ * disk space stranded due to fragmentation and store everything we have
+ * promised to store.
+ *
+ * But we don't want to be running copygc unnecessarily when the device still
+ * has plenty of free space - rather, we want copygc to smoothly run every so
+ * often and continually reduce the amount of fragmented space as the device
+ * fills up. So, we increase the threshold by half the current free space.
+ */
+unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
+{
+	s64 wait = S64_MAX, fragmented_allowed, fragmented;
 
-	eytzinger0_sort(h->data, h->used,
-			sizeof(h->data[0]),
-			bucket_offset_cmp, NULL);
+	for_each_rw_member(c, ca) {
+		struct bch_dev_usage usage = bch2_dev_usage_read(ca);
 
-	ret = bch2_move_data(c, &ca->copygc_pd.rate,
-			     writepoint_ptr(&ca->copygc_write_point),
-			     POS_MIN, POS_MAX,
-			     copygc_pred, ca,
-			     &move_stats);
+		fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
+				       ca->mi.bucket_size) >> 1);
+		fragmented = 0;
 
-	down_read(&ca->bucket_lock);
-	buckets = bucket_array(ca);
-	for (i = h->data; i < h->data + h->used; i++) {
-		size_t b = sector_to_bucket(ca, i->offset);
-		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+		for (unsigned i = 0; i < BCH_DATA_NR; i++)
+			if (data_type_movable(i))
+				fragmented += usage.d[i].fragmented;
 
-		if (i->gen == m.gen && bucket_sectors_used(m)) {
-			sectors_not_moved += bucket_sectors_used(m);
-			buckets_not_moved++;
-		}
+		wait = min(wait, max(0LL, fragmented_allowed - fragmented));
 	}
-	up_read(&ca->bucket_lock);
 
-	if (sectors_not_moved && !ret)
-		bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
-			 sectors_not_moved, sectors_to_move,
-			 buckets_not_moved, buckets_to_move);
+	return wait;
+}
 
-	trace_copygc(ca,
-		     atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
-		     buckets_to_move, buckets_not_moved);
+void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	printbuf_tabstop_push(out, 32);
+	prt_printf(out, "running:\t%u\n",		c->copygc_running);
+	prt_printf(out, "copygc_wait:\t%llu\n",		c->copygc_wait);
+	prt_printf(out, "copygc_wait_at:\t%llu\n",	c->copygc_wait_at);
+
+	prt_printf(out, "Currently waiting for:\t");
+	prt_human_readable_u64(out, max(0LL, c->copygc_wait -
+					atomic64_read(&c->io_clock[WRITE].now)) << 9);
+	prt_newline(out);
+
+	prt_printf(out, "Currently waiting since:\t");
+	prt_human_readable_u64(out, max(0LL,
+					atomic64_read(&c->io_clock[WRITE].now) -
+					c->copygc_wait_at) << 9);
+	prt_newline(out);
+
+	prt_printf(out, "Currently calculated wait:\t");
+	prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
+	prt_newline(out);
 }
 
 static int bch2_copygc_thread(void *arg)
 {
-	struct bch_dev *ca = arg;
-	struct bch_fs *c = ca->fs;
+	struct bch_fs *c = arg;
+	struct moving_context ctxt;
+	struct bch_move_stats move_stats;
 	struct io_clock *clock = &c->io_clock[WRITE];
-	struct bch_dev_usage usage;
-	unsigned long last;
-	u64 available, fragmented, reserve, next;
+	struct buckets_in_flight *buckets;
+	u64 last, wait;
+	int ret = 0;
+
+	buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL);
+	if (!buckets)
+		return -ENOMEM;
+	ret = rhashtable_init(&buckets->table, &bch_move_bucket_params);
+	bch_err_msg(c, ret, "allocating copygc buckets in flight");
+	if (ret) {
+		kfree(buckets);
+		return ret;
+	}
 
 	set_freezable();
 
-	while (!kthread_should_stop()) {
-		if (kthread_wait_freezable(c->copy_gc_enabled))
-			break;
+	bch2_move_stats_init(&move_stats, "copygc");
+	bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
+			      writepoint_ptr(&c->copygc_write_point),
+			      false);
 
-		last = atomic_long_read(&clock->now);
+	while (!ret && !kthread_should_stop()) {
+		bool did_work = false;
 
-		reserve = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
-				 ca->mi.bucket_size *
-				 c->opts.gc_reserve_percent, 200);
+		bch2_trans_unlock_long(ctxt.trans);
+		cond_resched();
 
-		usage = bch2_dev_usage_read(c, ca);
+		if (!c->opts.copygc_enabled) {
+			move_buckets_wait(&ctxt, buckets, true);
+			kthread_wait_freezable(c->opts.copygc_enabled ||
+					       kthread_should_stop());
+		}
 
-		/*
-		 * don't start copygc until less than half the gc reserve is
-		 * available:
-		 */
-		available = __dev_buckets_available(ca, usage) *
-			ca->mi.bucket_size;
-		if (available > reserve) {
-			next = last + available - reserve;
-			bch2_kthread_io_clock_wait(clock, next,
-					MAX_SCHEDULE_TIMEOUT);
+		if (unlikely(freezing(current))) {
+			move_buckets_wait(&ctxt, buckets, true);
+			__refrigerator(false);
 			continue;
 		}
 
-		/*
-		 * don't start copygc until there's more than half the copygc
-		 * reserve of fragmented space:
-		 */
-		fragmented = usage.sectors_fragmented;
-		if (fragmented < reserve) {
-			next = last + reserve - fragmented;
-			bch2_kthread_io_clock_wait(clock, next,
+		last = atomic64_read(&clock->now);
+		wait = bch2_copygc_wait_amount(c);
+
+		if (wait > clock->max_slop) {
+			c->copygc_wait_at = last;
+			c->copygc_wait = last + wait;
+			move_buckets_wait(&ctxt, buckets, true);
+			trace_and_count(c, copygc_wait, c, wait, last + wait);
+			bch2_kthread_io_clock_wait(clock, last + wait,
 					MAX_SCHEDULE_TIMEOUT);
 			continue;
 		}
 
-		bch2_copygc(c, ca);
+		c->copygc_wait = 0;
+
+		c->copygc_running = true;
+		ret = bch2_copygc(&ctxt, buckets, &did_work);
+		c->copygc_running = false;
+
+		wake_up(&c->copygc_running_wq);
+
+		if (!wait && !did_work) {
+			u64 min_member_capacity = bch2_min_rw_member_capacity(c);
+
+			if (min_member_capacity == U64_MAX)
+				min_member_capacity = 128 * 2048;
+
+			move_buckets_wait(&ctxt, buckets, true);
+			bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
+					MAX_SCHEDULE_TIMEOUT);
+		}
 	}
 
+	move_buckets_wait(&ctxt, buckets, true);
+
+	rhashtable_destroy(&buckets->table);
+	kfree(buckets);
+	bch2_moving_ctxt_exit(&ctxt);
+	bch2_move_stats_exit(&move_stats, c);
+
 	return 0;
 }
 
-void bch2_copygc_stop(struct bch_dev *ca)
+void bch2_copygc_stop(struct bch_fs *c)
 {
-	ca->copygc_pd.rate.rate = UINT_MAX;
-	bch2_ratelimit_reset(&ca->copygc_pd.rate);
-
-	if (ca->copygc_thread) {
-		kthread_stop(ca->copygc_thread);
-		put_task_struct(ca->copygc_thread);
+	if (c->copygc_thread) {
+		kthread_stop(c->copygc_thread);
+		put_task_struct(c->copygc_thread);
 	}
-	ca->copygc_thread = NULL;
+	c->copygc_thread = NULL;
 }
 
-int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
+int bch2_copygc_start(struct bch_fs *c)
 {
 	struct task_struct *t;
+	int ret;
 
-	BUG_ON(ca->copygc_thread);
+	if (c->copygc_thread)
+		return 0;
 
 	if (c->opts.nochanges)
 		return 0;
@@ -288,21 +428,22 @@ int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
 	if (bch2_fs_init_fault("copygc_start"))
 		return -ENOMEM;
 
-	t = kthread_create(bch2_copygc_thread, ca,
-			   "bch_copygc[%s]", ca->name);
-	if (IS_ERR(t))
-		return PTR_ERR(t);
+	t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
+	ret = PTR_ERR_OR_ZERO(t);
+	bch_err_msg(c, ret, "creating copygc thread");
+	if (ret)
+		return ret;
 
 	get_task_struct(t);
 
-	ca->copygc_thread = t;
-	wake_up_process(ca->copygc_thread);
+	c->copygc_thread = t;
+	wake_up_process(c->copygc_thread);
 
 	return 0;
 }
 
-void bch2_dev_copygc_init(struct bch_dev *ca)
+void bch2_fs_copygc_init(struct bch_fs *c)
 {
-	bch2_pd_controller_init(&ca->copygc_pd);
-	ca->copygc_pd.d_term = 0;
+	init_waitqueue_head(&c->copygc_running_wq);
+	c->copygc_running = false;
 }
diff --git a/libbcachefs/movinggc.h b/libbcachefs/movinggc.h
index c46fa1f1..ea181fef 100644
--- a/libbcachefs/movinggc.h
+++ b/libbcachefs/movinggc.h
@@ -1,8 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_MOVINGGC_H
 #define _BCACHEFS_MOVINGGC_H
 
-void bch2_copygc_stop(struct bch_dev *);
-int bch2_copygc_start(struct bch_fs *, struct bch_dev *);
-void bch2_dev_copygc_init(struct bch_dev *);
+unsigned long bch2_copygc_wait_amount(struct bch_fs *);
+void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_copygc_stop(struct bch_fs *);
+int bch2_copygc_start(struct bch_fs *);
+void bch2_fs_copygc_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_MOVINGGC_H */
diff --git a/libbcachefs/nocow_locking.c b/libbcachefs/nocow_locking.c
new file mode 100644
index 00000000..3c21981a
--- /dev/null
+++ b/libbcachefs/nocow_locking.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "nocow_locking.h"
+#include "util.h"
+
+#include <linux/closure.h>
+
+bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket)
+{
+	u64 dev_bucket = bucket_to_u64(bucket);
+	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(l->b); i++)
+		if (l->b[i] == dev_bucket && atomic_read(&l->l[i]))
+			return true;
+	return false;
+}
+
+#define sign(v)		(v < 0 ? -1 : v > 0 ? 1 : 0)
+
+void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags)
+{
+	u64 dev_bucket = bucket_to_u64(bucket);
+	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+	int lock_val = flags ? 1 : -1;
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(l->b); i++)
+		if (l->b[i] == dev_bucket) {
+			int v = atomic_sub_return(lock_val, &l->l[i]);
+
+			BUG_ON(v && sign(v) != lock_val);
+			if (!v)
+				closure_wake_up(&l->wait);
+			return;
+		}
+
+	BUG();
+}
+
+bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
+				 u64 dev_bucket, int flags)
+{
+	int v, lock_val = flags ? 1 : -1;
+	unsigned i;
+
+	spin_lock(&l->lock);
+
+	for (i = 0; i < ARRAY_SIZE(l->b); i++)
+		if (l->b[i] == dev_bucket)
+			goto got_entry;
+
+	for (i = 0; i < ARRAY_SIZE(l->b); i++)
+		if (!atomic_read(&l->l[i])) {
+			l->b[i] = dev_bucket;
+			goto take_lock;
+		}
+fail:
+	spin_unlock(&l->lock);
+	return false;
+got_entry:
+	v = atomic_read(&l->l[i]);
+	if (lock_val > 0 ? v < 0 : v > 0)
+		goto fail;
+take_lock:
+	v = atomic_read(&l->l[i]);
+	/* Overflow? */
+	if (v && sign(v + lock_val) != sign(v))
+		goto fail;
+
+	atomic_add(lock_val, &l->l[i]);
+	spin_unlock(&l->lock);
+	return true;
+}
+
+void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+			      struct nocow_lock_bucket *l,
+			      u64 dev_bucket, int flags)
+{
+	if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) {
+		struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks);
+		u64 start_time = local_clock();
+
+		__closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
+		bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+	}
+}
+
+void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t)
+
+{
+	unsigned i, nr_zero = 0;
+	struct nocow_lock_bucket *l;
+
+	for (l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) {
+		unsigned v = 0;
+
+		for (i = 0; i < ARRAY_SIZE(l->l); i++)
+			v |= atomic_read(&l->l[i]);
+
+		if (!v) {
+			nr_zero++;
+			continue;
+		}
+
+		if (nr_zero)
+			prt_printf(out, "(%u empty entries)\n", nr_zero);
+		nr_zero = 0;
+
+		for (i = 0; i < ARRAY_SIZE(l->l); i++) {
+			int v = atomic_read(&l->l[i]);
+			if (v) {
+				bch2_bpos_to_text(out, u64_to_bucket(l->b[i]));
+				prt_printf(out, ": %s %u ", v < 0 ? "copy" : "update", abs(v));
+			}
+		}
+		prt_newline(out);
+	}
+
+	if (nr_zero)
+		prt_printf(out, "(%u empty entries)\n", nr_zero);
+}
+
+void bch2_fs_nocow_locking_exit(struct bch_fs *c)
+{
+	struct bucket_nocow_lock_table *t = &c->nocow_locks;
+
+	for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
+		for (unsigned j = 0; j < ARRAY_SIZE(l->l); j++)
+			BUG_ON(atomic_read(&l->l[j]));
+}
+
+int bch2_fs_nocow_locking_init(struct bch_fs *c)
+{
+	struct bucket_nocow_lock_table *t = &c->nocow_locks;
+
+	for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
+		spin_lock_init(&l->lock);
+
+	return 0;
+}
diff --git a/libbcachefs/nocow_locking.h b/libbcachefs/nocow_locking.h
new file mode 100644
index 00000000..f9d6a426
--- /dev/null
+++ b/libbcachefs/nocow_locking.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_NOCOW_LOCKING_H
+#define _BCACHEFS_NOCOW_LOCKING_H
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "nocow_locking_types.h"
+
+#include <linux/hash.h>
+
+static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+							  u64 dev_bucket)
+{
+	unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS);
+
+	return t->l + (h & (BUCKET_NOCOW_LOCKS - 1));
+}
+
+#define BUCKET_NOCOW_LOCK_UPDATE	(1 << 0)
+
+bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos);
+void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int);
+bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int);
+void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *,
+			      struct nocow_lock_bucket *, u64, int);
+
+static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+					  struct bpos bucket, int flags)
+{
+	u64 dev_bucket = bucket_to_u64(bucket);
+	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+
+	__bch2_bucket_nocow_lock(t, l, dev_bucket, flags);
+}
+
+static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
+					  struct bpos bucket, int flags)
+{
+	u64 dev_bucket = bucket_to_u64(bucket);
+	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+
+	return __bch2_bucket_nocow_trylock(l, dev_bucket, flags);
+}
+
+void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
+
+void bch2_fs_nocow_locking_exit(struct bch_fs *);
+int bch2_fs_nocow_locking_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_NOCOW_LOCKING_H */
diff --git a/libbcachefs/nocow_locking_types.h b/libbcachefs/nocow_locking_types.h
new file mode 100644
index 00000000..bd12bf67
--- /dev/null
+++ b/libbcachefs/nocow_locking_types.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_NOCOW_LOCKING_TYPES_H
+#define _BCACHEFS_NOCOW_LOCKING_TYPES_H
+
+#define BUCKET_NOCOW_LOCKS_BITS		10
+#define BUCKET_NOCOW_LOCKS		(1U << BUCKET_NOCOW_LOCKS_BITS)
+
+struct nocow_lock_bucket {
+	struct closure_waitlist		wait;
+	spinlock_t			lock;
+	u64				b[4];
+	atomic_t			l[4];
+} __aligned(SMP_CACHE_BYTES);
+
+struct bucket_nocow_lock_table {
+	struct nocow_lock_bucket	l[BUCKET_NOCOW_LOCKS];
+};
+
+#endif /* _BCACHEFS_NOCOW_LOCKING_TYPES_H */
+
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index 8e655bc1..6772faf3 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -1,94 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include <linux/kernel.h>
+#include <linux/fs_parser.h>
 
 #include "bcachefs.h"
+#include "compress.h"
 #include "disk_groups.h"
+#include "error.h"
 #include "opts.h"
+#include "recovery_passes.h"
 #include "super-io.h"
 #include "util.h"
 
+#define x(t, n, ...) [n] = #t,
+
 const char * const bch2_error_actions[] = {
-	"continue",
-	"remount-ro",
-	"panic",
+	BCH_ERROR_ACTIONS()
+	NULL
+};
+
+const char * const bch2_fsck_fix_opts[] = {
+	BCH_FIX_ERRORS_OPTS()
+	NULL
+};
+
+const char * const bch2_version_upgrade_opts[] = {
+	BCH_VERSION_UPGRADE_OPTS()
+	NULL
+};
+
+const char * const bch2_sb_features[] = {
+	BCH_SB_FEATURES()
+	NULL
+};
+
+const char * const bch2_sb_compat[] = {
+	BCH_SB_COMPAT()
+	NULL
+};
+
+const char * const __bch2_btree_ids[] = {
+	BCH_BTREE_IDS()
+	NULL
+};
+
+static const char * const __bch2_csum_types[] = {
+	BCH_CSUM_TYPES()
+	NULL
+};
+
+const char * const __bch2_csum_opts[] = {
+	BCH_CSUM_OPTS()
+	NULL
+};
+
+const char * const __bch2_compression_types[] = {
+	BCH_COMPRESSION_TYPES()
 	NULL
 };
 
-const char * const bch2_csum_types[] = {
-	"none",
-	"crc32c",
-	"crc64",
+const char * const bch2_compression_opts[] = {
+	BCH_COMPRESSION_OPTS()
 	NULL
 };
 
-const char * const bch2_compression_types[] = {
-	"none",
-	"lz4",
-	"gzip",
-	"zstd",
+const char * const __bch2_str_hash_types[] = {
+	BCH_STR_HASH_TYPES()
 	NULL
 };
 
-const char * const bch2_str_hash_types[] = {
-	"crc32c",
-	"crc64",
-	"siphash",
+const char * const bch2_str_hash_opts[] = {
+	BCH_STR_HASH_OPTS()
 	NULL
 };
 
-const char * const bch2_data_types[] = {
-	"none",
-	"sb",
-	"journal",
-	"btree",
-	"data",
-	"cached",
+const char * const __bch2_data_types[] = {
+	BCH_DATA_TYPES()
 	NULL
 };
 
-const char * const bch2_cache_replacement_policies[] = {
-	"lru",
-	"fifo",
-	"random",
+const char * const bch2_member_states[] = {
+	BCH_MEMBER_STATES()
 	NULL
 };
 
-/* Default is -1; we skip past it for struct cached_dev's cache mode */
-const char * const bch2_cache_modes[] = {
-	"default",
-	"writethrough",
-	"writeback",
-	"writearound",
-	"none",
+static const char * const __bch2_jset_entry_types[] = {
+	BCH_JSET_ENTRY_TYPES()
 	NULL
 };
 
-const char * const bch2_dev_state[] = {
-	"readwrite",
-	"readonly",
-	"failed",
-	"spare",
+static const char * const __bch2_fs_usage_types[] = {
+	BCH_FS_USAGE_TYPES()
 	NULL
 };
 
+#undef x
+
+static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[],
+				    unsigned nr, const char *type, unsigned idx)
+{
+	if (idx < nr)
+		prt_str(out, opts[idx]);
+	else
+		prt_printf(out, "(unknown %s %u)", type, idx);
+}
+
+#define PRT_STR_OPT_BOUNDSCHECKED(name, type)					\
+void bch2_prt_##name(struct printbuf *out, type t)				\
+{										\
+	prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\
+}
+
+PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type,	enum bch_jset_entry_type);
+PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type,	enum bch_fs_usage_type);
+PRT_STR_OPT_BOUNDSCHECKED(data_type,		enum bch_data_type);
+PRT_STR_OPT_BOUNDSCHECKED(csum_opt,		enum bch_csum_opt);
+PRT_STR_OPT_BOUNDSCHECKED(csum_type,		enum bch_csum_type);
+PRT_STR_OPT_BOUNDSCHECKED(compression_type,	enum bch_compression_type);
+PRT_STR_OPT_BOUNDSCHECKED(str_hash_type,	enum bch_str_hash_type);
+
+static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
+				     struct printbuf *err)
+{
+	if (!val) {
+		*res = FSCK_FIX_yes;
+	} else {
+		int ret = match_string(bch2_fsck_fix_opts, -1, val);
+
+		if (ret < 0 && err)
+			prt_str(err, "fix_errors: invalid selection");
+		if (ret < 0)
+			return ret;
+		*res = ret;
+	}
+
+	return 0;
+}
+
+static void bch2_opt_fix_errors_to_text(struct printbuf *out,
+					struct bch_fs *c,
+					struct bch_sb *sb,
+					u64 v)
+{
+	prt_str(out, bch2_fsck_fix_opts[v]);
+}
+
+#define bch2_opt_fix_errors (struct bch_opt_fn) {	\
+	.parse = bch2_opt_fix_errors_parse,		\
+	.to_text = bch2_opt_fix_errors_to_text,		\
+}
+
+const char * const bch2_d_types[BCH_DT_MAX] = {
+	[DT_UNKNOWN]	= "unknown",
+	[DT_FIFO]	= "fifo",
+	[DT_CHR]	= "chr",
+	[DT_DIR]	= "dir",
+	[DT_BLK]	= "blk",
+	[DT_REG]	= "reg",
+	[DT_LNK]	= "lnk",
+	[DT_SOCK]	= "sock",
+	[DT_WHT]	= "whiteout",
+	[DT_SUBVOL]	= "subvol",
+};
+
+u64 BCH2_NO_SB_OPT(const struct bch_sb *sb)
+{
+	BUG();
+}
+
+void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v)
+{
+	BUG();
+}
+
 void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
 {
-#define BCH_OPT(_name, ...)						\
+#define x(_name, ...)						\
 	if (opt_defined(src, _name))					\
 		opt_set(*dst, _name, src._name);
 
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
 }
 
 bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
 {
 	switch (id) {
-#define BCH_OPT(_name, ...)						\
+#define x(_name, ...)						\
 	case Opt_##_name:						\
 		return opt_defined(*opts, _name);
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
 	default:
 		BUG();
 	}
@@ -97,11 +199,11 @@ bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
 u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
 {
 	switch (id) {
-#define BCH_OPT(_name, ...)						\
+#define x(_name, ...)						\
 	case Opt_##_name:						\
 		return opts->_name;
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
 	default:
 		BUG();
 	}
@@ -110,55 +212,47 @@ u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
 void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
 {
 	switch (id) {
-#define BCH_OPT(_name, ...)						\
+#define x(_name, ...)						\
 	case Opt_##_name:						\
 		opt_set(*opts, _name, v);				\
 		break;
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
 	default:
 		BUG();
 	}
 }
 
-/*
- * Initial options from superblock - here we don't want any options undefined,
- * any options the superblock doesn't specify are set to 0:
- */
-struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
-{
-	struct bch_opts opts = bch2_opts_empty();
-
-#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)		\
-	if (_sb_opt != NO_SB_OPT)					\
-		opt_set(opts, _name, _sb_opt(sb));
-	BCH_OPTS()
-#undef BCH_OPT
-
-	return opts;
-}
-
 const struct bch_option bch2_opt_table[] = {
-#define OPT_BOOL()		.type = BCH_OPT_BOOL
-#define OPT_UINT(_min, _max)	.type = BCH_OPT_UINT, .min = _min, .max = _max
-#define OPT_STR(_choices)	.type = BCH_OPT_STR, .choices = _choices
-#define OPT_FN(_fn)		.type = BCH_OPT_FN,			\
-				.parse = _fn##_parse,			\
-				.print = _fn##_print
-
-#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)		\
+#define OPT_BOOL()		.type = BCH_OPT_BOOL, .min = 0, .max = 2
+#define OPT_UINT(_min, _max)	.type = BCH_OPT_UINT,			\
+				.min = _min, .max = _max
+#define OPT_STR(_choices)	.type = BCH_OPT_STR,			\
+				.min = 0, .max = ARRAY_SIZE(_choices) - 1, \
+				.choices = _choices
+#define OPT_STR_NOLIMIT(_choices)	.type = BCH_OPT_STR,		\
+				.min = 0, .max = U64_MAX,		\
+				.choices = _choices
+#define OPT_BITFIELD(_choices)	.type = BCH_OPT_BITFIELD,		\
+				.choices = _choices
+#define OPT_FN(_fn)		.type = BCH_OPT_FN, .fn	= _fn
+
+#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help)	\
 	[Opt_##_name] = {						\
 		.attr	= {						\
 			.name	= #_name,				\
-			.mode = _mode == OPT_RUNTIME ? 0644 : 0444,	\
+			.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444,	\
 		},							\
-		.mode	= _mode,					\
+		.flags	= _flags,					\
+		.hint	= _hint,					\
+		.help	= _help,					\
+		.get_sb = _sb_opt,					\
 		.set_sb	= SET_##_sb_opt,				\
 		_type							\
 	},
 
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
 };
 
 int bch2_opt_lookup(const char *name)
@@ -195,180 +289,446 @@ static int bch2_mount_opt_lookup(const char *name)
 	return bch2_opt_lookup(name);
 }
 
-int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
-		   const char *val, u64 *res)
+int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
+{
+	if (v < opt->min) {
+		if (err)
+			prt_printf(err, "%s: too small (min %llu)",
+			       opt->attr.name, opt->min);
+		return -BCH_ERR_ERANGE_option_too_small;
+	}
+
+	if (opt->max && v >= opt->max) {
+		if (err)
+			prt_printf(err, "%s: too big (max %llu)",
+			       opt->attr.name, opt->max);
+		return -BCH_ERR_ERANGE_option_too_big;
+	}
+
+	if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
+		if (err)
+			prt_printf(err, "%s: not a multiple of 512",
+			       opt->attr.name);
+		return -BCH_ERR_opt_parse_error;
+	}
+
+	if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
+		if (err)
+			prt_printf(err, "%s: must be a power of two",
+			       opt->attr.name);
+		return -BCH_ERR_opt_parse_error;
+	}
+
+	if (opt->fn.validate)
+		return opt->fn.validate(v, err);
+
+	return 0;
+}
+
+int bch2_opt_parse(struct bch_fs *c,
+		   const struct bch_option *opt,
+		   const char *val, u64 *res,
+		   struct printbuf *err)
 {
 	ssize_t ret;
 
 	switch (opt->type) {
 	case BCH_OPT_BOOL:
-		ret = kstrtou64(val, 10, res);
-		if (ret < 0)
-			return ret;
+		if (val) {
+			ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool);
+			if (ret != -BCH_ERR_option_not_bool) {
+				*res = ret;
+			} else {
+				if (err)
+					prt_printf(err, "%s: must be bool", opt->attr.name);
+				return ret;
+			}
+		} else {
+			*res = 1;
+		}
 
-		if (*res > 1)
-			return -ERANGE;
 		break;
 	case BCH_OPT_UINT:
-		ret = kstrtou64(val, 10, res);
-		if (ret < 0)
-			return ret;
+		if (!val) {
+			prt_printf(err, "%s: required value",
+				   opt->attr.name);
+			return -EINVAL;
+		}
 
-		if (*res < opt->min || *res >= opt->max)
-			return -ERANGE;
+		ret = opt->flags & OPT_HUMAN_READABLE
+			? bch2_strtou64_h(val, res)
+			: kstrtou64(val, 10, res);
+		if (ret < 0) {
+			if (err)
+				prt_printf(err, "%s: must be a number",
+					   opt->attr.name);
+			return ret;
+		}
 		break;
 	case BCH_OPT_STR:
+		if (!val) {
+			prt_printf(err, "%s: required value",
+				   opt->attr.name);
+			return -EINVAL;
+		}
+
 		ret = match_string(opt->choices, -1, val);
-		if (ret < 0)
+		if (ret < 0) {
+			if (err)
+				prt_printf(err, "%s: invalid selection",
+					   opt->attr.name);
 			return ret;
+		}
 
 		*res = ret;
 		break;
+	case BCH_OPT_BITFIELD: {
+		s64 v = bch2_read_flag_list(val, opt->choices);
+		if (v < 0)
+			return v;
+		*res = v;
+		break;
+	}
 	case BCH_OPT_FN:
-		if (!c)
-			return -EINVAL;
+		ret = opt->fn.parse(c, val, res, err);
+
+		if (ret == -BCH_ERR_option_needs_open_fs)
+			return ret;
 
-		return opt->parse(c, val, res);
+		if (ret < 0) {
+			if (err)
+				prt_printf(err, "%s: parse error",
+					   opt->attr.name);
+			return ret;
+		}
 	}
 
-	return 0;
+	return bch2_opt_validate(opt, *res, err);
 }
 
-int bch2_opt_to_text(struct bch_fs *c, char *buf, size_t len,
-		     const struct bch_option *opt, u64 v,
-		     unsigned flags)
+void bch2_opt_to_text(struct printbuf *out,
+		      struct bch_fs *c, struct bch_sb *sb,
+		      const struct bch_option *opt, u64 v,
+		      unsigned flags)
 {
-	char *out = buf, *end = buf + len;
-
 	if (flags & OPT_SHOW_MOUNT_STYLE) {
-		if (opt->type == BCH_OPT_BOOL)
-			return scnprintf(out, end - out, "%s%s",
-					 v ? "" : "no",
-					 opt->attr.name);
+		if (opt->type == BCH_OPT_BOOL) {
+			prt_printf(out, "%s%s",
+			       v ? "" : "no",
+			       opt->attr.name);
+			return;
+		}
 
-		out += scnprintf(out, end - out, "%s=", opt->attr.name);
+		prt_printf(out, "%s=", opt->attr.name);
 	}
 
 	switch (opt->type) {
 	case BCH_OPT_BOOL:
 	case BCH_OPT_UINT:
-		out += scnprintf(out, end - out, "%lli", v);
+		if (opt->flags & OPT_HUMAN_READABLE)
+			prt_human_readable_u64(out, v);
+		else
+			prt_printf(out, "%lli", v);
 		break;
 	case BCH_OPT_STR:
-		out += (flags & OPT_SHOW_FULL_LIST)
-			? bch2_scnprint_string_list(out, end - out, opt->choices, v)
-			: scnprintf(out, end - out, opt->choices[v]);
+		if (v < opt->min || v >= opt->max)
+			prt_printf(out, "(invalid option %lli)", v);
+		else if (flags & OPT_SHOW_FULL_LIST)
+			prt_string_option(out, opt->choices, v);
+		else
+			prt_str(out, opt->choices[v]);
+		break;
+	case BCH_OPT_BITFIELD:
+		prt_bitflags(out, opt->choices, v);
 		break;
 	case BCH_OPT_FN:
-		return opt->print(c, out, end - out, v);
+		opt->fn.to_text(out, c, sb, v);
+		break;
 	default:
 		BUG();
 	}
+}
+
+void bch2_opts_to_text(struct printbuf *out,
+		       struct bch_opts opts,
+		       struct bch_fs *c, struct bch_sb *sb,
+		       unsigned show_mask, unsigned hide_mask,
+		       unsigned flags)
+{
+	bool first = true;
+
+	for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) {
+		const struct bch_option *opt = &bch2_opt_table[i];
+
+		if ((opt->flags & hide_mask) || !(opt->flags & show_mask))
+			continue;
+
+		u64 v = bch2_opt_get_by_id(&opts, i);
+		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+			continue;
+
+		if (!first)
+			prt_char(out, ',');
+		first = false;
+
+		bch2_opt_to_text(out, c, sb, opt, v, flags);
+	}
+}
+
+int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
+{
+	int ret = 0;
+
+	switch (id) {
+	case Opt_compression:
+	case Opt_background_compression:
+		ret = bch2_check_set_has_compressed_data(c, v);
+		break;
+	case Opt_erasure_code:
+		if (v)
+			bch2_check_set_feature(c, BCH_FEATURE_ec);
+		break;
+	}
 
-	return out - buf;
+	return ret;
 }
 
-int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
+int bch2_opts_check_may_set(struct bch_fs *c)
 {
-	char *opt, *name, *val;
-	int ret, id;
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < bch2_opts_nr; i++) {
+		ret = bch2_opt_check_may_set(c, i,
+				bch2_opt_get_by_id(&c->opts, i));
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
+			     struct printbuf *parse_later,
+			     const char *name, const char *val)
+{
+	struct printbuf err = PRINTBUF;
 	u64 v;
+	int ret, id;
 
-	while ((opt = strsep(&options, ",")) != NULL) {
-		name	= strsep(&opt, "=");
-		val	= opt;
+	id = bch2_mount_opt_lookup(name);
 
-		if (val) {
-			id = bch2_mount_opt_lookup(name);
-			if (id < 0)
-				goto bad_opt;
+	/* Check for the form "noopt", negation of a boolean opt: */
+	if (id < 0 &&
+	    !val &&
+	    !strncmp("no", name, 2)) {
+		id = bch2_mount_opt_lookup(name + 2);
+		val = "0";
+	}
 
-			ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v);
-			if (ret < 0)
-				goto bad_val;
-		} else {
-			id = bch2_mount_opt_lookup(name);
-			v = 1;
+	/* Unknown options are ignored: */
+	if (id < 0)
+		return 0;
 
-			if (id < 0 &&
-			    !strncmp("no", name, 2)) {
-				id = bch2_mount_opt_lookup(name + 2);
-				v = 0;
-			}
+	if (!(bch2_opt_table[id].flags & OPT_MOUNT))
+		goto bad_opt;
 
-			if (id < 0)
-				goto bad_opt;
+	if (id == Opt_acl &&
+	    !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
+		goto bad_opt;
 
-			if (bch2_opt_table[id].type != BCH_OPT_BOOL)
-				goto no_val;
-		}
+	if ((id == Opt_usrquota ||
+	     id == Opt_grpquota) &&
+	    !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
+		goto bad_opt;
 
-		if (bch2_opt_table[id].mode < OPT_MOUNT)
-			goto bad_opt;
+	ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
+	if (ret == -BCH_ERR_option_needs_open_fs && parse_later) {
+		prt_printf(parse_later, "%s=%s,", name, val);
+		if (parse_later->allocation_failure) {
+			ret = -ENOMEM;
+			goto out;
+		}
 
-		if (id == Opt_acl &&
-		    !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
-			goto bad_opt;
+		ret = 0;
+		goto out;
+	}
 
-		if ((id == Opt_usrquota ||
-		     id == Opt_grpquota) &&
-		    !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
-			goto bad_opt;
+	if (ret < 0)
+		goto bad_val;
 
+	if (opts)
 		bch2_opt_set_by_id(opts, id, v);
-	}
 
-	return 0;
+	ret = 0;
+	goto out;
+
 bad_opt:
 	pr_err("Bad mount option %s", name);
-	return -1;
+	ret = -BCH_ERR_option_name;
+	goto out;
+
 bad_val:
-	pr_err("Invalid value %s for mount option %s", val, name);
-	return -1;
-no_val:
-	pr_err("Mount option %s requires a value", name);
-	return -1;
-}
+	pr_err("Invalid mount option %s", err.buf);
+	ret = -BCH_ERR_option_value;
 
-/* io opts: */
+out:
+	printbuf_exit(&err);
+	return ret;
+}
 
-struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
+int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
+			  struct printbuf *parse_later, char *options)
 {
-	struct bch_io_opts ret = { 0 };
-#define BCH_INODE_OPT(_name, _bits)					\
-	if (opt_defined(src, _name))					\
-		opt_set(ret, _name, src._name);
-	BCH_INODE_OPTS()
-#undef BCH_INODE_OPT
+	char *copied_opts, *copied_opts_start;
+	char *opt, *name, *val;
+	int ret;
+
+	if (!options)
+		return 0;
+
+	/*
+	 * sys_fsconfig() is now occasionally providing us with option lists
+	 * starting with a comma - weird.
+	 */
+	if (*options == ',')
+		options++;
+
+	copied_opts = kstrdup(options, GFP_KERNEL);
+	if (!copied_opts)
+		return -ENOMEM;
+	copied_opts_start = copied_opts;
+
+	while ((opt = strsep(&copied_opts, ",")) != NULL) {
+		if (!*opt)
+			continue;
+
+		name	= strsep(&opt, "=");
+		val	= opt;
+
+		ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = 0;
+	goto out;
+
+out:
+	kfree(copied_opts_start);
 	return ret;
 }
 
-struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
+u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
 {
-	struct bch_opts ret = { 0 };
-#define BCH_INODE_OPT(_name, _bits)					\
-	if (opt_defined(src, _name))					\
-		opt_set(ret, _name, src._name);
-	BCH_INODE_OPTS()
-#undef BCH_INODE_OPT
-	return ret;
+	const struct bch_option *opt = bch2_opt_table + id;
+	u64 v;
+
+	v = opt->get_sb(sb);
+
+	if (opt->flags & OPT_SB_FIELD_ILOG2)
+		v = 1ULL << v;
+
+	if (opt->flags & OPT_SB_FIELD_SECTORS)
+		v <<= 9;
+
+	return v;
 }
 
-void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
+/*
+ * Initial options from superblock - here we don't want any options undefined,
+ * any options the superblock doesn't specify are set to 0:
+ */
+int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
 {
-#define BCH_INODE_OPT(_name, _bits)					\
-	if (opt_defined(src, _name))					\
-		opt_set(*dst, _name, src._name);
+	unsigned id;
+
+	for (id = 0; id < bch2_opts_nr; id++) {
+		const struct bch_option *opt = bch2_opt_table + id;
+
+		if (opt->get_sb == BCH2_NO_SB_OPT)
+			continue;
+
+		bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id));
+	}
+
+	return 0;
+}
+
+struct bch_dev_sb_opt_set {
+	void			(*set_sb)(struct bch_member *, u64);
+};
+
+static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = {
+#define x(n, set)	[Opt_##n] = { .set_sb = SET_##set },
+	BCH_DEV_OPT_SETTERS()
+#undef x
+};
+
+void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
+		       const struct bch_option *opt, u64 v)
+{
+	enum bch_opt_id id = opt - bch2_opt_table;
+
+	if (opt->flags & OPT_SB_FIELD_SECTORS)
+		v >>= 9;
+
+	if (opt->flags & OPT_SB_FIELD_ILOG2)
+		v = ilog2(v);
+
+	if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
+		v++;
+
+	if (opt->flags & OPT_FS) {
+		if (opt->set_sb != SET_BCH2_NO_SB_OPT)
+			opt->set_sb(sb, v);
+	}
+
+	if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) {
+		if (WARN(!bch2_member_exists(sb, dev_idx),
+			 "tried to set device option %s on nonexistent device %i",
+			 opt->attr.name, dev_idx))
+			return;
+
+		struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx);
+
+		const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id;
+		if (set->set_sb)
+			set->set_sb(m, v);
+		else
+			pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name);
+	}
+}
+
+void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
+		     const struct bch_option *opt, u64 v)
+{
+	mutex_lock(&c->sb_lock);
+	__bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v);
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+}
+
+/* io opts: */
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
+{
+	struct bch_io_opts opts = {
+#define x(_name, _bits)	._name = src._name,
 	BCH_INODE_OPTS()
-#undef BCH_INODE_OPT
+#undef x
+	};
+
+	bch2_io_opts_fixups(&opts);
+	return opts;
 }
 
 bool bch2_opt_is_inode_opt(enum bch_opt_id id)
 {
 	static const enum bch_opt_id inode_opt_list[] = {
-#define BCH_INODE_OPT(_name, _bits)	Opt_##_name,
+#define x(_name, _bits)	Opt_##_name,
 	BCH_INODE_OPTS()
-#undef BCH_INODE_OPT
+#undef x
 	};
 	unsigned i;
 
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index f476033e..ea69099e 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_OPTS_H
 #define _BCACHEFS_OPTS_H
 
@@ -7,14 +8,35 @@
 #include <linux/sysfs.h>
 #include "bcachefs_format.h"
 
+struct bch_fs;
+
 extern const char * const bch2_error_actions[];
-extern const char * const bch2_csum_types[];
-extern const char * const bch2_compression_types[];
-extern const char * const bch2_str_hash_types[];
-extern const char * const bch2_data_types[];
-extern const char * const bch2_cache_replacement_policies[];
-extern const char * const bch2_cache_modes[];
-extern const char * const bch2_dev_state[];
+extern const char * const bch2_fsck_fix_opts[];
+extern const char * const bch2_version_upgrade_opts[];
+extern const char * const bch2_sb_features[];
+extern const char * const bch2_sb_compat[];
+extern const char * const __bch2_btree_ids[];
+extern const char * const __bch2_csum_opts[];
+extern const char * const __bch2_compression_types[];
+extern const char * const bch2_compression_opts[];
+extern const char * const __bch2_str_hash_types[];
+extern const char * const bch2_str_hash_opts[];
+extern const char * const __bch2_data_types[];
+extern const char * const bch2_member_states[];
+extern const char * const bch2_d_types[];
+
+void bch2_prt_jset_entry_type(struct printbuf *,	enum bch_jset_entry_type);
+void bch2_prt_fs_usage_type(struct printbuf *,		enum bch_fs_usage_type);
+void bch2_prt_data_type(struct printbuf *,		enum bch_data_type);
+void bch2_prt_csum_opt(struct printbuf *,		enum bch_csum_opt);
+void bch2_prt_csum_type(struct printbuf *,		enum bch_csum_type);
+void bch2_prt_compression_type(struct printbuf *,	enum bch_compression_type);
+void bch2_prt_str_hash_type(struct printbuf *,		enum bch_str_hash_type);
+
+static inline const char *bch2_d_type_str(unsigned d_type)
+{
+	return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
+}
 
 /*
  * Mount options; we also store defaults in the superblock.
@@ -29,24 +51,41 @@ extern const char * const bch2_dev_state[];
  */
 
 /* dummy option, for options that aren't stored in the superblock */
-LE64_BITMASK(NO_SB_OPT,		struct bch_sb, flags[0], 0, 0);
-
-enum opt_mode {
-	OPT_INTERNAL,
-	OPT_FORMAT,
-	OPT_MOUNT,
-	OPT_RUNTIME,
+u64 BCH2_NO_SB_OPT(const struct bch_sb *);
+void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
+
+/* When can be set: */
+enum opt_flags {
+	OPT_FS			= BIT(0),	/* Filesystem option */
+	OPT_DEVICE		= BIT(1),	/* Device option */
+	OPT_INODE		= BIT(2),	/* Inode option */
+	OPT_FORMAT		= BIT(3),	/* May be specified at format time */
+	OPT_MOUNT		= BIT(4),	/* May be specified at mount time */
+	OPT_RUNTIME		= BIT(5),	/* May be specified at runtime */
+	OPT_HUMAN_READABLE	= BIT(6),
+	OPT_MUST_BE_POW_2	= BIT(7),	/* Must be power of 2 */
+	OPT_SB_FIELD_SECTORS	= BIT(8),	/* Superblock field is >> 9 of actual value */
+	OPT_SB_FIELD_ILOG2	= BIT(9),	/* Superblock field is ilog2 of actual value */
+	OPT_SB_FIELD_ONE_BIAS	= BIT(10),	/* 0 means default value */
+	OPT_HIDDEN		= BIT(11),
 };
 
 enum opt_type {
 	BCH_OPT_BOOL,
 	BCH_OPT_UINT,
 	BCH_OPT_STR,
+	BCH_OPT_BITFIELD,
 	BCH_OPT_FN,
 };
 
+struct bch_opt_fn {
+	int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *);
+	void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+	int (*validate)(u64, struct printbuf *);
+};
+
 /**
- * BCH_OPT(name, type, in mem type, mode, sb_opt)
+ * x(name, shortopt, type, in mem type, mode, sb_opt)
  *
  * @name	- name of mount option, sysfs attribute, and struct bch_opts
  *		  member
@@ -64,136 +103,453 @@ enum opt_type {
  *  - helptext
  */
 
+#ifdef __KERNEL__
+#define RATELIMIT_ERRORS_DEFAULT true
+#else
+#define RATELIMIT_ERRORS_DEFAULT false
+#endif
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCACHEFS_VERBOSE_DEFAULT	true
+#else
+#define BCACHEFS_VERBOSE_DEFAULT	false
+#endif
+
+#define BCH_FIX_ERRORS_OPTS()		\
+	x(exit,	0)			\
+	x(yes,	1)			\
+	x(no,	2)			\
+	x(ask,	3)
+
+enum fsck_err_opts {
+#define x(t, n)	FSCK_FIX_##t,
+	BCH_FIX_ERRORS_OPTS()
+#undef x
+};
+
 #define BCH_OPTS()							\
-	BCH_OPT(block_size,		u16,	OPT_FORMAT,		\
-		OPT_UINT(1, 128),					\
-		BCH_SB_BLOCK_SIZE,		8)			\
-	BCH_OPT(btree_node_size,	u16,	OPT_FORMAT,		\
-		OPT_UINT(1, 128),					\
-		BCH_SB_BTREE_NODE_SIZE,		512)			\
-	BCH_OPT(errors,			u8,	OPT_RUNTIME,		\
-		OPT_STR(bch2_error_actions),				\
-		BCH_SB_ERROR_ACTION,		BCH_ON_ERROR_RO)	\
-	BCH_OPT(metadata_replicas,	u8,	OPT_RUNTIME,		\
-		OPT_UINT(1, BCH_REPLICAS_MAX),				\
-		BCH_SB_META_REPLICAS_WANT,	1)			\
-	BCH_OPT(data_replicas,		u8,	OPT_RUNTIME,		\
-		OPT_UINT(1, BCH_REPLICAS_MAX),				\
-		BCH_SB_DATA_REPLICAS_WANT,	1)			\
-	BCH_OPT(metadata_replicas_required, u8,	OPT_MOUNT,		\
-		OPT_UINT(1, BCH_REPLICAS_MAX),				\
-		BCH_SB_META_REPLICAS_REQ,	1)			\
-	BCH_OPT(data_replicas_required, u8,	OPT_MOUNT,		\
-		OPT_UINT(1, BCH_REPLICAS_MAX),				\
-		BCH_SB_DATA_REPLICAS_REQ,	1)			\
-	BCH_OPT(metadata_checksum,	u8,	OPT_RUNTIME,		\
-		OPT_STR(bch2_csum_types),				\
-		BCH_SB_META_CSUM_TYPE,		BCH_CSUM_OPT_CRC32C)	\
-	BCH_OPT(data_checksum,		u8,	OPT_RUNTIME,		\
-		OPT_STR(bch2_csum_types),				\
-		BCH_SB_DATA_CSUM_TYPE,		BCH_CSUM_OPT_CRC32C)	\
-	BCH_OPT(compression,		u8,	OPT_RUNTIME,		\
-		OPT_STR(bch2_compression_types),			\
-		BCH_SB_COMPRESSION_TYPE,	BCH_COMPRESSION_OPT_NONE)\
-	BCH_OPT(background_compression,	u8,	OPT_RUNTIME,		\
-		OPT_STR(bch2_compression_types),			\
-		BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_NONE)\
-	BCH_OPT(str_hash,		u8,	OPT_RUNTIME,		\
-		OPT_STR(bch2_str_hash_types),				\
-		BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_SIPHASH)	\
-	BCH_OPT(foreground_target,	u16,	OPT_RUNTIME,		\
-		OPT_FN(bch2_opt_target),				\
-		BCH_SB_FOREGROUND_TARGET,	0)			\
-	BCH_OPT(background_target,	u16,	OPT_RUNTIME,		\
-		OPT_FN(bch2_opt_target),				\
-		BCH_SB_BACKGROUND_TARGET,	0)			\
-	BCH_OPT(promote_target,		u16,	OPT_RUNTIME,		\
-		OPT_FN(bch2_opt_target),				\
-		BCH_SB_PROMOTE_TARGET,	0)				\
-	BCH_OPT(inodes_32bit,		u8,	OPT_RUNTIME,		\
-		OPT_BOOL(),						\
-		BCH_SB_INODE_32BIT,		false)			\
-	BCH_OPT(gc_reserve_percent,	u8,	OPT_MOUNT,		\
-		OPT_UINT(5, 21),					\
-		BCH_SB_GC_RESERVE,		8)			\
-	BCH_OPT(root_reserve_percent,	u8,	OPT_MOUNT,		\
-		OPT_UINT(0, 100),					\
-		BCH_SB_ROOT_RESERVE,		0)			\
-	BCH_OPT(wide_macs,		u8,	OPT_RUNTIME,		\
-		OPT_BOOL(),						\
-		BCH_SB_128_BIT_MACS,		false)			\
-	BCH_OPT(acl,			u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		BCH_SB_POSIX_ACL,		true)			\
-	BCH_OPT(usrquota,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		BCH_SB_USRQUOTA,		false)			\
-	BCH_OPT(grpquota,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		BCH_SB_GRPQUOTA,		false)			\
-	BCH_OPT(prjquota,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		BCH_SB_PRJQUOTA,		false)			\
-	BCH_OPT(degraded,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(discard,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(verbose_recovery,	u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(verbose_init,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(journal_flush_disabled, u8,	OPT_RUNTIME,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(nofsck,			u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(fix_errors,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(nochanges,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(noreplay,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(norecovery,		u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(noexcl,			u8,	OPT_MOUNT,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(sb,			u64,	OPT_MOUNT,		\
-		OPT_UINT(0, S64_MAX),					\
-		NO_SB_OPT,			BCH_SB_SECTOR)		\
-	BCH_OPT(read_only,		u8,	OPT_INTERNAL,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)			\
-	BCH_OPT(nostart,		u8,	OPT_INTERNAL,		\
-		OPT_BOOL(),						\
-		NO_SB_OPT,			false)
+	x(block_size,			u16,				\
+	  OPT_FS|OPT_FORMAT|						\
+	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS,	\
+	  OPT_UINT(512, 1U << 16),					\
+	  BCH_SB_BLOCK_SIZE,		8,				\
+	  "size",	NULL)						\
+	x(btree_node_size,		u32,				\
+	  OPT_FS|OPT_FORMAT|						\
+	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS,	\
+	  OPT_UINT(512, 1U << 20),					\
+	  BCH_SB_BTREE_NODE_SIZE,	512,				\
+	  "size",	"Btree node size, default 256k")		\
+	x(errors,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_STR(bch2_error_actions),					\
+	  BCH_SB_ERROR_ACTION,		BCH_ON_ERROR_fix_safe,		\
+	  NULL,		"Action to take on filesystem error")		\
+	x(metadata_replicas,		u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_META_REPLICAS_WANT,	1,				\
+	  "#",		"Number of metadata replicas")			\
+	x(data_replicas,		u8,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_DATA_REPLICAS_WANT,	1,				\
+	  "#",		"Number of data replicas")			\
+	x(metadata_replicas_required, u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_META_REPLICAS_REQ,	1,				\
+	  "#",		NULL)						\
+	x(data_replicas_required,	u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_DATA_REPLICAS_REQ,	1,				\
+	  "#",		NULL)						\
+	x(encoded_extent_max,		u32,				\
+	  OPT_FS|OPT_FORMAT|						\
+	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\
+	  OPT_UINT(4096, 2U << 20),					\
+	  BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10,			\
+	  "size",	"Maximum size of checksummed/compressed extents")\
+	x(metadata_checksum,		u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_STR(__bch2_csum_opts),					\
+	  BCH_SB_META_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
+	  NULL,		NULL)						\
+	x(data_checksum,		u8,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_STR(__bch2_csum_opts),					\
+	  BCH_SB_DATA_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
+	  NULL,		NULL)						\
+	x(compression,			u8,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_FN(bch2_opt_compression),					\
+	  BCH_SB_COMPRESSION_TYPE,	BCH_COMPRESSION_OPT_none,	\
+	  NULL,		NULL)						\
+	x(background_compression,	u8,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_FN(bch2_opt_compression),					\
+	  BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none,	\
+	  NULL,		NULL)						\
+	x(str_hash,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_STR(bch2_str_hash_opts),					\
+	  BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_OPT_siphash,	\
+	  NULL,		"Hash function for directory entries and xattrs")\
+	x(metadata_target,		u16,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_METADATA_TARGET,	0,				\
+	  "(target)",	"Device or label for metadata writes")		\
+	x(foreground_target,		u16,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_FOREGROUND_TARGET,	0,				\
+	  "(target)",	"Device or label for foreground writes")	\
+	x(background_target,		u16,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_BACKGROUND_TARGET,	0,				\
+	  "(target)",	"Device or label to move data to in the background")\
+	x(promote_target,		u16,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_PROMOTE_TARGET,	0,				\
+	  "(target)",	"Device or label to promote data to on read")	\
+	x(erasure_code,			u16,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_BOOL(),							\
+	  BCH_SB_ERASURE_CODE,		false,				\
+	  NULL,		"Enable erasure coding (DO NOT USE YET)")	\
+	x(inodes_32bit,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_BOOL(),							\
+	  BCH_SB_INODE_32BIT,		true,				\
+	  NULL,		"Constrain inode numbers to 32 bits")		\
+	x(shard_inode_numbers,		u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_BOOL(),							\
+	  BCH_SB_SHARD_INUMS,		true,				\
+	  NULL,		"Shard new inode numbers by CPU id")		\
+	x(inodes_use_key_cache,	u8,					\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_INODES_USE_KEY_CACHE,	true,				\
+	  NULL,		"Use the btree key cache for the inodes btree")	\
+	x(btree_node_mem_ptr_optimization, u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		true,				\
+	  NULL,		"Stash pointer to in memory btree node in btree ptr")\
+	x(gc_reserve_percent,		u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_UINT(5, 21),						\
+	  BCH_SB_GC_RESERVE,		8,				\
+	  "%",		"Percentage of disk space to reserve for copygc")\
+	x(gc_reserve_bytes,		u64,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|			\
+	  OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS,			\
+	  OPT_UINT(0, U64_MAX),						\
+	  BCH_SB_GC_RESERVE_BYTES,	0,				\
+	  "%",		"Amount of disk space to reserve for copygc\n"	\
+			"Takes precedence over gc_reserve_percent if set")\
+	x(root_reserve_percent,		u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_UINT(0, 100),						\
+	  BCH_SB_ROOT_RESERVE,		0,				\
+	  "%",		"Percentage of disk space to reserve for superuser")\
+	x(wide_macs,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_BOOL(),							\
+	  BCH_SB_128_BIT_MACS,		false,				\
+	  NULL,		"Store full 128 bits of cryptographic MACs, instead of 80")\
+	x(inline_data,			u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		true,				\
+	  NULL,		"Enable inline data extents")			\
+	x(promote_whole_extents,	u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_PROMOTE_WHOLE_EXTENTS,	true,				\
+	  NULL,		"Promote whole extents, instead of just part being read")\
+	x(acl,				u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_POSIX_ACL,		true,				\
+	  NULL,		"Enable POSIX acls")				\
+	x(usrquota,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_USRQUOTA,		false,				\
+	  NULL,		"Enable user quotas")				\
+	x(grpquota,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_GRPQUOTA,		false,				\
+	  NULL,		"Enable group quotas")				\
+	x(prjquota,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_PRJQUOTA,		false,				\
+	  NULL,		"Enable project quotas")			\
+	x(degraded,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Allow mounting in degraded mode")		\
+	x(very_degraded,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Allow mounting in when data will be missing")	\
+	x(no_splitbrain_check,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Don't kick drives out when splitbrain detected")\
+	x(discard,			u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_DEVICE,					\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		true,				\
+	  NULL,		"Enable discard/TRIM support")			\
+	x(verbose,			u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		BCACHEFS_VERBOSE_DEFAULT,	\
+	  NULL,		"Extra debugging information during mount/recovery")\
+	x(journal_flush_delay,		u32,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_UINT(1, U32_MAX),						\
+	  BCH_SB_JOURNAL_FLUSH_DELAY,	1000,				\
+	  NULL,		"Delay in milliseconds before automatic journal commits")\
+	x(journal_flush_disabled,	u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_JOURNAL_FLUSH_DISABLED,false,				\
+	  NULL,		"Disable journal flush on sync/fsync\n"		\
+			"If enabled, writes can be lost, but only since the\n"\
+			"last journal write (default 1 second)")	\
+	x(journal_reclaim_delay,	u32,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_UINT(0, U32_MAX),						\
+	  BCH_SB_JOURNAL_RECLAIM_DELAY,	100,				\
+	  NULL,		"Delay in milliseconds before automatic journal reclaim")\
+	x(move_bytes_in_flight,		u32,				\
+	  OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_UINT(1024, U32_MAX),					\
+	  BCH2_NO_SB_OPT,		1U << 20,			\
+	  NULL,		"Maximum Amount of IO to keep in flight by the move path")\
+	x(move_ios_in_flight,		u32,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_UINT(1, 1024),						\
+	  BCH2_NO_SB_OPT,		32,				\
+	  NULL,		"Maximum number of IOs to keep in flight by the move path")\
+	x(fsck,				u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Run fsck on mount")				\
+	x(fsck_memory_usage_percent,	u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_UINT(20, 70),						\
+	  BCH2_NO_SB_OPT,		50,				\
+	  NULL,		"Maximum percentage of system ram fsck is allowed to pin")\
+	x(fix_errors,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_FN(bch2_opt_fix_errors),					\
+	  BCH2_NO_SB_OPT,		FSCK_FIX_exit,			\
+	  NULL,		"Fix errors during fsck without asking")	\
+	x(ratelimit_errors,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		RATELIMIT_ERRORS_DEFAULT,	\
+	  NULL,		"Ratelimit error messages during fsck")		\
+	x(nochanges,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Super read only mode - no writes at all will be issued,\n"\
+			"even if we have to replay the journal")	\
+	x(norecovery,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Exit recovery immediately prior to journal replay")\
+	x(recovery_passes,		u64,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BITFIELD(bch2_recovery_passes),				\
+	  BCH2_NO_SB_OPT,		0,				\
+	  NULL,		"Recovery passes to run explicitly")		\
+	x(recovery_passes_exclude,	u64,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BITFIELD(bch2_recovery_passes),				\
+	  BCH2_NO_SB_OPT,		0,				\
+	  NULL,		"Recovery passes to exclude")			\
+	x(recovery_pass_last,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_STR_NOLIMIT(bch2_recovery_passes),			\
+	  BCH2_NO_SB_OPT,		0,				\
+	  NULL,		"Exit recovery after specified pass")		\
+	x(retain_recovery_info,		u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Don't free journal entries/keys, scanned btree nodes after startup")\
+	x(read_entire_journal,		u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Read all journal entries, not just dirty ones")\
+	x(read_journal_only,		u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Only read the journal, skip the rest of recovery")\
+	x(journal_transaction_names,	u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_BOOL(),							\
+	  BCH_SB_JOURNAL_TRANSACTION_NAMES, true,			\
+	  NULL,		"Log transaction function names in journal")	\
+	x(allocator_stuck_timeout,	u16,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_UINT(0, U16_MAX),						\
+	  BCH_SB_ALLOCATOR_STUCK_TIMEOUT, 30,				\
+	  NULL,		"Default timeout in seconds for stuck allocator messages")\
+	x(noexcl,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Don't open device in exclusive mode")		\
+	x(direct_io,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,			true,			\
+	  NULL,		"Use O_DIRECT (userspace only)")		\
+	x(sb,				u64,				\
+	  OPT_MOUNT,							\
+	  OPT_UINT(0, S64_MAX),						\
+	  BCH2_NO_SB_OPT,		BCH_SB_SECTOR,			\
+	  "offset",	"Sector offset of superblock")			\
+	x(read_only,			u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_HIDDEN,					\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		NULL)						\
+	x(nostart,			u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Don\'t start filesystem, only open devices")	\
+	x(reconstruct_alloc,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Reconstruct alloc btree")			\
+	x(version_upgrade,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_STR(bch2_version_upgrade_opts),				\
+	  BCH_SB_VERSION_UPGRADE,	BCH_VERSION_UPGRADE_compatible,	\
+	  NULL,		"Set superblock to latest version,\n"		\
+			"allowing any new features to be used")		\
+	x(stdio,			u64,				\
+	  0,								\
+	  OPT_UINT(0, S64_MAX),						\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Pointer to a struct stdio_redirect")		\
+	x(project,			u8,				\
+	  OPT_INODE,							\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		NULL)						\
+	x(nocow,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,		\
+	  OPT_BOOL(),							\
+	  BCH_SB_NOCOW,			false,				\
+	  NULL,		"Nocow mode: Writes will be done in place when possible.\n"\
+			"Snapshots and reflink will still caused writes to be COW\n"\
+			"Implicitly disables data checksumming, compression and encryption")\
+	x(nocow_enabled,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,			true,			\
+	  NULL,		"Enable nocow mode: enables runtime locking in\n"\
+			"data move path needed if nocow will ever be in use\n")\
+	x(copygc_enabled,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,			true,			\
+	  NULL,		"Enable copygc: disable for debugging, or to\n"\
+			"quiet the system when doing performance testing\n")\
+	x(rebalance_enabled,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,			true,			\
+	  NULL,		"Enable rebalance: disable for debugging, or to\n"\
+			"quiet the system when doing performance testing\n")\
+	x(no_data_io,			u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Skip submit_bio() for data reads and writes, "	\
+			"for performance testing purposes")		\
+	x(fs_size,			u64,				\
+	  OPT_DEVICE,							\
+	  OPT_UINT(0, S64_MAX),						\
+	  BCH2_NO_SB_OPT,		0,				\
+	  "size",	"Size of filesystem on device")			\
+	x(bucket,			u32,				\
+	  OPT_DEVICE,							\
+	  OPT_UINT(0, S64_MAX),						\
+	  BCH2_NO_SB_OPT,		0,				\
+	  "size",	"Specifies the bucket size; must be greater than the btree node size")\
+	x(durability,			u8,				\
+	  OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS,				\
+	  OPT_UINT(0, BCH_REPLICAS_MAX),				\
+	  BCH2_NO_SB_OPT,		1,				\
+	  "n",		"Data written to this device will be considered\n"\
+			"to have already been replicated n times")	\
+	x(data_allowed,			u8,				\
+	  OPT_DEVICE,							\
+	  OPT_BITFIELD(__bch2_data_types),				\
+	  BCH2_NO_SB_OPT,		BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
+	  "types",	"Allowed data types for this device: journal, btree, and/or user")\
+	x(btree_node_prefetch,		u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		true,				\
+	  NULL,		"BTREE_ITER_prefetch casuse btree nodes to be\n"\
+	  " prefetched sequentially")
+
+#define BCH_DEV_OPT_SETTERS()						\
+	x(discard,		BCH_MEMBER_DISCARD)			\
+	x(durability,		BCH_MEMBER_DURABILITY)			\
+	x(data_allowed,		BCH_MEMBER_DATA_ALLOWED)
 
 struct bch_opts {
-#define BCH_OPT(_name, _bits, ...)	unsigned _name##_defined:1;
+#define x(_name, _bits, ...)	unsigned _name##_defined:1;
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
 
-#define BCH_OPT(_name, _bits, ...)	_bits	_name;
+#define x(_name, _bits, ...)	_bits	_name;
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
+};
+
+struct bch2_opts_parse {
+	struct bch_opts opts;
+
+	/* to save opts that can't be parsed before the FS is opened: */
+	struct printbuf parse_later;
 };
 
-static const struct bch_opts bch2_opts_default = {
-#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)		\
+static const __maybe_unused struct bch_opts bch2_opts_default = {
+#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...)		\
 	._name##_defined = true,					\
 	._name = _default,						\
 
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
 };
 
 #define opt_defined(_opts, _name)	((_opts)._name##_defined)
@@ -215,32 +571,29 @@ static inline struct bch_opts bch2_opts_empty(void)
 void bch2_opts_apply(struct bch_opts *, struct bch_opts);
 
 enum bch_opt_id {
-#define BCH_OPT(_name, ...)	Opt_##_name,
+#define x(_name, ...)	Opt_##_name,
 	BCH_OPTS()
-#undef BCH_OPT
+#undef x
 	bch2_opts_nr
 };
 
 struct bch_fs;
+struct printbuf;
 
 struct bch_option {
 	struct attribute	attr;
+	u64			(*get_sb)(const struct bch_sb *);
 	void			(*set_sb)(struct bch_sb *, u64);
-	enum opt_mode		mode;
 	enum opt_type		type;
+	enum opt_flags		flags;
+	u64			min, max;
 
-	union {
-	struct {
-		u64		min, max;
-	};
-	struct {
-		const char * const *choices;
-	};
-	struct {
-		int (*parse)(struct bch_fs *, const char *, u64 *);
-		int (*print)(struct bch_fs *, char *, size_t, u64);
-	};
-	};
+	const char * const *choices;
+
+	struct bch_opt_fn	fn;
+
+	const char		*hint;
+	const char		*help;
 
 };
 
@@ -250,43 +603,74 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
 u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
 void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
 
-struct bch_opts bch2_opts_from_sb(struct bch_sb *);
+u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
+int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
+void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
+
+struct bch_dev;
+void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64);
 
 int bch2_opt_lookup(const char *);
-int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *);
+int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
+int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
+		   const char *, u64 *, struct printbuf *);
 
 #define OPT_SHOW_FULL_LIST	(1 << 0)
 #define OPT_SHOW_MOUNT_STYLE	(1 << 1)
 
-int bch2_opt_to_text(struct bch_fs *, char *, size_t,
-		     const struct bch_option *, u64, unsigned);
+void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
+		      const struct bch_option *, u64, unsigned);
+void bch2_opts_to_text(struct printbuf *,
+		       struct bch_opts,
+		       struct bch_fs *, struct bch_sb *,
+		       unsigned, unsigned, unsigned);
 
-int bch2_parse_mount_opts(struct bch_opts *, char *);
+int bch2_opt_check_may_set(struct bch_fs *, int, u64);
+int bch2_opts_check_may_set(struct bch_fs *);
+int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *,
+			     struct printbuf *, const char *, const char *);
+int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *,
+			  char *);
 
 /* inode opts: */
 
-#define BCH_INODE_OPTS()					\
-	BCH_INODE_OPT(data_checksum,			8)	\
-	BCH_INODE_OPT(compression,			8)	\
-	BCH_INODE_OPT(background_compression,		8)	\
-	BCH_INODE_OPT(data_replicas,			8)	\
-	BCH_INODE_OPT(promote_target,			16)	\
-	BCH_INODE_OPT(foreground_target,		16)	\
-	BCH_INODE_OPT(background_target,		16)
-
 struct bch_io_opts {
-#define BCH_INODE_OPT(_name, _bits)	unsigned _name##_defined:1;
+#define x(_name, _bits)	u##_bits _name;
 	BCH_INODE_OPTS()
-#undef BCH_INODE_OPT
-
-#define BCH_INODE_OPT(_name, _bits)	u##_bits _name;
+#undef x
+#define x(_name, _bits)	u64 _name##_from_inode:1;
 	BCH_INODE_OPTS()
-#undef BCH_INODE_OPT
+#undef x
 };
 
+static inline void bch2_io_opts_fixups(struct bch_io_opts *opts)
+{
+	if (!opts->background_target)
+		opts->background_target = opts->foreground_target;
+	if (!opts->background_compression)
+		opts->background_compression = opts->compression;
+	if (opts->nocow) {
+		opts->compression = opts->background_compression = 0;
+		opts->data_checksum = 0;
+		opts->erasure_code = 0;
+	}
+}
+
 struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
-struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts);
-void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
 bool bch2_opt_is_inode_opt(enum bch_opt_id);
 
+/* rebalance opts: */
+
+static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_io_opts *opts)
+{
+	return (struct bch_extent_rebalance) {
+		.type = BIT(BCH_EXTENT_ENTRY_rebalance),
+#define x(_name)							\
+		._name = opts->_name,					\
+		._name##_from_inode = opts->_name##_from_inode,
+		BCH_REBALANCE_OPTS()
+#undef x
+	};
+};
+
 #endif /* _BCACHEFS_OPTS_H */
diff --git a/libbcachefs/printbuf.c b/libbcachefs/printbuf.c
new file mode 100644
index 00000000..4cf5a2af
--- /dev/null
+++ b/libbcachefs/printbuf.c
@@ -0,0 +1,509 @@
+// SPDX-License-Identifier: LGPL-2.1+
+/* Copyright (C) 2022 Kent Overstreet */
+
+#include <linux/bitmap.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string_helpers.h>
+
+#include "printbuf.h"
+
+static inline unsigned __printbuf_linelen(struct printbuf *buf, unsigned pos)
+{
+	return pos - buf->last_newline;
+}
+
+static inline unsigned printbuf_linelen(struct printbuf *buf)
+{
+	return __printbuf_linelen(buf, buf->pos);
+}
+
+/*
+ * Returns spaces from start of line, if set, or 0 if unset:
+ */
+static inline unsigned cur_tabstop(struct printbuf *buf)
+{
+	return buf->cur_tabstop < buf->nr_tabstops
+		? buf->_tabstops[buf->cur_tabstop]
+		: 0;
+}
+
+int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
+{
+	/* Reserved space for terminating nul: */
+	extra += 1;
+
+	if (out->pos + extra <= out->size)
+		return 0;
+
+	if (!out->heap_allocated) {
+		out->overflow = true;
+		return 0;
+	}
+
+	unsigned new_size = roundup_pow_of_two(out->size + extra);
+
+	/* Sanity check... */
+	if (new_size > PAGE_SIZE << MAX_PAGE_ORDER) {
+		out->allocation_failure = true;
+		out->overflow = true;
+		return -ENOMEM;
+	}
+
+	/*
+	 * Note: output buffer must be freeable with kfree(), it's not required
+	 * that the user use printbuf_exit().
+	 */
+	char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
+
+	if (!buf) {
+		out->allocation_failure = true;
+		out->overflow = true;
+		return -ENOMEM;
+	}
+
+	out->buf	= buf;
+	out->size	= new_size;
+	return 0;
+}
+
+static void printbuf_advance_pos(struct printbuf *out, unsigned len)
+{
+	out->pos += min(len, printbuf_remaining(out));
+}
+
+static void printbuf_insert_spaces(struct printbuf *out, unsigned pos, unsigned nr)
+{
+	unsigned move = out->pos - pos;
+
+	bch2_printbuf_make_room(out, nr);
+
+	if (pos + nr < out->size)
+		memmove(out->buf + pos + nr,
+			out->buf + pos,
+			min(move, out->size - 1 - pos - nr));
+
+	if (pos < out->size)
+		memset(out->buf + pos, ' ', min(nr, out->size - pos));
+
+	printbuf_advance_pos(out, nr);
+	printbuf_nul_terminate_reserved(out);
+}
+
+static void __printbuf_do_indent(struct printbuf *out, unsigned pos)
+{
+	while (true) {
+		int pad;
+		unsigned len = out->pos - pos;
+		char *p = out->buf + pos;
+		char *n = memscan(p, '\n', len);
+		if (cur_tabstop(out)) {
+			n = min(n, (char *) memscan(p, '\r', len));
+			n = min(n, (char *) memscan(p, '\t', len));
+		}
+
+		pos = n - out->buf;
+		if (pos == out->pos)
+			break;
+
+		switch (*n) {
+		case '\n':
+			pos++;
+			out->last_newline = pos;
+
+			printbuf_insert_spaces(out, pos, out->indent);
+
+			pos = min(pos + out->indent, out->pos);
+			out->last_field = pos;
+			out->cur_tabstop = 0;
+			break;
+		case '\r':
+			memmove(n, n + 1, out->pos - pos);
+			--out->pos;
+			pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos);
+			if (pad > 0) {
+				printbuf_insert_spaces(out, out->last_field, pad);
+				pos += pad;
+			}
+
+			out->last_field = pos;
+			out->cur_tabstop++;
+			break;
+		case '\t':
+			pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos) - 1;
+			if (pad > 0) {
+				*n = ' ';
+				printbuf_insert_spaces(out, pos, pad - 1);
+				pos += pad;
+			} else {
+				memmove(n, n + 1, out->pos - pos);
+				--out->pos;
+			}
+
+			out->last_field = pos;
+			out->cur_tabstop++;
+			break;
+		}
+	}
+}
+
+static inline void printbuf_do_indent(struct printbuf *out, unsigned pos)
+{
+	if (out->has_indent_or_tabstops && !out->suppress_indent_tabstop_handling)
+		__printbuf_do_indent(out, pos);
+}
+
+void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
+{
+	int len;
+
+	do {
+		va_list args2;
+
+		va_copy(args2, args);
+		len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args2);
+		va_end(args2);
+	} while (len > printbuf_remaining(out) &&
+		 !bch2_printbuf_make_room(out, len));
+
+	unsigned indent_pos = out->pos;
+	printbuf_advance_pos(out, len);
+	printbuf_do_indent(out, indent_pos);
+}
+
+void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
+{
+	va_list args;
+	int len;
+
+	do {
+		va_start(args, fmt);
+		len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args);
+		va_end(args);
+	} while (len > printbuf_remaining(out) &&
+		 !bch2_printbuf_make_room(out, len));
+
+	unsigned indent_pos = out->pos;
+	printbuf_advance_pos(out, len);
+	printbuf_do_indent(out, indent_pos);
+}
+
+/**
+ * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be
+ * null terminated
+ * @buf:	printbuf to terminate
+ * Returns:	Printbuf contents, as a nul terminated C string
+ */
+const char *bch2_printbuf_str(const struct printbuf *buf)
+{
+	/*
+	 * If we've written to a printbuf then it's guaranteed to be a null
+	 * terminated string - but if we haven't, then we might not have
+	 * allocated a buffer at all:
+	 */
+	return buf->pos
+		? buf->buf
+		: "";
+}
+
+/**
+ * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it
+ * against accidental use.
+ * @buf:	printbuf to exit
+ */
+void bch2_printbuf_exit(struct printbuf *buf)
+{
+	if (buf->heap_allocated) {
+		kfree(buf->buf);
+		buf->buf = ERR_PTR(-EINTR); /* poison value */
+	}
+}
+
+void bch2_printbuf_tabstops_reset(struct printbuf *buf)
+{
+	buf->nr_tabstops = 0;
+}
+
+void bch2_printbuf_tabstop_pop(struct printbuf *buf)
+{
+	if (buf->nr_tabstops)
+		--buf->nr_tabstops;
+}
+
+/*
+ * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces from previous tabpstop
+ *
+ * In the future this function may allocate memory if setting more than
+ * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start
+ * of line.
+ */
+int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
+{
+	unsigned prev_tabstop = buf->nr_tabstops
+		? buf->_tabstops[buf->nr_tabstops - 1]
+		: 0;
+
+	if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops)))
+		return -EINVAL;
+
+	buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces;
+	buf->has_indent_or_tabstops = true;
+	return 0;
+}
+
+/**
+ * bch2_printbuf_indent_add() - add to the current indent level
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces to add to the current indent level
+ *
+ * Subsequent lines, and the current line if the output position is at the start
+ * of the current line, will be indented by @spaces more spaces.
+ */
+void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces)
+{
+	if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
+		spaces = 0;
+
+	buf->indent += spaces;
+	prt_chars(buf, ' ', spaces);
+
+	buf->has_indent_or_tabstops = true;
+}
+
+/**
+ * bch2_printbuf_indent_sub() - subtract from the current indent level
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces to subtract from the current indent level
+ *
+ * Subsequent lines, and the current line if the output position is at the start
+ * of the current line, will be indented by @spaces less spaces.
+ */
+void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
+{
+	if (WARN_ON_ONCE(spaces > buf->indent))
+		spaces = buf->indent;
+
+	if (buf->last_newline + buf->indent == buf->pos) {
+		buf->pos -= spaces;
+		printbuf_nul_terminate(buf);
+	}
+	buf->indent -= spaces;
+
+	if (!buf->indent && !buf->nr_tabstops)
+		buf->has_indent_or_tabstops = false;
+}
+
+void bch2_prt_newline(struct printbuf *buf)
+{
+	bch2_printbuf_make_room(buf, 1 + buf->indent);
+
+	__prt_char_reserved(buf, '\n');
+
+	buf->last_newline	= buf->pos;
+
+	__prt_chars_reserved(buf, ' ', buf->indent);
+
+	printbuf_nul_terminate_reserved(buf);
+
+	buf->last_field		= buf->pos;
+	buf->cur_tabstop	= 0;
+}
+
+void bch2_printbuf_strip_trailing_newline(struct printbuf *out)
+{
+	for (int p = out->pos - 1; p >= 0; --p) {
+		if (out->buf[p] == '\n') {
+			out->pos = p;
+			break;
+		}
+		if (out->buf[p] != ' ')
+			break;
+	}
+
+	printbuf_nul_terminate_reserved(out);
+}
+
+static void __prt_tab(struct printbuf *out)
+{
+	int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out));
+
+	prt_chars(out, ' ', spaces);
+
+	out->last_field = out->pos;
+	out->cur_tabstop++;
+}
+
+/**
+ * bch2_prt_tab() - Advance printbuf to the next tabstop
+ * @out:	printbuf to control
+ *
+ * Advance output to the next tabstop by printing spaces.
+ */
+void bch2_prt_tab(struct printbuf *out)
+{
+	if (WARN_ON(!cur_tabstop(out)))
+		return;
+
+	__prt_tab(out);
+}
+
+static void __prt_tab_rjust(struct printbuf *buf)
+{
+	int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf);
+	if (pad > 0)
+		printbuf_insert_spaces(buf, buf->last_field, pad);
+
+	buf->last_field = buf->pos;
+	buf->cur_tabstop++;
+}
+
+/**
+ * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
+ * previous output
+ *
+ * @buf: printbuf to control
+ *
+ * Advance output to the next tabstop by inserting spaces immediately after the
+ * previous tabstop, right justifying previously outputted text.
+ */
+void bch2_prt_tab_rjust(struct printbuf *buf)
+{
+	if (WARN_ON(!cur_tabstop(buf)))
+		return;
+
+	__prt_tab_rjust(buf);
+}
+
+/**
+ * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters
+ *
+ * @out:	output printbuf
+ * @str:	string to print
+ * @count:	number of bytes to print
+ *
+ * The following contol characters are handled as so:
+ *   \n: prt_newline	newline that obeys current indent level
+ *   \t: prt_tab	advance to next tabstop
+ *   \r: prt_tab_rjust	advance to next tabstop, with right justification
+ */
+void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
+{
+	unsigned indent_pos = out->pos;
+	prt_bytes(out, str, count);
+	printbuf_do_indent(out, indent_pos);
+}
+
+/**
+ * bch2_prt_human_readable_u64() - Print out a u64 in human readable units
+ * @out:	output printbuf
+ * @v:		integer to print
+ *
+ * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
+ */
+void bch2_prt_human_readable_u64(struct printbuf *out, u64 v)
+{
+	bch2_printbuf_make_room(out, 10);
+	unsigned len = string_get_size(v, 1, !out->si_units,
+				       out->buf + out->pos,
+				       printbuf_remaining_size(out));
+	printbuf_advance_pos(out, len);
+}
+
+/**
+ * bch2_prt_human_readable_s64() - Print out a s64 in human readable units
+ * @out:	output printbuf
+ * @v:		integer to print
+ *
+ * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
+ */
+void bch2_prt_human_readable_s64(struct printbuf *out, s64 v)
+{
+	if (v < 0)
+		prt_char(out, '-');
+	bch2_prt_human_readable_u64(out, abs(v));
+}
+
+/**
+ * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options
+ * @out:	output printbuf
+ * @v:		integer to print
+ *
+ * Units are either raw (default), or human reabable units (controlled via
+ * @buf->human_readable_units)
+ */
+void bch2_prt_units_u64(struct printbuf *out, u64 v)
+{
+	if (out->human_readable_units)
+		bch2_prt_human_readable_u64(out, v);
+	else
+		bch2_prt_printf(out, "%llu", v);
+}
+
+/**
+ * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options
+ * @out:	output printbuf
+ * @v:		integer to print
+ *
+ * Units are either raw (default), or human reabable units (controlled via
+ * @buf->human_readable_units)
+ */
+void bch2_prt_units_s64(struct printbuf *out, s64 v)
+{
+	if (v < 0)
+		prt_char(out, '-');
+	bch2_prt_units_u64(out, abs(v));
+}
+
+void bch2_prt_string_option(struct printbuf *out,
+			    const char * const list[],
+			    size_t selected)
+{
+	for (size_t i = 0; list[i]; i++)
+		bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]);
+}
+
+void bch2_prt_bitflags(struct printbuf *out,
+		       const char * const list[], u64 flags)
+{
+	unsigned bit, nr = 0;
+	bool first = true;
+
+	while (list[nr])
+		nr++;
+
+	while (flags && (bit = __ffs64(flags)) < nr) {
+		if (!first)
+			bch2_prt_printf(out, ",");
+		first = false;
+		bch2_prt_printf(out, "%s", list[bit]);
+		flags ^= BIT_ULL(bit);
+	}
+}
+
+void bch2_prt_bitflags_vector(struct printbuf *out,
+			      const char * const list[],
+			      unsigned long *v, unsigned nr)
+{
+	bool first = true;
+	unsigned i;
+
+	for (i = 0; i < nr; i++)
+		if (!list[i]) {
+			nr = i - 1;
+			break;
+		}
+
+	for_each_set_bit(i, v, nr) {
+		if (!first)
+			bch2_prt_printf(out, ",");
+		first = false;
+		bch2_prt_printf(out, "%s", list[i]);
+	}
+}
diff --git a/libbcachefs/printbuf.h b/libbcachefs/printbuf.h
new file mode 100644
index 00000000..1d570387
--- /dev/null
+++ b/libbcachefs/printbuf.h
@@ -0,0 +1,282 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+/* Copyright (C) 2022 Kent Overstreet */
+
+#ifndef _BCACHEFS_PRINTBUF_H
+#define _BCACHEFS_PRINTBUF_H
+
+/*
+ * Printbufs: Simple strings for printing to, with optional heap allocation
+ *
+ * This code has provisions for use in userspace, to aid in making other code
+ * portable between kernelspace and userspace.
+ *
+ * Basic example:
+ *   struct printbuf buf = PRINTBUF;
+ *
+ *   prt_printf(&buf, "foo=");
+ *   foo_to_text(&buf, foo);
+ *   printk("%s", buf.buf);
+ *   printbuf_exit(&buf);
+ *
+ * Or
+ *   struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size)
+ *
+ * We can now write pretty printers instead of writing code that dumps
+ * everything to the kernel log buffer, and then those pretty-printers can be
+ * used by other code that outputs to kernel log, sysfs, debugfs, etc.
+ *
+ * Memory allocation: Outputing to a printbuf may allocate memory. This
+ * allocation is done with GFP_KERNEL, by default: use the newer
+ * memalloc_*_(save|restore) functions as needed.
+ *
+ * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations
+ * will be done with GFP_NOWAIT if printbuf->atomic is nonzero.
+ *
+ * It's allowed to grab the output buffer and free it later with kfree() instead
+ * of using printbuf_exit(), if the user just needs a heap allocated string at
+ * the end.
+ *
+ * Memory allocation failures: We don't return errors directly, because on
+ * memory allocation failure we usually don't want to bail out and unwind - we
+ * want to print what we've got, on a best-effort basis. But code that does want
+ * to return -ENOMEM may check printbuf.allocation_failure.
+ *
+ * Indenting, tabstops:
+ *
+ * To aid is writing multi-line pretty printers spread across multiple
+ * functions, printbufs track the current indent level.
+ *
+ * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent
+ * level, respectively.
+ *
+ * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from
+ * start of line. Once set, prt_tab() will output spaces up to the next tabstop.
+ * prt_tab_rjust() will also advance the current line of text up to the next
+ * tabstop, but it does so by shifting text since the previous tabstop up to the
+ * next tabstop - right justifying it.
+ *
+ * Make sure you use prt_newline() instead of \n in the format string for indent
+ * level and tabstops to work corretly.
+ *
+ * Output units: printbuf->units exists to tell pretty-printers how to output
+ * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as
+ * human readable bytes. prt_units() obeys it.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+enum printbuf_si {
+	PRINTBUF_UNITS_2,	/* use binary powers of 2^10 */
+	PRINTBUF_UNITS_10,	/* use powers of 10^3 (standard SI) */
+};
+
+#define PRINTBUF_INLINE_TABSTOPS	6
+
+struct printbuf {
+	char			*buf;
+	unsigned		size;
+	unsigned		pos;
+	unsigned		last_newline;
+	unsigned		last_field;
+	unsigned		indent;
+	/*
+	 * If nonzero, allocations will be done with GFP_ATOMIC:
+	 */
+	u8			atomic;
+	bool			allocation_failure:1;
+	bool			heap_allocated:1;
+	bool			overflow:1;
+	enum printbuf_si	si_units:1;
+	bool			human_readable_units:1;
+	bool			has_indent_or_tabstops:1;
+	bool			suppress_indent_tabstop_handling:1;
+	u8			nr_tabstops;
+
+	/*
+	 * Do not modify directly: use printbuf_tabstop_add(),
+	 * printbuf_tabstop_get()
+	 */
+	u8			cur_tabstop;
+	u8			_tabstops[PRINTBUF_INLINE_TABSTOPS];
+};
+
+int bch2_printbuf_make_room(struct printbuf *, unsigned);
+__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...);
+__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list);
+const char *bch2_printbuf_str(const struct printbuf *);
+void bch2_printbuf_exit(struct printbuf *);
+
+void bch2_printbuf_tabstops_reset(struct printbuf *);
+void bch2_printbuf_tabstop_pop(struct printbuf *);
+int bch2_printbuf_tabstop_push(struct printbuf *, unsigned);
+
+void bch2_printbuf_indent_add(struct printbuf *, unsigned);
+void bch2_printbuf_indent_sub(struct printbuf *, unsigned);
+
+void bch2_prt_newline(struct printbuf *);
+void bch2_printbuf_strip_trailing_newline(struct printbuf *);
+void bch2_prt_tab(struct printbuf *);
+void bch2_prt_tab_rjust(struct printbuf *);
+
+void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned);
+void bch2_prt_human_readable_u64(struct printbuf *, u64);
+void bch2_prt_human_readable_s64(struct printbuf *, s64);
+void bch2_prt_units_u64(struct printbuf *, u64);
+void bch2_prt_units_s64(struct printbuf *, s64);
+void bch2_prt_string_option(struct printbuf *, const char * const[], size_t);
+void bch2_prt_bitflags(struct printbuf *, const char * const[], u64);
+void bch2_prt_bitflags_vector(struct printbuf *, const char * const[],
+			      unsigned long *, unsigned);
+
+/* Initializer for a heap allocated printbuf: */
+#define PRINTBUF ((struct printbuf) { .heap_allocated = true })
+
+/* Initializer a printbuf that points to an external buffer: */
+#define PRINTBUF_EXTERN(_buf, _size)			\
+((struct printbuf) {					\
+	.buf	= _buf,					\
+	.size	= _size,				\
+})
+
+/*
+ * Returns size remaining of output buffer:
+ */
+static inline unsigned printbuf_remaining_size(struct printbuf *out)
+{
+	if (WARN_ON(out->size && out->pos >= out->size))
+		out->pos = out->size - 1;
+	return out->size - out->pos;
+}
+
+/*
+ * Returns number of characters we can print to the output buffer - i.e.
+ * excluding the terminating nul:
+ */
+static inline unsigned printbuf_remaining(struct printbuf *out)
+{
+	return out->size ? printbuf_remaining_size(out) - 1 : 0;
+}
+
+static inline unsigned printbuf_written(struct printbuf *out)
+{
+	return out->size ? min(out->pos, out->size - 1) : 0;
+}
+
+static inline void printbuf_nul_terminate_reserved(struct printbuf *out)
+{
+	if (WARN_ON(out->size && out->pos >= out->size))
+		out->pos = out->size - 1;
+	if (out->size)
+		out->buf[out->pos] = 0;
+}
+
+static inline void printbuf_nul_terminate(struct printbuf *out)
+{
+	bch2_printbuf_make_room(out, 1);
+	printbuf_nul_terminate_reserved(out);
+}
+
+/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */
+static inline void __prt_char_reserved(struct printbuf *out, char c)
+{
+	if (printbuf_remaining(out))
+		out->buf[out->pos++] = c;
+}
+
+/* Doesn't nul terminate: */
+static inline void __prt_char(struct printbuf *out, char c)
+{
+	bch2_printbuf_make_room(out, 1);
+	__prt_char_reserved(out, c);
+}
+
+static inline void prt_char(struct printbuf *out, char c)
+{
+	bch2_printbuf_make_room(out, 2);
+	__prt_char_reserved(out, c);
+	printbuf_nul_terminate_reserved(out);
+}
+
+static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n)
+{
+	unsigned can_print = min(n, printbuf_remaining(out));
+
+	for (unsigned i = 0; i < can_print; i++)
+		out->buf[out->pos++] = c;
+}
+
+static inline void prt_chars(struct printbuf *out, char c, unsigned n)
+{
+	bch2_printbuf_make_room(out, n);
+	__prt_chars_reserved(out, c, n);
+	printbuf_nul_terminate_reserved(out);
+}
+
+static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n)
+{
+	bch2_printbuf_make_room(out, n);
+
+	unsigned can_print = min(n, printbuf_remaining(out));
+
+	for (unsigned i = 0; i < can_print; i++)
+		out->buf[out->pos++] = ((char *) b)[i];
+
+	printbuf_nul_terminate(out);
+}
+
+static inline void prt_str(struct printbuf *out, const char *str)
+{
+	prt_bytes(out, str, strlen(str));
+}
+
+static inline void prt_str_indented(struct printbuf *out, const char *str)
+{
+	bch2_prt_bytes_indented(out, str, strlen(str));
+}
+
+static inline void prt_hex_byte(struct printbuf *out, u8 byte)
+{
+	bch2_printbuf_make_room(out, 3);
+	__prt_char_reserved(out, hex_asc_hi(byte));
+	__prt_char_reserved(out, hex_asc_lo(byte));
+	printbuf_nul_terminate_reserved(out);
+}
+
+static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
+{
+	bch2_printbuf_make_room(out, 3);
+	__prt_char_reserved(out, hex_asc_upper_hi(byte));
+	__prt_char_reserved(out, hex_asc_upper_lo(byte));
+	printbuf_nul_terminate_reserved(out);
+}
+
+/**
+ * printbuf_reset - re-use a printbuf without freeing and re-initializing it:
+ */
+static inline void printbuf_reset(struct printbuf *buf)
+{
+	buf->pos		= 0;
+	buf->allocation_failure	= 0;
+	buf->indent		= 0;
+	buf->nr_tabstops	= 0;
+	buf->cur_tabstop	= 0;
+}
+
+/**
+ * printbuf_atomic_inc - mark as entering an atomic section
+ */
+static inline void printbuf_atomic_inc(struct printbuf *buf)
+{
+	buf->atomic++;
+}
+
+/**
+ * printbuf_atomic_inc - mark as leaving an atomic section
+ */
+static inline void printbuf_atomic_dec(struct printbuf *buf)
+{
+	buf->atomic--;
+}
+
+#endif /* _BCACHEFS_PRINTBUF_H */
diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c
index bb03d83a..8b857fc3 100644
--- a/libbcachefs/quota.c
+++ b/libbcachefs/quota.c
@@ -1,68 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "btree_update.h"
+#include "errcode.h"
+#include "error.h"
 #include "inode.h"
 #include "quota.h"
+#include "snapshot.h"
 #include "super-io.h"
 
-static const char *bch2_sb_validate_quota(struct bch_sb *sb,
-					  struct bch_sb_field *f)
+static const char * const bch2_quota_types[] = {
+	"user",
+	"group",
+	"project",
+};
+
+static const char * const bch2_quota_counters[] = {
+	"space",
+	"inodes",
+};
+
+static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				  enum bch_validate_flags flags, struct printbuf *err)
 {
 	struct bch_sb_field_quota *q = field_to_type(f, quota);
 
-	if (vstruct_bytes(&q->field) != sizeof(*q))
-		return "invalid field quota: wrong size";
+	if (vstruct_bytes(&q->field) < sizeof(*q)) {
+		prt_printf(err, "wrong size (got %zu should be %zu)",
+		       vstruct_bytes(&q->field), sizeof(*q));
+		return -BCH_ERR_invalid_sb_quota;
+	}
 
-	return NULL;
+	return 0;
 }
 
-const struct bch_sb_field_ops bch_sb_field_ops_quota = {
-	.validate	= bch2_sb_validate_quota,
-};
-
-const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
+static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb,
+				  struct bch_sb_field *f)
 {
-	struct bkey_s_c_quota dq;
-
-	if (k.k->p.inode >= QTYP_NR)
-		return "invalid quota type";
+	struct bch_sb_field_quota *q = field_to_type(f, quota);
+	unsigned qtyp, counter;
 
-	switch (k.k->type) {
-	case BCH_QUOTA: {
-		dq = bkey_s_c_to_quota(k);
+	for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) {
+		prt_printf(out, "%s: flags %llx",
+		       bch2_quota_types[qtyp],
+		       le64_to_cpu(q->q[qtyp].flags));
 
-		if (bkey_val_bytes(k.k) != sizeof(struct bch_quota))
-			return "incorrect value size";
+		for (counter = 0; counter < Q_COUNTERS; counter++)
+			prt_printf(out, " %s timelimit %u warnlimit %u",
+			       bch2_quota_counters[counter],
+			       le32_to_cpu(q->q[qtyp].c[counter].timelimit),
+			       le32_to_cpu(q->q[qtyp].c[counter].warnlimit));
 
-		return NULL;
-	}
-	default:
-		return "invalid type";
+		prt_newline(out);
 	}
 }
 
-static const char * const bch2_quota_counters[] = {
-	"space",
-	"inodes",
+const struct bch_sb_field_ops bch_sb_field_ops_quota = {
+	.validate	= bch2_sb_quota_validate,
+	.to_text	= bch2_sb_quota_to_text,
 };
 
-void bch2_quota_to_text(struct bch_fs *c, char *buf,
-			size_t size, struct bkey_s_c k)
+int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k,
+			struct bkey_validate_context from)
 {
-	char *out = buf, *end= buf + size;
-	struct bkey_s_c_quota dq;
-	unsigned i;
+	int ret = 0;
 
-	switch (k.k->type) {
-	case BCH_QUOTA:
-		dq = bkey_s_c_to_quota(k);
+	bkey_fsck_err_on(k.k->p.inode >= QTYP_NR,
+			 c, quota_type_invalid,
+			 "invalid quota type (%llu >= %u)",
+			 k.k->p.inode, QTYP_NR);
+fsck_err:
+	return ret;
+}
 
-		for (i = 0; i < Q_COUNTERS; i++)
-			out += scnprintf(out, end - out, "%s hardlimit %llu softlimit %llu",
-					 bch2_quota_counters[i],
-					 le64_to_cpu(dq.v->c[i].hardlimit),
-					 le64_to_cpu(dq.v->c[i].softlimit));
-		break;
-	}
+void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
+			struct bkey_s_c k)
+{
+	struct bkey_s_c_quota dq = bkey_s_c_to_quota(k);
+	unsigned i;
+
+	for (i = 0; i < Q_COUNTERS; i++)
+		prt_printf(out, "%s hardlimit %llu softlimit %llu",
+		       bch2_quota_counters[i],
+		       le64_to_cpu(dq.v->c[i].hardlimit),
+		       le64_to_cpu(dq.v->c[i].softlimit));
 }
 
 #ifdef CONFIG_BCACHEFS_QUOTA
@@ -71,6 +91,39 @@ void bch2_quota_to_text(struct bch_fs *c, char *buf,
 #include <linux/fs.h>
 #include <linux/quota.h>
 
+static void qc_info_to_text(struct printbuf *out, struct qc_info *i)
+{
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, 20);
+
+	prt_printf(out, "i_fieldmask\t%x\n",		i->i_fieldmask);
+	prt_printf(out, "i_flags\t%u\n",		i->i_flags);
+	prt_printf(out, "i_spc_timelimit\t%u\n",	i->i_spc_timelimit);
+	prt_printf(out, "i_ino_timelimit\t%u\n",	i->i_ino_timelimit);
+	prt_printf(out, "i_rt_spc_timelimit\t%u\n",	i->i_rt_spc_timelimit);
+	prt_printf(out, "i_spc_warnlimit\t%u\n",	i->i_spc_warnlimit);
+	prt_printf(out, "i_ino_warnlimit\t%u\n",	i->i_ino_warnlimit);
+	prt_printf(out, "i_rt_spc_warnlimit\t%u\n",	i->i_rt_spc_warnlimit);
+}
+
+static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
+{
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, 20);
+
+	prt_printf(out, "d_fieldmask\t%x\n",		q->d_fieldmask);
+	prt_printf(out, "d_spc_hardlimit\t%llu\n",	q->d_spc_hardlimit);
+	prt_printf(out, "d_spc_softlimit\t%llu\n",	q->d_spc_softlimit);
+	prt_printf(out, "d_ino_hardlimit\%llu\n",	q->d_ino_hardlimit);
+	prt_printf(out, "d_ino_softlimit\t%llu\n",	q->d_ino_softlimit);
+	prt_printf(out, "d_space\t%llu\n",		q->d_space);
+	prt_printf(out, "d_ino_count\t%llu\n",		q->d_ino_count);
+	prt_printf(out, "d_ino_timer\t%llu\n",		q->d_ino_timer);
+	prt_printf(out, "d_spc_timer\t%llu\n",		q->d_spc_timer);
+	prt_printf(out, "d_ino_warns\t%i\n",		q->d_ino_warns);
+	prt_printf(out, "d_spc_warns\t%i\n",		q->d_spc_warns);
+}
+
 static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
 {
 	qtypes >>= i;
@@ -178,7 +231,7 @@ static int bch2_quota_check_limit(struct bch_fs *c,
 
 	BUG_ON((s64) n < 0);
 
-	if (mode == BCH_QUOTA_NOCHECK)
+	if (mode == KEY_TYPE_QUOTA_NOCHECK)
 		return 0;
 
 	if (v <= 0) {
@@ -201,34 +254,20 @@ static int bch2_quota_check_limit(struct bch_fs *c,
 	if (qc->hardlimit &&
 	    qc->hardlimit < n &&
 	    !ignore_hardlimit(q)) {
-		if (mode == BCH_QUOTA_PREALLOC)
-			return -EDQUOT;
-
 		prepare_warning(qc, qtype, counter, msgs, HARDWARN);
+		return -EDQUOT;
 	}
 
 	if (qc->softlimit &&
-	    qc->softlimit < n &&
-	    qc->timer &&
-	    ktime_get_real_seconds() >= qc->timer &&
-	    !ignore_hardlimit(q)) {
-		if (mode == BCH_QUOTA_PREALLOC)
-			return -EDQUOT;
-
-		prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
-	}
-
-	if (qc->softlimit &&
-	    qc->softlimit < n &&
-	    qc->timer == 0) {
-		if (mode == BCH_QUOTA_PREALLOC)
+	    qc->softlimit < n) {
+		if (qc->timer == 0) {
+			qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit;
+			prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
+		} else if (ktime_get_real_seconds() >= qc->timer &&
+			   !ignore_hardlimit(q)) {
+			prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
 			return -EDQUOT;
-
-		prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
-
-		/* XXX is this the right one? */
-		qc->timer = ktime_get_real_seconds() +
-			q->limits[counter].warnlimit;
+		}
 	}
 
 	return 0;
@@ -247,16 +286,16 @@ int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
 
 	memset(&msgs, 0, sizeof(msgs));
 
+	for_each_set_qtype(c, i, q, qtypes) {
+		mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_KERNEL);
+		if (!mq[i])
+			return -ENOMEM;
+	}
+
 	for_each_set_qtype(c, i, q, qtypes)
 		mutex_lock_nested(&q->lock, i);
 
 	for_each_set_qtype(c, i, q, qtypes) {
-		mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS);
-		if (!mq[i]) {
-			ret = -ENOMEM;
-			goto err;
-		}
-
 		ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
 		if (ret)
 			goto err;
@@ -286,7 +325,8 @@ static void __bch2_quota_transfer(struct bch_memquota *src_q,
 
 int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
 			struct bch_qid dst,
-			struct bch_qid src, u64 space)
+			struct bch_qid src, u64 space,
+			enum quota_acct_mode mode)
 {
 	struct bch_memquota_type *q;
 	struct bch_memquota *src_q[3], *dst_q[3];
@@ -298,27 +338,26 @@ int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
 
 	memset(&msgs, 0, sizeof(msgs));
 
+	for_each_set_qtype(c, i, q, qtypes) {
+		src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_KERNEL);
+		dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_KERNEL);
+		if (!src_q[i] || !dst_q[i])
+			return -ENOMEM;
+	}
+
 	for_each_set_qtype(c, i, q, qtypes)
 		mutex_lock_nested(&q->lock, i);
 
 	for_each_set_qtype(c, i, q, qtypes) {
-		src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS);
-		dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS);
-
-		if (!src_q[i] || !dst_q[i]) {
-			ret = -ENOMEM;
-			goto err;
-		}
-
 		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
 					     dst_q[i]->c[Q_SPC].v + space,
-					     BCH_QUOTA_PREALLOC);
+					     mode);
 		if (ret)
 			goto err;
 
 		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
 					     dst_q[i]->c[Q_INO].v + 1,
-					     BCH_QUOTA_PREALLOC);
+					     mode);
 		if (ret)
 			goto err;
 	}
@@ -337,7 +376,8 @@ err:
 	return ret;
 }
 
-static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k,
+			    struct qc_dqblk *qdq)
 {
 	struct bkey_s_c_quota dq;
 	struct bch_memquota_type *q;
@@ -346,8 +386,11 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
 
 	BUG_ON(k.k->p.inode >= QTYP_NR);
 
+	if (!((1U << k.k->p.inode) & enabled_qtypes(c)))
+		return 0;
+
 	switch (k.k->type) {
-	case BCH_QUOTA:
+	case KEY_TYPE_quota:
 		dq = bkey_s_c_to_quota(k);
 		q = &c->quotas[k.k->p.inode];
 
@@ -363,31 +406,21 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
 			mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
 		}
 
+		if (qdq && qdq->d_fieldmask & QC_SPC_TIMER)
+			mq->c[Q_SPC].timer	= qdq->d_spc_timer;
+		if (qdq && qdq->d_fieldmask & QC_SPC_WARNS)
+			mq->c[Q_SPC].warns	= qdq->d_spc_warns;
+		if (qdq && qdq->d_fieldmask & QC_INO_TIMER)
+			mq->c[Q_INO].timer	= qdq->d_ino_timer;
+		if (qdq && qdq->d_fieldmask & QC_INO_WARNS)
+			mq->c[Q_INO].warns	= qdq->d_ino_warns;
+
 		mutex_unlock(&q->lock);
 	}
 
 	return 0;
 }
 
-static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	for_each_btree_key(&iter, c, BTREE_ID_QUOTAS, POS(type, 0),
-			   BTREE_ITER_PREFETCH, k) {
-		if (k.k->p.inode != type)
-			break;
-
-		ret = __bch2_quota_set(c, k);
-		if (ret)
-			break;
-	}
-
-	return bch2_btree_iter_unlock(&iter) ?: ret;
-}
-
 void bch2_fs_quota_exit(struct bch_fs *c)
 {
 	unsigned i;
@@ -404,12 +437,32 @@ void bch2_fs_quota_init(struct bch_fs *c)
 		mutex_init(&c->quotas[i].lock);
 }
 
+static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb)
+{
+	struct bch_sb_field_quota *sb_quota = bch2_sb_field_get(sb->sb, quota);
+
+	if (sb_quota)
+		return sb_quota;
+
+	sb_quota = bch2_sb_field_resize(sb, quota, sizeof(*sb_quota) / sizeof(u64));
+	if (sb_quota) {
+		unsigned qtype, qc;
+
+		for (qtype = 0; qtype < QTYP_NR; qtype++)
+			for (qc = 0; qc < Q_COUNTERS; qc++)
+				sb_quota->q[qtype].c[qc].timelimit =
+					cpu_to_le32(7 * 24 * 60 * 60);
+	}
+
+	return sb_quota;
+}
+
 static void bch2_sb_quota_read(struct bch_fs *c)
 {
 	struct bch_sb_field_quota *sb_quota;
 	unsigned i, j;
 
-	sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
+	sb_quota = bch2_sb_field_get(c->disk_sb.sb, quota);
 	if (!sb_quota)
 		return;
 
@@ -425,40 +478,70 @@ static void bch2_sb_quota_read(struct bch_fs *c)
 	}
 }
 
-int bch2_fs_quota_read(struct bch_fs *c)
+static int bch2_fs_quota_read_inode(struct btree_trans *trans,
+				    struct btree_iter *iter,
+				    struct bkey_s_c k)
 {
-	unsigned i, qtypes = enabled_qtypes(c);
-	struct bch_memquota_type *q;
-	struct btree_iter iter;
+	struct bch_fs *c = trans->c;
 	struct bch_inode_unpacked u;
-	struct bkey_s_c k;
-	int ret;
+	struct bch_snapshot_tree s_t;
+	u32 tree = bch2_snapshot_tree(c, k.k->p.snapshot);
+
+	int ret = bch2_snapshot_tree_lookup(trans, tree, &s_t);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+			"%s: snapshot tree %u not found", __func__, tree);
+	if (ret)
+		return ret;
+
+	if (!s_t.master_subvol)
+		goto advance;
+
+	ret = bch2_inode_find_by_inum_nowarn_trans(trans,
+				(subvol_inum) {
+					le32_to_cpu(s_t.master_subvol),
+					k.k->p.offset,
+				}, &u);
+	/*
+	 * Inode might be deleted in this snapshot - the easiest way to handle
+	 * that is to just skip it here:
+	 */
+	if (bch2_err_matches(ret, ENOENT))
+		goto advance;
+
+	if (ret)
+		return ret;
+
+	bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
+			KEY_TYPE_QUOTA_NOCHECK);
+	bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
+			KEY_TYPE_QUOTA_NOCHECK);
+advance:
+	bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+	return 0;
+}
+
+int bch2_fs_quota_read(struct bch_fs *c)
+{
 
 	mutex_lock(&c->sb_lock);
+	struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+	if (!sb_quota) {
+		mutex_unlock(&c->sb_lock);
+		return -BCH_ERR_ENOSPC_sb_quota;
+	}
+
 	bch2_sb_quota_read(c);
 	mutex_unlock(&c->sb_lock);
 
-	for_each_set_qtype(c, i, q, qtypes) {
-		ret = bch2_quota_init_type(c, i);
-		if (ret)
-			return ret;
-	}
-
-	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN,
-			   BTREE_ITER_PREFETCH, k) {
-		switch (k.k->type) {
-		case BCH_INODE_FS:
-			ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
-			if (ret)
-				return ret;
-
-			bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
-					BCH_QUOTA_NOCHECK);
-			bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
-					BCH_QUOTA_NOCHECK);
-		}
-	}
-	return bch2_btree_iter_unlock(&iter) ?: ret;
+	int ret = bch2_trans_run(c,
+		for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN,
+				   BTREE_ITER_prefetch, k,
+			__bch2_quota_set(c, k, NULL)) ?:
+		for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
+				   BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+			bch2_fs_quota_read_inode(trans, &iter, k)));
+	bch_err_fn(c, ret);
+	return ret;
 }
 
 /* Enable/disable/delete quotas for an entire filesystem: */
@@ -466,8 +549,10 @@ int bch2_fs_quota_read(struct bch_fs *c)
 static int bch2_quota_enable(struct super_block	*sb, unsigned uflags)
 {
 	struct bch_fs *c = sb->s_fs_info;
+	struct bch_sb_field_quota *sb_quota;
+	int ret = 0;
 
-	if (sb->s_flags & MS_RDONLY)
+	if (sb->s_flags & SB_RDONLY)
 		return -EROFS;
 
 	/* Accounting must be enabled at mount time: */
@@ -485,6 +570,12 @@ static int bch2_quota_enable(struct super_block	*sb, unsigned uflags)
 		return -EINVAL;
 
 	mutex_lock(&c->sb_lock);
+	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+	if (!sb_quota) {
+		ret = -BCH_ERR_ENOSPC_sb_quota;
+		goto unlock;
+	}
+
 	if (uflags & FS_QUOTA_UDQ_ENFD)
 		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
 
@@ -495,16 +586,17 @@ static int bch2_quota_enable(struct super_block	*sb, unsigned uflags)
 		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
 
 	bch2_write_super(c);
+unlock:
 	mutex_unlock(&c->sb_lock);
 
-	return 0;
+	return bch2_err_class(ret);
 }
 
 static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
 {
 	struct bch_fs *c = sb->s_fs_info;
 
-	if (sb->s_flags & MS_RDONLY)
+	if (sb->s_flags & SB_RDONLY)
 		return -EROFS;
 
 	mutex_lock(&c->sb_lock);
@@ -528,17 +620,17 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
 	struct bch_fs *c = sb->s_fs_info;
 	int ret;
 
-	if (sb->s_flags & MS_RDONLY)
+	if (sb->s_flags & SB_RDONLY)
 		return -EROFS;
 
 	if (uflags & FS_USER_QUOTA) {
 		if (c->opts.usrquota)
 			return -EINVAL;
 
-		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
 					      POS(QTYP_USR, 0),
-					      POS(QTYP_USR + 1, 0),
-					      ZERO_VERSION, NULL, NULL, NULL);
+					      POS(QTYP_USR, U64_MAX),
+					      0, NULL);
 		if (ret)
 			return ret;
 	}
@@ -547,10 +639,10 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
 		if (c->opts.grpquota)
 			return -EINVAL;
 
-		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
 					      POS(QTYP_GRP, 0),
-					      POS(QTYP_GRP + 1, 0),
-					      ZERO_VERSION, NULL, NULL, NULL);
+					      POS(QTYP_GRP, U64_MAX),
+					      0, NULL);
 		if (ret)
 			return ret;
 	}
@@ -559,10 +651,10 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
 		if (c->opts.prjquota)
 			return -EINVAL;
 
-		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
 					      POS(QTYP_PRJ, 0),
-					      POS(QTYP_PRJ + 1, 0),
-					      ZERO_VERSION, NULL, NULL, NULL);
+					      POS(QTYP_PRJ, U64_MAX),
+					      0, NULL);
 		if (ret)
 			return ret;
 	}
@@ -608,9 +700,17 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
 {
 	struct bch_fs *c = sb->s_fs_info;
 	struct bch_sb_field_quota *sb_quota;
-	struct bch_memquota_type *q;
+	int ret = 0;
+
+	if (0) {
+		struct printbuf buf = PRINTBUF;
 
-	if (sb->s_flags & MS_RDONLY)
+		qc_info_to_text(&buf, info);
+		pr_info("setting:\n%s", buf.buf);
+		printbuf_exit(&buf);
+	}
+
+	if (sb->s_flags & SB_RDONLY)
 		return -EROFS;
 
 	if (type >= QTYP_NR)
@@ -623,15 +723,11 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
 	    ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
 		return -EINVAL;
 
-	q = &c->quotas[type];
-
 	mutex_lock(&c->sb_lock);
-	sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
+	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
 	if (!sb_quota) {
-		sb_quota = bch2_sb_resize_quota(&c->disk_sb,
-					sizeof(*sb_quota) / sizeof(u64));
-		if (!sb_quota)
-			return -ENOSPC;
+		ret = -BCH_ERR_ENOSPC_sb_quota;
+		goto unlock;
 	}
 
 	if (info->i_fieldmask & QC_SPC_TIMER)
@@ -653,9 +749,10 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
 	bch2_sb_quota_read(c);
 
 	bch2_write_super(c);
+unlock:
 	mutex_unlock(&c->sb_lock);
 
-	return 0;
+	return bch2_err_class(ret);
 }
 
 /* Get/set individual quotas: */
@@ -700,77 +797,83 @@ static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
 	struct bch_fs *c		= sb->s_fs_info;
 	struct bch_memquota_type *q	= &c->quotas[kqid->type];
 	qid_t qid			= from_kqid(&init_user_ns, *kqid);
-	struct genradix_iter iter	= genradix_iter_init(&q->table, qid);
+	struct genradix_iter iter;
 	struct bch_memquota *mq;
 	int ret = 0;
 
 	mutex_lock(&q->lock);
 
-	while ((mq = genradix_iter_peek(&iter, &q->table))) {
+	genradix_for_each_from(&q->table, iter, mq, qid)
 		if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
 			__bch2_quota_get(qdq, mq);
 			*kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
 			goto found;
 		}
 
-		genradix_iter_advance(&iter, &q->table);
-	}
-
 	ret = -ENOENT;
 found:
 	mutex_unlock(&q->lock);
-	return ret;
+	return bch2_err_class(ret);
 }
 
-static int bch2_set_quota(struct super_block *sb, struct kqid qid,
-			  struct qc_dqblk *qdq)
+static int bch2_set_quota_trans(struct btree_trans *trans,
+				struct bkey_i_quota *new_quota,
+				struct qc_dqblk *qdq)
 {
-	struct bch_fs *c = sb->s_fs_info;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	struct bkey_i_quota new_quota;
 	int ret;
 
-	if (sb->s_flags & MS_RDONLY)
-		return -EROFS;
-
-	bkey_quota_init(&new_quota.k_i);
-	new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
-
-	bch2_btree_iter_init(&iter, c, BTREE_ID_QUOTAS, new_quota.k.p,
-			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(&iter);
-
-	ret = btree_iter_err(k);
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
+			       BTREE_ITER_slots|BTREE_ITER_intent);
+	ret = bkey_err(k);
 	if (unlikely(ret))
 		return ret;
 
-	switch (k.k->type) {
-	case BCH_QUOTA:
-		new_quota.v = *bkey_s_c_to_quota(k).v;
-		break;
-	}
+	if (k.k->type == KEY_TYPE_quota)
+		new_quota->v = *bkey_s_c_to_quota(k).v;
 
 	if (qdq->d_fieldmask & QC_SPC_SOFT)
-		new_quota.v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
+		new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
 	if (qdq->d_fieldmask & QC_SPC_HARD)
-		new_quota.v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
+		new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
 
 	if (qdq->d_fieldmask & QC_INO_SOFT)
-		new_quota.v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
+		new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
 	if (qdq->d_fieldmask & QC_INO_HARD)
-		new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
+		new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
 
-	ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
-				   BTREE_INSERT_ENTRY(&iter, &new_quota.k_i));
-	bch2_btree_iter_unlock(&iter);
+	ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
 
-	if (ret)
-		return ret;
+static int bch2_set_quota(struct super_block *sb, struct kqid qid,
+			  struct qc_dqblk *qdq)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct bkey_i_quota new_quota;
+	int ret;
 
-	ret = __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
+	if (0) {
+		struct printbuf buf = PRINTBUF;
 
-	return ret;
+		qc_dqblk_to_text(&buf, qdq);
+		pr_info("setting:\n%s", buf.buf);
+		printbuf_exit(&buf);
+	}
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	bkey_quota_init(&new_quota.k_i);
+	new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
+
+	ret = bch2_trans_commit_do(c, NULL, NULL, 0,
+			    bch2_set_quota_trans(trans, &new_quota, qdq)) ?:
+		__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
+
+	return bch2_err_class(ret);
 }
 
 const struct quotactl_ops bch2_quotactl_operations = {
diff --git a/libbcachefs/quota.h b/libbcachefs/quota.h
index 0b24f22c..1551800f 100644
--- a/libbcachefs/quota.h
+++ b/libbcachefs/quota.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_QUOTA_H
 #define _BCACHEFS_QUOTA_H
 
@@ -6,26 +7,22 @@
 
 extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
 
-const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+int bch2_quota_validate(struct bch_fs *, struct bkey_s_c,
+			struct bkey_validate_context);
+void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_quota_ops (struct bkey_ops) {		\
-	.key_invalid	= bch2_quota_invalid,		\
+#define bch2_bkey_ops_quota ((struct bkey_ops) {	\
+	.key_validate	= bch2_quota_validate,		\
 	.val_to_text	= bch2_quota_to_text,		\
-}
-
-enum quota_acct_mode {
-	BCH_QUOTA_PREALLOC,
-	BCH_QUOTA_WARN,
-	BCH_QUOTA_NOCHECK,
-};
+	.min_val_size	= 32,				\
+})
 
 static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
 {
 	return (struct bch_qid) {
 		.q[QTYP_USR] = u->bi_uid,
 		.q[QTYP_GRP] = u->bi_gid,
-		.q[QTYP_PRJ] = u->bi_project,
+		.q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0,
 	};
 }
 
@@ -42,7 +39,7 @@ int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
 		    s64, enum quota_acct_mode);
 
 int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
-			struct bch_qid, u64);
+			struct bch_qid, u64, enum quota_acct_mode);
 
 void bch2_fs_quota_exit(struct bch_fs *);
 void bch2_fs_quota_init(struct bch_fs *);
@@ -61,7 +58,8 @@ static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
 
 static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
 				      struct bch_qid dst,
-				      struct bch_qid src, u64 space)
+				      struct bch_qid src, u64 space,
+				      enum quota_acct_mode mode)
 {
 	return 0;
 }
diff --git a/libbcachefs/quota_format.h b/libbcachefs/quota_format.h
new file mode 100644
index 00000000..dc34347e
--- /dev/null
+++ b/libbcachefs/quota_format.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_FORMAT_H
+#define _BCACHEFS_QUOTA_FORMAT_H
+
+/* KEY_TYPE_quota: */
+
+enum quota_types {
+	QTYP_USR		= 0,
+	QTYP_GRP		= 1,
+	QTYP_PRJ		= 2,
+	QTYP_NR			= 3,
+};
+
+enum quota_counters {
+	Q_SPC			= 0,
+	Q_INO			= 1,
+	Q_COUNTERS		= 2,
+};
+
+struct bch_quota_counter {
+	__le64			hardlimit;
+	__le64			softlimit;
+};
+
+struct bch_quota {
+	struct bch_val		v;
+	struct bch_quota_counter c[Q_COUNTERS];
+} __packed __aligned(8);
+
+/* BCH_SB_FIELD_quota: */
+
+struct bch_sb_quota_counter {
+	__le32				timelimit;
+	__le32				warnlimit;
+};
+
+struct bch_sb_quota_type {
+	__le64				flags;
+	struct bch_sb_quota_counter	c[Q_COUNTERS];
+};
+
+struct bch_sb_field_quota {
+	struct bch_sb_field		field;
+	struct bch_sb_quota_type	q[QTYP_NR];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_QUOTA_FORMAT_H */
diff --git a/libbcachefs/quota_types.h b/libbcachefs/quota_types.h
index bcaed4ea..6a136083 100644
--- a/libbcachefs/quota_types.h
+++ b/libbcachefs/quota_types.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_QUOTA_TYPES_H
 #define _BCACHEFS_QUOTA_TYPES_H
 
@@ -7,6 +8,12 @@ struct bch_qid {
 	u32		q[QTYP_NR];
 };
 
+enum quota_acct_mode {
+	KEY_TYPE_QUOTA_PREALLOC,
+	KEY_TYPE_QUOTA_WARN,
+	KEY_TYPE_QUOTA_NOCHECK,
+};
+
 struct memquota_counter {
 	u64				v;
 	u64				hardlimit;
diff --git a/libbcachefs/rcu_pending.c b/libbcachefs/rcu_pending.c
new file mode 100644
index 00000000..67522aa3
--- /dev/null
+++ b/libbcachefs/rcu_pending.c
@@ -0,0 +1,652 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) "%s() " fmt "\n", __func__
+
+#include <linux/generic-radix-tree.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/vmalloc.h>
+
+#include "rcu_pending.h"
+#include "darray.h"
+#include "util.h"
+
+#define static_array_for_each(_a, _i)			\
+	for (typeof(&(_a)[0]) _i = _a;			\
+	     _i < (_a) + ARRAY_SIZE(_a);		\
+	     _i++)
+
+enum rcu_pending_special {
+	RCU_PENDING_KVFREE	= 1,
+	RCU_PENDING_CALL_RCU	= 2,
+};
+
+#define RCU_PENDING_KVFREE_FN		((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
+#define RCU_PENDING_CALL_RCU_FN		((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
+
+static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp)
+{
+	return ssp
+		? get_state_synchronize_srcu(ssp)
+		: get_state_synchronize_rcu();
+}
+
+static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp)
+{
+	return ssp
+		? start_poll_synchronize_srcu(ssp)
+		: start_poll_synchronize_rcu();
+}
+
+static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+	return ssp
+		? poll_state_synchronize_srcu(ssp, cookie)
+		: poll_state_synchronize_rcu(cookie);
+}
+
+static inline void __rcu_barrier(struct srcu_struct *ssp)
+{
+	return ssp
+		? srcu_barrier(ssp)
+		: rcu_barrier();
+}
+
+static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp,
+			      rcu_callback_t func)
+{
+	if (ssp)
+		call_srcu(ssp, rhp, func);
+	else
+		call_rcu(rhp, func);
+}
+
+struct rcu_pending_seq {
+	/*
+	 * We're using a radix tree like a vector - we're just pushing elements
+	 * onto the end; we're using a radix tree instead of an actual vector to
+	 * avoid reallocation overhead
+	 */
+	GENRADIX(struct rcu_head *)	objs;
+	size_t				nr;
+	struct rcu_head			**cursor;
+	unsigned long			seq;
+};
+
+struct rcu_pending_list {
+	struct rcu_head			*head;
+	struct rcu_head			*tail;
+	unsigned long			seq;
+};
+
+struct rcu_pending_pcpu {
+	struct rcu_pending		*parent;
+	spinlock_t			lock;
+	int				cpu;
+
+	/*
+	 * We can't bound the number of unprocessed gp sequence numbers, and we
+	 * can't efficiently merge radix trees for expired grace periods, so we
+	 * need darray/vector:
+	 */
+	DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs;
+
+	/* Third entry is for expired objects: */
+	struct rcu_pending_list		lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1];
+
+	struct rcu_head			cb;
+	bool				cb_armed;
+	struct work_struct		work;
+};
+
+static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
+{
+	if (p->objs.nr)
+		return true;
+
+	static_array_for_each(p->lists, i)
+		if (i->head)
+			return true;
+
+	return false;
+}
+
+static void rcu_pending_list_merge(struct rcu_pending_list *l1,
+				   struct rcu_pending_list *l2)
+{
+#ifdef __KERNEL__
+	if (!l1->head)
+		l1->head = l2->head;
+	else
+		l1->tail->next = l2->head;
+#else
+	if (!l1->head)
+		l1->head = l2->head;
+	else
+		l1->tail->next.next = (void *) l2->head;
+#endif
+
+	l1->tail = l2->tail;
+	l2->head = l2->tail = NULL;
+}
+
+static void rcu_pending_list_add(struct rcu_pending_list *l,
+				 struct rcu_head *n)
+{
+#ifdef __KERNEL__
+	if (!l->head)
+		l->head = n;
+	else
+		l->tail->next = n;
+	l->tail = n;
+	n->next = NULL;
+#else
+	if (!l->head)
+		l->head = n;
+	else
+		l->tail->next.next = (void *) n;
+	l->tail = n;
+	n->next.next = NULL;
+#endif
+}
+
+static void merge_expired_lists(struct rcu_pending_pcpu *p)
+{
+	struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+
+	for (struct rcu_pending_list *i = p->lists; i < expired; i++)
+		if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq))
+			rcu_pending_list_merge(expired, i);
+}
+
+#ifndef __KERNEL__
+static inline void kfree_bulk(size_t nr, void ** p)
+{
+	while (nr--)
+		kfree(*p);
+}
+
+#define local_irq_save(flags)		\
+do {					\
+	flags = 0;			\
+} while (0)
+#endif
+
+static noinline void __process_finished_items(struct rcu_pending *pending,
+					      struct rcu_pending_pcpu *p,
+					      unsigned long flags)
+{
+	struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+	struct rcu_pending_seq objs = {};
+	struct rcu_head *list = NULL;
+
+	if (p->objs.nr &&
+	    __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) {
+		objs = p->objs.data[0];
+		darray_remove_item(&p->objs, p->objs.data);
+	}
+
+	merge_expired_lists(p);
+
+	list = expired->head;
+	expired->head = expired->tail = NULL;
+
+	spin_unlock_irqrestore(&p->lock, flags);
+
+	switch ((ulong) pending->process) {
+	case RCU_PENDING_KVFREE:
+		for (size_t i = 0; i < objs.nr; ) {
+			size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i);
+
+			kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i));
+			i += nr_this_node;
+		}
+		genradix_free(&objs.objs);
+
+		while (list) {
+			struct rcu_head *obj = list;
+#ifdef __KERNEL__
+			list = obj->next;
+#else
+			list = (void *) obj->next.next;
+#endif
+
+			/*
+			 * low bit of pointer indicates whether rcu_head needs
+			 * to be freed - kvfree_rcu_mightsleep()
+			 */
+			BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
+
+			void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
+			bool free_head = ((unsigned long) obj->func) & 1UL;
+
+			kvfree(ptr);
+			if (free_head)
+				kfree(obj);
+		}
+
+		break;
+
+	case RCU_PENDING_CALL_RCU:
+		for (size_t i = 0; i < objs.nr; i++) {
+			struct rcu_head *obj = *genradix_ptr(&objs.objs, i);
+			obj->func(obj);
+		}
+		genradix_free(&objs.objs);
+
+		while (list) {
+			struct rcu_head *obj = list;
+#ifdef __KERNEL__
+			list = obj->next;
+#else
+			list = (void *) obj->next.next;
+#endif
+			obj->func(obj);
+		}
+		break;
+
+	default:
+		for (size_t i = 0; i < objs.nr; i++)
+			pending->process(pending, *genradix_ptr(&objs.objs, i));
+		genradix_free(&objs.objs);
+
+		while (list) {
+			struct rcu_head *obj = list;
+#ifdef __KERNEL__
+			list = obj->next;
+#else
+			list = (void *) obj->next.next;
+#endif
+			pending->process(pending, obj);
+		}
+		break;
+	}
+}
+
+static bool process_finished_items(struct rcu_pending *pending,
+				   struct rcu_pending_pcpu *p,
+				   unsigned long flags)
+{
+	/*
+	 * XXX: we should grab the gp seq once and avoid multiple function
+	 * calls, this is called from __rcu_pending_enqueue() fastpath in
+	 * may_sleep==true mode
+	 */
+	if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) ||
+	    (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) ||
+	    (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) ||
+	    p->lists[2].head) {
+		__process_finished_items(pending, p, flags);
+		return true;
+	}
+
+	return false;
+}
+
+static void rcu_pending_work(struct work_struct *work)
+{
+	struct rcu_pending_pcpu *p =
+		container_of(work, struct rcu_pending_pcpu, work);
+	struct rcu_pending *pending = p->parent;
+	unsigned long flags;
+
+	do {
+		spin_lock_irqsave(&p->lock, flags);
+	} while (process_finished_items(pending, p, flags));
+
+	spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static void rcu_pending_rcu_cb(struct rcu_head *rcu)
+{
+	struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb);
+
+	schedule_work_on(p->cpu, &p->work);
+
+	unsigned long flags;
+	spin_lock_irqsave(&p->lock, flags);
+	if (__rcu_pending_has_pending(p)) {
+		spin_unlock_irqrestore(&p->lock, flags);
+		__call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
+	} else {
+		p->cb_armed = false;
+		spin_unlock_irqrestore(&p->lock, flags);
+	}
+}
+
+static __always_inline struct rcu_pending_seq *
+get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq)
+{
+	darray_for_each_reverse(p->objs, objs)
+		if (objs->seq == seq)
+			return objs;
+
+	if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
+		return NULL;
+
+	return &darray_last(p->objs);
+}
+
+static noinline bool
+rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq,
+			 struct rcu_head *head, void *ptr,
+			 unsigned long *flags)
+{
+	if (ptr) {
+		if (!head) {
+			/*
+			 * kvfree_rcu_mightsleep(): we weren't passed an
+			 * rcu_head, but we need one: use the low bit of the
+			 * ponter to free to flag that the head needs to be
+			 * freed as well:
+			 */
+			ptr = (void *)(((unsigned long) ptr)|1UL);
+			head = kmalloc(sizeof(*head), __GFP_NOWARN);
+			if (!head) {
+				spin_unlock_irqrestore(&p->lock, *flags);
+				head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL);
+				/*
+				 * dropped lock, did GFP_KERNEL allocation,
+				 * check for gp expiration
+				 */
+				if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) {
+					kvfree(--ptr);
+					kfree(head);
+					spin_lock_irqsave(&p->lock, *flags);
+					return false;
+				}
+			}
+		}
+
+		head->func = ptr;
+	}
+again:
+	for (struct rcu_pending_list *i = p->lists;
+	     i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
+		if (i->seq == seq) {
+			rcu_pending_list_add(i, head);
+			return false;
+		}
+	}
+
+	for (struct rcu_pending_list *i = p->lists;
+	     i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
+		if (!i->head) {
+			i->seq = seq;
+			rcu_pending_list_add(i, head);
+			return true;
+		}
+	}
+
+	merge_expired_lists(p);
+	goto again;
+}
+
+/*
+ * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via
+ * pending->pracess) once grace period elapses.
+ *
+ * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall
+ * back to a linked list.
+ *
+ * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a
+ *   process callback
+ *
+ * - If @ptr and @head are both not NULL, we're kvfree_rcu()
+ *
+ * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep()
+ *
+ * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process
+ *   expired items.
+ */
+static __always_inline void
+__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
+		      void *ptr, bool may_sleep)
+{
+
+	struct rcu_pending_pcpu *p;
+	struct rcu_pending_seq *objs;
+	struct genradix_node *new_node = NULL;
+	unsigned long seq, flags;
+	bool start_gp = false;
+
+	BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
+
+	local_irq_save(flags);
+	p = this_cpu_ptr(pending->p);
+	spin_lock(&p->lock);
+	seq = __get_state_synchronize_rcu(pending->srcu);
+restart:
+	if (may_sleep &&
+	    unlikely(process_finished_items(pending, p, flags)))
+		goto check_expired;
+
+	/*
+	 * In kvfree_rcu() mode, the radix tree is only for slab pointers so
+	 * that we can do kfree_bulk() - vmalloc pointers always use the linked
+	 * list:
+	 */
+	if (ptr && unlikely(is_vmalloc_addr(ptr)))
+		goto list_add;
+
+	objs = get_object_radix(p, seq);
+	if (unlikely(!objs))
+		goto list_add;
+
+	if (unlikely(!objs->cursor)) {
+		/*
+		 * New radix tree nodes must be added under @p->lock because the
+		 * tree root is in a darray that can be resized (typically,
+		 * genradix supports concurrent unlocked allocation of new
+		 * nodes) - hence preallocation and the retry loop:
+		 */
+		objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs,
+						objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN);
+		if (unlikely(!objs->cursor)) {
+			if (may_sleep) {
+				spin_unlock_irqrestore(&p->lock, flags);
+
+				gfp_t gfp = GFP_KERNEL;
+				if (!head)
+					gfp |= __GFP_NOFAIL;
+
+				new_node = genradix_alloc_node(gfp);
+				if (!new_node)
+					may_sleep = false;
+				goto check_expired;
+			}
+list_add:
+			start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags);
+			goto start_gp;
+		}
+	}
+
+	*objs->cursor++ = ptr ?: head;
+	/* zero cursor if we hit the end of a radix tree node: */
+	if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1)))
+		objs->cursor = NULL;
+	start_gp = !objs->nr;
+	objs->nr++;
+start_gp:
+	if (unlikely(start_gp)) {
+		/*
+		 * We only have one callback (ideally, we would have one for
+		 * every outstanding graceperiod) - so if our callback is
+		 * already in flight, we may still have to start a grace period
+		 * (since we used get_state() above, not start_poll())
+		 */
+		if (!p->cb_armed) {
+			p->cb_armed = true;
+			spin_unlock_irqrestore(&p->lock, flags);
+			__call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
+			goto free_node;
+		} else {
+			__start_poll_synchronize_rcu(pending->srcu);
+		}
+	}
+	spin_unlock_irqrestore(&p->lock, flags);
+free_node:
+	if (new_node)
+		genradix_free_node(new_node);
+	return;
+check_expired:
+	if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) {
+		switch ((ulong) pending->process) {
+		case RCU_PENDING_KVFREE:
+			kvfree(ptr);
+			break;
+		case RCU_PENDING_CALL_RCU:
+			head->func(head);
+			break;
+		default:
+			pending->process(pending, head);
+			break;
+		}
+		goto free_node;
+	}
+
+	local_irq_save(flags);
+	p = this_cpu_ptr(pending->p);
+	spin_lock(&p->lock);
+	goto restart;
+}
+
+void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj)
+{
+	__rcu_pending_enqueue(pending, obj, NULL, true);
+}
+
+static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
+{
+	struct rcu_head *ret = NULL;
+
+	spin_lock_irq(&p->lock);
+	darray_for_each(p->objs, objs)
+		if (objs->nr) {
+			ret = *genradix_ptr(&objs->objs, --objs->nr);
+			objs->cursor = NULL;
+			if (!objs->nr)
+				genradix_free(&objs->objs);
+			goto out;
+		}
+
+	static_array_for_each(p->lists, i)
+		if (i->head) {
+			ret = i->head;
+#ifdef __KERNEL__
+			i->head = ret->next;
+#else
+			i->head = (void *) ret->next.next;
+#endif
+			if (!i->head)
+				i->tail = NULL;
+			goto out;
+		}
+out:
+	spin_unlock_irq(&p->lock);
+
+	return ret;
+}
+
+struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending)
+{
+	return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p));
+}
+
+struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending)
+{
+	struct rcu_head *ret = rcu_pending_dequeue(pending);
+
+	if (ret)
+		return ret;
+
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending)
+{
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+		spin_lock_irq(&p->lock);
+		if (__rcu_pending_has_pending(p) || p->cb_armed) {
+			spin_unlock_irq(&p->lock);
+			return true;
+		}
+		spin_unlock_irq(&p->lock);
+	}
+
+	return false;
+}
+
+void rcu_pending_exit(struct rcu_pending *pending)
+{
+	int cpu;
+
+	if (!pending->p)
+		return;
+
+	while (rcu_pending_has_pending_or_armed(pending)) {
+		__rcu_barrier(pending->srcu);
+
+		for_each_possible_cpu(cpu) {
+			struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+			flush_work(&p->work);
+		}
+	}
+
+	for_each_possible_cpu(cpu) {
+		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+		flush_work(&p->work);
+	}
+
+	for_each_possible_cpu(cpu) {
+		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+
+		static_array_for_each(p->lists, i)
+			WARN_ON(i->head);
+		WARN_ON(p->objs.nr);
+		darray_exit(&p->objs);
+	}
+	free_percpu(pending->p);
+}
+
+/**
+ * rcu_pending_init: - initialize a rcu_pending
+ *
+ * @pending:	Object to init
+ * @srcu:	May optionally be used with an srcu_struct; if NULL, uses normal
+ *		RCU flavor
+ * @process:	Callback function invoked on objects once their RCU barriers
+ *		have completed; if NULL, kvfree() is used.
+ */
+int rcu_pending_init(struct rcu_pending *pending,
+		     struct srcu_struct *srcu,
+		     rcu_pending_process_fn process)
+{
+	pending->p = alloc_percpu(struct rcu_pending_pcpu);
+	if (!pending->p)
+		return -ENOMEM;
+
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+		p->parent	= pending;
+		p->cpu		= cpu;
+		spin_lock_init(&p->lock);
+		darray_init(&p->objs);
+		INIT_WORK(&p->work, rcu_pending_work);
+	}
+
+	pending->srcu = srcu;
+	pending->process = process;
+
+	return 0;
+}
diff --git a/libbcachefs/rcu_pending.h b/libbcachefs/rcu_pending.h
new file mode 100644
index 00000000..71a2f4dd
--- /dev/null
+++ b/libbcachefs/rcu_pending.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RCU_PENDING_H
+#define _LINUX_RCU_PENDING_H
+
+#include <linux/rcupdate.h>
+
+struct rcu_pending;
+typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *);
+
+struct rcu_pending_pcpu;
+
+struct rcu_pending {
+	struct rcu_pending_pcpu __percpu *p;
+	struct srcu_struct		*srcu;
+	rcu_pending_process_fn		process;
+};
+
+void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj);
+struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending);
+struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending);
+
+void rcu_pending_exit(struct rcu_pending *pending);
+int rcu_pending_init(struct rcu_pending *pending,
+		     struct srcu_struct *srcu,
+		     rcu_pending_process_fn process);
+
+#endif /* _LINUX_RCU_PENDING_H */
diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c
index 4154b1e9..4adc74cd 100644
--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@@ -1,300 +1,629 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
-#include "alloc.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
 #include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "clock.h"
+#include "compress.h"
 #include "disk_groups.h"
-#include "extents.h"
-#include "io.h"
+#include "errcode.h"
+#include "error.h"
+#include "inode.h"
+#include "io_write.h"
 #include "move.h"
 #include "rebalance.h"
+#include "subvolume.h"
 #include "super-io.h"
+#include "trace.h"
 
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/sched/cputime.h>
-#include <trace/events/bcachefs.h>
 
-static inline bool rebalance_ptr_pred(struct bch_fs *c,
-				      const struct bch_extent_ptr *ptr,
-				      struct bch_extent_crc_unpacked crc,
-				      struct bch_io_opts *io_opts)
+/* bch_extent_rebalance: */
+
+static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
 {
-	if (io_opts->background_target &&
-	    !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
-	    !ptr->cached)
-		return true;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
 
-	if (io_opts->background_compression &&
-	    crc.compression_type !=
-	    bch2_compression_opt_to_type[io_opts->background_compression])
-		return true;
+	bkey_extent_entry_for_each(ptrs, entry)
+		if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
+			return &entry->rebalance;
 
-	return false;
+	return NULL;
 }
 
-void bch2_rebalance_add_key(struct bch_fs *c,
-			    struct bkey_s_c k,
-			    struct bch_io_opts *io_opts)
+static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
+					   struct bch_io_opts *opts,
+					   struct bkey_s_c k,
+					   struct bkey_ptrs_c ptrs)
 {
-	const struct bch_extent_ptr *ptr;
-	struct bch_extent_crc_unpacked crc;
-	struct bkey_s_c_extent e;
+	if (!opts->background_compression)
+		return 0;
 
-	if (!bkey_extent_is_data(k.k))
-		return;
+	unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned ptr_bit = 1;
+	unsigned rewrite_ptrs = 0;
 
-	if (!io_opts->background_target &&
-	    !io_opts->background_compression)
-		return;
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
+		    p.ptr.unwritten)
+			return 0;
 
-	e = bkey_s_c_to_extent(k);
+		if (!p.ptr.cached && p.crc.compression_type != compression_type)
+			rewrite_ptrs |= ptr_bit;
+		ptr_bit <<= 1;
+	}
 
-	extent_for_each_ptr_crc(e, ptr, crc)
-		if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
-			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	return rewrite_ptrs;
+}
 
-			if (atomic64_add_return(crc.compressed_size,
-						&ca->rebalance_work) ==
-			    crc.compressed_size)
-				rebalance_wakeup(c);
+static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
+				       struct bch_io_opts *opts,
+				       struct bkey_ptrs_c ptrs)
+{
+	if (!opts->background_target ||
+	    !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target))
+		return 0;
+
+	unsigned ptr_bit = 1;
+	unsigned rewrite_ptrs = 0;
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
+			rewrite_ptrs |= ptr_bit;
+		ptr_bit <<= 1;
+	}
+
+	return rewrite_ptrs;
+}
+
+static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
+					      struct bch_io_opts *opts,
+					      struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
+		bch2_bkey_ptrs_need_move(c, opts, ptrs);
+}
+
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+	const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k);
+	if (!opts)
+		return 0;
+
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	u64 sectors = 0;
+
+	if (opts->background_compression) {
+		unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
+
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+			if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
+			    p.ptr.unwritten) {
+				sectors = 0;
+				goto incompressible;
+			}
+
+			if (!p.ptr.cached && p.crc.compression_type != compression_type)
+				sectors += p.crc.compressed_size;
 		}
+	}
+incompressible:
+	if (opts->background_target &&
+	    bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) {
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+			if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
+				sectors += p.crc.compressed_size;
+	}
+
+	return sectors;
 }
 
-void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts,
+					     struct bkey_s_c k)
 {
-	if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
-	    sectors)
-		rebalance_wakeup(c);
+	if (!bkey_extent_is_direct_data(k.k))
+		return 0;
+
+	const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
+
+	if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
+		struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts);
+		return old == NULL || memcmp(old, &new, sizeof(new));
+	} else {
+		return old != NULL;
+	}
 }
 
-static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
-				    enum bkey_type type,
-				    struct bkey_s_c_extent e,
-				    struct bch_io_opts *io_opts,
-				    struct data_opts *data_opts)
+int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
+				  struct bkey_i *_k)
 {
-	const struct bch_extent_ptr *ptr;
-	struct bch_extent_crc_unpacked crc;
+	if (!bkey_extent_is_direct_data(&_k->k))
+		return 0;
 
-	/* Make sure we have room to add a new pointer: */
-	if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
-	    BKEY_EXTENT_VAL_U64s_MAX)
-		return DATA_SKIP;
+	struct bkey_s k = bkey_i_to_s(_k);
+	struct bch_extent_rebalance *old =
+		(struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
 
-	extent_for_each_ptr_crc(e, ptr, crc)
-		if (rebalance_ptr_pred(c, ptr, crc, io_opts))
-			goto found;
+	if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) {
+		if (!old) {
+			old = bkey_val_end(k);
+			k.k->u64s += sizeof(*old) / sizeof(u64);
+		}
 
-	return DATA_SKIP;
-found:
-	data_opts->target		= io_opts->background_target;
-	data_opts->btree_insert_flags	= 0;
-	return DATA_ADD_REPLICAS;
+		*old = io_opts_to_rebalance_opts(opts);
+	} else {
+		if (old)
+			extent_entry_drop(k, (union bch_extent_entry *) old);
+	}
+
+	return 0;
 }
 
-struct rebalance_work {
-	int		dev_most_full_idx;
-	unsigned	dev_most_full_percent;
-	u64		dev_most_full_work;
-	u64		dev_most_full_capacity;
-	u64		total_work;
+int bch2_get_update_rebalance_opts(struct btree_trans *trans,
+				   struct bch_io_opts *io_opts,
+				   struct btree_iter *iter,
+				   struct bkey_s_c k)
+{
+	BUG_ON(iter->flags & BTREE_ITER_is_extents);
+	BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
+
+	const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v
+		? bch2_bkey_rebalance_opts(k) : NULL;
+	if (r) {
+#define x(_name)							\
+		if (r->_name##_from_inode) {				\
+			io_opts->_name = r->_name;			\
+			io_opts->_name##_from_inode = true;		\
+		}
+		BCH_REBALANCE_OPTS()
+#undef x
+	}
+
+	if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k))
+		return 0;
+
+	struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
+	int ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	bkey_reassemble(n, k);
+
+	/* On successfull transaction commit, @k was invalidated: */
+
+	return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?:
+		bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
+		bch2_trans_commit(trans, NULL, NULL, 0) ?:
+		-BCH_ERR_transaction_restart_nested;
+}
+
+#define REBALANCE_WORK_SCAN_OFFSET	(U64_MAX - 1)
+
+static const char * const bch2_rebalance_state_strs[] = {
+#define x(t) #t,
+	BCH_REBALANCE_STATES()
+	NULL
+#undef x
 };
 
-static void rebalance_work_accumulate(struct rebalance_work *w,
-		u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
+int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
 {
-	unsigned percent_full;
-	u64 work = dev_work + unknown_dev;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_cookie *cookie;
+	u64 v;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
+			     SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+			     BTREE_ITER_intent);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	v = k.k->type == KEY_TYPE_cookie
+		? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
+		: 0;
+
+	cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
+	ret = PTR_ERR_OR_ZERO(cookie);
+	if (ret)
+		goto err;
+
+	bkey_cookie_init(&cookie->k_i);
+	cookie->k.p = iter.pos;
+	cookie->v.cookie = cpu_to_le64(v + 1);
+
+	ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
 
-	if (work < dev_work || work < unknown_dev)
-		work = U64_MAX;
-	work = min(work, capacity);
+int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
+{
+	int ret = bch2_trans_commit_do(c, NULL, NULL,
+				       BCH_TRANS_COMMIT_no_enospc,
+			    bch2_set_rebalance_needs_scan_trans(trans, inum));
+	rebalance_wakeup(c);
+	return ret;
+}
+
+int bch2_set_fs_needs_rebalance(struct bch_fs *c)
+{
+	return bch2_set_rebalance_needs_scan(c, 0);
+}
+
+static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 v;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
+			     SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+			     BTREE_ITER_intent);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	v = k.k->type == KEY_TYPE_cookie
+		? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
+		: 0;
+
+	if (v == cookie)
+		ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
+					    struct btree_iter *work_iter)
+{
+	return !kthread_should_stop()
+		? bch2_btree_iter_peek(work_iter)
+		: bkey_s_c_null;
+}
 
-	percent_full = div_u64(work * 100, capacity);
+static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
+					   struct btree_iter *iter,
+					   struct bkey_s_c k)
+{
+	if (!bch2_bkey_rebalance_opts(k))
+		return 0;
 
-	if (percent_full >= w->dev_most_full_percent) {
-		w->dev_most_full_idx		= idx;
-		w->dev_most_full_percent	= percent_full;
-		w->dev_most_full_work		= work;
-		w->dev_most_full_capacity	= capacity;
+	struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
+	int ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	extent_entry_drop(bkey_i_to_s(n),
+			  (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
+	return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+}
+
+static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
+			struct bpos work_pos,
+			struct btree_iter *extent_iter,
+			struct bch_io_opts *io_opts,
+			struct data_update_opts *data_opts)
+{
+	struct bch_fs *c = trans->c;
+
+	bch2_trans_iter_exit(trans, extent_iter);
+	bch2_trans_iter_init(trans, extent_iter,
+			     work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
+			     work_pos,
+			     BTREE_ITER_all_snapshots);
+	struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter);
+	if (bkey_err(k))
+		return k;
+
+	int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k);
+	if (ret)
+		return bkey_s_c_err(ret);
+
+	memset(data_opts, 0, sizeof(*data_opts));
+	data_opts->rewrite_ptrs		= bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
+	data_opts->target		= io_opts->background_target;
+	data_opts->write_flags		|= BCH_WRITE_ONLY_SPECIFIED_DEVS;
+
+	if (!data_opts->rewrite_ptrs) {
+		/*
+		 * device we would want to write to offline? devices in target
+		 * changed?
+		 *
+		 * We'll now need a full scan before this extent is picked up
+		 * again:
+		 */
+		int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
+		if (ret)
+			return bkey_s_c_err(ret);
+		return bkey_s_c_null;
 	}
 
-	if (w->total_work + dev_work >= w->total_work &&
-	    w->total_work + dev_work >= dev_work)
-		w->total_work += dev_work;
+	if (trace_rebalance_extent_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		prt_newline(&buf);
+
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+		unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs);
+		if (p) {
+			prt_str(&buf, "compression=");
+			bch2_compression_opt_to_text(&buf, io_opts->background_compression);
+			prt_str(&buf, " ");
+			bch2_prt_u64_base2(&buf, p);
+			prt_newline(&buf);
+		}
+
+		p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs);
+		if (p) {
+			prt_str(&buf, "move=");
+			bch2_target_to_text(&buf, c, io_opts->background_target);
+			prt_str(&buf, " ");
+			bch2_prt_u64_base2(&buf, p);
+			prt_newline(&buf);
+		}
+
+		trace_rebalance_extent(c, buf.buf);
+		printbuf_exit(&buf);
+	}
+
+	return k;
 }
 
-static struct rebalance_work rebalance_work(struct bch_fs *c)
+noinline_for_stack
+static int do_rebalance_extent(struct moving_context *ctxt,
+			       struct bpos work_pos,
+			       struct btree_iter *extent_iter)
 {
-	struct bch_dev *ca;
-	struct rebalance_work ret = { .dev_most_full_idx = -1 };
-	u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
-	unsigned i;
-
-	for_each_online_member(ca, c, i)
-		rebalance_work_accumulate(&ret,
-			atomic64_read(&ca->rebalance_work),
-			unknown_dev,
-			bucket_to_sector(ca, ca->mi.nbuckets -
-					 ca->mi.first_bucket),
-			i);
-
-	rebalance_work_accumulate(&ret,
-		unknown_dev, 0, c->capacity, -1);
+	struct btree_trans *trans = ctxt->trans;
+	struct bch_fs *c = trans->c;
+	struct bch_fs_rebalance *r = &trans->c->rebalance;
+	struct data_update_opts data_opts;
+	struct bch_io_opts io_opts;
+	struct bkey_s_c k;
+	struct bkey_buf sk;
+	int ret;
+
+	ctxt->stats = &r->work_stats;
+	r->state = BCH_REBALANCE_working;
+
+	bch2_bkey_buf_init(&sk);
+
+	ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
+				extent_iter, &io_opts, &data_opts));
+	if (ret || !k.k)
+		goto out;
+
+	atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+
+	/*
+	 * The iterator gets unlocked by __bch2_read_extent - need to
+	 * save a copy of @k elsewhere:
+	 */
+	bch2_bkey_buf_reassemble(&sk, c, k);
+	k = bkey_i_to_s_c(sk.k);
+
+	ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
+	if (ret) {
+		if (bch2_err_matches(ret, ENOMEM)) {
+			/* memory allocation failure, wait for some IO to finish */
+			bch2_move_ctxt_wait_for_io(ctxt);
+			ret = -BCH_ERR_transaction_restart_nested;
+		}
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto out;
 
+		/* skip it and continue, XXX signal failure */
+		ret = 0;
+	}
+out:
+	bch2_bkey_buf_exit(&sk, c);
 	return ret;
 }
 
-static void rebalance_work_reset(struct bch_fs *c)
+static bool rebalance_pred(struct bch_fs *c, void *arg,
+			   struct bkey_s_c k,
+			   struct bch_io_opts *io_opts,
+			   struct data_update_opts *data_opts)
+{
+	data_opts->rewrite_ptrs		= bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
+	data_opts->target		= io_opts->background_target;
+	data_opts->write_flags		|= BCH_WRITE_ONLY_SPECIFIED_DEVS;
+	return data_opts->rewrite_ptrs != 0;
+}
+
+static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
 {
-	struct bch_dev *ca;
-	unsigned i;
+	struct btree_trans *trans = ctxt->trans;
+	struct bch_fs_rebalance *r = &trans->c->rebalance;
+	int ret;
+
+	bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
+	ctxt->stats = &r->scan_stats;
+
+	if (!inum) {
+		r->scan_start	= BBPOS_MIN;
+		r->scan_end	= BBPOS_MAX;
+	} else {
+		r->scan_start	= BBPOS(BTREE_ID_extents, POS(inum, 0));
+		r->scan_end	= BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
+	}
 
-	for_each_online_member(ca, c, i)
-		atomic64_set(&ca->rebalance_work, 0);
+	r->state = BCH_REBALANCE_scanning;
 
-	atomic64_set(&c->rebalance.work_unknown_dev, 0);
+	ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
+		commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			  bch2_clear_rebalance_needs_scan(trans, inum, cookie));
+
+	bch2_move_stats_exit(&r->scan_stats, trans->c);
+	return ret;
 }
 
-static unsigned long curr_cputime(void)
+static void rebalance_wait(struct bch_fs *c)
 {
-	u64 utime, stime;
+	struct bch_fs_rebalance *r = &c->rebalance;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	u64 now = atomic64_read(&clock->now);
+	u64 min_member_capacity = bch2_min_rw_member_capacity(c);
+
+	if (min_member_capacity == U64_MAX)
+		min_member_capacity = 128 * 2048;
+
+	r->wait_iotime_end		= now + (min_member_capacity >> 6);
 
-	task_cputime_adjusted(current, &utime, &stime);
-	return nsecs_to_jiffies(utime + stime);
+	if (r->state != BCH_REBALANCE_waiting) {
+		r->wait_iotime_start	= now;
+		r->wait_wallclock_start	= ktime_get_real_ns();
+		r->state		= BCH_REBALANCE_waiting;
+	}
+
+	bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
 }
 
-static int bch2_rebalance_thread(void *arg)
+static int do_rebalance(struct moving_context *ctxt)
 {
-	struct bch_fs *c = arg;
+	struct btree_trans *trans = ctxt->trans;
+	struct bch_fs *c = trans->c;
 	struct bch_fs_rebalance *r = &c->rebalance;
-	struct io_clock *clock = &c->io_clock[WRITE];
-	struct rebalance_work w, p;
-	unsigned long start, prev_start;
-	unsigned long prev_run_time, prev_run_cputime;
-	unsigned long cputime, prev_cputime;
-	unsigned long io_start;
-	long throttle;
+	struct btree_iter rebalance_work_iter, extent_iter = { NULL };
+	struct bkey_s_c k;
+	int ret = 0;
 
-	set_freezable();
+	bch2_trans_begin(trans);
 
-	io_start	= atomic_long_read(&clock->now);
-	p		= rebalance_work(c);
-	prev_start	= jiffies;
-	prev_cputime	= curr_cputime();
+	bch2_move_stats_init(&r->work_stats, "rebalance_work");
+	bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
 
-	while (!kthread_wait_freezable(r->enabled)) {
-		start			= jiffies;
-		cputime			= curr_cputime();
+	bch2_trans_iter_init(trans, &rebalance_work_iter,
+			     BTREE_ID_rebalance_work, POS_MIN,
+			     BTREE_ITER_all_snapshots);
+
+	while (!bch2_move_ratelimit(ctxt)) {
+		if (!c->opts.rebalance_enabled) {
+			bch2_moving_ctxt_flush_all(ctxt);
+			kthread_wait_freezable(c->opts.rebalance_enabled ||
+					       kthread_should_stop());
+		}
 
-		prev_run_time		= start - prev_start;
-		prev_run_cputime	= cputime - prev_cputime;
+		if (kthread_should_stop())
+			break;
 
-		w			= rebalance_work(c);
-		BUG_ON(!w.dev_most_full_capacity);
+		bch2_trans_begin(trans);
 
-		if (!w.total_work) {
-			r->state = REBALANCE_WAITING;
-			kthread_wait_freezable(rebalance_work(c).total_work);
+		ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			continue;
-		}
+		if (ret || !k.k)
+			break;
 
-		/*
-		 * If there isn't much work to do, throttle cpu usage:
-		 */
-		throttle = prev_run_cputime * 100 /
-			max(1U, w.dev_most_full_percent) -
-			prev_run_time;
-
-		if (w.dev_most_full_percent < 20 && throttle > 0) {
-			r->state = REBALANCE_THROTTLED;
-			r->throttled_until_iotime = io_start +
-				div_u64(w.dev_most_full_capacity *
-					(20 - w.dev_most_full_percent),
-					50);
-			r->throttled_until_cputime = start + throttle;
-
-			bch2_kthread_io_clock_wait(clock,
-				r->throttled_until_iotime,
-				throttle);
+		ret = k.k->type == KEY_TYPE_cookie
+			? do_rebalance_scan(ctxt, k.k->p.inode,
+					    le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
+			: do_rebalance_extent(ctxt, k.k->p, &extent_iter);
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			continue;
-		}
+		if (ret)
+			break;
 
-		/* minimum 1 mb/sec: */
-		r->pd.rate.rate =
-			max_t(u64, 1 << 11,
-			      r->pd.rate.rate *
-			      max(p.dev_most_full_percent, 1U) /
-			      max(w.dev_most_full_percent, 1U));
-
-		io_start	= atomic_long_read(&clock->now);
-		p		= w;
-		prev_start	= start;
-		prev_cputime	= cputime;
-
-		r->state = REBALANCE_RUNNING;
-		memset(&r->move_stats, 0, sizeof(r->move_stats));
-		rebalance_work_reset(c);
-
-		bch2_move_data(c,
-			       /* ratelimiting disabled for now */
-			       NULL, /*  &r->pd.rate, */
-			       writepoint_ptr(&c->rebalance_write_point),
-			       POS_MIN, POS_MAX,
-			       rebalance_pred, NULL,
-			       &r->move_stats);
+		bch2_btree_iter_advance(&rebalance_work_iter);
 	}
 
-	return 0;
+	bch2_trans_iter_exit(trans, &extent_iter);
+	bch2_trans_iter_exit(trans, &rebalance_work_iter);
+	bch2_move_stats_exit(&r->scan_stats, c);
+
+	if (!ret &&
+	    !kthread_should_stop() &&
+	    !atomic64_read(&r->work_stats.sectors_seen) &&
+	    !atomic64_read(&r->scan_stats.sectors_seen)) {
+		bch2_moving_ctxt_flush_all(ctxt);
+		bch2_trans_unlock_long(trans);
+		rebalance_wait(c);
+	}
+
+	if (!bch2_err_matches(ret, EROFS))
+		bch_err_fn(c, ret);
+	return ret;
 }
 
-ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf)
+static int bch2_rebalance_thread(void *arg)
 {
-	char *out = buf, *end = out + PAGE_SIZE;
+	struct bch_fs *c = arg;
 	struct bch_fs_rebalance *r = &c->rebalance;
-	struct rebalance_work w = rebalance_work(c);
-	char h1[21], h2[21];
+	struct moving_context ctxt;
+
+	set_freezable();
+
+	bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
+			      writepoint_ptr(&c->rebalance_write_point),
+			      true);
+
+	while (!kthread_should_stop() && !do_rebalance(&ctxt))
+		;
+
+	bch2_moving_ctxt_exit(&ctxt);
 
-	bch2_hprint(h1, w.dev_most_full_work << 9);
-	bch2_hprint(h2, w.dev_most_full_capacity << 9);
-	out += scnprintf(out, end - out,
-			 "fullest_dev (%i):\t%s/%s\n",
-			 w.dev_most_full_idx, h1, h2);
+	return 0;
+}
 
-	bch2_hprint(h1, w.total_work << 9);
-	bch2_hprint(h2, c->capacity << 9);
-	out += scnprintf(out, end - out,
-			 "total work:\t\t%s/%s\n",
-			 h1, h2);
+void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct bch_fs_rebalance *r = &c->rebalance;
 
-	out += scnprintf(out, end - out,
-			 "rate:\t\t\t%u\n",
-			 r->pd.rate.rate);
+	prt_str(out, bch2_rebalance_state_strs[r->state]);
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
 
 	switch (r->state) {
-	case REBALANCE_WAITING:
-		out += scnprintf(out, end - out, "waiting\n");
+	case BCH_REBALANCE_waiting: {
+		u64 now = atomic64_read(&c->io_clock[WRITE].now);
+
+		prt_str(out, "io wait duration:  ");
+		bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
+		prt_newline(out);
+
+		prt_str(out, "io wait remaining: ");
+		bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
+		prt_newline(out);
+
+		prt_str(out, "duration waited:   ");
+		bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
+		prt_newline(out);
 		break;
-	case REBALANCE_THROTTLED:
-		bch2_hprint(h1,
-			    (r->throttled_until_iotime -
-			     atomic_long_read(&c->io_clock[WRITE].now)) << 9);
-		out += scnprintf(out, end - out,
-				 "throttled for %lu sec or %s io\n",
-				 (r->throttled_until_cputime - jiffies) / HZ,
-				 h1);
+	}
+	case BCH_REBALANCE_working:
+		bch2_move_stats_to_text(out, &r->work_stats);
 		break;
-	case REBALANCE_RUNNING:
-		out += scnprintf(out, end - out, "running\n");
-		out += scnprintf(out, end - out, "pos %llu:%llu\n",
-				 r->move_stats.iter.pos.inode,
-				 r->move_stats.iter.pos.offset);
+	case BCH_REBALANCE_scanning:
+		bch2_move_stats_to_text(out, &r->scan_stats);
 		break;
 	}
-
-	return out - buf;
+	prt_newline(out);
+	printbuf_indent_sub(out, 2);
 }
 
 void bch2_rebalance_stop(struct bch_fs *c)
@@ -319,13 +648,19 @@ void bch2_rebalance_stop(struct bch_fs *c)
 int bch2_rebalance_start(struct bch_fs *c)
 {
 	struct task_struct *p;
+	int ret;
+
+	if (c->rebalance.thread)
+		return 0;
 
 	if (c->opts.nochanges)
 		return 0;
 
-	p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
-	if (IS_ERR(p))
-		return PTR_ERR(p);
+	p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
+	ret = PTR_ERR_OR_ZERO(p);
+	bch_err_msg(c, ret, "creating rebalance thread");
+	if (ret)
+		return ret;
 
 	get_task_struct(p);
 	rcu_assign_pointer(c->rebalance.thread, p);
@@ -336,6 +671,4 @@ int bch2_rebalance_start(struct bch_fs *c)
 void bch2_fs_rebalance_init(struct bch_fs *c)
 {
 	bch2_pd_controller_init(&c->rebalance.pd);
-
-	atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
 }
diff --git a/libbcachefs/rebalance.h b/libbcachefs/rebalance.h
index 2e6aa677..0a0821ab 100644
--- a/libbcachefs/rebalance.h
+++ b/libbcachefs/rebalance.h
@@ -1,8 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_REBALANCE_H
 #define _BCACHEFS_REBALANCE_H
 
+#include "compress.h"
+#include "disk_groups.h"
 #include "rebalance_types.h"
 
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
+int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *);
+int bch2_get_update_rebalance_opts(struct btree_trans *,
+				   struct bch_io_opts *,
+				   struct btree_iter *,
+				   struct bkey_s_c);
+
+int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
+int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
+int bch2_set_fs_needs_rebalance(struct bch_fs *);
+
 static inline void rebalance_wakeup(struct bch_fs *c)
 {
 	struct task_struct *p;
@@ -14,11 +28,7 @@ static inline void rebalance_wakeup(struct bch_fs *c)
 	rcu_read_unlock();
 }
 
-void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
-			    struct bch_io_opts *);
-void bch2_rebalance_add_work(struct bch_fs *, u64);
-
-ssize_t bch2_rebalance_work_show(struct bch_fs *, char *);
+void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
 
 void bch2_rebalance_stop(struct bch_fs *);
 int bch2_rebalance_start(struct bch_fs *);
diff --git a/libbcachefs/rebalance_format.h b/libbcachefs/rebalance_format.h
new file mode 100644
index 00000000..ff9a1342
--- /dev/null
+++ b/libbcachefs/rebalance_format.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REBALANCE_FORMAT_H
+#define _BCACHEFS_REBALANCE_FORMAT_H
+
+struct bch_extent_rebalance {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:6,
+				unused:3,
+
+				promote_target_from_inode:1,
+				erasure_code_from_inode:1,
+				data_checksum_from_inode:1,
+				background_compression_from_inode:1,
+				data_replicas_from_inode:1,
+				background_target_from_inode:1,
+
+				promote_target:16,
+				erasure_code:1,
+				data_checksum:4,
+				data_replicas:4,
+				background_compression:8, /* enum bch_compression_opt */
+				background_target:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			background_target:16,
+				background_compression:8,
+				data_replicas:4,
+				data_checksum:4,
+				erasure_code:1,
+				promote_target:16,
+
+				background_target_from_inode:1,
+				data_replicas_from_inode:1,
+				background_compression_from_inode:1,
+				data_checksum_from_inode:1,
+				erasure_code_from_inode:1,
+				promote_target_from_inode:1,
+
+				unused:3,
+				type:6;
+#endif
+};
+
+/* subset of BCH_INODE_OPTS */
+#define BCH_REBALANCE_OPTS()			\
+	x(data_checksum)			\
+	x(background_compression)		\
+	x(data_replicas)			\
+	x(promote_target)			\
+	x(background_target)			\
+	x(erasure_code)
+
+#endif /* _BCACHEFS_REBALANCE_FORMAT_H */
+
diff --git a/libbcachefs/rebalance_types.h b/libbcachefs/rebalance_types.h
index aaf5b9ca..fe5098c1 100644
--- a/libbcachefs/rebalance_types.h
+++ b/libbcachefs/rebalance_types.h
@@ -1,26 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_REBALANCE_TYPES_H
 #define _BCACHEFS_REBALANCE_TYPES_H
 
+#include "bbpos_types.h"
 #include "move_types.h"
 
-enum rebalance_state {
-	REBALANCE_WAITING,
-	REBALANCE_THROTTLED,
-	REBALANCE_RUNNING,
+#define BCH_REBALANCE_STATES()		\
+	x(waiting)			\
+	x(working)			\
+	x(scanning)
+
+enum bch_rebalance_states {
+#define x(t)	BCH_REBALANCE_##t,
+	BCH_REBALANCE_STATES()
+#undef x
 };
 
 struct bch_fs_rebalance {
-	struct task_struct __rcu *thread;
+	struct task_struct __rcu	*thread;
 	struct bch_pd_controller pd;
 
-	atomic64_t		work_unknown_dev;
+	enum bch_rebalance_states	state;
+	u64				wait_iotime_start;
+	u64				wait_iotime_end;
+	u64				wait_wallclock_start;
 
-	enum rebalance_state	state;
-	unsigned long		throttled_until_iotime;
-	unsigned long		throttled_until_cputime;
-	struct bch_move_stats	move_stats;
+	struct bch_move_stats		work_stats;
 
-	unsigned		enabled:1;
+	struct bbpos			scan_start;
+	struct bbpos			scan_end;
+	struct bch_move_stats		scan_stats;
 };
 
 #endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 0af136d6..a342744f 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -1,281 +1,1062 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
-#include "alloc.h"
-#include "btree_gc.h"
+#include "alloc_background.h"
+#include "bkey_buf.h"
+#include "btree_journal_iter.h"
+#include "btree_node_scan.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
+#include "buckets.h"
 #include "dirent.h"
+#include "disk_accounting.h"
+#include "errcode.h"
 #include "error.h"
-#include "fsck.h"
+#include "fs-common.h"
 #include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "logged_ops.h"
+#include "move.h"
 #include "quota.h"
+#include "rebalance.h"
 #include "recovery.h"
+#include "recovery_passes.h"
+#include "replicas.h"
+#include "sb-clean.h"
+#include "sb-downgrade.h"
+#include "snapshot.h"
 #include "super-io.h"
 
+#include <linux/sort.h>
 #include <linux/stat.h>
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
-struct bkey_i *btree_root_find(struct bch_fs *c,
-			       struct bch_sb_field_clean *clean,
-			       struct jset *j,
-			       enum btree_id id, unsigned *level)
+int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
 {
-	struct bkey_i *k;
-	struct jset_entry *entry, *start, *end;
+	u64 b = BIT_ULL(btree);
+	int ret = 0;
 
-	if (clean) {
-		start = clean->start;
-		end = vstruct_end(&clean->field);
-	} else {
-		start = j->start;
-		end = vstruct_last(j);
+	mutex_lock(&c->sb_lock);
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+	if (!(c->sb.btrees_lost_data & b)) {
+		struct printbuf buf = PRINTBUF;
+		bch2_btree_id_to_text(&buf, btree);
+		bch_err(c, "flagging btree %s lost data", buf.buf);
+		printbuf_exit(&buf);
+		ext->btrees_lost_data |= cpu_to_le64(b);
 	}
 
-	for (entry = start; entry < end; entry = vstruct_next(entry))
-		if (entry->type == BCH_JSET_ENTRY_btree_root &&
-		    entry->btree_id == id)
-			goto found;
+	/* Once we have runtime self healing for topology errors we won't need this: */
+	ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret;
+
+	/* Btree node accounting will be off: */
+	__set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
+	ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
 
-	return NULL;
-found:
-	if (!entry->u64s)
-		return ERR_PTR(-EINVAL);
+#ifdef CONFIG_BCACHEFS_DEBUG
+	/*
+	 * These are much more minor, and don't need to be corrected right away,
+	 * but in debug mode we want the next fsck run to be clean:
+	 */
+	ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret;
+	ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret;
+#endif
+
+	switch (btree) {
+	case BTREE_ID_alloc:
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
+		goto out;
+	case BTREE_ID_backpointers:
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret;
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret;
+		goto out;
+	case BTREE_ID_need_discard:
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+		goto out;
+	case BTREE_ID_freespace:
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+		goto out;
+	case BTREE_ID_bucket_gens:
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+		goto out;
+	case BTREE_ID_lru:
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+		goto out;
+	case BTREE_ID_accounting:
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
+		goto out;
+	default:
+		ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret;
+		goto out;
+	}
+out:
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
 
-	k = entry->start;
-	*level = entry->level;
-	return k;
+	return ret;
 }
 
-static int verify_superblock_clean(struct bch_fs *c,
-				   struct bch_sb_field_clean *clean,
-				   struct jset *j)
+/* for -o reconstruct_alloc: */
+static void bch2_reconstruct_alloc(struct bch_fs *c)
 {
-	unsigned i;
-	int ret = 0;
+	bch2_journal_log_msg(c, "dropping alloc info");
+	bch_info(c, "dropping and reconstructing all alloc info");
+
+	mutex_lock(&c->sb_lock);
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
+	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required);
+	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required);
+	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required);
+	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required);
+
+	__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent);
+
+	__set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong, ext->errors_silent);
+
+	__set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent);
+
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent);
+
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
+	c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+
+	c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
+				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+	bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers,
+				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+	bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard,
+				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+	bch2_shoot_down_journal_keys(c, BTREE_ID_freespace,
+				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+	bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens,
+				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+}
+
+/*
+ * Btree node pointers have a field to stack a pointer to the in memory btree
+ * node; we need to zero out this field when reading in btree nodes, or when
+ * reading in keys from the journal:
+ */
+static void zero_out_btree_mem_ptr(struct journal_keys *keys)
+{
+	darray_for_each(*keys, i)
+		if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
+			bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
+}
+
+/* journal replay: */
+
+static void replay_now_at(struct journal *j, u64 seq)
+{
+	BUG_ON(seq < j->replay_journal_seq);
+
+	seq = min(seq, j->replay_journal_seq_end);
+
+	while (j->replay_journal_seq < seq)
+		bch2_journal_pin_put(j, j->replay_journal_seq++);
+}
+
+static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
+					      struct journal_key *k)
+{
+	struct btree_iter iter;
+	bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+				  BTREE_MAX_DEPTH, k->level,
+				  BTREE_ITER_intent);
+	int ret = bch2_btree_iter_traverse(&iter);
+	if (ret)
+		goto out;
+
+	struct bkey u;
+	struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u);
+
+	/* Has this delta already been applied to the btree? */
+	if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) {
+		ret = 0;
+		goto out;
+	}
+
+	struct bkey_i *new = k->k;
+	if (old.k->type == KEY_TYPE_accounting) {
+		new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k));
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			goto out;
+
+		bch2_accounting_accumulate(bkey_i_to_accounting(new),
+					   bkey_s_c_to_accounting(old));
+	}
 
-	if (!clean || !j)
+	trans->journal_res.seq = k->journal_seq;
+
+	ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int bch2_journal_replay_key(struct btree_trans *trans,
+				   struct journal_key *k)
+{
+	struct btree_iter iter;
+	unsigned iter_flags =
+		BTREE_ITER_intent|
+		BTREE_ITER_not_extents;
+	unsigned update_flags = BTREE_TRIGGER_norun;
+	int ret;
+
+	if (k->overwritten)
 		return 0;
 
-	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
-			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
-			le64_to_cpu(clean->journal_seq),
-			le64_to_cpu(j->seq)))
-		bch2_fs_mark_clean(c, false);
+	trans->journal_res.seq = k->journal_seq;
+
+	/*
+	 * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to
+	 * keep the key cache coherent with the underlying btree. Nothing
+	 * besides the allocator is doing updates yet so we don't need key cache
+	 * coherency for non-alloc btrees, and key cache fills for snapshots
+	 * btrees use BTREE_ITER_filter_snapshots, which isn't available until
+	 * the snapshots recovery pass runs.
+	 */
+	if (!k->level && k->btree_id == BTREE_ID_alloc)
+		iter_flags |= BTREE_ITER_cached;
+	else
+		update_flags |= BTREE_UPDATE_key_cache_reclaim;
+
+	bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+				  BTREE_MAX_DEPTH, k->level,
+				  iter_flags);
+	ret = bch2_btree_iter_traverse(&iter);
+	if (ret)
+		goto out;
+
+	struct btree_path *path = btree_iter_path(trans, &iter);
+	if (unlikely(!btree_path_node(path, k->level))) {
+		bch2_trans_iter_exit(trans, &iter);
+		bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+					  BTREE_MAX_DEPTH, 0, iter_flags);
+		ret =   bch2_btree_iter_traverse(&iter) ?:
+			bch2_btree_increase_depth(trans, iter.path, 0) ?:
+			-BCH_ERR_transaction_restart_nested;
+		goto out;
+	}
+
+	/* Must be checked with btree locked: */
+	if (k->overwritten)
+		goto out;
+
+	if (k->k->k.type == KEY_TYPE_accounting) {
+		ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k);
+		goto out;
+	}
+
+	ret = bch2_trans_update(trans, &iter, k->k, update_flags);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int journal_sort_seq_cmp(const void *_l, const void *_r)
+{
+	const struct journal_key *l = *((const struct journal_key **)_l);
+	const struct journal_key *r = *((const struct journal_key **)_r);
+
+	/*
+	 * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last
+	 *
+	 * journal_seq == 0 means that the key comes from early repair, and
+	 * should be inserted last so as to avoid overflowing the journal
+	 */
+	return cmp_int(l->journal_seq - 1, r->journal_seq - 1);
+}
+
+int bch2_journal_replay(struct bch_fs *c)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	DARRAY(struct journal_key *) keys_sorted = { 0 };
+	struct journal *j = &c->journal;
+	u64 start_seq	= c->journal_replay_seq_start;
+	u64 end_seq	= c->journal_replay_seq_start;
+	struct btree_trans *trans = NULL;
+	bool immediate_flush = false;
+	int ret = 0;
 
-	mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
-			"superblock read clock doesn't match journal after clean shutdown");
-	mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
-			"superblock read clock doesn't match journal after clean shutdown");
+	if (keys->nr) {
+		ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
+					   keys->nr, start_seq, end_seq);
+		if (ret)
+			goto err;
+	}
 
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		struct bkey_i *k1, *k2;
-		unsigned l1 = 0, l2 = 0;
+	BUG_ON(!atomic_read(&keys->ref));
 
-		k1 = btree_root_find(c, clean, NULL, i, &l1);
-		k2 = btree_root_find(c, NULL, j, i, &l2);
+	move_gap(keys, keys->nr);
+	trans = bch2_trans_get(c);
 
-		if (!k1 && !k2)
+	/*
+	 * Replay accounting keys first: we can't allow the write buffer to
+	 * flush accounting keys until we're done
+	 */
+	darray_for_each(*keys, k) {
+		if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated))
 			continue;
 
-		mustfix_fsck_err_on(!k1 || !k2 ||
-				    IS_ERR(k1) ||
-				    IS_ERR(k2) ||
-				    k1->k.u64s != k2->k.u64s ||
-				    memcmp(k1, k2, bkey_bytes(k1)) ||
-				    l1 != l2, c,
-			"superblock btree root doesn't match journal after clean shutdown");
+		cond_resched();
+
+		ret = commit_do(trans, NULL, NULL,
+				BCH_TRANS_COMMIT_no_enospc|
+				BCH_TRANS_COMMIT_journal_reclaim|
+				BCH_TRANS_COMMIT_skip_accounting_apply|
+				BCH_TRANS_COMMIT_no_journal_res|
+				BCH_WATERMARK_reclaim,
+			     bch2_journal_replay_accounting_key(trans, k));
+		if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret)))
+			goto err;
+
+		k->overwritten = true;
 	}
-fsck_err:
+
+	set_bit(BCH_FS_accounting_replay_done, &c->flags);
+
+	/*
+	 * First, attempt to replay keys in sorted order. This is more
+	 * efficient - better locality of btree access -  but some might fail if
+	 * that would cause a journal deadlock.
+	 */
+	darray_for_each(*keys, k) {
+		cond_resched();
+
+		/*
+		 * k->allocated means the key wasn't read in from the journal,
+		 * rather it was from early repair code
+		 */
+		if (k->allocated)
+			immediate_flush = true;
+
+		/* Skip fastpath if we're low on space in the journal */
+		ret = c->journal.watermark ? -1 :
+			commit_do(trans, NULL, NULL,
+				  BCH_TRANS_COMMIT_no_enospc|
+				  BCH_TRANS_COMMIT_journal_reclaim|
+				  BCH_TRANS_COMMIT_skip_accounting_apply|
+				  (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
+			     bch2_journal_replay_key(trans, k));
+		BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting);
+		if (ret) {
+			ret = darray_push(&keys_sorted, k);
+			if (ret)
+				goto err;
+		}
+	}
+
+	bch2_trans_unlock_long(trans);
+	/*
+	 * Now, replay any remaining keys in the order in which they appear in
+	 * the journal, unpinning those journal entries as we go:
+	 */
+	sort(keys_sorted.data, keys_sorted.nr,
+	     sizeof(keys_sorted.data[0]),
+	     journal_sort_seq_cmp, NULL);
+
+	darray_for_each(keys_sorted, kp) {
+		cond_resched();
+
+		struct journal_key *k = *kp;
+
+		if (k->journal_seq)
+			replay_now_at(j, k->journal_seq);
+		else
+			replay_now_at(j, j->replay_journal_seq_end);
+
+		ret = commit_do(trans, NULL, NULL,
+				BCH_TRANS_COMMIT_no_enospc|
+				BCH_TRANS_COMMIT_skip_accounting_apply|
+				(!k->allocated
+				 ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
+				 : 0),
+			     bch2_journal_replay_key(trans, k));
+		if (ret) {
+			struct printbuf buf = PRINTBUF;
+			bch2_btree_id_level_to_text(&buf, k->btree_id, k->level);
+			bch_err_msg(c, ret, "while replaying key at %s:", buf.buf);
+			printbuf_exit(&buf);
+			goto err;
+		}
+
+		BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten);
+	}
+
+	/*
+	 * We need to put our btree_trans before calling flush_all_pins(), since
+	 * that will use a btree_trans internally
+	 */
+	bch2_trans_put(trans);
+	trans = NULL;
+
+	if (!c->opts.retain_recovery_info &&
+	    c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay)
+		bch2_journal_keys_put_initial(c);
+
+	replay_now_at(j, j->replay_journal_seq_end);
+	j->replay_journal_seq = 0;
+
+	bch2_journal_set_replay_done(j);
+
+	/* if we did any repair, flush it immediately */
+	if (immediate_flush) {
+		bch2_journal_flush_all_pins(&c->journal);
+		ret = bch2_journal_meta(&c->journal);
+	}
+
+	if (keys->nr)
+		bch2_journal_log_msg(c, "journal replay finished");
+err:
+	if (trans)
+		bch2_trans_put(trans);
+	darray_exit(&keys_sorted);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
-static bool journal_empty(struct list_head *journal)
+/* journal replay early: */
+
+static int journal_replay_entry_early(struct bch_fs *c,
+				      struct jset_entry *entry)
 {
-	struct journal_replay *i;
-	struct jset_entry *entry;
+	int ret = 0;
 
-	if (list_empty(journal))
-		return true;
+	switch (entry->type) {
+	case BCH_JSET_ENTRY_btree_root: {
 
-	i = list_last_entry(journal, struct journal_replay, list);
+		if (unlikely(!entry->u64s))
+			return 0;
 
-	if (i->j.last_seq != i->j.seq)
-		return false;
+		if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX,
+				c, invalid_btree_id,
+				"invalid btree id %u (max %u)",
+				entry->btree_id, BTREE_ID_NR_MAX))
+			return 0;
 
-	list_for_each_entry(i, journal, list) {
-		vstruct_for_each(&i->j, entry) {
-			if (entry->type == BCH_JSET_ENTRY_btree_root)
-				continue;
+		while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) {
+			ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL });
+			if (ret)
+				return ret;
+		}
+
+		struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
+
+		r->level = entry->level;
+		bkey_copy(&r->key, (struct bkey_i *) entry->start);
+		r->error = 0;
+		r->alive = true;
+		break;
+	}
+	case BCH_JSET_ENTRY_usage: {
+		struct jset_entry_usage *u =
+			container_of(entry, struct jset_entry_usage, entry);
+
+		switch (entry->btree_id) {
+		case BCH_FS_USAGE_key_version:
+			atomic64_set(&c->key_version, le64_to_cpu(u->v));
+			break;
+		}
+		break;
+	}
+	case BCH_JSET_ENTRY_blacklist: {
+		struct jset_entry_blacklist *bl_entry =
+			container_of(entry, struct jset_entry_blacklist, entry);
+
+		ret = bch2_journal_seq_blacklist_add(c,
+				le64_to_cpu(bl_entry->seq),
+				le64_to_cpu(bl_entry->seq) + 1);
+		break;
+	}
+	case BCH_JSET_ENTRY_blacklist_v2: {
+		struct jset_entry_blacklist_v2 *bl_entry =
+			container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+		ret = bch2_journal_seq_blacklist_add(c,
+				le64_to_cpu(bl_entry->start),
+				le64_to_cpu(bl_entry->end) + 1);
+		break;
+	}
+	case BCH_JSET_ENTRY_clock: {
+		struct jset_entry_clock *clock =
+			container_of(entry, struct jset_entry_clock, entry);
+
+		atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time));
+	}
+	}
+fsck_err:
+	return ret;
+}
 
-			if (entry->type == BCH_JSET_ENTRY_btree_keys &&
-			    !entry->u64s)
+static int journal_replay_early(struct bch_fs *c,
+				struct bch_sb_field_clean *clean)
+{
+	if (clean) {
+		for (struct jset_entry *entry = clean->start;
+		     entry != vstruct_end(&clean->field);
+		     entry = vstruct_next(entry)) {
+			int ret = journal_replay_entry_early(c, entry);
+			if (ret)
+				return ret;
+		}
+	} else {
+		struct genradix_iter iter;
+		struct journal_replay *i, **_i;
+
+		genradix_for_each(&c->journal_entries, iter, _i) {
+			i = *_i;
+
+			if (journal_replay_ignore(i))
 				continue;
-			return false;
+
+			vstruct_for_each(&i->j, entry) {
+				int ret = journal_replay_entry_early(c, entry);
+				if (ret)
+					return ret;
+			}
 		}
 	}
 
-	return true;
+	return 0;
 }
 
-int bch2_fs_recovery(struct bch_fs *c)
+/* sb clean section: */
+
+static int read_btree_roots(struct bch_fs *c)
 {
-	const char *err = "cannot allocate memory";
-	struct bch_sb_field_clean *clean = NULL, *sb_clean = NULL;
-	LIST_HEAD(journal);
-	struct jset *j = NULL;
-	unsigned i;
-	int ret;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
 
-	mutex_lock(&c->sb_lock);
-	if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
-		bch_info(c, "building replicas info");
-		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
+	for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (!r->alive)
+			continue;
+
+		if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc)
+			continue;
+
+		printbuf_reset(&buf);
+		bch2_btree_id_level_to_text(&buf, i, r->level);
+
+		if (mustfix_fsck_err_on((ret = r->error),
+					c, btree_root_bkey_invalid,
+					"invalid btree root %s",
+					buf.buf) ||
+		    mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)),
+					c, btree_root_read_error,
+					"error reading btree root %s: %s",
+					buf.buf, bch2_err_str(ret))) {
+			if (btree_id_is_alloc(i))
+				r->error = 0;
+
+			ret = bch2_btree_lost_data(c, i);
+			BUG_ON(ret);
+		}
 	}
 
-	if (c->sb.clean)
-		sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
-	if (sb_clean) {
-		clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
-				GFP_KERNEL);
-		if (!clean) {
-			ret = -ENOMEM;
-			mutex_unlock(&c->sb_lock);
-			goto err;
+	for (unsigned i = 0; i < BTREE_ID_NR; i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (!r->b && !r->error) {
+			r->alive = false;
+			r->level = 0;
+			bch2_btree_root_alloc_fake(c, i, 0);
 		}
 	}
-	mutex_unlock(&c->sb_lock);
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
 
-	if (clean)
-		bch_info(c, "recovering from clean shutdown, journal seq %llu",
-			 le64_to_cpu(clean->journal_seq));
+static bool check_version_upgrade(struct bch_fs *c)
+{
+	unsigned latest_version	= bcachefs_metadata_version_current;
+	unsigned latest_compatible = min(latest_version,
+					 bch2_latest_compatible_version(c->sb.version));
+	unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
+	unsigned new_version = 0;
+
+	if (old_version < bcachefs_metadata_required_upgrade_below) {
+		if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible ||
+		    latest_compatible < bcachefs_metadata_required_upgrade_below)
+			new_version = latest_version;
+		else
+			new_version = latest_compatible;
+	} else {
+		switch (c->opts.version_upgrade) {
+		case BCH_VERSION_UPGRADE_compatible:
+			new_version = latest_compatible;
+			break;
+		case BCH_VERSION_UPGRADE_incompatible:
+			new_version = latest_version;
+			break;
+		case BCH_VERSION_UPGRADE_none:
+			new_version = min(old_version, latest_version);
+			break;
+		}
+	}
+
+	if (new_version > old_version) {
+		struct printbuf buf = PRINTBUF;
+
+		if (old_version < bcachefs_metadata_required_upgrade_below)
+			prt_str(&buf, "Version upgrade required:\n");
+
+		if (old_version != c->sb.version) {
+			prt_str(&buf, "Version upgrade from ");
+			bch2_version_to_text(&buf, c->sb.version_upgrade_complete);
+			prt_str(&buf, " to ");
+			bch2_version_to_text(&buf, c->sb.version);
+			prt_str(&buf, " incomplete\n");
+		}
 
-	if (!clean || !c->opts.nofsck) {
-		ret = bch2_journal_read(c, &journal);
+		prt_printf(&buf, "Doing %s version upgrade from ",
+			   BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version)
+			   ? "incompatible" : "compatible");
+		bch2_version_to_text(&buf, old_version);
+		prt_str(&buf, " to ");
+		bch2_version_to_text(&buf, new_version);
+		prt_newline(&buf);
+
+		struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+		__le64 passes = ext->recovery_passes_required[0];
+		bch2_sb_set_upgrade(c, old_version, new_version);
+		passes = ext->recovery_passes_required[0] & ~passes;
+
+		if (passes) {
+			prt_str(&buf, "  running recovery passes: ");
+			prt_bitflags(&buf, bch2_recovery_passes,
+				     bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
+		}
+
+		bch_info(c, "%s", buf.buf);
+
+		bch2_sb_upgrade(c, new_version);
+
+		printbuf_exit(&buf);
+		return true;
+	}
+
+	return false;
+}
+
+int bch2_fs_recovery(struct bch_fs *c)
+{
+	struct bch_sb_field_clean *clean = NULL;
+	struct jset *last_journal_entry = NULL;
+	u64 last_seq = 0, blacklist_seq, journal_seq;
+	int ret = 0;
+
+	if (c->sb.clean) {
+		clean = bch2_read_superblock_clean(c);
+		ret = PTR_ERR_OR_ZERO(clean);
 		if (ret)
 			goto err;
 
-		j = &list_entry(journal.prev, struct journal_replay, list)->j;
+		bch_info(c, "recovering from clean shutdown, journal seq %llu",
+			 le64_to_cpu(clean->journal_seq));
 	} else {
-		ret = bch2_journal_set_seq(c,
-					   le64_to_cpu(clean->journal_seq),
-					   le64_to_cpu(clean->journal_seq));
-		BUG_ON(ret);
+		bch_info(c, "recovering from unclean shutdown");
 	}
 
-	ret = verify_superblock_clean(c, clean, j);
-	if (ret)
+	if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
+		bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
+		ret = -EINVAL;
 		goto err;
+	}
 
-	fsck_err_on(clean && !journal_empty(&journal), c,
-		    "filesystem marked clean but journal not empty");
+	if (!c->sb.clean &&
+	    !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
+		bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
+		ret = -EINVAL;
+		goto err;
+	}
 
-	if (clean) {
-		c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
-		c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
-	} else {
-		c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
-		c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
+	if (c->opts.norecovery) {
+		c->opts.recovery_pass_last = c->opts.recovery_pass_last
+			? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read)
+			: BCH_RECOVERY_PASS_snapshots_read;
+		c->opts.nochanges = true;
+		c->opts.read_only = true;
 	}
 
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		unsigned level;
-		struct bkey_i *k;
+	mutex_lock(&c->sb_lock);
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+	bool write_sb = false;
 
-		k = btree_root_find(c, clean, j, i, &level);
-		if (!k)
-			continue;
+	if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
+		ext->recovery_passes_required[0] |=
+			cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
+		write_sb = true;
+	}
 
-		err = "invalid btree root pointer";
-		if (IS_ERR(k))
+	u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+	if (sb_passes) {
+		struct printbuf buf = PRINTBUF;
+		prt_str(&buf, "superblock requires following recovery passes to be run:\n  ");
+		prt_bitflags(&buf, bch2_recovery_passes, sb_passes);
+		bch_info(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+	}
+
+	if (bch2_check_version_downgrade(c)) {
+		struct printbuf buf = PRINTBUF;
+
+		prt_str(&buf, "Version downgrade required:");
+
+		__le64 passes = ext->recovery_passes_required[0];
+		bch2_sb_set_downgrade(c,
+				      BCH_VERSION_MINOR(bcachefs_metadata_version_current),
+				      BCH_VERSION_MINOR(c->sb.version));
+		passes = ext->recovery_passes_required[0] & ~passes;
+		if (passes) {
+			prt_str(&buf, "\n  running recovery passes: ");
+			prt_bitflags(&buf, bch2_recovery_passes,
+				     bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
+		}
+
+		bch_info(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+		write_sb = true;
+	}
+
+	if (check_version_upgrade(c))
+		write_sb = true;
+
+	c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+
+	if (write_sb)
+		bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	if (c->opts.fsck)
+		set_bit(BCH_FS_fsck_running, &c->flags);
+	if (c->sb.clean)
+		set_bit(BCH_FS_clean_recovery, &c->flags);
+	set_bit(BCH_FS_recovery_running, &c->flags);
+
+	ret = bch2_blacklist_table_initialize(c);
+	if (ret) {
+		bch_err(c, "error initializing blacklist table");
+		goto err;
+	}
+
+	bch2_journal_pos_from_member_info_resume(c);
+
+	if (!c->sb.clean || c->opts.retain_recovery_info) {
+		struct genradix_iter iter;
+		struct journal_replay **i;
+
+		bch_verbose(c, "starting journal read");
+		ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq);
+		if (ret)
 			goto err;
 
-		err = "error reading btree root";
-		if (bch2_btree_root_read(c, i, k, level)) {
-			if (i != BTREE_ID_ALLOC)
+		/*
+		 * note: cmd_list_journal needs the blacklist table fully up to date so
+		 * it can asterisk ignored journal entries:
+		 */
+		if (c->opts.read_journal_only)
+			goto out;
+
+		genradix_for_each_reverse(&c->journal_entries, iter, i)
+			if (!journal_replay_ignore(*i)) {
+				last_journal_entry = &(*i)->j;
+				break;
+			}
+
+		if (mustfix_fsck_err_on(c->sb.clean &&
+					last_journal_entry &&
+					!journal_entry_empty(last_journal_entry), c,
+				clean_but_journal_not_empty,
+				"filesystem marked clean but journal not empty")) {
+			c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+			SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+			c->sb.clean = false;
+		}
+
+		if (!last_journal_entry) {
+			fsck_err_on(!c->sb.clean, c,
+				    dirty_but_no_journal_entries,
+				    "no journal entries found");
+			if (clean)
+				goto use_clean;
+
+			genradix_for_each_reverse(&c->journal_entries, iter, i)
+				if (*i) {
+					last_journal_entry = &(*i)->j;
+					(*i)->ignore_blacklisted = false;
+					(*i)->ignore_not_dirty= false;
+					/*
+					 * This was probably a NO_FLUSH entry,
+					 * so last_seq was garbage - but we know
+					 * we're only using a single journal
+					 * entry, set it here:
+					 */
+					(*i)->j.last_seq = (*i)->j.seq;
+					break;
+				}
+		}
+
+		ret = bch2_journal_keys_sort(c);
+		if (ret)
+			goto err;
+
+		if (c->sb.clean && last_journal_entry) {
+			ret = bch2_verify_superblock_clean(c, &clean,
+						      last_journal_entry);
+			if (ret)
 				goto err;
+		}
+	} else {
+use_clean:
+		if (!clean) {
+			bch_err(c, "no superblock clean section found");
+			ret = -BCH_ERR_fsck_repair_impossible;
+			goto err;
 
-			mustfix_fsck_err(c, "error reading btree root");
 		}
+		blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
 	}
 
-	for (i = 0; i < BTREE_ID_NR; i++)
-		if (!c->btree_roots[i].b)
-			bch2_btree_root_alloc(c, i);
+	c->journal_replay_seq_start	= last_seq;
+	c->journal_replay_seq_end	= blacklist_seq - 1;
 
-	err = "error reading allocation information";
-	ret = bch2_alloc_read(c, &journal);
-	if (ret)
-		goto err;
+	if (c->opts.reconstruct_alloc)
+		bch2_reconstruct_alloc(c);
 
-	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+	zero_out_btree_mem_ptr(&c->journal_keys);
 
-	bch_verbose(c, "starting mark and sweep:");
-	err = "error in recovery";
-	ret = bch2_initial_gc(c, &journal);
+	ret = journal_replay_early(c, clean);
 	if (ret)
 		goto err;
-	bch_verbose(c, "mark and sweep done");
-
-	if (c->opts.noreplay)
-		goto out;
 
 	/*
-	 * Mark dirty before journal replay, fsck:
-	 * XXX: after a clean shutdown, this could be done lazily only when fsck
-	 * finds an error
+	 * After an unclean shutdown, skip then next few journal sequence
+	 * numbers as they may have been referenced by btree writes that
+	 * happened before their corresponding journal writes - those btree
+	 * writes need to be ignored, by skipping and blacklisting the next few
+	 * journal sequence numbers:
 	 */
-	bch2_fs_mark_clean(c, false);
+	if (!c->sb.clean)
+		journal_seq += 8;
+
+	if (blacklist_seq != journal_seq) {
+		ret =   bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
+					     blacklist_seq, journal_seq) ?:
+			bch2_journal_seq_blacklist_add(c,
+					blacklist_seq, journal_seq);
+		if (ret) {
+			bch_err_msg(c, ret, "error creating new journal seq blacklist entry");
+			goto err;
+		}
+	}
+
+	ret =   bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
+				     journal_seq, last_seq, blacklist_seq - 1) ?:
+		bch2_fs_journal_start(&c->journal, journal_seq);
+	if (ret)
+		goto err;
 
 	/*
-	 * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
-	 * will give spurious errors about oldest_gen > bucket_gen -
-	 * this is a hack but oh well.
+	 * Skip past versions that might have possibly been used (as nonces),
+	 * but hadn't had their pointers written:
 	 */
-	bch2_fs_journal_start(&c->journal);
+	if (c->sb.encryption_type && !c->sb.clean)
+		atomic64_add(1 << 16, &c->key_version);
 
-	err = "error starting allocator";
-	ret = bch2_fs_allocator_start(c);
+	ret = read_btree_roots(c);
 	if (ret)
 		goto err;
 
-	bch_verbose(c, "starting journal replay:");
-	err = "journal replay failed";
-	ret = bch2_journal_replay(c, &journal);
-	if (ret)
-		goto err;
-	bch_verbose(c, "journal replay done");
+	set_bit(BCH_FS_btree_running, &c->flags);
 
-	if (c->opts.norecovery)
-		goto out;
+	ret = bch2_sb_set_upgrade_extra(c);
 
-	err = "error in fsck";
-	ret = bch2_fsck(c);
+	ret = bch2_run_recovery_passes(c);
 	if (ret)
 		goto err;
 
-	if (!test_bit(BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags)) {
-		mutex_lock(&c->sb_lock);
-		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
-		mutex_unlock(&c->sb_lock);
+	/*
+	 * Normally set by the appropriate recovery pass: when cleared, this
+	 * indicates we're in early recovery and btree updates should be done by
+	 * being applied to the journal replay keys. _Must_ be cleared before
+	 * multithreaded use:
+	 */
+	set_bit(BCH_FS_may_go_rw, &c->flags);
+	clear_bit(BCH_FS_fsck_running, &c->flags);
+	clear_bit(BCH_FS_recovery_running, &c->flags);
+
+	/* in case we don't run journal replay, i.e. norecovery mode */
+	set_bit(BCH_FS_accounting_replay_done, &c->flags);
+
+	bch2_async_btree_node_rewrites_flush(c);
+
+	/* fsync if we fixed errors */
+	if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
+		bch2_journal_flush_all_pins(&c->journal);
+		bch2_journal_meta(&c->journal);
+	}
+
+	/* If we fixed errors, verify that fs is actually clean now: */
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+	    test_bit(BCH_FS_errors_fixed, &c->flags) &&
+	    !test_bit(BCH_FS_errors_not_fixed, &c->flags) &&
+	    !test_bit(BCH_FS_error, &c->flags)) {
+		bch2_flush_fsck_errs(c);
+
+		bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
+		clear_bit(BCH_FS_errors_fixed, &c->flags);
+
+		c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
+
+		ret = bch2_run_recovery_passes(c);
+		if (ret)
+			goto err;
+
+		if (test_bit(BCH_FS_errors_fixed, &c->flags) ||
+		    test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
+			bch_err(c, "Second fsck run was not clean");
+			set_bit(BCH_FS_errors_not_fixed, &c->flags);
+		}
+
+		set_bit(BCH_FS_errors_fixed, &c->flags);
 	}
 
 	if (enabled_qtypes(c)) {
-		bch_verbose(c, "reading quotas:");
+		bch_verbose(c, "reading quotas");
 		ret = bch2_fs_quota_read(c);
 		if (ret)
 			goto err;
 		bch_verbose(c, "quotas done");
 	}
 
+	mutex_lock(&c->sb_lock);
+	ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+	write_sb = false;
+
+	if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) {
+		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version));
+		write_sb = true;
+	}
+
+	if (!test_bit(BCH_FS_error, &c->flags) &&
+	    !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) {
+		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
+		write_sb = true;
+	}
+
+	if (!test_bit(BCH_FS_error, &c->flags) &&
+	    !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) {
+		memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
+		write_sb = true;
+	}
+
+	if (c->opts.fsck &&
+	    !test_bit(BCH_FS_error, &c->flags) &&
+	    c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 &&
+	    ext->btrees_lost_data) {
+		ext->btrees_lost_data = 0;
+		write_sb = true;
+	}
+
+	if (c->opts.fsck &&
+	    !test_bit(BCH_FS_error, &c->flags) &&
+	    !test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
+		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
+		SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0);
+		write_sb = true;
+	}
+
+	if (bch2_blacklist_entries_gc(c))
+		write_sb = true;
+
+	if (write_sb)
+		bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
+	    c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) {
+		struct bch_move_stats stats;
+
+		bch2_move_stats_init(&stats, "recovery");
+
+		struct printbuf buf = PRINTBUF;
+		bch2_version_to_text(&buf, c->sb.version_min);
+		bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf);
+		printbuf_exit(&buf);
+
+		ret =   bch2_fs_read_write_early(c) ?:
+			bch2_scan_old_btree_nodes(c, &stats);
+		if (ret)
+			goto err;
+		bch_info(c, "scanning for old btree nodes done");
+	}
+
+	ret = 0;
 out:
-	bch2_journal_entries_free(&journal);
-	kfree(clean);
+	bch2_flush_fsck_errs(c);
+
+	if (!c->opts.retain_recovery_info) {
+		bch2_journal_keys_put_initial(c);
+		bch2_find_btree_nodes_exit(&c->found_btree_nodes);
+	}
+	if (!IS_ERR(clean))
+		kfree(clean);
+
+	if (!ret &&
+	    test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) &&
+	    !c->opts.nochanges) {
+		bch2_fs_read_write_early(c);
+		bch2_delete_dead_snapshots_async(c);
+	}
+
+	bch_err_fn(c, ret);
 	return ret;
 err:
 fsck_err:
-	BUG_ON(!ret);
+	bch2_fs_emergency_read_only(c);
 	goto out;
 }
 
@@ -283,79 +1064,113 @@ int bch2_fs_initialize(struct bch_fs *c)
 {
 	struct bch_inode_unpacked root_inode, lostfound_inode;
 	struct bkey_inode_buf packed_inode;
-	struct bch_hash_info root_hash_info;
 	struct qstr lostfound = QSTR("lost+found");
-	const char *err = "cannot allocate memory";
-	struct bch_dev *ca;
-	LIST_HEAD(journal);
-	unsigned i;
+	struct bch_member *m;
 	int ret;
 
 	bch_notice(c, "initializing new filesystem");
+	set_bit(BCH_FS_new_fs, &c->flags);
 
-	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+	mutex_lock(&c->sb_lock);
+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
 
-	ret = bch2_initial_gc(c, &journal);
-	if (ret)
-		goto err;
+	bch2_check_version_downgrade(c);
 
-	err = "unable to allocate journal buckets";
-	for_each_online_member(ca, c, i)
-		if (bch2_dev_journal_alloc(ca)) {
-			percpu_ref_put(&ca->io_ref);
-			goto err;
-		}
+	if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) {
+		bch2_sb_upgrade(c, bcachefs_metadata_version_current);
+		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
+		bch2_write_super(c);
+	}
+
+	for_each_member_device(c, ca) {
+		m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+		SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false);
+		ca->mi = bch2_mi_to_cpu(m);
+	}
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	set_bit(BCH_FS_btree_running, &c->flags);
+	set_bit(BCH_FS_may_go_rw, &c->flags);
 
-	for (i = 0; i < BTREE_ID_NR; i++)
-		bch2_btree_root_alloc(c, i);
+	for (unsigned i = 0; i < BTREE_ID_NR; i++)
+		bch2_btree_root_alloc_fake(c, i, 0);
+
+	ret = bch2_fs_journal_alloc(c);
+	if (ret)
+		goto err;
 
 	/*
 	 * journal_res_get() will crash if called before this has
 	 * set up the journal.pin FIFO and journal.cur pointer:
 	 */
-	bch2_fs_journal_start(&c->journal);
+	bch2_fs_journal_start(&c->journal, 1);
+	set_bit(BCH_FS_accounting_replay_done, &c->flags);
 	bch2_journal_set_replay_done(&c->journal);
 
-	err = "error starting allocator";
-	ret = bch2_fs_allocator_start(c);
+	ret = bch2_fs_read_write_early(c);
 	if (ret)
 		goto err;
 
-	bch2_inode_init(c, &root_inode, 0, 0,
-			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
-	root_inode.bi_inum = BCACHEFS_ROOT_INO;
-	root_inode.bi_nlink++; /* lost+found */
-	bch2_inode_pack(&packed_inode, &root_inode);
+	for_each_member_device(c, ca) {
+		ret = bch2_dev_usage_init(ca, false);
+		if (ret) {
+			bch2_dev_put(ca);
+			goto err;
+		}
+	}
 
-	err = "error creating root directory";
-	ret = bch2_btree_insert(c, BTREE_ID_INODES,
-				&packed_inode.inode.k_i,
-				NULL, NULL, NULL, 0);
+	/*
+	 * Write out the superblock and journal buckets, now that we can do
+	 * btree updates
+	 */
+	bch_verbose(c, "marking superblocks");
+	ret = bch2_trans_mark_dev_sbs(c);
+	bch_err_msg(c, ret, "marking superblocks");
 	if (ret)
 		goto err;
 
-	bch2_inode_init(c, &lostfound_inode, 0, 0,
-			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0,
-			&root_inode);
-	lostfound_inode.bi_inum = BCACHEFS_ROOT_INO + 1;
-	bch2_inode_pack(&packed_inode, &lostfound_inode);
+	ret = bch2_fs_freespace_init(c);
+	if (ret)
+		goto err;
+
+	ret = bch2_initialize_subvolumes(c);
+	if (ret)
+		goto err;
+
+	bch_verbose(c, "reading snapshots table");
+	ret = bch2_snapshots_read(c);
+	if (ret)
+		goto err;
+	bch_verbose(c, "reading snapshots done");
+
+	bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL);
+	root_inode.bi_inum	= BCACHEFS_ROOT_INO;
+	root_inode.bi_subvol	= BCACHEFS_ROOT_SUBVOL;
+	bch2_inode_pack(&packed_inode, &root_inode);
+	packed_inode.inode.k.p.snapshot = U32_MAX;
 
-	err = "error creating lost+found";
-	ret = bch2_btree_insert(c, BTREE_ID_INODES,
-				&packed_inode.inode.k_i,
-				NULL, NULL, NULL, 0);
+	ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0, 0);
+	bch_err_msg(c, ret, "creating root directory");
 	if (ret)
 		goto err;
 
-	root_hash_info = bch2_hash_info_init(c, &root_inode);
+	bch2_inode_init_early(c, &lostfound_inode);
 
-	ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
-				 &lostfound, lostfound_inode.bi_inum, NULL,
-				 BTREE_INSERT_NOFAIL);
+	ret = bch2_trans_commit_do(c, NULL, NULL, 0,
+		bch2_create_trans(trans,
+				  BCACHEFS_ROOT_SUBVOL_INUM,
+				  &root_inode, &lostfound_inode,
+				  &lostfound,
+				  0, 0, S_IFDIR|0700, 0,
+				  NULL, NULL, (subvol_inum) { 0 }, 0));
+	bch_err_msg(c, ret, "creating lost+found");
 	if (ret)
 		goto err;
 
-	atomic_long_set(&c->nr_inodes, 2);
+	c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1;
 
 	if (enabled_qtypes(c)) {
 		ret = bch2_fs_quota_read(c);
@@ -363,21 +1178,21 @@ int bch2_fs_initialize(struct bch_fs *c)
 			goto err;
 	}
 
-	err = "error writing first journal entry";
-	ret = bch2_journal_meta(&c->journal);
+	ret = bch2_journal_flush(&c->journal);
+	bch_err_msg(c, ret, "writing first journal entry");
 	if (ret)
 		goto err;
 
 	mutex_lock(&c->sb_lock);
 	SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
 
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
+	c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
 	return 0;
 err:
-	BUG_ON(!ret);
+	bch_err_fn(c, ret);
 	return ret;
 }
diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h
index 685507e8..b0d55754 100644
--- a/libbcachefs/recovery.h
+++ b/libbcachefs/recovery.h
@@ -1,6 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_RECOVERY_H
 #define _BCACHEFS_RECOVERY_H
 
+int bch2_btree_lost_data(struct bch_fs *, enum btree_id);
+
+int bch2_journal_replay(struct bch_fs *);
+
 int bch2_fs_recovery(struct bch_fs *);
 int bch2_fs_initialize(struct bch_fs *);
 
diff --git a/libbcachefs/recovery_passes.c b/libbcachefs/recovery_passes.c
new file mode 100644
index 00000000..f6d3a99c
--- /dev/null
+++ b/libbcachefs/recovery_passes.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "btree_gc.h"
+#include "btree_node_scan.h"
+#include "disk_accounting.h"
+#include "ec.h"
+#include "fsck.h"
+#include "inode.h"
+#include "journal.h"
+#include "lru.h"
+#include "logged_ops.h"
+#include "rebalance.h"
+#include "recovery.h"
+#include "recovery_passes.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "super.h"
+#include "super-io.h"
+
+const char * const bch2_recovery_passes[] = {
+#define x(_fn, ...)	#_fn,
+	BCH_RECOVERY_PASSES()
+#undef x
+	NULL
+};
+
+/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */
+static int bch2_recovery_pass_empty(struct bch_fs *c)
+{
+	return 0;
+}
+
+static int bch2_set_may_go_rw(struct bch_fs *c)
+{
+	struct journal_keys *keys = &c->journal_keys;
+
+	/*
+	 * After we go RW, the journal keys buffer can't be modified (except for
+	 * setting journal_key->overwritten: it will be accessed by multiple
+	 * threads
+	 */
+	move_gap(keys, keys->nr);
+
+	set_bit(BCH_FS_may_go_rw, &c->flags);
+
+	if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes)
+		return bch2_fs_read_write_early(c);
+	return 0;
+}
+
+struct recovery_pass_fn {
+	int		(*fn)(struct bch_fs *);
+	unsigned	when;
+};
+
+static struct recovery_pass_fn recovery_pass_fns[] = {
+#define x(_fn, _id, _when)	{ .fn = bch2_##_fn, .when = _when },
+	BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static const u8 passes_to_stable_map[] = {
+#define x(n, id, ...)	[BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
+	BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
+{
+	return passes_to_stable_map[pass];
+}
+
+u64 bch2_recovery_passes_to_stable(u64 v)
+{
+	u64 ret = 0;
+	for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
+		if (v & BIT_ULL(i))
+			ret |= BIT_ULL(passes_to_stable_map[i]);
+	return ret;
+}
+
+u64 bch2_recovery_passes_from_stable(u64 v)
+{
+	static const u8 map[] = {
+#define x(n, id, ...)	[BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
+	BCH_RECOVERY_PASSES()
+#undef x
+	};
+
+	u64 ret = 0;
+	for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
+		if (v & BIT_ULL(i))
+			ret |= BIT_ULL(map[i]);
+	return ret;
+}
+
+/*
+ * For when we need to rewind recovery passes and run a pass we skipped:
+ */
+static int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
+					     enum bch_recovery_pass pass)
+{
+	if (c->opts.recovery_passes & BIT_ULL(pass))
+		return 0;
+
+	if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns))
+		return -BCH_ERR_not_in_recovery;
+
+	if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
+	    c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) {
+		bch_info(c, "need recovery pass %s (%u), but already rw",
+			 bch2_recovery_passes[pass], pass);
+		return -BCH_ERR_cannot_rewind_recovery;
+	}
+
+	bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
+		 bch2_recovery_passes[pass], pass,
+		 bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
+
+	c->opts.recovery_passes |= BIT_ULL(pass);
+
+	if (c->curr_recovery_pass >= pass) {
+		c->curr_recovery_pass = pass;
+		c->recovery_passes_complete &= (1ULL << pass) >> 1;
+		return -BCH_ERR_restart_recovery;
+	} else {
+		return 0;
+	}
+}
+
+int bch2_run_explicit_recovery_pass(struct bch_fs *c,
+				    enum bch_recovery_pass pass)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&c->recovery_pass_lock, flags);
+	int ret = __bch2_run_explicit_recovery_pass(c, pass);
+	spin_unlock_irqrestore(&c->recovery_pass_lock, flags);
+	return ret;
+}
+
+int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c,
+					       enum bch_recovery_pass pass)
+{
+	lockdep_assert_held(&c->sb_lock);
+
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+	__set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
+
+	return bch2_run_explicit_recovery_pass(c, pass);
+}
+
+int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
+					       enum bch_recovery_pass pass)
+{
+	enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+
+	mutex_lock(&c->sb_lock);
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+	if (!test_bit_le64(s, ext->recovery_passes_required)) {
+		__set_bit_le64(s, ext->recovery_passes_required);
+		bch2_write_super(c);
+	}
+	mutex_unlock(&c->sb_lock);
+
+	return bch2_run_explicit_recovery_pass(c, pass);
+}
+
+static void bch2_clear_recovery_pass_required(struct bch_fs *c,
+					      enum bch_recovery_pass pass)
+{
+	enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+
+	mutex_lock(&c->sb_lock);
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+	if (test_bit_le64(s, ext->recovery_passes_required)) {
+		__clear_bit_le64(s, ext->recovery_passes_required);
+		bch2_write_super(c);
+	}
+	mutex_unlock(&c->sb_lock);
+}
+
+u64 bch2_fsck_recovery_passes(void)
+{
+	u64 ret = 0;
+
+	for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
+		if (recovery_pass_fns[i].when & PASS_FSCK)
+			ret |= BIT_ULL(i);
+	return ret;
+}
+
+static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+	struct recovery_pass_fn *p = recovery_pass_fns + pass;
+
+	if (c->opts.recovery_passes_exclude & BIT_ULL(pass))
+		return false;
+	if (c->opts.recovery_passes & BIT_ULL(pass))
+		return true;
+	if ((p->when & PASS_FSCK) && c->opts.fsck)
+		return true;
+	if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
+		return true;
+	if (p->when & PASS_ALWAYS)
+		return true;
+	return false;
+}
+
+static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+	struct recovery_pass_fn *p = recovery_pass_fns + pass;
+	int ret;
+
+	if (!(p->when & PASS_SILENT))
+		bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
+			   bch2_recovery_passes[pass]);
+	ret = p->fn(c);
+	if (ret)
+		return ret;
+	if (!(p->when & PASS_SILENT))
+		bch2_print(c, KERN_CONT " done\n");
+
+	return 0;
+}
+
+int bch2_run_online_recovery_passes(struct bch_fs *c)
+{
+	int ret = 0;
+
+	down_read(&c->state_lock);
+
+	for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
+		struct recovery_pass_fn *p = recovery_pass_fns + i;
+
+		if (!(p->when & PASS_ONLINE))
+			continue;
+
+		ret = bch2_run_recovery_pass(c, i);
+		if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
+			i = c->curr_recovery_pass;
+			continue;
+		}
+		if (ret)
+			break;
+	}
+
+	up_read(&c->state_lock);
+
+	return ret;
+}
+
+int bch2_run_recovery_passes(struct bch_fs *c)
+{
+	int ret = 0;
+
+	/*
+	 * We can't allow set_may_go_rw to be excluded; that would cause us to
+	 * use the journal replay keys for updates where it's not expected.
+	 */
+	c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
+
+	while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
+		spin_lock_irq(&c->recovery_pass_lock);
+		unsigned pass = c->curr_recovery_pass;
+
+		if (c->opts.recovery_pass_last &&
+		    c->curr_recovery_pass > c->opts.recovery_pass_last) {
+			spin_unlock_irq(&c->recovery_pass_lock);
+			break;
+		}
+
+		if (!should_run_recovery_pass(c, pass)) {
+			c->curr_recovery_pass++;
+			c->recovery_pass_done = max(c->recovery_pass_done, pass);
+			spin_unlock_irq(&c->recovery_pass_lock);
+			continue;
+		}
+		spin_unlock_irq(&c->recovery_pass_lock);
+
+		ret =   bch2_run_recovery_pass(c, pass) ?:
+			bch2_journal_flush(&c->journal);
+
+		spin_lock_irq(&c->recovery_pass_lock);
+		if (c->curr_recovery_pass < pass) {
+			/*
+			 * bch2_run_explicit_recovery_pass() was called: we
+			 * can't always catch -BCH_ERR_restart_recovery because
+			 * it may have been called from another thread (btree
+			 * node read completion)
+			 */
+			spin_unlock_irq(&c->recovery_pass_lock);
+			continue;
+		} else if (c->curr_recovery_pass == pass) {
+			c->curr_recovery_pass++;
+		} else {
+			BUG();
+		}
+		spin_unlock_irq(&c->recovery_pass_lock);
+
+		if (ret)
+			break;
+
+		c->recovery_passes_complete |= BIT_ULL(pass);
+		c->recovery_pass_done = max(c->recovery_pass_done, pass);
+
+		if (!test_bit(BCH_FS_error, &c->flags))
+			bch2_clear_recovery_pass_required(c, pass);
+	}
+
+	return ret;
+}
diff --git a/libbcachefs/recovery_passes.h b/libbcachefs/recovery_passes.h
new file mode 100644
index 00000000..7d7339c8
--- /dev/null
+++ b/libbcachefs/recovery_passes.h
@@ -0,0 +1,18 @@
+#ifndef _BCACHEFS_RECOVERY_PASSES_H
+#define _BCACHEFS_RECOVERY_PASSES_H
+
+extern const char * const bch2_recovery_passes[];
+
+u64 bch2_recovery_passes_to_stable(u64 v);
+u64 bch2_recovery_passes_from_stable(u64 v);
+
+u64 bch2_fsck_recovery_passes(void);
+
+int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
+int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass);
+int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass);
+
+int bch2_run_online_recovery_passes(struct bch_fs *);
+int bch2_run_recovery_passes(struct bch_fs *);
+
+#endif /* _BCACHEFS_RECOVERY_PASSES_H */
diff --git a/libbcachefs/recovery_passes_types.h b/libbcachefs/recovery_passes_types.h
new file mode 100644
index 00000000..94dc20ca
--- /dev/null
+++ b/libbcachefs/recovery_passes_types.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H
+#define _BCACHEFS_RECOVERY_PASSES_TYPES_H
+
+#define PASS_SILENT		BIT(0)
+#define PASS_FSCK		BIT(1)
+#define PASS_UNCLEAN		BIT(2)
+#define PASS_ALWAYS		BIT(3)
+#define PASS_ONLINE		BIT(4)
+
+/*
+ * Passes may be reordered, but the second field is a persistent identifier and
+ * must never change:
+ */
+#define BCH_RECOVERY_PASSES()							\
+	x(recovery_pass_empty,			41, PASS_SILENT)		\
+	x(scan_for_btree_nodes,			37, 0)				\
+	x(check_topology,			 4, 0)				\
+	x(accounting_read,			39, PASS_ALWAYS)		\
+	x(alloc_read,				 0, PASS_ALWAYS)		\
+	x(stripes_read,				 1, PASS_ALWAYS)		\
+	x(initialize_subvolumes,		 2, 0)				\
+	x(snapshots_read,			 3, PASS_ALWAYS)		\
+	x(check_allocations,			 5, PASS_FSCK)			\
+	x(trans_mark_dev_sbs,			 6, PASS_ALWAYS|PASS_SILENT)	\
+	x(fs_journal_alloc,			 7, PASS_ALWAYS|PASS_SILENT)	\
+	x(set_may_go_rw,			 8, PASS_ALWAYS|PASS_SILENT)	\
+	x(journal_replay,			 9, PASS_ALWAYS)		\
+	x(check_alloc_info,			10, PASS_ONLINE|PASS_FSCK)	\
+	x(check_lrus,				11, PASS_ONLINE|PASS_FSCK)	\
+	x(check_btree_backpointers,		12, PASS_ONLINE|PASS_FSCK)	\
+	x(check_backpointers_to_extents,	13, PASS_ONLINE|PASS_FSCK)	\
+	x(check_extents_to_backpointers,	14, PASS_ONLINE|PASS_FSCK)	\
+	x(check_alloc_to_lru_refs,		15, PASS_ONLINE|PASS_FSCK)	\
+	x(fs_freespace_init,			16, PASS_ALWAYS|PASS_SILENT)	\
+	x(bucket_gens_init,			17, 0)				\
+	x(reconstruct_snapshots,		38, 0)				\
+	x(check_snapshot_trees,			18, PASS_ONLINE|PASS_FSCK)	\
+	x(check_snapshots,			19, PASS_ONLINE|PASS_FSCK)	\
+	x(check_subvols,			20, PASS_ONLINE|PASS_FSCK)	\
+	x(check_subvol_children,		35, PASS_ONLINE|PASS_FSCK)	\
+	x(delete_dead_snapshots,		21, PASS_ONLINE|PASS_FSCK)	\
+	x(fs_upgrade_for_subvolumes,		22, 0)				\
+	x(check_inodes,				24, PASS_FSCK)			\
+	x(check_extents,			25, PASS_FSCK)			\
+	x(check_indirect_extents,		26, PASS_FSCK)			\
+	x(check_dirents,			27, PASS_FSCK)			\
+	x(check_xattrs,				28, PASS_FSCK)			\
+	x(check_root,				29, PASS_ONLINE|PASS_FSCK)	\
+	x(check_unreachable_inodes,		40, PASS_ONLINE|PASS_FSCK)	\
+	x(check_subvolume_structure,		36, PASS_ONLINE|PASS_FSCK)	\
+	x(check_directory_structure,		30, PASS_ONLINE|PASS_FSCK)	\
+	x(check_nlinks,				31, PASS_FSCK)			\
+	x(resume_logged_ops,			23, PASS_ALWAYS)		\
+	x(delete_dead_inodes,			32, PASS_ALWAYS)		\
+	x(fix_reflink_p,			33, 0)				\
+	x(set_fs_needs_rebalance,		34, 0)				\
+
+/* We normally enumerate recovery passes in the order we run them: */
+enum bch_recovery_pass {
+#define x(n, id, when)	BCH_RECOVERY_PASS_##n,
+	BCH_RECOVERY_PASSES()
+#undef x
+	BCH_RECOVERY_PASS_NR
+};
+
+/* But we also need stable identifiers that can be used in the superblock */
+enum bch_recovery_pass_stable {
+#define x(n, id, when)	BCH_RECOVERY_PASS_STABLE_##n = id,
+	BCH_RECOVERY_PASSES()
+#undef x
+};
+
+#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */
diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c
new file mode 100644
index 00000000..e1911b9b
--- /dev/null
+++ b/libbcachefs/reflink.c
@@ -0,0 +1,839 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "rebalance.h"
+#include "reflink.h"
+#include "subvolume.h"
+#include "super-io.h"
+
+#include <linux/sched/signal.h>
+
+static inline bool bkey_extent_is_reflink_data(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_reflink_v:
+	case KEY_TYPE_indirect_inline_data:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline unsigned bkey_type_to_indirect(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_extent:
+		return KEY_TYPE_reflink_v;
+	case KEY_TYPE_inline_data:
+		return KEY_TYPE_indirect_inline_data;
+	default:
+		return 0;
+	}
+}
+
+/* reflink pointers */
+
+int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k,
+			    struct bkey_validate_context from)
+{
+	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+	int ret = 0;
+
+	bkey_fsck_err_on(REFLINK_P_IDX(p.v) < le32_to_cpu(p.v->front_pad),
+			 c, reflink_p_front_pad_bad,
+			 "idx < front_pad (%llu < %u)",
+			 REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad));
+fsck_err:
+	return ret;
+}
+
+void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+
+	prt_printf(out, "idx %llu front_pad %u back_pad %u",
+	       REFLINK_P_IDX(p.v),
+	       le32_to_cpu(p.v->front_pad),
+	       le32_to_cpu(p.v->back_pad));
+}
+
+bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
+{
+	struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l);
+	struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r);
+
+	/*
+	 * Disabled for now, the triggers code needs to be reworked for merging
+	 * of reflink pointers to work:
+	 */
+	return false;
+
+	if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v))
+		return false;
+
+	if (REFLINK_P_ERROR(l.v) != REFLINK_P_ERROR(r.v))
+		return false;
+
+	bch2_key_resize(l.k, l.k->size + r.k->size);
+	return true;
+}
+
+/* indirect extents */
+
+int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k,
+			    struct bkey_validate_context from)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)),
+			 c, reflink_v_pos_bad,
+			 "indirect extent above maximum position 0:%llu",
+			 REFLINK_P_IDX_MAX);
+
+	ret = bch2_bkey_ptrs_validate(c, k, from);
+fsck_err:
+	return ret;
+}
+
+void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+	prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
+
+	bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+#if 0
+Currently disabled, needs to be debugged:
+
+bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
+{
+	struct bkey_s_reflink_v   l = bkey_s_to_reflink_v(_l);
+	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r);
+
+	return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
+}
+#endif
+
+/* indirect inline data */
+
+int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
+				       struct bkey_validate_context from)
+{
+	return 0;
+}
+
+void bch2_indirect_inline_data_to_text(struct printbuf *out,
+				       struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
+	unsigned datalen = bkey_inline_data_bytes(k.k);
+
+	prt_printf(out, "refcount %llu datalen %u: %*phN",
+	       le64_to_cpu(d.v->refcount), datalen,
+	       min(datalen, 32U), d.v->data);
+}
+
+/* lookup */
+
+static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bkey_s_c_reflink_p p,
+					    bool should_commit)
+{
+	struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
+	int ret = PTR_ERR_OR_ZERO(new);
+	if (ret)
+		return ret;
+
+	SET_REFLINK_P_ERROR(&new->v, false);
+	ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun);
+	if (ret)
+		return ret;
+
+	if (!should_commit)
+		return 0;
+
+	return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+		-BCH_ERR_transaction_restart_nested;
+}
+
+static int bch2_indirect_extent_missing_error(struct btree_trans *trans,
+					      struct bkey_s_c_reflink_p p,
+					      u64 missing_start, u64 missing_end,
+					      bool should_commit)
+{
+	if (REFLINK_P_ERROR(p.v))
+		return -BCH_ERR_missing_indirect_extent;
+
+	struct bch_fs *c = trans->c;
+	u64 live_start	= REFLINK_P_IDX(p.v);
+	u64 live_end	= REFLINK_P_IDX(p.v) + p.k->size;
+	u64 refd_start	= live_start	- le32_to_cpu(p.v->front_pad);
+	u64 refd_end	= live_end	+ le32_to_cpu(p.v->back_pad);
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	BUG_ON(missing_start	< refd_start);
+	BUG_ON(missing_end	> refd_end);
+
+	if (fsck_err(trans, reflink_p_to_missing_reflink_v,
+		     "pointer to missing indirect extent\n"
+		     "  %s\n"
+		     "  missing range %llu-%llu",
+		     (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
+		     missing_start, missing_end)) {
+		struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			goto err;
+
+		/*
+		 * Is the missing range not actually needed?
+		 *
+		 * p.v->idx refers to the data that we actually want, but if the
+		 * indirect extent we point to was bigger, front_pad and back_pad
+		 * indicate the range we took a reference on.
+		 */
+
+		if (missing_end <= live_start) {
+			new->v.front_pad = cpu_to_le32(live_start - missing_end);
+		} else if (missing_start >= live_end) {
+			new->v.back_pad = cpu_to_le32(missing_start - live_end);
+		} else {
+			struct bpos new_start	= bkey_start_pos(&new->k);
+			struct bpos new_end	= new->k.p;
+
+			if (missing_start > live_start)
+				new_start.offset += missing_start - live_start;
+			if (missing_end < live_end)
+				new_end.offset -= live_end - missing_end;
+
+			bch2_cut_front(new_start, &new->k_i);
+			bch2_cut_back(new_end, &new->k_i);
+
+			SET_REFLINK_P_ERROR(&new->v, true);
+		}
+
+		ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun);
+		if (ret)
+			goto err;
+
+		if (should_commit)
+			ret =   bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+				-BCH_ERR_transaction_restart_nested;
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/*
+ * This is used from the read path, which doesn't expect to have to do a
+ * transaction commit, and from triggers, which should not be doing a commit:
+ */
+struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans,
+					    struct btree_iter *iter,
+					    s64 *offset_into_extent,
+					    struct bkey_s_c_reflink_p p,
+					    bool should_commit,
+					    unsigned iter_flags)
+{
+	BUG_ON(*offset_into_extent < -((s64) le32_to_cpu(p.v->front_pad)));
+	BUG_ON(*offset_into_extent >= p.k->size + le32_to_cpu(p.v->back_pad));
+
+	u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent;
+
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink,
+				       POS(0, reflink_offset), iter_flags);
+	if (bkey_err(k))
+		return k;
+
+	if (unlikely(!bkey_extent_is_reflink_data(k.k))) {
+		bch2_trans_iter_exit(trans, iter);
+
+		unsigned size = min((u64) k.k->size,
+				    REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) -
+				    reflink_offset);
+		bch2_key_resize(&iter->k, size);
+
+		int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset,
+							     k.k->p.offset, should_commit);
+		if (ret)
+			return bkey_s_c_err(ret);
+	} else if (unlikely(REFLINK_P_ERROR(p.v))) {
+		bch2_trans_iter_exit(trans, iter);
+
+		int ret = bch2_indirect_extent_not_missing(trans, p, should_commit);
+		if (ret)
+			return bkey_s_c_err(ret);
+	}
+
+	*offset_into_extent = reflink_offset - bkey_start_offset(k.k);
+	return k;
+}
+
+/* reflink pointer trigger */
+
+static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
+			struct bkey_s_c_reflink_p p, u64 *idx,
+			enum btree_iter_update_trigger_flags flags)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+
+	s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v);
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, &offset_into_extent, p, false,
+							BTREE_ITER_intent|
+							BTREE_ITER_with_updates);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (bkey_deleted(k.k)) {
+		if (!(flags & BTREE_TRIGGER_overwrite))
+			ret = -BCH_ERR_missing_indirect_extent;
+		goto next;
+	}
+
+	struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+	ret = PTR_ERR_OR_ZERO(new);
+	if (ret)
+		goto err;
+
+	__le64 *refcount = bkey_refcount(bkey_i_to_s(new));
+	if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) {
+		bch2_bkey_val_to_text(&buf, c, p.s_c);
+		prt_printf(&buf, "\n  ");
+		bch2_bkey_val_to_text(&buf, c, k);
+		log_fsck_err(trans, reflink_refcount_underflow,
+			     "indirect extent refcount underflow while marking\n  %s",
+			   buf.buf);
+		goto next;
+	}
+
+	if (flags & BTREE_TRIGGER_insert) {
+		struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
+		u64 pad;
+
+		pad = max_t(s64, le32_to_cpu(v->front_pad),
+			    REFLINK_P_IDX(v) - bkey_start_offset(&new->k));
+		BUG_ON(pad > U32_MAX);
+		v->front_pad = cpu_to_le32(pad);
+
+		pad = max_t(s64, le32_to_cpu(v->back_pad),
+			    new->k.p.offset - p.k->size - REFLINK_P_IDX(v));
+		BUG_ON(pad > U32_MAX);
+		v->back_pad = cpu_to_le32(pad);
+	}
+
+	le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1);
+
+	bch2_btree_iter_set_pos_to_extent_start(&iter);
+	ret = bch2_trans_update(trans, &iter, new, 0);
+	if (ret)
+		goto err;
+next:
+	*idx = k.k->p.offset;
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
+				struct bkey_s_c_reflink_p p, u64 *idx,
+				enum btree_iter_update_trigger_flags flags,
+				size_t r_idx)
+{
+	struct bch_fs *c = trans->c;
+	struct reflink_gc *r;
+	int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1;
+	u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad);
+	s64 ret = 0;
+	struct printbuf buf = PRINTBUF;
+
+	if (r_idx >= c->reflink_gc_nr)
+		goto not_found;
+
+	r = genradix_ptr(&c->reflink_gc_table, r_idx);
+	next_idx = min(next_idx, r->offset - r->size);
+	if (*idx < next_idx)
+		goto not_found;
+
+	BUG_ON((s64) r->refcount + add < 0);
+
+	if (flags & BTREE_TRIGGER_gc)
+		r->refcount += add;
+	*idx = r->offset;
+	return 0;
+not_found:
+	if (flags & BTREE_TRIGGER_check_repair) {
+		ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false);
+		if (ret)
+			goto err;
+	}
+
+	*idx = next_idx;
+err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int __trigger_reflink_p(struct btree_trans *trans,
+		enum btree_id btree_id, unsigned level, struct bkey_s_c k,
+		enum btree_iter_update_trigger_flags flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+	int ret = 0;
+
+	u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad);
+	u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad);
+
+	if (flags & BTREE_TRIGGER_transactional) {
+		while (idx < end && !ret)
+			ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags);
+	}
+
+	if (flags & (BTREE_TRIGGER_check_repair|BTREE_TRIGGER_gc)) {
+		size_t l = 0, r = c->reflink_gc_nr;
+
+		while (l < r) {
+			size_t m = l + (r - l) / 2;
+			struct reflink_gc *ref = genradix_ptr(&c->reflink_gc_table, m);
+			if (ref->offset <= idx)
+				l = m + 1;
+			else
+				r = m;
+		}
+
+		while (idx < end && !ret)
+			ret = gc_trigger_reflink_p_segment(trans, p, &idx, flags, l++);
+	}
+
+	return ret;
+}
+
+int bch2_trigger_reflink_p(struct btree_trans *trans,
+			   enum btree_id btree_id, unsigned level,
+			   struct bkey_s_c old,
+			   struct bkey_s new,
+			   enum btree_iter_update_trigger_flags flags)
+{
+	if ((flags & BTREE_TRIGGER_transactional) &&
+	    (flags & BTREE_TRIGGER_insert)) {
+		struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v;
+
+		v->front_pad = v->back_pad = 0;
+	}
+
+	return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
+/* indirect extent trigger */
+
+static inline void
+check_indirect_extent_deleting(struct bkey_s new,
+			       enum btree_iter_update_trigger_flags *flags)
+{
+	if ((*flags & BTREE_TRIGGER_insert) && !*bkey_refcount(new)) {
+		new.k->type = KEY_TYPE_deleted;
+		new.k->size = 0;
+		set_bkey_val_u64s(new.k, 0);
+		*flags &= ~BTREE_TRIGGER_insert;
+	}
+}
+
+int bch2_trigger_reflink_v(struct btree_trans *trans,
+			   enum btree_id btree_id, unsigned level,
+			   struct bkey_s_c old, struct bkey_s new,
+			   enum btree_iter_update_trigger_flags flags)
+{
+	if ((flags & BTREE_TRIGGER_transactional) &&
+	    (flags & BTREE_TRIGGER_insert))
+		check_indirect_extent_deleting(new, &flags);
+
+	return bch2_trigger_extent(trans, btree_id, level, old, new, flags);
+}
+
+int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
+			      enum btree_id btree_id, unsigned level,
+			      struct bkey_s_c old, struct bkey_s new,
+			      enum btree_iter_update_trigger_flags flags)
+{
+	check_indirect_extent_deleting(new, &flags);
+
+	return 0;
+}
+
+/* create */
+
+static int bch2_make_extent_indirect(struct btree_trans *trans,
+				     struct btree_iter *extent_iter,
+				     struct bkey_i *orig)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter reflink_iter = { NULL };
+	struct bkey_s_c k;
+	struct bkey_i *r_v;
+	struct bkey_i_reflink_p *r_p;
+	__le64 *refcount;
+	int ret;
+
+	if (orig->k.type == KEY_TYPE_inline_data)
+		bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
+
+	bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX,
+			     BTREE_ITER_intent);
+	k = bch2_btree_iter_peek_prev(&reflink_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	/*
+	 * XXX: we're assuming that 56 bits will be enough for the life of the
+	 * filesystem: we need to implement wraparound, with a cursor in the
+	 * logged ops btree:
+	 */
+	if (bkey_ge(reflink_iter.pos, POS(0, REFLINK_P_IDX_MAX - orig->k.size)))
+		return -ENOSPC;
+
+	r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
+	ret = PTR_ERR_OR_ZERO(r_v);
+	if (ret)
+		goto err;
+
+	bkey_init(&r_v->k);
+	r_v->k.type	= bkey_type_to_indirect(&orig->k);
+	r_v->k.p	= reflink_iter.pos;
+	bch2_key_resize(&r_v->k, orig->k.size);
+	r_v->k.bversion	= orig->k.bversion;
+
+	set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
+
+	refcount	= bkey_refcount(bkey_i_to_s(r_v));
+	*refcount	= 0;
+	memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
+
+	ret = bch2_trans_update(trans, &reflink_iter, r_v, 0);
+	if (ret)
+		goto err;
+
+	/*
+	 * orig is in a bkey_buf which statically allocates 5 64s for the val,
+	 * so we know it will be big enough:
+	 */
+	orig->k.type = KEY_TYPE_reflink_p;
+	r_p = bkey_i_to_reflink_p(orig);
+	set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
+
+	/* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */
+#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
+	__underlying_memset(&r_p->v, 0, sizeof(r_p->v));
+#else
+	memset(&r_p->v, 0, sizeof(r_p->v));
+#endif
+
+	SET_REFLINK_P_IDX(&r_p->v, bkey_start_offset(&r_v->k));
+
+	ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
+				BTREE_UPDATE_internal_snapshot_node);
+err:
+	bch2_trans_iter_exit(trans, &reflink_iter);
+
+	return ret;
+}
+
+static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
+{
+	struct bkey_s_c k;
+	int ret;
+
+	for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) {
+		if (bkey_extent_is_unwritten(k))
+			continue;
+
+		if (bkey_extent_is_data(k.k))
+			return k;
+	}
+
+	if (bkey_ge(iter->pos, end))
+		bch2_btree_iter_set_pos(iter, end);
+	return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
+}
+
+s64 bch2_remap_range(struct bch_fs *c,
+		     subvol_inum dst_inum, u64 dst_offset,
+		     subvol_inum src_inum, u64 src_offset,
+		     u64 remap_sectors,
+		     u64 new_i_size, s64 *i_sectors_delta)
+{
+	struct btree_trans *trans;
+	struct btree_iter dst_iter, src_iter;
+	struct bkey_s_c src_k;
+	struct bkey_buf new_dst, new_src;
+	struct bpos dst_start = POS(dst_inum.inum, dst_offset);
+	struct bpos src_start = POS(src_inum.inum, src_offset);
+	struct bpos dst_end = dst_start, src_end = src_start;
+	struct bch_io_opts opts;
+	struct bpos src_want;
+	u64 dst_done = 0;
+	u32 dst_snapshot, src_snapshot;
+	int ret = 0, ret2 = 0;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
+		return -BCH_ERR_erofs_no_writes;
+
+	bch2_check_set_feature(c, BCH_FEATURE_reflink);
+
+	dst_end.offset += remap_sectors;
+	src_end.offset += remap_sectors;
+
+	bch2_bkey_buf_init(&new_dst);
+	bch2_bkey_buf_init(&new_src);
+	trans = bch2_trans_get(c);
+
+	ret = bch2_inum_opts_get(trans, src_inum, &opts);
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
+			     BTREE_ITER_intent);
+	bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
+			     BTREE_ITER_intent);
+
+	while ((ret == 0 ||
+		bch2_err_matches(ret, BCH_ERR_transaction_restart)) &&
+	       bkey_lt(dst_iter.pos, dst_end)) {
+		struct disk_reservation disk_res = { 0 };
+
+		bch2_trans_begin(trans);
+
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
+		ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol,
+						  &src_snapshot);
+		if (ret)
+			continue;
+
+		bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
+
+		ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol,
+						  &dst_snapshot);
+		if (ret)
+			continue;
+
+		bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
+
+		if (dst_inum.inum < src_inum.inum) {
+			/* Avoid some lock cycle transaction restarts */
+			ret = bch2_btree_iter_traverse(&dst_iter);
+			if (ret)
+				continue;
+		}
+
+		dst_done = dst_iter.pos.offset - dst_start.offset;
+		src_want = POS(src_start.inode, src_start.offset + dst_done);
+		bch2_btree_iter_set_pos(&src_iter, src_want);
+
+		src_k = get_next_src(&src_iter, src_end);
+		ret = bkey_err(src_k);
+		if (ret)
+			continue;
+
+		if (bkey_lt(src_want, src_iter.pos)) {
+			ret = bch2_fpunch_at(trans, &dst_iter, dst_inum,
+					min(dst_end.offset,
+					    dst_iter.pos.offset +
+					    src_iter.pos.offset - src_want.offset),
+					i_sectors_delta);
+			continue;
+		}
+
+		if (src_k.k->type != KEY_TYPE_reflink_p) {
+			bch2_btree_iter_set_pos_to_extent_start(&src_iter);
+
+			bch2_bkey_buf_reassemble(&new_src, c, src_k);
+			src_k = bkey_i_to_s_c(new_src.k);
+
+			ret = bch2_make_extent_indirect(trans, &src_iter,
+						new_src.k);
+			if (ret)
+				continue;
+
+			BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
+		}
+
+		if (src_k.k->type == KEY_TYPE_reflink_p) {
+			struct bkey_s_c_reflink_p src_p =
+				bkey_s_c_to_reflink_p(src_k);
+			struct bkey_i_reflink_p *dst_p =
+				bkey_reflink_p_init(new_dst.k);
+
+			u64 offset = REFLINK_P_IDX(src_p.v) +
+				(src_want.offset -
+				 bkey_start_offset(src_k.k));
+
+			SET_REFLINK_P_IDX(&dst_p->v, offset);
+		} else {
+			BUG();
+		}
+
+		new_dst.k->k.p = dst_iter.pos;
+		bch2_key_resize(&new_dst.k->k,
+				min(src_k.k->p.offset - src_want.offset,
+				    dst_end.offset - dst_iter.pos.offset));
+
+		ret =   bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?:
+			bch2_extent_update(trans, dst_inum, &dst_iter,
+					new_dst.k, &disk_res,
+					new_i_size, i_sectors_delta,
+					true);
+		bch2_disk_reservation_put(c, &disk_res);
+	}
+	bch2_trans_iter_exit(trans, &dst_iter);
+	bch2_trans_iter_exit(trans, &src_iter);
+
+	BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end));
+	BUG_ON(bkey_gt(dst_iter.pos, dst_end));
+
+	dst_done = dst_iter.pos.offset - dst_start.offset;
+	new_i_size = min(dst_iter.pos.offset << 9, new_i_size);
+
+	do {
+		struct bch_inode_unpacked inode_u;
+		struct btree_iter inode_iter = { NULL };
+
+		bch2_trans_begin(trans);
+
+		ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u,
+				       dst_inum, BTREE_ITER_intent);
+
+		if (!ret2 &&
+		    inode_u.bi_size < new_i_size) {
+			inode_u.bi_size = new_i_size;
+			ret2  = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+				bch2_trans_commit(trans, NULL, NULL,
+						  BCH_TRANS_COMMIT_no_enospc);
+		}
+
+		bch2_trans_iter_exit(trans, &inode_iter);
+	} while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
+err:
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&new_src, c);
+	bch2_bkey_buf_exit(&new_dst, c);
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_reflink);
+
+	return dst_done ?: ret ?: ret2;
+}
+
+/* fsck */
+
+static int bch2_gc_write_reflink_key(struct btree_trans *trans,
+				     struct btree_iter *iter,
+				     struct bkey_s_c k,
+				     size_t *idx)
+{
+	struct bch_fs *c = trans->c;
+	const __le64 *refcount = bkey_refcount_c(k);
+	struct printbuf buf = PRINTBUF;
+	struct reflink_gc *r;
+	int ret = 0;
+
+	if (!refcount)
+		return 0;
+
+	while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
+	       r->offset < k.k->p.offset)
+		++*idx;
+
+	if (!r ||
+	    r->offset != k.k->p.offset ||
+	    r->size != k.k->size) {
+		bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+		return -EINVAL;
+	}
+
+	if (fsck_err_on(r->refcount != le64_to_cpu(*refcount),
+			trans, reflink_v_refcount_wrong,
+			"reflink key has wrong refcount:\n"
+			"  %s\n"
+			"  should be %u",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+			r->refcount)) {
+		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			goto out;
+
+		if (!r->refcount)
+			new->k.type = KEY_TYPE_deleted;
+		else
+			*bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
+		ret = bch2_trans_update(trans, iter, new, 0);
+	}
+out:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_gc_reflink_done(struct bch_fs *c)
+{
+	size_t idx = 0;
+
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_reflink, POS_MIN,
+				BTREE_ITER_prefetch, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
+	c->reflink_gc_nr = 0;
+	return ret;
+}
+
+int bch2_gc_reflink_start(struct bch_fs *c)
+{
+	c->reflink_gc_nr = 0;
+
+	int ret = bch2_trans_run(c,
+		for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
+				   BTREE_ITER_prefetch, k, ({
+			const __le64 *refcount = bkey_refcount_c(k);
+
+			if (!refcount)
+				continue;
+
+			struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
+							c->reflink_gc_nr++, GFP_KERNEL);
+			if (!r) {
+				ret = -BCH_ERR_ENOMEM_gc_reflink_start;
+				break;
+			}
+
+			r->offset	= k.k->p.offset;
+			r->size		= k.k->size;
+			r->refcount	= 0;
+			0;
+		})));
+
+	bch_err_fn(c, ret);
+	return ret;
+}
diff --git a/libbcachefs/reflink.h b/libbcachefs/reflink.h
new file mode 100644
index 00000000..f119316a
--- /dev/null
+++ b/libbcachefs/reflink.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REFLINK_H
+#define _BCACHEFS_REFLINK_H
+
+int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c,
+			    struct bkey_validate_context);
+void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
+			   struct bkey_s_c, struct bkey_s,
+			   enum btree_iter_update_trigger_flags);
+
+#define bch2_bkey_ops_reflink_p ((struct bkey_ops) {		\
+	.key_validate	= bch2_reflink_p_validate,		\
+	.val_to_text	= bch2_reflink_p_to_text,		\
+	.key_merge	= bch2_reflink_p_merge,			\
+	.trigger	= bch2_trigger_reflink_p,		\
+	.min_val_size	= 16,					\
+})
+
+int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c,
+			    struct bkey_validate_context);
+void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
+			   struct bkey_s_c, struct bkey_s,
+			   enum btree_iter_update_trigger_flags);
+
+#define bch2_bkey_ops_reflink_v ((struct bkey_ops) {		\
+	.key_validate	= bch2_reflink_v_validate,		\
+	.val_to_text	= bch2_reflink_v_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+	.trigger	= bch2_trigger_reflink_v,		\
+	.min_val_size	= 8,					\
+})
+
+int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c,
+				       struct bkey_validate_context);
+void bch2_indirect_inline_data_to_text(struct printbuf *,
+				struct bch_fs *, struct bkey_s_c);
+int bch2_trigger_indirect_inline_data(struct btree_trans *,
+					 enum btree_id, unsigned,
+			      struct bkey_s_c, struct bkey_s,
+			      enum btree_iter_update_trigger_flags);
+
+#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) {	\
+	.key_validate	= bch2_indirect_inline_data_validate,	\
+	.val_to_text	= bch2_indirect_inline_data_to_text,	\
+	.trigger	= bch2_trigger_indirect_inline_data,	\
+	.min_val_size	= 8,					\
+})
+
+static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_reflink_v:
+		return &bkey_s_c_to_reflink_v(k).v->refcount;
+	case KEY_TYPE_indirect_inline_data:
+		return &bkey_s_c_to_indirect_inline_data(k).v->refcount;
+	default:
+		return NULL;
+	}
+}
+
+static inline __le64 *bkey_refcount(struct bkey_s k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_reflink_v:
+		return &bkey_s_to_reflink_v(k).v->refcount;
+	case KEY_TYPE_indirect_inline_data:
+		return &bkey_s_to_indirect_inline_data(k).v->refcount;
+	default:
+		return NULL;
+	}
+}
+
+struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_iter *,
+					    s64 *, struct bkey_s_c_reflink_p,
+					    bool, unsigned);
+
+s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
+		     subvol_inum, u64, u64, u64, s64 *);
+
+int bch2_gc_reflink_done(struct bch_fs *);
+int bch2_gc_reflink_start(struct bch_fs *);
+
+#endif /* _BCACHEFS_REFLINK_H */
diff --git a/libbcachefs/reflink_format.h b/libbcachefs/reflink_format.h
new file mode 100644
index 00000000..53502627
--- /dev/null
+++ b/libbcachefs/reflink_format.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REFLINK_FORMAT_H
+#define _BCACHEFS_REFLINK_FORMAT_H
+
+struct bch_reflink_p {
+	struct bch_val		v;
+	__le64			idx_flags;
+	/*
+	 * A reflink pointer might point to an indirect extent which is then
+	 * later split (by copygc or rebalance). If we only pointed to part of
+	 * the original indirect extent, and then one of the fragments is
+	 * outside the range we point to, we'd leak a refcount: so when creating
+	 * reflink pointers, we need to store pad values to remember the full
+	 * range we were taking a reference on.
+	 */
+	__le32			front_pad;
+	__le32			back_pad;
+} __packed __aligned(8);
+
+LE64_BITMASK(REFLINK_P_IDX,	struct bch_reflink_p, idx_flags,  0, 56);
+LE64_BITMASK(REFLINK_P_ERROR,	struct bch_reflink_p, idx_flags, 56, 57);
+
+struct bch_reflink_v {
+	struct bch_val		v;
+	__le64			refcount;
+	union bch_extent_entry	start[0];
+	__u64			_data[];
+} __packed __aligned(8);
+
+struct bch_indirect_inline_data {
+	struct bch_val		v;
+	__le64			refcount;
+	u8			data[];
+};
+
+#endif /* _BCACHEFS_REFLINK_FORMAT_H */
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c
index 1e94d35f..477ef099 100644
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@@ -1,419 +1,646 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "buckets.h"
+#include "disk_accounting.h"
+#include "journal.h"
 #include "replicas.h"
 #include "super-io.h"
 
+#include <linux/sort.h>
+
 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
 					    struct bch_replicas_cpu *);
 
+/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
+static int bch2_memcmp(const void *l, const void *r,  const void *priv)
+{
+	size_t size = (size_t) priv;
+	return memcmp(l, r, size);
+}
+
 /* Replicas tracking - in memory: */
 
-#define for_each_cpu_replicas_entry(_r, _i)				\
-	for (_i = (_r)->entries;					\
-	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
-	     _i = (void *) (_i) + (_r)->entry_size)
+static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	BUG_ON(!e->nr_devs);
+	BUG_ON(e->nr_required > 1 &&
+	       e->nr_required >= e->nr_devs);
+
+	for (unsigned i = 0; i + 1 < e->nr_devs; i++)
+		BUG_ON(e->devs[i] >= e->devs[i + 1]);
+#endif
+}
 
-static inline struct bch_replicas_cpu_entry *
-cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
 {
-	return (void *) r->entries + r->entry_size * i;
+	bubble_sort(e->devs, e->nr_devs, u8_cmp);
 }
 
 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
 {
-	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+	eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
+			  bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
+}
+
+static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
+					   struct bch_replicas_entry_v0 *e)
+{
+	bch2_prt_data_type(out, e->data_type);
+
+	prt_printf(out, ": %u [", e->nr_devs);
+	for (unsigned i = 0; i < e->nr_devs; i++)
+		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
+	prt_printf(out, "]");
 }
 
-static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
-				     unsigned dev)
+void bch2_replicas_entry_to_text(struct printbuf *out,
+				 struct bch_replicas_entry_v1 *e)
 {
-	return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
+	bch2_prt_data_type(out, e->data_type);
+
+	prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
+	for (unsigned i = 0; i < e->nr_devs; i++)
+		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
+	prt_printf(out, "]");
 }
 
-static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
-				    unsigned dev)
+static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r,
+					   struct bch_sb *sb,
+					   struct printbuf *err)
 {
-	e->devs[dev >> 3] |= 1 << (dev & 7);
+	if (!r->nr_devs) {
+		prt_printf(err, "no devices in entry ");
+		goto bad;
+	}
+
+	if (r->nr_required > 1 &&
+	    r->nr_required >= r->nr_devs) {
+		prt_printf(err, "bad nr_required in entry ");
+		goto bad;
+	}
+
+	for (unsigned i = 0; i < r->nr_devs; i++)
+		if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
+		    !bch2_member_exists(sb, r->devs[i])) {
+			prt_printf(err, "invalid device %u in entry ", r->devs[i]);
+			goto bad;
+		}
+
+	return 0;
+bad:
+	bch2_replicas_entry_to_text(err, r);
+	return -BCH_ERR_invalid_replicas_entry;
 }
 
-static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
+int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
+				 struct bch_fs *c,
+				 struct printbuf *err)
 {
-	return (r->entry_size -
-		offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
+	if (!r->nr_devs) {
+		prt_printf(err, "no devices in entry ");
+		goto bad;
+	}
+
+	if (r->nr_required > 1 &&
+	    r->nr_required >= r->nr_devs) {
+		prt_printf(err, "bad nr_required in entry ");
+		goto bad;
+	}
+
+	for (unsigned i = 0; i < r->nr_devs; i++)
+		if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
+		    !bch2_dev_exists(c, r->devs[i])) {
+			prt_printf(err, "invalid device %u in entry ", r->devs[i]);
+			goto bad;
+		}
+
+	return 0;
+bad:
+	bch2_replicas_entry_to_text(err, r);
+	return -BCH_ERR_invalid_replicas_entry;
 }
 
-int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
-			      char *buf, size_t size)
+void bch2_cpu_replicas_to_text(struct printbuf *out,
+			       struct bch_replicas_cpu *r)
 {
-	char *out = buf, *end = out + size;
-	struct bch_replicas_cpu_entry *e;
+	struct bch_replicas_entry_v1 *e;
 	bool first = true;
-	unsigned i;
 
 	for_each_cpu_replicas_entry(r, e) {
-		bool first_e = true;
-
 		if (!first)
-			out += scnprintf(out, end - out, " ");
+			prt_printf(out, " ");
 		first = false;
 
-		out += scnprintf(out, end - out, "%u: [", e->data_type);
-
-		for (i = 0; i < replicas_dev_slots(r); i++)
-			if (replicas_test_dev(e, i)) {
-				if (!first_e)
-					out += scnprintf(out, end - out, " ");
-				first_e = false;
-				out += scnprintf(out, end - out, "%u", i);
-			}
-		out += scnprintf(out, end - out, "]");
+		bch2_replicas_entry_to_text(out, e);
 	}
+}
+
+static void extent_to_replicas(struct bkey_s_c k,
+			       struct bch_replicas_entry_v1 *r)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
 
-	return out - buf;
+	r->nr_required	= 1;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if (p.ptr.cached)
+			continue;
+
+		if (!p.has_ec)
+			replicas_entry_add_dev(r, p.ptr.dev);
+		else
+			r->nr_required = 0;
+	}
 }
 
-static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e,
-					enum bch_data_type data_type,
-					struct bch_replicas_cpu_entry *r,
-					unsigned *max_dev)
+static void stripe_to_replicas(struct bkey_s_c k,
+			       struct bch_replicas_entry_v1 *r)
 {
+	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
 	const struct bch_extent_ptr *ptr;
-	unsigned nr = 0;
 
-	BUG_ON(!data_type ||
-	       data_type == BCH_DATA_SB ||
-	       data_type >= BCH_DATA_NR);
+	r->nr_required	= s.v->nr_blocks - s.v->nr_redundant;
 
-	memset(r, 0, sizeof(*r));
-	r->data_type = data_type;
+	for (ptr = s.v->ptrs;
+	     ptr < s.v->ptrs + s.v->nr_blocks;
+	     ptr++)
+		replicas_entry_add_dev(r, ptr->dev);
+}
 
-	*max_dev = 0;
+void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
+			   struct bkey_s_c k)
+{
+	e->nr_devs = 0;
+
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+		e->data_type = BCH_DATA_btree;
+		extent_to_replicas(k, e);
+		break;
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		e->data_type = BCH_DATA_user;
+		extent_to_replicas(k, e);
+		break;
+	case KEY_TYPE_stripe:
+		e->data_type = BCH_DATA_parity;
+		stripe_to_replicas(k, e);
+		break;
+	}
 
-	extent_for_each_ptr(e, ptr)
-		if (!ptr->cached) {
-			*max_dev = max_t(unsigned, *max_dev, ptr->dev);
-			replicas_set_dev(r, ptr->dev);
-			nr++;
-		}
-	return nr;
+	bch2_replicas_entry_sort(e);
 }
 
-static inline void devlist_to_replicas(struct bch_devs_list devs,
-				       enum bch_data_type data_type,
-				       struct bch_replicas_cpu_entry *r,
-				       unsigned *max_dev)
+void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
+			      enum bch_data_type data_type,
+			      struct bch_devs_list devs)
 {
-	unsigned i;
-
 	BUG_ON(!data_type ||
-	       data_type == BCH_DATA_SB ||
+	       data_type == BCH_DATA_sb ||
 	       data_type >= BCH_DATA_NR);
 
-	memset(r, 0, sizeof(*r));
-	r->data_type = data_type;
+	e->data_type	= data_type;
+	e->nr_devs	= 0;
+	e->nr_required	= 1;
 
-	*max_dev = 0;
+	darray_for_each(devs, i)
+		replicas_entry_add_dev(e, *i);
 
-	for (i = 0; i < devs.nr; i++) {
-		*max_dev = max_t(unsigned, *max_dev, devs.devs[i]);
-		replicas_set_dev(r, devs.devs[i]);
-	}
+	bch2_replicas_entry_sort(e);
+}
+
+static struct bch_replicas_cpu
+cpu_replicas_add_entry(struct bch_fs *c,
+		       struct bch_replicas_cpu *old,
+		       struct bch_replicas_entry_v1 *new_entry)
+{
+	struct bch_replicas_cpu new = {
+		.nr		= old->nr + 1,
+		.entry_size	= max_t(unsigned, old->entry_size,
+					replicas_entry_bytes(new_entry)),
+	};
+
+	new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
+	if (!new.entries)
+		return new;
+
+	for (unsigned i = 0; i < old->nr; i++)
+		memcpy(cpu_replicas_entry(&new, i),
+		       cpu_replicas_entry(old, i),
+		       old->entry_size);
+
+	memcpy(cpu_replicas_entry(&new, old->nr),
+	       new_entry,
+	       replicas_entry_bytes(new_entry));
+
+	bch2_cpu_replicas_sort(&new);
+	return new;
 }
 
-static struct bch_replicas_cpu *
-cpu_replicas_add_entry(struct bch_replicas_cpu *old,
-		       struct bch_replicas_cpu_entry new_entry,
-		       unsigned max_dev)
+static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
+				       struct bch_replicas_entry_v1 *search)
 {
-	struct bch_replicas_cpu *new;
-	unsigned i, nr, entry_size;
+	int idx, entry_size = replicas_entry_bytes(search);
 
-	entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
-		DIV_ROUND_UP(max_dev + 1, 8);
-	entry_size = max(entry_size, old->entry_size);
-	nr = old->nr + 1;
+	if (unlikely(entry_size > r->entry_size))
+		return -1;
 
-	new = kzalloc(sizeof(struct bch_replicas_cpu) +
-		      nr * entry_size, GFP_NOIO);
-	if (!new)
-		return NULL;
+#define entry_cmp(_l, _r)	memcmp(_l, _r, entry_size)
+	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
+			      entry_cmp, search);
+#undef entry_cmp
 
-	new->nr		= nr;
-	new->entry_size	= entry_size;
+	return idx < r->nr ? idx : -1;
+}
 
-	for (i = 0; i < old->nr; i++)
-		memcpy(cpu_replicas_entry(new, i),
-		       cpu_replicas_entry(old, i),
-		       min(new->entry_size, old->entry_size));
+int bch2_replicas_entry_idx(struct bch_fs *c,
+			    struct bch_replicas_entry_v1 *search)
+{
+	bch2_replicas_entry_sort(search);
 
-	memcpy(cpu_replicas_entry(new, old->nr),
-	       &new_entry,
-	       new->entry_size);
+	return __replicas_entry_idx(&c->replicas, search);
+}
 
-	bch2_cpu_replicas_sort(new);
-	return new;
+static bool __replicas_has_entry(struct bch_replicas_cpu *r,
+				 struct bch_replicas_entry_v1 *search)
+{
+	return __replicas_entry_idx(r, search) >= 0;
+}
+
+bool bch2_replicas_marked_locked(struct bch_fs *c,
+			  struct bch_replicas_entry_v1 *search)
+{
+	verify_replicas_entry(search);
+
+	return !search->nr_devs ||
+		(__replicas_has_entry(&c->replicas, search) &&
+		 (likely((!c->replicas_gc.entries)) ||
+		  __replicas_has_entry(&c->replicas_gc, search)));
 }
 
-static bool replicas_has_entry(struct bch_replicas_cpu *r,
-				struct bch_replicas_cpu_entry search,
-				unsigned max_dev)
+bool bch2_replicas_marked(struct bch_fs *c,
+			  struct bch_replicas_entry_v1 *search)
 {
-	return max_dev < replicas_dev_slots(r) &&
-		eytzinger0_find(r->entries, r->nr,
-				r->entry_size,
-				memcmp, &search) < r->nr;
+	percpu_down_read(&c->mark_lock);
+	bool ret = bch2_replicas_marked_locked(c, search);
+	percpu_up_read(&c->mark_lock);
+
+	return ret;
 }
 
 noinline
 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
-				struct bch_replicas_cpu_entry new_entry,
-				unsigned max_dev)
+				struct bch_replicas_entry_v1 *new_entry)
 {
-	struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
-	int ret = -ENOMEM;
+	struct bch_replicas_cpu new_r, new_gc;
+	int ret = 0;
+
+	verify_replicas_entry(new_entry);
+
+	memset(&new_r, 0, sizeof(new_r));
+	memset(&new_gc, 0, sizeof(new_gc));
 
 	mutex_lock(&c->sb_lock);
 
-	old_gc = rcu_dereference_protected(c->replicas_gc,
-					   lockdep_is_held(&c->sb_lock));
-	if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
-		new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
-		if (!new_gc)
+	if (c->replicas_gc.entries &&
+	    !__replicas_has_entry(&c->replicas_gc, new_entry)) {
+		new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
+		if (!new_gc.entries) {
+			ret = -BCH_ERR_ENOMEM_cpu_replicas;
 			goto err;
+		}
 	}
 
-	old_r = rcu_dereference_protected(c->replicas,
-					  lockdep_is_held(&c->sb_lock));
-	if (!replicas_has_entry(old_r, new_entry, max_dev)) {
-		new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
-		if (!new_r)
+	if (!__replicas_has_entry(&c->replicas, new_entry)) {
+		new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
+		if (!new_r.entries) {
+			ret = -BCH_ERR_ENOMEM_cpu_replicas;
 			goto err;
+		}
 
-		ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
+		ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
 		if (ret)
 			goto err;
 	}
 
+	if (!new_r.entries &&
+	    !new_gc.entries)
+		goto out;
+
 	/* allocations done, now commit: */
 
-	if (new_r)
+	if (new_r.entries)
 		bch2_write_super(c);
 
 	/* don't update in memory replicas until changes are persistent */
+	percpu_down_write(&c->mark_lock);
+	if (new_r.entries)
+		swap(c->replicas, new_r);
+	if (new_gc.entries)
+		swap(new_gc, c->replicas_gc);
+	percpu_up_write(&c->mark_lock);
+out:
+	mutex_unlock(&c->sb_lock);
 
-	if (new_gc) {
-		rcu_assign_pointer(c->replicas_gc, new_gc);
-		kfree_rcu(old_gc, rcu);
-	}
-
-	if (new_r) {
-		rcu_assign_pointer(c->replicas, new_r);
-		kfree_rcu(old_r, rcu);
-	}
+	kfree(new_r.entries);
+	kfree(new_gc.entries);
 
-	mutex_unlock(&c->sb_lock);
-	return 0;
-err:
-	mutex_unlock(&c->sb_lock);
-	kfree(new_gc);
-	kfree(new_r);
 	return ret;
+err:
+	bch_err_msg(c, ret, "adding replicas entry");
+	goto out;
 }
 
-int bch2_mark_replicas(struct bch_fs *c,
-		       enum bch_data_type data_type,
-		       struct bch_devs_list devs)
+int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
 {
-	struct bch_replicas_cpu_entry search;
-	struct bch_replicas_cpu *r, *gc_r;
-	unsigned max_dev;
-	bool marked;
+	return likely(bch2_replicas_marked(c, r))
+		? 0 : bch2_mark_replicas_slowpath(c, r);
+}
 
-	if (!devs.nr)
-		return 0;
+/*
+ * Old replicas_gc mechanism: only used for journal replicas entries now, should
+ * die at some point:
+ */
 
-	BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
+int bch2_replicas_gc_end(struct bch_fs *c, int ret)
+{
+	lockdep_assert_held(&c->replicas_gc_lock);
 
-	devlist_to_replicas(devs, data_type, &search, &max_dev);
+	mutex_lock(&c->sb_lock);
+	percpu_down_write(&c->mark_lock);
 
-	rcu_read_lock();
-	r = rcu_dereference(c->replicas);
-	gc_r = rcu_dereference(c->replicas_gc);
-	marked = replicas_has_entry(r, search, max_dev) &&
-		(!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
-	rcu_read_unlock();
+	ret =   ret ?:
+		bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
+	if (!ret)
+		swap(c->replicas, c->replicas_gc);
 
-	return likely(marked) ? 0
-		: bch2_mark_replicas_slowpath(c, search, max_dev);
-}
+	kfree(c->replicas_gc.entries);
+	c->replicas_gc.entries = NULL;
 
-int bch2_mark_bkey_replicas(struct bch_fs *c,
-			    enum bch_data_type data_type,
-			    struct bkey_s_c k)
-{
-	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
-	unsigned i;
-	int ret;
+	percpu_up_write(&c->mark_lock);
 
-	for (i = 0; i < cached.nr; i++)
-		if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
-					      bch2_dev_list_single(cached.devs[i]))))
-			return ret;
+	if (!ret)
+		bch2_write_super(c);
 
-	return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
 }
 
-int bch2_replicas_gc_end(struct bch_fs *c, int ret)
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 {
-	struct bch_replicas_cpu *new_r, *old_r;
+	struct bch_replicas_entry_v1 *e;
+	unsigned i = 0;
 
 	lockdep_assert_held(&c->replicas_gc_lock);
 
 	mutex_lock(&c->sb_lock);
-
-	new_r = rcu_dereference_protected(c->replicas_gc,
-					  lockdep_is_held(&c->sb_lock));
-	rcu_assign_pointer(c->replicas_gc, NULL);
-
-	if (ret)
-		goto err;
-
-	if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
-		ret = -ENOSPC;
-		goto err;
+	BUG_ON(c->replicas_gc.entries);
+
+	c->replicas_gc.nr		= 0;
+	c->replicas_gc.entry_size	= 0;
+
+	for_each_cpu_replicas_entry(&c->replicas, e) {
+		/* Preserve unknown data types */
+		if (e->data_type >= BCH_DATA_NR ||
+		    !((1 << e->data_type) & typemask)) {
+			c->replicas_gc.nr++;
+			c->replicas_gc.entry_size =
+				max_t(unsigned, c->replicas_gc.entry_size,
+				      replicas_entry_bytes(e));
+		}
 	}
 
-	bch2_write_super(c);
-
-	/* don't update in memory replicas until changes are persistent */
+	c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
+					 c->replicas_gc.entry_size,
+					 GFP_KERNEL);
+	if (!c->replicas_gc.entries) {
+		mutex_unlock(&c->sb_lock);
+		bch_err(c, "error allocating c->replicas_gc");
+		return -BCH_ERR_ENOMEM_replicas_gc;
+	}
 
-	old_r = rcu_dereference_protected(c->replicas,
-					  lockdep_is_held(&c->sb_lock));
+	for_each_cpu_replicas_entry(&c->replicas, e)
+		if (e->data_type >= BCH_DATA_NR ||
+		    !((1 << e->data_type) & typemask))
+			memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
+			       e, c->replicas_gc.entry_size);
 
-	rcu_assign_pointer(c->replicas, new_r);
-	kfree_rcu(old_r, rcu);
-out:
+	bch2_cpu_replicas_sort(&c->replicas_gc);
 	mutex_unlock(&c->sb_lock);
-	return ret;
-err:
-	kfree_rcu(new_r, rcu);
-	goto out;
+
+	return 0;
 }
 
-int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
+/*
+ * New much simpler mechanism for clearing out unneeded replicas entries - drop
+ * replicas entries that have 0 sectors used.
+ *
+ * However, we don't track sector counts for journal usage, so this doesn't drop
+ * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism
+ * is retained for that.
+ */
+int bch2_replicas_gc2(struct bch_fs *c)
 {
-	struct bch_replicas_cpu *dst, *src;
-	struct bch_replicas_cpu_entry *e;
-
-	lockdep_assert_held(&c->replicas_gc_lock);
+	struct bch_replicas_cpu new = { 0 };
+	unsigned nr;
+	int ret = 0;
+
+	bch2_accounting_mem_gc(c);
+retry:
+	nr		= READ_ONCE(c->replicas.nr);
+	new.entry_size	= READ_ONCE(c->replicas.entry_size);
+	new.entries	= kcalloc(nr, new.entry_size, GFP_KERNEL);
+	if (!new.entries) {
+		bch_err(c, "error allocating c->replicas_gc");
+		return -BCH_ERR_ENOMEM_replicas_gc;
+	}
 
 	mutex_lock(&c->sb_lock);
-	BUG_ON(c->replicas_gc);
+	percpu_down_write(&c->mark_lock);
 
-	src = rcu_dereference_protected(c->replicas,
-					lockdep_is_held(&c->sb_lock));
-
-	dst = kzalloc(sizeof(struct bch_replicas_cpu) +
-		      src->nr * src->entry_size, GFP_NOIO);
-	if (!dst) {
+	if (nr			!= c->replicas.nr ||
+	    new.entry_size	!= c->replicas.entry_size) {
+		percpu_up_write(&c->mark_lock);
 		mutex_unlock(&c->sb_lock);
-		return -ENOMEM;
+		kfree(new.entries);
+		goto retry;
+	}
+
+	for (unsigned i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry_v1 *e =
+			cpu_replicas_entry(&c->replicas, i);
+
+		struct disk_accounting_pos k = {
+			.type = BCH_DISK_ACCOUNTING_replicas,
+		};
+
+		unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e),
+			      "embedded variable length struct");
+
+		struct bpos p = disk_accounting_pos_to_bpos(&k);
+
+		struct bch_accounting_mem *acc = &c->accounting;
+		bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+					    accounting_pos_cmp, &p) >= acc->k.nr;
+
+		if (e->data_type == BCH_DATA_journal || !kill)
+			memcpy(cpu_replicas_entry(&new, new.nr++),
+			       e, new.entry_size);
 	}
 
-	dst->nr		= 0;
-	dst->entry_size	= src->entry_size;
+	bch2_cpu_replicas_sort(&new);
 
-	for_each_cpu_replicas_entry(src, e)
-		if (!((1 << e->data_type) & typemask))
-			memcpy(cpu_replicas_entry(dst, dst->nr++),
-			       e, dst->entry_size);
+	ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
 
-	bch2_cpu_replicas_sort(dst);
+	if (!ret)
+		swap(c->replicas, new);
+
+	kfree(new.entries);
+
+	percpu_up_write(&c->mark_lock);
+
+	if (!ret)
+		bch2_write_super(c);
 
-	rcu_assign_pointer(c->replicas_gc, dst);
 	mutex_unlock(&c->sb_lock);
 
-	return 0;
+	return ret;
 }
 
 /* Replicas tracking - superblock: */
 
-static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
-					unsigned *nr,
-					unsigned *bytes,
-					unsigned *max_dev)
+static int
+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
+				   struct bch_replicas_cpu *cpu_r)
 {
-	struct bch_replicas_entry *i;
-	unsigned j;
+	struct bch_replicas_entry_v1 *e, *dst;
+	unsigned nr = 0, entry_size = 0, idx = 0;
+
+	for_each_replicas_entry(sb_r, e) {
+		entry_size = max_t(unsigned, entry_size,
+				   replicas_entry_bytes(e));
+		nr++;
+	}
 
-	*nr	= 0;
-	*bytes	= sizeof(*r);
-	*max_dev = 0;
+	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
+	if (!cpu_r->entries)
+		return -BCH_ERR_ENOMEM_cpu_replicas;
 
-	if (!r)
-		return;
+	cpu_r->nr		= nr;
+	cpu_r->entry_size	= entry_size;
 
-	for_each_replicas_entry(r, i) {
-		for (j = 0; j < i->nr; j++)
-			*max_dev = max_t(unsigned, *max_dev, i->devs[j]);
-		(*nr)++;
+	for_each_replicas_entry(sb_r, e) {
+		dst = cpu_replicas_entry(cpu_r, idx++);
+		memcpy(dst, e, replicas_entry_bytes(e));
+		bch2_replicas_entry_sort(dst);
 	}
 
-	*bytes = (void *) i - (void *) r;
+	return 0;
 }
 
-static struct bch_replicas_cpu *
-__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
+static int
+__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
+				      struct bch_replicas_cpu *cpu_r)
 {
-	struct bch_replicas_cpu *cpu_r;
-	unsigned i, nr, bytes, max_dev, entry_size;
+	struct bch_replicas_entry_v0 *e;
+	unsigned nr = 0, entry_size = 0, idx = 0;
 
-	bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
+	for_each_replicas_entry(sb_r, e) {
+		entry_size = max_t(unsigned, entry_size,
+				   replicas_entry_bytes(e));
+		nr++;
+	}
 
-	entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
-		DIV_ROUND_UP(max_dev + 1, 8);
+	entry_size += sizeof(struct bch_replicas_entry_v1) -
+		sizeof(struct bch_replicas_entry_v0);
 
-	cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
-			nr * entry_size, GFP_NOIO);
-	if (!cpu_r)
-		return NULL;
+	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
+	if (!cpu_r->entries)
+		return -BCH_ERR_ENOMEM_cpu_replicas;
 
 	cpu_r->nr		= nr;
 	cpu_r->entry_size	= entry_size;
 
-	if (nr) {
-		struct bch_replicas_cpu_entry *dst =
-			cpu_replicas_entry(cpu_r, 0);
-		struct bch_replicas_entry *src = sb_r->entries;
-
-		while (dst < cpu_replicas_entry(cpu_r, nr)) {
-			dst->data_type = src->data_type;
-			for (i = 0; i < src->nr; i++)
-				replicas_set_dev(dst, src->devs[i]);
-
-			src	= replicas_entry_next(src);
-			dst	= (void *) dst + entry_size;
-		}
+	for_each_replicas_entry(sb_r, e) {
+		struct bch_replicas_entry_v1 *dst =
+			cpu_replicas_entry(cpu_r, idx++);
+
+		dst->data_type	= e->data_type;
+		dst->nr_devs	= e->nr_devs;
+		dst->nr_required = 1;
+		memcpy(dst->devs, e->devs, e->nr_devs);
+		bch2_replicas_entry_sort(dst);
 	}
 
-	bch2_cpu_replicas_sort(cpu_r);
-	return cpu_r;
+	return 0;
 }
 
 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
 {
-	struct bch_sb_field_replicas *sb_r;
-	struct bch_replicas_cpu *cpu_r, *old_r;
+	struct bch_sb_field_replicas *sb_v1;
+	struct bch_sb_field_replicas_v0 *sb_v0;
+	struct bch_replicas_cpu new_r = { 0, 0, NULL };
+	int ret = 0;
+
+	if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
+		ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
+	else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0)))
+		ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
+	if (ret)
+		return ret;
+
+	bch2_cpu_replicas_sort(&new_r);
+
+	percpu_down_write(&c->mark_lock);
+	swap(c->replicas, new_r);
+	percpu_up_write(&c->mark_lock);
+
+	kfree(new_r.entries);
+
+	return 0;
+}
+
+static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
+					       struct bch_replicas_cpu *r)
+{
+	struct bch_sb_field_replicas_v0 *sb_r;
+	struct bch_replicas_entry_v0 *dst;
+	struct bch_replicas_entry_v1 *src;
+	size_t bytes;
+
+	bytes = sizeof(struct bch_sb_field_replicas);
+
+	for_each_cpu_replicas_entry(r, src)
+		bytes += replicas_entry_bytes(src) - 1;
+
+	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
+			DIV_ROUND_UP(bytes, sizeof(u64)));
+	if (!sb_r)
+		return -BCH_ERR_ENOSPC_sb_replicas;
+
+	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
+	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
 
-	sb_r	= bch2_sb_get_replicas(c->disk_sb.sb);
-	cpu_r	= __bch2_sb_replicas_to_cpu_replicas(sb_r);
-	if (!cpu_r)
-		return -ENOMEM;
+	memset(&sb_r->entries, 0,
+	       vstruct_end(&sb_r->field) -
+	       (void *) &sb_r->entries);
+
+	dst = sb_r->entries;
+	for_each_cpu_replicas_entry(r, src) {
+		dst->data_type	= src->data_type;
+		dst->nr_devs	= src->nr_devs;
+		memcpy(dst->devs, src->devs, src->nr_devs);
 
-	old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
-	rcu_assign_pointer(c->replicas, cpu_r);
-	if (old_r)
-		kfree_rcu(old_r, rcu);
+		dst = replicas_entry_next(dst);
+
+		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
+	}
 
 	return 0;
 }
@@ -422,276 +649,271 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
 					    struct bch_replicas_cpu *r)
 {
 	struct bch_sb_field_replicas *sb_r;
-	struct bch_replicas_entry *sb_e;
-	struct bch_replicas_cpu_entry *e;
-	size_t i, bytes;
+	struct bch_replicas_entry_v1 *dst, *src;
+	bool need_v1 = false;
+	size_t bytes;
 
 	bytes = sizeof(struct bch_sb_field_replicas);
 
-	for_each_cpu_replicas_entry(r, e) {
-		bytes += sizeof(struct bch_replicas_entry);
-		for (i = 0; i < r->entry_size - 1; i++)
-			bytes += hweight8(e->devs[i]);
+	for_each_cpu_replicas_entry(r, src) {
+		bytes += replicas_entry_bytes(src);
+		if (src->nr_required != 1)
+			need_v1 = true;
 	}
 
-	sb_r = bch2_sb_resize_replicas(&c->disk_sb,
-			DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
+	if (!need_v1)
+		return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
+
+	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
+			DIV_ROUND_UP(bytes, sizeof(u64)));
 	if (!sb_r)
-		return -ENOSPC;
+		return -BCH_ERR_ENOSPC_sb_replicas;
+
+	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
+	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
 
 	memset(&sb_r->entries, 0,
 	       vstruct_end(&sb_r->field) -
 	       (void *) &sb_r->entries);
 
-	sb_e = sb_r->entries;
-	for_each_cpu_replicas_entry(r, e) {
-		sb_e->data_type = e->data_type;
-
-		for (i = 0; i < replicas_dev_slots(r); i++)
-			if (replicas_test_dev(e, i))
-				sb_e->devs[sb_e->nr++] = i;
+	dst = sb_r->entries;
+	for_each_cpu_replicas_entry(r, src) {
+		memcpy(dst, src, replicas_entry_bytes(src));
 
-		sb_e = replicas_entry_next(sb_e);
+		dst = replicas_entry_next(dst);
 
-		BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
+		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
 	}
 
 	return 0;
 }
 
-static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
+static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
+				      struct bch_sb *sb,
+				      struct printbuf *err)
 {
-	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
-	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
-	struct bch_replicas_cpu *cpu_r = NULL;
-	struct bch_replicas_entry *e;
-	const char *err;
 	unsigned i;
 
-	for_each_replicas_entry(sb_r, e) {
-		err = "invalid replicas entry: invalid data type";
-		if (e->data_type >= BCH_DATA_NR)
-			goto err;
+	sort_r(cpu_r->entries,
+	       cpu_r->nr,
+	       cpu_r->entry_size,
+	       bch2_memcmp, NULL,
+	       (void *)(size_t)cpu_r->entry_size);
 
-		err = "invalid replicas entry: no devices";
-		if (!e->nr)
-			goto err;
+	for (i = 0; i < cpu_r->nr; i++) {
+		struct bch_replicas_entry_v1 *e =
+			cpu_replicas_entry(cpu_r, i);
 
-		err = "invalid replicas entry: too many devices";
-		if (e->nr >= BCH_REPLICAS_MAX)
-			goto err;
+		int ret = bch2_replicas_entry_sb_validate(e, sb, err);
+		if (ret)
+			return ret;
 
-		err = "invalid replicas entry: invalid device";
-		for (i = 0; i < e->nr; i++)
-			if (!bch2_dev_exists(sb, mi, e->devs[i]))
-				goto err;
-	}
+		if (i + 1 < cpu_r->nr) {
+			struct bch_replicas_entry_v1 *n =
+				cpu_replicas_entry(cpu_r, i + 1);
 
-	err = "cannot allocate memory";
-	cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
-	if (!cpu_r)
-		goto err;
+			BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
 
-	sort_cmp_size(cpu_r->entries,
-		      cpu_r->nr,
-		      cpu_r->entry_size,
-		      memcmp, NULL);
+			if (!memcmp(e, n, cpu_r->entry_size)) {
+				prt_printf(err, "duplicate replicas entry ");
+				bch2_replicas_entry_to_text(err, e);
+				return -BCH_ERR_invalid_sb_replicas;
+			}
+		}
+	}
 
-	for (i = 0; i + 1 < cpu_r->nr; i++) {
-		struct bch_replicas_cpu_entry *l =
-			cpu_replicas_entry(cpu_r, i);
-		struct bch_replicas_cpu_entry *r =
-			cpu_replicas_entry(cpu_r, i + 1);
+	return 0;
+}
 
-		BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
+static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				     enum bch_validate_flags flags, struct printbuf *err)
+{
+	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
+	struct bch_replicas_cpu cpu_r;
+	int ret;
 
-		err = "duplicate replicas entry";
-		if (!memcmp(l, r, cpu_r->entry_size))
-			goto err;
-	}
+	ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r);
+	if (ret)
+		return ret;
 
-	err = NULL;
-err:
-	kfree(cpu_r);
-	return err;
+	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
+	kfree(cpu_r.entries);
+	return ret;
 }
 
-const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
-	.validate	= bch2_sb_validate_replicas,
-};
-
-int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
+static void bch2_sb_replicas_to_text(struct printbuf *out,
+				     struct bch_sb *sb,
+				     struct bch_sb_field *f)
 {
-	char *out = buf, *end = out + size;
-	struct bch_replicas_entry *e;
+	struct bch_sb_field_replicas *r = field_to_type(f, replicas);
+	struct bch_replicas_entry_v1 *e;
 	bool first = true;
-	unsigned i;
-
-	if (!r) {
-		out += scnprintf(out, end - out, "(no replicas section found)");
-		return out - buf;
-	}
 
 	for_each_replicas_entry(r, e) {
 		if (!first)
-			out += scnprintf(out, end - out, " ");
+			prt_printf(out, " ");
 		first = false;
 
-		out += scnprintf(out, end - out, "%u: [", e->data_type);
-
-		for (i = 0; i < e->nr; i++)
-			out += scnprintf(out, end - out,
-					 i ? " %u" : "%u", e->devs[i]);
-		out += scnprintf(out, end - out, "]");
+		bch2_replicas_entry_to_text(out, e);
 	}
-
-	return out - buf;
+	prt_newline(out);
 }
 
-/* Query replicas: */
+const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
+	.validate	= bch2_sb_replicas_validate,
+	.to_text	= bch2_sb_replicas_to_text,
+};
 
-bool bch2_replicas_marked(struct bch_fs *c,
-			  enum bch_data_type data_type,
-			  struct bch_devs_list devs)
+static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
+					enum bch_validate_flags flags, struct printbuf *err)
 {
-	struct bch_replicas_cpu_entry search;
-	unsigned max_dev;
-	bool ret;
-
-	if (!devs.nr)
-		return true;
-
-	devlist_to_replicas(devs, data_type, &search, &max_dev);
+	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
+	struct bch_replicas_cpu cpu_r;
+	int ret;
 
-	rcu_read_lock();
-	ret = replicas_has_entry(rcu_dereference(c->replicas),
-				 search, max_dev);
-	rcu_read_unlock();
+	ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r);
+	if (ret)
+		return ret;
 
+	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
+	kfree(cpu_r.entries);
 	return ret;
 }
 
-bool bch2_bkey_replicas_marked(struct bch_fs *c,
-			       enum bch_data_type data_type,
-			       struct bkey_s_c k)
+static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
+					struct bch_sb *sb,
+					struct bch_sb_field *f)
 {
-	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
-	unsigned i;
+	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
+	struct bch_replicas_entry_v0 *e;
+	bool first = true;
 
-	for (i = 0; i < cached.nr; i++)
-		if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
-					  bch2_dev_list_single(cached.devs[i])))
-			return false;
+	for_each_replicas_entry(sb_r, e) {
+		if (!first)
+			prt_printf(out, " ");
+		first = false;
 
-	return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
+		bch2_replicas_entry_v0_to_text(out, e);
+	}
+	prt_newline(out);
 }
 
-struct replicas_status __bch2_replicas_status(struct bch_fs *c,
-					      struct bch_devs_mask online_devs)
+const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
+	.validate	= bch2_sb_replicas_v0_validate,
+	.to_text	= bch2_sb_replicas_v0_to_text,
+};
+
+/* Query replicas: */
+
+bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
+			   unsigned flags, bool print)
 {
-	struct bch_sb_field_members *mi;
-	struct bch_replicas_cpu_entry *e;
-	struct bch_replicas_cpu *r;
-	unsigned i, dev, dev_slots, nr_online, nr_offline;
-	struct replicas_status ret;
+	struct bch_replicas_entry_v1 *e;
+	bool ret = true;
 
-	memset(&ret, 0, sizeof(ret));
+	percpu_down_read(&c->mark_lock);
+	for_each_cpu_replicas_entry(&c->replicas, e) {
+		unsigned nr_online = 0, nr_failed = 0, dflags = 0;
+		bool metadata = e->data_type < BCH_DATA_user;
 
-	for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
-		ret.replicas[i].nr_online = UINT_MAX;
+		if (e->data_type == BCH_DATA_cached)
+			continue;
 
-	mi = bch2_sb_get_members(c->disk_sb.sb);
-	rcu_read_lock();
+		rcu_read_lock();
+		for (unsigned i = 0; i < e->nr_devs; i++) {
+			if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
+				nr_failed++;
+				continue;
+			}
 
-	r = rcu_dereference(c->replicas);
-	dev_slots = replicas_dev_slots(r);
+			nr_online += test_bit(e->devs[i], devs.d);
 
-	for_each_cpu_replicas_entry(r, e) {
-		if (e->data_type >= ARRAY_SIZE(ret.replicas))
-			panic("e %p data_type %u\n", e, e->data_type);
+			struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
+			nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
+		}
+		rcu_read_unlock();
 
-		nr_online = nr_offline = 0;
+		if (nr_online + nr_failed == e->nr_devs)
+			continue;
 
-		for (dev = 0; dev < dev_slots; dev++) {
-			if (!replicas_test_dev(e, dev))
-				continue;
+		if (nr_online < e->nr_required)
+			dflags |= metadata
+				? BCH_FORCE_IF_METADATA_LOST
+				: BCH_FORCE_IF_DATA_LOST;
 
-			BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, dev));
+		if (nr_online < e->nr_devs)
+			dflags |= metadata
+				? BCH_FORCE_IF_METADATA_DEGRADED
+				: BCH_FORCE_IF_DATA_DEGRADED;
 
-			if (test_bit(dev, online_devs.d))
-				nr_online++;
-			else
-				nr_offline++;
-		}
+		if (dflags & ~flags) {
+			if (print) {
+				struct printbuf buf = PRINTBUF;
 
-		ret.replicas[e->data_type].nr_online =
-			min(ret.replicas[e->data_type].nr_online,
-			    nr_online);
+				bch2_replicas_entry_to_text(&buf, e);
+				bch_err(c, "insufficient devices online (%u) for replicas entry %s",
+					nr_online, buf.buf);
+				printbuf_exit(&buf);
+			}
+			ret = false;
+			break;
+		}
 
-		ret.replicas[e->data_type].nr_offline =
-			max(ret.replicas[e->data_type].nr_offline,
-			    nr_offline);
 	}
-
-	rcu_read_unlock();
+	percpu_up_read(&c->mark_lock);
 
 	return ret;
 }
 
-struct replicas_status bch2_replicas_status(struct bch_fs *c)
+unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
 {
-	return __bch2_replicas_status(c, bch2_online_devs(c));
-}
+	struct bch_sb_field_replicas *replicas;
+	struct bch_sb_field_replicas_v0 *replicas_v0;
+	unsigned data_has = 0;
 
-static bool have_enough_devs(struct replicas_status s,
-			     enum bch_data_type type,
-			     bool force_if_degraded,
-			     bool force_if_lost)
-{
-	return (!s.replicas[type].nr_offline || force_if_degraded) &&
-		(s.replicas[type].nr_online || force_if_lost);
-}
+	replicas = bch2_sb_field_get(sb, replicas);
+	replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
 
-bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
-{
-	return (have_enough_devs(s, BCH_DATA_JOURNAL,
-				 flags & BCH_FORCE_IF_METADATA_DEGRADED,
-				 flags & BCH_FORCE_IF_METADATA_LOST) &&
-		have_enough_devs(s, BCH_DATA_BTREE,
-				 flags & BCH_FORCE_IF_METADATA_DEGRADED,
-				 flags & BCH_FORCE_IF_METADATA_LOST) &&
-		have_enough_devs(s, BCH_DATA_USER,
-				 flags & BCH_FORCE_IF_DATA_DEGRADED,
-				 flags & BCH_FORCE_IF_DATA_LOST));
-}
+	if (replicas) {
+		struct bch_replicas_entry_v1 *r;
 
-unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
-{
-	struct replicas_status s = bch2_replicas_status(c);
+		for_each_replicas_entry(replicas, r) {
+			if (r->data_type >= sizeof(data_has) * 8)
+				continue;
 
-	return meta
-		? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
-		      s.replicas[BCH_DATA_BTREE].nr_online)
-		: s.replicas[BCH_DATA_USER].nr_online;
-}
+			for (unsigned i = 0; i < r->nr_devs; i++)
+				if (r->devs[i] == dev)
+					data_has |= 1 << r->data_type;
+		}
 
-unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct bch_replicas_cpu_entry *e;
-	struct bch_replicas_cpu *r;
-	unsigned ret = 0;
+	} else if (replicas_v0) {
+		struct bch_replicas_entry_v0 *r;
 
-	rcu_read_lock();
-	r = rcu_dereference(c->replicas);
+		for_each_replicas_entry_v0(replicas_v0, r) {
+			if (r->data_type >= sizeof(data_has) * 8)
+				continue;
 
-	if (ca->dev_idx >= replicas_dev_slots(r))
-		goto out;
+			for (unsigned i = 0; i < r->nr_devs; i++)
+				if (r->devs[i] == dev)
+					data_has |= 1 << r->data_type;
+		}
+	}
 
-	for_each_cpu_replicas_entry(r, e)
-		if (replicas_test_dev(e, ca->dev_idx))
-			ret |= 1 << e->data_type;
-out:
-	rcu_read_unlock();
+
+	return data_has;
+}
+
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+{
+	mutex_lock(&c->sb_lock);
+	unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
+	mutex_unlock(&c->sb_lock);
 
 	return ret;
 }
+
+void bch2_fs_replicas_exit(struct bch_fs *c)
+{
+	kfree(c->replicas.entries);
+	kfree(c->replicas_gc.entries);
+}
diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h
index 49f114b0..5aba2c1c 100644
--- a/libbcachefs/replicas.h
+++ b/libbcachefs/replicas.h
@@ -1,51 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_REPLICAS_H
 #define _BCACHEFS_REPLICAS_H
 
-bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
-			  struct bch_devs_list);
-bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type,
-			       struct bkey_s_c);
-int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
-		       struct bch_devs_list);
-int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type,
-			    struct bkey_s_c);
-
-int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t);
-int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t);
-
-struct replicas_status {
-	struct {
-		unsigned	nr_online;
-		unsigned	nr_offline;
-	}			replicas[BCH_DATA_NR];
-};
-
-struct replicas_status __bch2_replicas_status(struct bch_fs *,
-					      struct bch_devs_mask);
-struct replicas_status bch2_replicas_status(struct bch_fs *);
-bool bch2_have_enough_devs(struct replicas_status, unsigned);
-
-unsigned bch2_replicas_online(struct bch_fs *, bool);
+#include "bkey.h"
+#include "eytzinger.h"
+#include "replicas_types.h"
+
+void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
+void bch2_replicas_entry_to_text(struct printbuf *,
+				 struct bch_replicas_entry_v1 *);
+int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *,
+				 struct bch_fs *, struct printbuf *);
+void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
+
+static inline struct bch_replicas_entry_v1 *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+	return (void *) r->entries + r->entry_size * i;
+}
+
+int bch2_replicas_entry_idx(struct bch_fs *,
+			    struct bch_replicas_entry_v1 *);
+
+void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *,
+			      enum bch_data_type,
+			      struct bch_devs_list);
+
+bool bch2_replicas_marked_locked(struct bch_fs *,
+			  struct bch_replicas_entry_v1 *);
+bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *);
+int bch2_mark_replicas(struct bch_fs *,
+		       struct bch_replicas_entry_v1 *);
+
+void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c);
+
+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
+					      unsigned dev)
+{
+	e->data_type	= BCH_DATA_cached;
+	e->nr_devs	= 1;
+	e->nr_required	= 1;
+	e->devs[0]	= dev;
+}
+
+bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
+			   unsigned, bool);
+
+unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
 unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
 
 int bch2_replicas_gc_end(struct bch_fs *, int);
 int bch2_replicas_gc_start(struct bch_fs *, unsigned);
+int bch2_replicas_gc2(struct bch_fs *);
+
+#define for_each_cpu_replicas_entry(_r, _i)				\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
+	     _i = (void *) (_i) + (_r)->entry_size)
 
 /* iterate over superblock replicas - used by userspace tools: */
 
-static inline struct bch_replicas_entry *
-replicas_entry_next(struct bch_replicas_entry *i)
-{
-	return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
-}
+#define replicas_entry_next(_i)						\
+	((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i)))
 
 #define for_each_replicas_entry(_r, _i)					\
 	for (_i = (_r)->entries;					\
 	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
 	     (_i) = replicas_entry_next(_i))
 
+#define for_each_replicas_entry_v0(_r, _i)				\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+	     (_i) = replicas_entry_next(_i))
+
 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
 
 extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
+
+void bch2_fs_replicas_exit(struct bch_fs *);
 
 #endif /* _BCACHEFS_REPLICAS_H */
diff --git a/libbcachefs/replicas_format.h b/libbcachefs/replicas_format.h
new file mode 100644
index 00000000..b7eff904
--- /dev/null
+++ b/libbcachefs/replicas_format.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REPLICAS_FORMAT_H
+#define _BCACHEFS_REPLICAS_FORMAT_H
+
+struct bch_replicas_entry_v0 {
+	__u8			data_type;
+	__u8			nr_devs;
+	__u8			devs[] __counted_by(nr_devs);
+} __packed;
+
+struct bch_sb_field_replicas_v0 {
+	struct bch_sb_field	field;
+	struct bch_replicas_entry_v0 entries[];
+} __packed __aligned(8);
+
+struct bch_replicas_entry_v1 {
+	__u8			data_type;
+	__u8			nr_devs;
+	__u8			nr_required;
+	__u8			devs[] __counted_by(nr_devs);
+} __packed;
+
+struct bch_sb_field_replicas {
+	struct bch_sb_field	field;
+	struct bch_replicas_entry_v1 entries[];
+} __packed __aligned(8);
+
+#define replicas_entry_bytes(_i)					\
+	(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
+
+#define replicas_entry_add_dev(e, d) ({					\
+	(e)->nr_devs++;							\
+	(e)->devs[(e)->nr_devs - 1] = (d);				\
+})
+
+#endif /* _BCACHEFS_REPLICAS_FORMAT_H */
diff --git a/libbcachefs/replicas_types.h b/libbcachefs/replicas_types.h
new file mode 100644
index 00000000..fed71c86
--- /dev/null
+++ b/libbcachefs/replicas_types.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REPLICAS_TYPES_H
+#define _BCACHEFS_REPLICAS_TYPES_H
+
+struct bch_replicas_cpu {
+	unsigned		nr;
+	unsigned		entry_size;
+	struct bch_replicas_entry_v1 *entries;
+};
+
+#endif /* _BCACHEFS_REPLICAS_TYPES_H */
diff --git a/libbcachefs/sb-clean.c b/libbcachefs/sb-clean.c
new file mode 100644
index 00000000..00527528
--- /dev/null
+++ b/libbcachefs/sb-clean.c
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "error.h"
+#include "journal_io.h"
+#include "replicas.h"
+#include "sb-clean.h"
+#include "super-io.h"
+
+/*
+ * BCH_SB_FIELD_clean:
+ *
+ * Btree roots, and a few other things, are recovered from the journal after an
+ * unclean shutdown - but after a clean shutdown, to avoid having to read the
+ * journal, we can store them in the superblock.
+ *
+ * bch_sb_field_clean simply contains a list of journal entries, stored exactly
+ * as they would be in the journal:
+ */
+
+int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean,
+				int write)
+{
+	struct jset_entry *entry;
+	int ret;
+
+	for (entry = clean->start;
+	     entry < (struct jset_entry *) vstruct_end(&clean->field);
+	     entry = vstruct_next(entry)) {
+		if (vstruct_end(entry) > vstruct_end(&clean->field)) {
+			bch_err(c, "journal entry (u64s %u) overran end of superblock clean section (u64s %u) by %zu",
+				le16_to_cpu(entry->u64s), le32_to_cpu(clean->field.u64s),
+				(u64 *) vstruct_end(entry) - (u64 *) vstruct_end(&clean->field));
+			bch2_sb_error_count(c, BCH_FSCK_ERR_sb_clean_entry_overrun);
+			return -BCH_ERR_fsck_repair_unimplemented;
+		}
+
+		ret = bch2_journal_entry_validate(c, NULL, entry,
+						  le16_to_cpu(c->disk_sb.sb->version),
+						  BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
+						  write);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static struct bkey_i *btree_root_find(struct bch_fs *c,
+				      struct bch_sb_field_clean *clean,
+				      struct jset *j,
+				      enum btree_id id, unsigned *level)
+{
+	struct bkey_i *k;
+	struct jset_entry *entry, *start, *end;
+
+	if (clean) {
+		start = clean->start;
+		end = vstruct_end(&clean->field);
+	} else {
+		start = j->start;
+		end = vstruct_last(j);
+	}
+
+	for (entry = start; entry < end; entry = vstruct_next(entry))
+		if (entry->type == BCH_JSET_ENTRY_btree_root &&
+		    entry->btree_id == id)
+			goto found;
+
+	return NULL;
+found:
+	if (!entry->u64s)
+		return ERR_PTR(-EINVAL);
+
+	k = entry->start;
+	*level = entry->level;
+	return k;
+}
+
+int bch2_verify_superblock_clean(struct bch_fs *c,
+				 struct bch_sb_field_clean **cleanp,
+				 struct jset *j)
+{
+	unsigned i;
+	struct bch_sb_field_clean *clean = *cleanp;
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+	int ret = 0;
+
+	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+			sb_clean_journal_seq_mismatch,
+			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+			le64_to_cpu(clean->journal_seq),
+			le64_to_cpu(j->seq))) {
+		kfree(clean);
+		*cleanp = NULL;
+		return 0;
+	}
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct bkey_i *k1, *k2;
+		unsigned l1 = 0, l2 = 0;
+
+		k1 = btree_root_find(c, clean, NULL, i, &l1);
+		k2 = btree_root_find(c, NULL, j, i, &l2);
+
+		if (!k1 && !k2)
+			continue;
+
+		printbuf_reset(&buf1);
+		printbuf_reset(&buf2);
+
+		if (k1)
+			bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
+		else
+			prt_printf(&buf1, "(none)");
+
+		if (k2)
+			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
+		else
+			prt_printf(&buf2, "(none)");
+
+		mustfix_fsck_err_on(!k1 || !k2 ||
+				    IS_ERR(k1) ||
+				    IS_ERR(k2) ||
+				    k1->k.u64s != k2->k.u64s ||
+				    memcmp(k1, k2, bkey_bytes(&k1->k)) ||
+				    l1 != l2, c,
+			sb_clean_btree_root_mismatch,
+			"superblock btree root %u doesn't match journal after clean shutdown\n"
+			"sb:      l=%u %s\n"
+			"journal: l=%u %s\n", i,
+			l1, buf1.buf,
+			l2, buf2.buf);
+	}
+fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
+	return ret;
+}
+
+struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
+{
+	struct bch_sb_field_clean *clean, *sb_clean;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean);
+
+	if (fsck_err_on(!sb_clean, c,
+			sb_clean_missing,
+			"superblock marked clean but clean section not present")) {
+		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+		c->sb.clean = false;
+		mutex_unlock(&c->sb_lock);
+		return ERR_PTR(-BCH_ERR_invalid_sb_clean);
+	}
+
+	clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+			GFP_KERNEL);
+	if (!clean) {
+		mutex_unlock(&c->sb_lock);
+		return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
+	}
+
+	ret = bch2_sb_clean_validate_late(c, clean, READ);
+	if (ret) {
+		kfree(clean);
+		mutex_unlock(&c->sb_lock);
+		return ERR_PTR(ret);
+	}
+
+	mutex_unlock(&c->sb_lock);
+
+	return clean;
+fsck_err:
+	mutex_unlock(&c->sb_lock);
+	return ERR_PTR(ret);
+}
+
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
+					   struct jset_entry **end,
+					   u64 journal_seq)
+{
+	{
+		struct jset_entry_usage *u =
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);
+
+		u->entry.type	= BCH_JSET_ENTRY_usage;
+		u->entry.btree_id = BCH_FS_USAGE_key_version;
+		u->v		= cpu_to_le64(atomic64_read(&c->key_version));
+	}
+
+	for (unsigned i = 0; i < 2; i++) {
+		struct jset_entry_clock *clock =
+			container_of(jset_entry_init(end, sizeof(*clock)),
+				     struct jset_entry_clock, entry);
+
+		clock->entry.type = BCH_JSET_ENTRY_clock;
+		clock->rw	= i;
+		clock->time	= cpu_to_le64(atomic64_read(&c->io_clock[i].now));
+	}
+}
+
+static int bch2_sb_clean_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				  enum bch_validate_flags flags, struct printbuf *err)
+{
+	struct bch_sb_field_clean *clean = field_to_type(f, clean);
+
+	if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
+		prt_printf(err, "wrong size (got %zu should be %zu)",
+		       vstruct_bytes(&clean->field), sizeof(*clean));
+		return -BCH_ERR_invalid_sb_clean;
+	}
+
+	for (struct jset_entry *entry = clean->start;
+	     entry != vstruct_end(&clean->field);
+	     entry = vstruct_next(entry)) {
+		if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) {
+			prt_str(err, "entry type ");
+			bch2_prt_jset_entry_type(err, entry->type);
+			prt_str(err, " overruns end of section");
+			return -BCH_ERR_invalid_sb_clean;
+		}
+	}
+
+	return 0;
+}
+
+static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
+				  struct bch_sb_field *f)
+{
+	struct bch_sb_field_clean *clean = field_to_type(f, clean);
+	struct jset_entry *entry;
+
+	prt_printf(out, "flags:          %x\n",		le32_to_cpu(clean->flags));
+	prt_printf(out, "journal_seq:    %llu\n",	le64_to_cpu(clean->journal_seq));
+
+	for (entry = clean->start;
+	     entry != vstruct_end(&clean->field);
+	     entry = vstruct_next(entry)) {
+		if ((void *) vstruct_next(entry) > vstruct_end(&clean->field))
+			break;
+
+		if (entry->type == BCH_JSET_ENTRY_btree_keys &&
+		    !entry->u64s)
+			continue;
+
+		bch2_journal_entry_to_text(out, NULL, entry);
+		prt_newline(out);
+	}
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_clean = {
+	.validate	= bch2_sb_clean_validate,
+	.to_text	= bch2_sb_clean_to_text,
+};
+
+int bch2_fs_mark_dirty(struct bch_fs *c)
+{
+	int ret;
+
+	/*
+	 * Unconditionally write superblock, to verify it hasn't changed before
+	 * we go rw:
+	 */
+
+	mutex_lock(&c->sb_lock);
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
+
+	ret = bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+void bch2_fs_mark_clean(struct bch_fs *c)
+{
+	struct bch_sb_field_clean *sb_clean;
+	struct jset_entry *entry;
+	unsigned u64s;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	if (BCH_SB_CLEAN(c->disk_sb.sb))
+		goto out;
+
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
+
+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
+	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
+	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
+
+	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
+
+	sb_clean = bch2_sb_field_resize(&c->disk_sb, clean, u64s);
+	if (!sb_clean) {
+		bch_err(c, "error resizing superblock while setting filesystem clean");
+		goto out;
+	}
+
+	sb_clean->flags		= 0;
+	sb_clean->journal_seq	= cpu_to_le64(atomic64_read(&c->journal.seq));
+
+	/* Trying to catch outstanding bug: */
+	BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
+
+	entry = sb_clean->start;
+	bch2_journal_super_entries_add_common(c, &entry, 0);
+	entry = bch2_btree_roots_to_journal_entries(c, entry, 0);
+	BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
+
+	memset(entry, 0,
+	       vstruct_end(&sb_clean->field) - (void *) entry);
+
+	/*
+	 * this should be in the write path, and we should be validating every
+	 * superblock section:
+	 */
+	ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
+	if (ret) {
+		bch_err(c, "error writing marking filesystem clean: validate error");
+		goto out;
+	}
+
+	bch2_journal_pos_from_member_info_set(c);
+
+	bch2_write_super(c);
+out:
+	mutex_unlock(&c->sb_lock);
+}
diff --git a/libbcachefs/sb-clean.h b/libbcachefs/sb-clean.h
new file mode 100644
index 00000000..71caef28
--- /dev/null
+++ b/libbcachefs/sb-clean.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_CLEAN_H
+#define _BCACHEFS_SB_CLEAN_H
+
+int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
+int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **,
+				 struct jset *);
+struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *);
+void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_clean;
+
+int bch2_fs_mark_dirty(struct bch_fs *);
+void bch2_fs_mark_clean(struct bch_fs *);
+
+#endif /* _BCACHEFS_SB_CLEAN_H */
diff --git a/libbcachefs/sb-counters.c b/libbcachefs/sb-counters.c
new file mode 100644
index 00000000..6992e746
--- /dev/null
+++ b/libbcachefs/sb-counters.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "super-io.h"
+#include "sb-counters.h"
+
+/* BCH_SB_FIELD_counters */
+
+static const char * const bch2_counter_names[] = {
+#define x(t, n, ...) (#t),
+	BCH_PERSISTENT_COUNTERS()
+#undef x
+	NULL
+};
+
+static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
+{
+	if (!ctrs)
+		return 0;
+
+	return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
+};
+
+static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				enum bch_validate_flags flags, struct printbuf *err)
+{
+	return 0;
+};
+
+static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
+			      struct bch_sb_field *f)
+{
+	struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
+	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+
+	for (unsigned i = 0; i < nr; i++)
+		prt_printf(out, "%s \t%llu\n",
+			   i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)",
+			   le64_to_cpu(ctrs->d[i]));
+};
+
+int bch2_sb_counters_to_cpu(struct bch_fs *c)
+{
+	struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
+	unsigned int i;
+	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+	u64 val = 0;
+
+	for (i = 0; i < BCH_COUNTER_NR; i++)
+		c->counters_on_mount[i] = 0;
+
+	for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) {
+		val = le64_to_cpu(ctrs->d[i]);
+		percpu_u64_set(&c->counters[i], val);
+		c->counters_on_mount[i] = val;
+	}
+	return 0;
+};
+
+int bch2_sb_counters_from_cpu(struct bch_fs *c)
+{
+	struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
+	struct bch_sb_field_counters *ret;
+	unsigned int i;
+	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+
+	if (nr < BCH_COUNTER_NR) {
+		ret = bch2_sb_field_resize(&c->disk_sb, counters,
+					       sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
+
+		if (ret) {
+			ctrs = ret;
+			nr = bch2_sb_counter_nr_entries(ctrs);
+		}
+	}
+
+
+	for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++)
+		ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
+	return 0;
+}
+
+void bch2_fs_counters_exit(struct bch_fs *c)
+{
+	free_percpu(c->counters);
+}
+
+int bch2_fs_counters_init(struct bch_fs *c)
+{
+	c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64));
+	if (!c->counters)
+		return -BCH_ERR_ENOMEM_fs_counters_init;
+
+	return bch2_sb_counters_to_cpu(c);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_counters = {
+	.validate	= bch2_sb_counters_validate,
+	.to_text	= bch2_sb_counters_to_text,
+};
diff --git a/libbcachefs/sb-counters.h b/libbcachefs/sb-counters.h
new file mode 100644
index 00000000..81f8aec9
--- /dev/null
+++ b/libbcachefs/sb-counters.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_COUNTERS_H
+#define _BCACHEFS_SB_COUNTERS_H
+
+#include "bcachefs.h"
+#include "super-io.h"
+
+int bch2_sb_counters_to_cpu(struct bch_fs *);
+int bch2_sb_counters_from_cpu(struct bch_fs *);
+
+void bch2_fs_counters_exit(struct bch_fs *);
+int bch2_fs_counters_init(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
+
+#endif // _BCACHEFS_SB_COUNTERS_H
diff --git a/libbcachefs/sb-counters_format.h b/libbcachefs/sb-counters_format.h
new file mode 100644
index 00000000..62ea4782
--- /dev/null
+++ b/libbcachefs/sb-counters_format.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H
+#define _BCACHEFS_SB_COUNTERS_FORMAT_H
+
+#define BCH_PERSISTENT_COUNTERS()				\
+	x(io_read,					0)	\
+	x(io_write,					1)	\
+	x(io_move,					2)	\
+	x(bucket_invalidate,				3)	\
+	x(bucket_discard,				4)	\
+	x(bucket_alloc,					5)	\
+	x(bucket_alloc_fail,				6)	\
+	x(btree_cache_scan,				7)	\
+	x(btree_cache_reap,				8)	\
+	x(btree_cache_cannibalize,			9)	\
+	x(btree_cache_cannibalize_lock,			10)	\
+	x(btree_cache_cannibalize_lock_fail,		11)	\
+	x(btree_cache_cannibalize_unlock,		12)	\
+	x(btree_node_write,				13)	\
+	x(btree_node_read,				14)	\
+	x(btree_node_compact,				15)	\
+	x(btree_node_merge,				16)	\
+	x(btree_node_split,				17)	\
+	x(btree_node_rewrite,				18)	\
+	x(btree_node_alloc,				19)	\
+	x(btree_node_free,				20)	\
+	x(btree_node_set_root,				21)	\
+	x(btree_path_relock_fail,			22)	\
+	x(btree_path_upgrade_fail,			23)	\
+	x(btree_reserve_get_fail,			24)	\
+	x(journal_entry_full,				25)	\
+	x(journal_full,					26)	\
+	x(journal_reclaim_finish,			27)	\
+	x(journal_reclaim_start,			28)	\
+	x(journal_write,				29)	\
+	x(read_promote,					30)	\
+	x(read_bounce,					31)	\
+	x(read_split,					33)	\
+	x(read_retry,					32)	\
+	x(read_reuse_race,				34)	\
+	x(move_extent_read,				35)	\
+	x(move_extent_write,				36)	\
+	x(move_extent_finish,				37)	\
+	x(move_extent_fail,				38)	\
+	x(move_extent_start_fail,			39)	\
+	x(copygc,					40)	\
+	x(copygc_wait,					41)	\
+	x(gc_gens_end,					42)	\
+	x(gc_gens_start,				43)	\
+	x(trans_blocked_journal_reclaim,		44)	\
+	x(trans_restart_btree_node_reused,		45)	\
+	x(trans_restart_btree_node_split,		46)	\
+	x(trans_restart_fault_inject,			47)	\
+	x(trans_restart_iter_upgrade,			48)	\
+	x(trans_restart_journal_preres_get,		49)	\
+	x(trans_restart_journal_reclaim,		50)	\
+	x(trans_restart_journal_res_get,		51)	\
+	x(trans_restart_key_cache_key_realloced,	52)	\
+	x(trans_restart_key_cache_raced,		53)	\
+	x(trans_restart_mark_replicas,			54)	\
+	x(trans_restart_mem_realloced,			55)	\
+	x(trans_restart_memory_allocation_failure,	56)	\
+	x(trans_restart_relock,				57)	\
+	x(trans_restart_relock_after_fill,		58)	\
+	x(trans_restart_relock_key_cache_fill,		59)	\
+	x(trans_restart_relock_next_node,		60)	\
+	x(trans_restart_relock_parent_for_fill,		61)	\
+	x(trans_restart_relock_path,			62)	\
+	x(trans_restart_relock_path_intent,		63)	\
+	x(trans_restart_too_many_iters,			64)	\
+	x(trans_restart_traverse,			65)	\
+	x(trans_restart_upgrade,			66)	\
+	x(trans_restart_would_deadlock,			67)	\
+	x(trans_restart_would_deadlock_write,		68)	\
+	x(trans_restart_injected,			69)	\
+	x(trans_restart_key_cache_upgrade,		70)	\
+	x(trans_traverse_all,				71)	\
+	x(transaction_commit,				72)	\
+	x(write_super,					73)	\
+	x(trans_restart_would_deadlock_recursion_limit,	74)	\
+	x(trans_restart_write_buffer_flush,		75)	\
+	x(trans_restart_split_race,			76)	\
+	x(write_buffer_flush_slowpath,			77)	\
+	x(write_buffer_flush_sync,			78)
+
+enum bch_persistent_counters {
+#define x(t, n, ...) BCH_COUNTER_##t,
+	BCH_PERSISTENT_COUNTERS()
+#undef x
+	BCH_COUNTER_NR
+};
+
+struct bch_sb_field_counters {
+	struct bch_sb_field	field;
+	__le64			d[];
+};
+
+#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */
diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c
new file mode 100644
index 00000000..fe453e17
--- /dev/null
+++ b/libbcachefs/sb-downgrade.c
@@ -0,0 +1,438 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Superblock section that contains a list of recovery passes to run when
+ * downgrading past a given version
+ */
+
+#include "bcachefs.h"
+#include "darray.h"
+#include "recovery_passes.h"
+#include "sb-downgrade.h"
+#include "sb-errors.h"
+#include "super-io.h"
+
+#define RECOVERY_PASS_ALL_FSCK		BIT_ULL(63)
+
+/*
+ * Upgrade, downgrade tables - run certain recovery passes, fix certain errors
+ *
+ * x(version, recovery_passes, errors...)
+ */
+#define UPGRADE_TABLE()						\
+	x(backpointers,						\
+	  RECOVERY_PASS_ALL_FSCK)				\
+	x(inode_v3,						\
+	  RECOVERY_PASS_ALL_FSCK)				\
+	x(unwritten_extents,					\
+	  RECOVERY_PASS_ALL_FSCK)				\
+	x(bucket_gens,						\
+	  BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)|		\
+	  RECOVERY_PASS_ALL_FSCK)				\
+	x(lru_v2,						\
+	  RECOVERY_PASS_ALL_FSCK)				\
+	x(fragmentation_lru,					\
+	  RECOVERY_PASS_ALL_FSCK)				\
+	x(no_bps_in_alloc_keys,					\
+	  RECOVERY_PASS_ALL_FSCK)				\
+	x(snapshot_trees,					\
+	  RECOVERY_PASS_ALL_FSCK)				\
+	x(snapshot_skiplists,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_snapshots),		\
+	  BCH_FSCK_ERR_snapshot_bad_depth,			\
+	  BCH_FSCK_ERR_snapshot_bad_skiplist)			\
+	x(deleted_inodes,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_inodes),		\
+	  BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list)	\
+	x(rebalance_work,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))	\
+	x(subvolume_fs_parent,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_dirents),		\
+	  BCH_FSCK_ERR_subvol_fs_path_parent_wrong)		\
+	x(btree_subvolume_children,				\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_subvols),		\
+	  BCH_FSCK_ERR_subvol_children_not_set)			\
+	x(mi_btree_bitmap,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_btree_bitmap_not_marked)			\
+	x(disk_accounting_v2,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_bkey_version_in_future,			\
+	  BCH_FSCK_ERR_dev_usage_buckets_wrong,			\
+	  BCH_FSCK_ERR_dev_usage_sectors_wrong,			\
+	  BCH_FSCK_ERR_dev_usage_fragmented_wrong,		\
+	  BCH_FSCK_ERR_accounting_mismatch)			\
+	x(disk_accounting_v3,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_bkey_version_in_future,			\
+	  BCH_FSCK_ERR_dev_usage_buckets_wrong,			\
+	  BCH_FSCK_ERR_dev_usage_sectors_wrong,			\
+	  BCH_FSCK_ERR_dev_usage_fragmented_wrong,		\
+	  BCH_FSCK_ERR_accounting_mismatch,			\
+	  BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,	\
+	  BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad,	\
+	  BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted,	\
+	  BCH_FSCK_ERR_accounting_key_junk_at_end)		\
+	x(disk_accounting_inum,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_accounting_mismatch)			\
+	x(rebalance_work_acct_fix,				\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_accounting_mismatch)			\
+	x(inode_has_child_snapshots,				\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_inodes),		\
+	  BCH_FSCK_ERR_inode_has_child_snapshots_wrong)		\
+	x(backpointer_bucket_gen,				\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_backpointers_to_extents)|\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
+	  BCH_FSCK_ERR_backpointer_to_missing_ptr,		\
+	  BCH_FSCK_ERR_ptr_to_missing_backpointer)		\
+	x(disk_accounting_big_endian,				\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_accounting_mismatch,			\
+	  BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,	\
+	  BCH_FSCK_ERR_accounting_key_junk_at_end)
+
+#define DOWNGRADE_TABLE()					\
+	x(bucket_stripe_sectors,				\
+	  0)							\
+	x(disk_accounting_v2,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_dev_usage_buckets_wrong,			\
+	  BCH_FSCK_ERR_dev_usage_sectors_wrong,			\
+	  BCH_FSCK_ERR_dev_usage_fragmented_wrong,		\
+	  BCH_FSCK_ERR_fs_usage_hidden_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_btree_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_data_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_cached_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_reserved_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_nr_inodes_wrong,		\
+	  BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong,	\
+	  BCH_FSCK_ERR_fs_usage_replicas_wrong,			\
+	  BCH_FSCK_ERR_bkey_version_in_future)			\
+	x(disk_accounting_v3,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_dev_usage_buckets_wrong,			\
+	  BCH_FSCK_ERR_dev_usage_sectors_wrong,			\
+	  BCH_FSCK_ERR_dev_usage_fragmented_wrong,		\
+	  BCH_FSCK_ERR_fs_usage_hidden_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_btree_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_data_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_cached_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_reserved_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_nr_inodes_wrong,		\
+	  BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong,	\
+	  BCH_FSCK_ERR_fs_usage_replicas_wrong,			\
+	  BCH_FSCK_ERR_accounting_replicas_not_marked,		\
+	  BCH_FSCK_ERR_bkey_version_in_future)			\
+	x(rebalance_work_acct_fix,				\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_accounting_mismatch,			\
+	  BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,	\
+	  BCH_FSCK_ERR_accounting_key_junk_at_end)		\
+	x(backpointer_bucket_gen,				\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
+	  BCH_FSCK_ERR_backpointer_bucket_offset_wrong,		\
+	  BCH_FSCK_ERR_backpointer_to_missing_ptr,		\
+	  BCH_FSCK_ERR_ptr_to_missing_backpointer)		\
+	x(disk_accounting_big_endian,				\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_accounting_mismatch,			\
+	  BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,	\
+	  BCH_FSCK_ERR_accounting_key_junk_at_end)
+
+struct upgrade_downgrade_entry {
+	u64		recovery_passes;
+	u16		version;
+	u16		nr_errors;
+	const u16	*errors;
+};
+
+#define x(ver, passes, ...) static const u16 upgrade_##ver##_errors[] = { __VA_ARGS__ };
+UPGRADE_TABLE()
+#undef x
+
+static const struct upgrade_downgrade_entry upgrade_table[] = {
+#define x(ver, passes, ...) {					\
+	.recovery_passes	= passes,			\
+	.version		= bcachefs_metadata_version_##ver,\
+	.nr_errors		= ARRAY_SIZE(upgrade_##ver##_errors),	\
+	.errors			= upgrade_##ver##_errors,	\
+},
+UPGRADE_TABLE()
+#undef x
+};
+
+static int have_stripes(struct bch_fs *c)
+{
+	if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b))
+		return 0;
+
+	return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b);
+}
+
+int bch2_sb_set_upgrade_extra(struct bch_fs *c)
+{
+	unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
+	unsigned new_version = c->sb.version;
+	bool write_sb = false;
+	int ret = 0;
+
+	mutex_lock(&c->sb_lock);
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+	if (old_version <  bcachefs_metadata_version_bucket_stripe_sectors &&
+	    new_version >= bcachefs_metadata_version_bucket_stripe_sectors &&
+	    (ret = have_stripes(c) > 0)) {
+		__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_sectors_wrong, ext->errors_silent);
+		write_sb = true;
+	}
+
+	if (write_sb)
+		bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return ret < 0 ? ret : 0;
+}
+
+void bch2_sb_set_upgrade(struct bch_fs *c,
+			 unsigned old_version,
+			 unsigned new_version)
+{
+	lockdep_assert_held(&c->sb_lock);
+
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+	for (const struct upgrade_downgrade_entry *i = upgrade_table;
+	     i < upgrade_table + ARRAY_SIZE(upgrade_table);
+	     i++)
+		if (i->version > old_version && i->version <= new_version) {
+			u64 passes = i->recovery_passes;
+
+			if (passes & RECOVERY_PASS_ALL_FSCK)
+				passes |= bch2_fsck_recovery_passes();
+			passes &= ~RECOVERY_PASS_ALL_FSCK;
+
+			ext->recovery_passes_required[0] |=
+				cpu_to_le64(bch2_recovery_passes_to_stable(passes));
+
+			for (const u16 *e = i->errors; e < i->errors + i->nr_errors; e++)
+				__set_bit_le64(*e, ext->errors_silent);
+		}
+}
+
+#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ };
+DOWNGRADE_TABLE()
+#undef x
+
+static const struct upgrade_downgrade_entry downgrade_table[] = {
+#define x(ver, passes, ...) {					\
+	.recovery_passes	= passes,			\
+	.version		= bcachefs_metadata_version_##ver,\
+	.nr_errors		= ARRAY_SIZE(downgrade_##ver##_errors),	\
+	.errors			= downgrade_##ver##_errors,	\
+},
+DOWNGRADE_TABLE()
+#undef x
+};
+
+static int downgrade_table_extra(struct bch_fs *c, darray_char *table)
+{
+	struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table);
+	unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
+	int ret = 0;
+
+	unsigned nr_errors = le16_to_cpu(dst->nr_errors);
+
+	switch (le16_to_cpu(dst->version)) {
+	case bcachefs_metadata_version_bucket_stripe_sectors:
+		if (have_stripes(c)) {
+			bytes += sizeof(dst->errors[0]) * 2;
+
+			ret = darray_make_room(table, bytes);
+			if (ret)
+				return ret;
+
+			/* open coded __set_bit_le64, as dst is packed and
+			 * dst->recovery_passes is misaligned */
+			unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations;
+			dst->recovery_passes[b / 64] |= cpu_to_le64(BIT_ULL(b % 64));
+
+			dst->errors[nr_errors++] = cpu_to_le16(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong);
+		}
+		break;
+	}
+
+	dst->nr_errors = cpu_to_le16(nr_errors);
+	return ret;
+}
+
+static inline const struct bch_sb_field_downgrade_entry *
+downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e)
+{
+	return (void *) &e->errors[le16_to_cpu(e->nr_errors)];
+}
+
+#define for_each_downgrade_entry(_d, _i)						\
+	for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries;		\
+	     (void *) _i	< vstruct_end(&(_d)->field) &&				\
+	     (void *) &_i->errors[0] <= vstruct_end(&(_d)->field) &&			\
+	     (void *) downgrade_entry_next_c(_i) <= vstruct_end(&(_d)->field);		\
+	     _i = downgrade_entry_next_c(_i))
+
+static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				      enum bch_validate_flags flags, struct printbuf *err)
+{
+	struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
+
+	for (const struct bch_sb_field_downgrade_entry *i = e->entries;
+	     (void *) i	< vstruct_end(&e->field);
+	     i = downgrade_entry_next_c(i)) {
+		/*
+		 * Careful: sb_field_downgrade_entry is only 2 byte aligned, but
+		 * section sizes are 8 byte aligned - an empty entry spanning
+		 * the end of the section is allowed (and ignored):
+		 */
+		if ((void *) &i->errors[0] > vstruct_end(&e->field))
+			break;
+
+		if (flags & BCH_VALIDATE_write &&
+		    (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) {
+			prt_printf(err, "downgrade entry overruns end of superblock section");
+			return -BCH_ERR_invalid_sb_downgrade;
+		}
+
+		if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) !=
+		    BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) {
+			prt_printf(err, "downgrade entry with mismatched major version (%u != %u)",
+				   BCH_VERSION_MAJOR(le16_to_cpu(i->version)),
+				   BCH_VERSION_MAJOR(le16_to_cpu(sb->version)));
+			return -BCH_ERR_invalid_sb_downgrade;
+		}
+	}
+
+	return 0;
+}
+
+static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb,
+				      struct bch_sb_field *f)
+{
+	struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
+
+	if (out->nr_tabstops <= 1)
+		printbuf_tabstop_push(out, 16);
+
+	for_each_downgrade_entry(e, i) {
+		prt_str(out, "version:\t");
+		bch2_version_to_text(out, le16_to_cpu(i->version));
+		prt_newline(out);
+
+		prt_str(out, "recovery passes:\t");
+		prt_bitflags(out, bch2_recovery_passes,
+			     bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0])));
+		prt_newline(out);
+
+		prt_str(out, "errors:\t");
+		bool first = true;
+		for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
+			if (!first)
+				prt_char(out, ',');
+			first = false;
+			bch2_sb_error_id_to_text(out, le16_to_cpu(i->errors[j]));
+		}
+		prt_newline(out);
+	}
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_downgrade = {
+	.validate	= bch2_sb_downgrade_validate,
+	.to_text	= bch2_sb_downgrade_to_text,
+};
+
+int bch2_sb_downgrade_update(struct bch_fs *c)
+{
+	if (!test_bit(BCH_FS_btree_running, &c->flags))
+		return 0;
+
+	darray_char table = {};
+	int ret = 0;
+
+	for (const struct upgrade_downgrade_entry *src = downgrade_table;
+	     src < downgrade_table + ARRAY_SIZE(downgrade_table);
+	     src++) {
+		if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
+			continue;
+
+		struct bch_sb_field_downgrade_entry *dst;
+		unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors;
+
+		ret = darray_make_room(&table, bytes);
+		if (ret)
+			goto out;
+
+		dst = (void *) &darray_top(table);
+		dst->version = cpu_to_le16(src->version);
+		dst->recovery_passes[0]	= cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes));
+		dst->recovery_passes[1]	= 0;
+		dst->nr_errors		= cpu_to_le16(src->nr_errors);
+		for (unsigned i = 0; i < src->nr_errors; i++)
+			dst->errors[i] = cpu_to_le16(src->errors[i]);
+
+		ret = downgrade_table_extra(c, &table);
+		if (ret)
+			goto out;
+
+		if (!dst->recovery_passes[0] &&
+		    !dst->recovery_passes[1] &&
+		    !dst->nr_errors)
+			continue;
+
+		table.nr += sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
+	}
+
+	struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
+
+	unsigned sb_u64s = DIV_ROUND_UP(sizeof(*d) + table.nr, sizeof(u64));
+
+	if (d && le32_to_cpu(d->field.u64s) > sb_u64s)
+		goto out;
+
+	d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s);
+	if (!d) {
+		ret = -BCH_ERR_ENOSPC_sb_downgrade;
+		goto out;
+	}
+
+	memcpy(d->entries, table.data, table.nr);
+	memset_u64s_tail(d->entries, 0, table.nr);
+out:
+	darray_exit(&table);
+	return ret;
+}
+
+void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_minor)
+{
+	struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
+	if (!d)
+		return;
+
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+	for_each_downgrade_entry(d, i) {
+		unsigned minor = BCH_VERSION_MINOR(le16_to_cpu(i->version));
+		if (new_minor < minor && minor <= old_minor) {
+			ext->recovery_passes_required[0] |= i->recovery_passes[0];
+			ext->recovery_passes_required[1] |= i->recovery_passes[1];
+
+			for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
+				unsigned e = le16_to_cpu(i->errors[j]);
+				if (e < BCH_FSCK_ERR_MAX)
+					__set_bit(e, c->sb.errors_silent);
+				if (e < sizeof(ext->errors_silent) * 8)
+					__set_bit_le64(e, ext->errors_silent);
+			}
+		}
+	}
+}
diff --git a/libbcachefs/sb-downgrade.h b/libbcachefs/sb-downgrade.h
new file mode 100644
index 00000000..095b7cc9
--- /dev/null
+++ b/libbcachefs/sb-downgrade.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_DOWNGRADE_H
+#define _BCACHEFS_SB_DOWNGRADE_H
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade;
+
+int bch2_sb_downgrade_update(struct bch_fs *);
+void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned);
+int bch2_sb_set_upgrade_extra(struct bch_fs *);
+void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned);
+
+#endif /* _BCACHEFS_SB_DOWNGRADE_H */
diff --git a/libbcachefs/sb-downgrade_format.h b/libbcachefs/sb-downgrade_format.h
new file mode 100644
index 00000000..cffd932b
--- /dev/null
+++ b/libbcachefs/sb-downgrade_format.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_DOWNGRADE_FORMAT_H
+#define _BCACHEFS_SB_DOWNGRADE_FORMAT_H
+
+struct bch_sb_field_downgrade_entry {
+	__le16			version;
+	__le64			recovery_passes[2];
+	__le16			nr_errors;
+	__le16			errors[] __counted_by(nr_errors);
+} __packed __aligned(2);
+
+struct bch_sb_field_downgrade {
+	struct bch_sb_field	field;
+	struct bch_sb_field_downgrade_entry entries[];
+};
+
+#endif /* _BCACHEFS_SB_DOWNGRADE_FORMAT_H */
diff --git a/libbcachefs/sb-errors.c b/libbcachefs/sb-errors.c
new file mode 100644
index 00000000..013a9688
--- /dev/null
+++ b/libbcachefs/sb-errors.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "sb-errors.h"
+#include "super-io.h"
+
+const char * const bch2_sb_error_strs[] = {
+#define x(t, n, ...) [n] = #t,
+	BCH_SB_ERRS()
+#undef x
+};
+
+void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
+{
+	if (id < BCH_FSCK_ERR_MAX)
+		prt_str(out, bch2_sb_error_strs[id]);
+	else
+		prt_printf(out, "(unknown error %u)", id);
+}
+
+static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e)
+{
+	return bch2_sb_field_nr_entries(e);
+}
+
+static inline unsigned bch2_sb_field_errors_u64s(unsigned nr)
+{
+	return (sizeof(struct bch_sb_field_errors) +
+		sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64);
+}
+
+static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				   enum bch_validate_flags flags, struct printbuf *err)
+{
+	struct bch_sb_field_errors *e = field_to_type(f, errors);
+	unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
+
+	for (i = 0; i < nr; i++) {
+		if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) {
+			prt_printf(err, "entry with count 0 (id ");
+			bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
+			prt_printf(err, ")");
+			return -BCH_ERR_invalid_sb_errors;
+		}
+
+		if (i + 1 < nr &&
+		    BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >=
+		    BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) {
+			prt_printf(err, "entries out of order");
+			return -BCH_ERR_invalid_sb_errors;
+		}
+	}
+
+	return 0;
+}
+
+static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb,
+				   struct bch_sb_field *f)
+{
+	struct bch_sb_field_errors *e = field_to_type(f, errors);
+	unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
+
+	if (out->nr_tabstops <= 1)
+		printbuf_tabstop_push(out, 16);
+
+	for (i = 0; i < nr; i++) {
+		bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
+		prt_tab(out);
+		prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i]));
+		prt_tab(out);
+		bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time));
+		prt_newline(out);
+	}
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_errors = {
+	.validate	= bch2_sb_errors_validate,
+	.to_text	= bch2_sb_errors_to_text,
+};
+
+void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err)
+{
+	bch_sb_errors_cpu *e = &c->fsck_error_counts;
+	struct bch_sb_error_entry_cpu n = {
+		.id = err,
+		.nr = 1,
+		.last_error_time = ktime_get_real_seconds()
+	};
+	unsigned i;
+
+	mutex_lock(&c->fsck_error_counts_lock);
+	for (i = 0; i < e->nr; i++) {
+		if (err == e->data[i].id) {
+			e->data[i].nr++;
+			e->data[i].last_error_time = n.last_error_time;
+			goto out;
+		}
+		if (err < e->data[i].id)
+			break;
+	}
+
+	if (darray_make_room(e, 1))
+		goto out;
+
+	darray_insert_item(e, i, n);
+out:
+	mutex_unlock(&c->fsck_error_counts_lock);
+}
+
+void bch2_sb_errors_from_cpu(struct bch_fs *c)
+{
+	bch_sb_errors_cpu *src = &c->fsck_error_counts;
+	struct bch_sb_field_errors *dst;
+	unsigned i;
+
+	mutex_lock(&c->fsck_error_counts_lock);
+
+	dst = bch2_sb_field_resize(&c->disk_sb, errors,
+				   bch2_sb_field_errors_u64s(src->nr));
+
+	if (!dst)
+		goto err;
+
+	for (i = 0; i < src->nr; i++) {
+		SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id);
+		SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr);
+		dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time);
+	}
+
+err:
+	mutex_unlock(&c->fsck_error_counts_lock);
+}
+
+static int bch2_sb_errors_to_cpu(struct bch_fs *c)
+{
+	struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors);
+	bch_sb_errors_cpu *dst = &c->fsck_error_counts;
+	unsigned i, nr = bch2_sb_field_errors_nr_entries(src);
+	int ret;
+
+	if (!nr)
+		return 0;
+
+	mutex_lock(&c->fsck_error_counts_lock);
+	ret = darray_make_room(dst, nr);
+	if (ret)
+		goto err;
+
+	dst->nr = nr;
+
+	for (i = 0; i < nr; i++) {
+		dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]);
+		dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]);
+		dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time);
+	}
+err:
+	mutex_unlock(&c->fsck_error_counts_lock);
+
+	return ret;
+}
+
+void bch2_fs_sb_errors_exit(struct bch_fs *c)
+{
+	darray_exit(&c->fsck_error_counts);
+}
+
+void bch2_fs_sb_errors_init_early(struct bch_fs *c)
+{
+	mutex_init(&c->fsck_error_counts_lock);
+	darray_init(&c->fsck_error_counts);
+}
+
+int bch2_fs_sb_errors_init(struct bch_fs *c)
+{
+	return bch2_sb_errors_to_cpu(c);
+}
diff --git a/libbcachefs/sb-errors.h b/libbcachefs/sb-errors.h
new file mode 100644
index 00000000..b2357b8e
--- /dev/null
+++ b/libbcachefs/sb-errors.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_ERRORS_H
+#define _BCACHEFS_SB_ERRORS_H
+
+#include "sb-errors_types.h"
+
+extern const char * const bch2_sb_error_strs[];
+
+void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
+
+void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id);
+
+void bch2_sb_errors_from_cpu(struct bch_fs *);
+
+void bch2_fs_sb_errors_exit(struct bch_fs *);
+void bch2_fs_sb_errors_init_early(struct bch_fs *);
+int bch2_fs_sb_errors_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_SB_ERRORS_H */
diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h
new file mode 100644
index 00000000..3bbda181
--- /dev/null
+++ b/libbcachefs/sb-errors_format.h
@@ -0,0 +1,333 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_ERRORS_FORMAT_H
+#define _BCACHEFS_SB_ERRORS_FORMAT_H
+
+enum bch_fsck_flags {
+	FSCK_CAN_FIX		= 1 << 0,
+	FSCK_CAN_IGNORE		= 1 << 1,
+	FSCK_NO_RATELIMIT	= 1 << 2,
+	FSCK_AUTOFIX		= 1 << 3,
+};
+
+#define BCH_SB_ERRS()									\
+	x(clean_but_journal_not_empty,				  0,	0)		\
+	x(dirty_but_no_journal_entries,				  1,	0)		\
+	x(dirty_but_no_journal_entries_post_drop_nonflushes,	  2,	0)		\
+	x(sb_clean_journal_seq_mismatch,			  3,	0)		\
+	x(sb_clean_btree_root_mismatch,				  4,	0)		\
+	x(sb_clean_missing,					  5,	0)		\
+	x(jset_unsupported_version,				  6,	0)		\
+	x(jset_unknown_csum,					  7,	0)		\
+	x(jset_last_seq_newer_than_seq,				  8,	0)		\
+	x(jset_past_bucket_end,					  9,	0)		\
+	x(jset_seq_blacklisted,					 10,	0)		\
+	x(journal_entries_missing,				 11,	0)		\
+	x(journal_entry_replicas_not_marked,			 12,	FSCK_AUTOFIX)	\
+	x(journal_entry_past_jset_end,				 13,	0)		\
+	x(journal_entry_replicas_data_mismatch,			 14,	0)		\
+	x(journal_entry_bkey_u64s_0,				 15,	0)		\
+	x(journal_entry_bkey_past_end,				 16,	0)		\
+	x(journal_entry_bkey_bad_format,			 17,	0)		\
+	x(journal_entry_bkey_invalid,				 18,	0)		\
+	x(journal_entry_btree_root_bad_size,			 19,	0)		\
+	x(journal_entry_blacklist_bad_size,			 20,	0)		\
+	x(journal_entry_blacklist_v2_bad_size,			 21,	0)		\
+	x(journal_entry_blacklist_v2_start_past_end,		 22,	0)		\
+	x(journal_entry_usage_bad_size,				 23,	0)		\
+	x(journal_entry_data_usage_bad_size,			 24,	0)		\
+	x(journal_entry_clock_bad_size,				 25,	0)		\
+	x(journal_entry_clock_bad_rw,				 26,	0)		\
+	x(journal_entry_dev_usage_bad_size,			 27,	0)		\
+	x(journal_entry_dev_usage_bad_dev,			 28,	0)		\
+	x(journal_entry_dev_usage_bad_pad,			 29,	0)		\
+	x(btree_node_unreadable,				 30,	0)		\
+	x(btree_node_fault_injected,				 31,	0)		\
+	x(btree_node_bad_magic,					 32,	0)		\
+	x(btree_node_bad_seq,					 33,	0)		\
+	x(btree_node_unsupported_version,			 34,	0)		\
+	x(btree_node_bset_older_than_sb_min,			 35,	0)		\
+	x(btree_node_bset_newer_than_sb,			 36,	0)		\
+	x(btree_node_data_missing,				 37,	0)		\
+	x(btree_node_bset_after_end,				 38,	0)		\
+	x(btree_node_replicas_sectors_written_mismatch,		 39,	0)		\
+	x(btree_node_replicas_data_mismatch,			 40,	0)		\
+	x(bset_unknown_csum,					 41,	0)		\
+	x(bset_bad_csum,					 42,	0)		\
+	x(bset_past_end_of_btree_node,				 43,	0)		\
+	x(bset_wrong_sector_offset,				 44,	0)		\
+	x(bset_empty,						 45,	0)		\
+	x(bset_bad_seq,						 46,	0)		\
+	x(bset_blacklisted_journal_seq,				 47,	0)		\
+	x(first_bset_blacklisted_journal_seq,			 48,	0)		\
+	x(btree_node_bad_btree,					 49,	0)		\
+	x(btree_node_bad_level,					 50,	0)		\
+	x(btree_node_bad_min_key,				 51,	0)		\
+	x(btree_node_bad_max_key,				 52,	0)		\
+	x(btree_node_bad_format,				 53,	0)		\
+	x(btree_node_bkey_past_bset_end,			 54,	0)		\
+	x(btree_node_bkey_bad_format,				 55,	0)		\
+	x(btree_node_bad_bkey,					 56,	0)		\
+	x(btree_node_bkey_out_of_order,				 57,	FSCK_AUTOFIX)	\
+	x(btree_root_bkey_invalid,				 58,	FSCK_AUTOFIX)	\
+	x(btree_root_read_error,				 59,	FSCK_AUTOFIX)	\
+	x(btree_root_bad_min_key,				 60,	0)		\
+	x(btree_root_bad_max_key,				 61,	0)		\
+	x(btree_node_read_error,				 62,	FSCK_AUTOFIX)	\
+	x(btree_node_topology_bad_min_key,			 63,	FSCK_AUTOFIX)	\
+	x(btree_node_topology_bad_max_key,			 64,	FSCK_AUTOFIX)	\
+	x(btree_node_topology_overwritten_by_prev_node,		 65,	FSCK_AUTOFIX)	\
+	x(btree_node_topology_overwritten_by_next_node,		 66,	FSCK_AUTOFIX)	\
+	x(btree_node_topology_interior_node_empty,		 67,	FSCK_AUTOFIX)	\
+	x(fs_usage_hidden_wrong,				 68,	FSCK_AUTOFIX)	\
+	x(fs_usage_btree_wrong,					 69,	FSCK_AUTOFIX)	\
+	x(fs_usage_data_wrong,					 70,	FSCK_AUTOFIX)	\
+	x(fs_usage_cached_wrong,				 71,	FSCK_AUTOFIX)	\
+	x(fs_usage_reserved_wrong,				 72,	FSCK_AUTOFIX)	\
+	x(fs_usage_persistent_reserved_wrong,			 73,	FSCK_AUTOFIX)	\
+	x(fs_usage_nr_inodes_wrong,				 74,	FSCK_AUTOFIX)	\
+	x(fs_usage_replicas_wrong,				 75,	FSCK_AUTOFIX)	\
+	x(dev_usage_buckets_wrong,				 76,	FSCK_AUTOFIX)	\
+	x(dev_usage_sectors_wrong,				 77,	FSCK_AUTOFIX)	\
+	x(dev_usage_fragmented_wrong,				 78,	FSCK_AUTOFIX)	\
+	x(dev_usage_buckets_ec_wrong,				 79,	FSCK_AUTOFIX)	\
+	x(bkey_version_in_future,				 80,	0)		\
+	x(bkey_u64s_too_small,					 81,	0)		\
+	x(bkey_invalid_type_for_btree,				 82,	0)		\
+	x(bkey_extent_size_zero,				 83,	0)		\
+	x(bkey_extent_size_greater_than_offset,			 84,	0)		\
+	x(bkey_size_nonzero,					 85,	0)		\
+	x(bkey_snapshot_nonzero,				 86,	0)		\
+	x(bkey_snapshot_zero,					 87,	0)		\
+	x(bkey_at_pos_max,					 88,	0)		\
+	x(bkey_before_start_of_btree_node,			 89,	0)		\
+	x(bkey_after_end_of_btree_node,				 90,	0)		\
+	x(bkey_val_size_nonzero,				 91,	0)		\
+	x(bkey_val_size_too_small,				 92,	0)		\
+	x(alloc_v1_val_size_bad,				 93,	0)		\
+	x(alloc_v2_unpack_error,				 94,	0)		\
+	x(alloc_v3_unpack_error,				 95,	0)		\
+	x(alloc_v4_val_size_bad,				 96,	0)		\
+	x(alloc_v4_backpointers_start_bad,			 97,	0)		\
+	x(alloc_key_data_type_bad,				 98,	0)		\
+	x(alloc_key_empty_but_have_data,			 99,	0)		\
+	x(alloc_key_dirty_sectors_0,				100,	0)		\
+	x(alloc_key_data_type_inconsistency,			101,	0)		\
+	x(alloc_key_to_missing_dev_bucket,			102,	0)		\
+	x(alloc_key_cached_inconsistency,			103,	0)		\
+	x(alloc_key_cached_but_read_time_zero,			104,	FSCK_AUTOFIX)	\
+	x(alloc_key_to_missing_lru_entry,			105,	FSCK_AUTOFIX)	\
+	x(alloc_key_data_type_wrong,				106,	FSCK_AUTOFIX)	\
+	x(alloc_key_gen_wrong,					107,	FSCK_AUTOFIX)	\
+	x(alloc_key_dirty_sectors_wrong,			108,	FSCK_AUTOFIX)	\
+	x(alloc_key_cached_sectors_wrong,			109,	FSCK_AUTOFIX)	\
+	x(alloc_key_stripe_wrong,				110,	FSCK_AUTOFIX)	\
+	x(alloc_key_stripe_redundancy_wrong,			111,	FSCK_AUTOFIX)	\
+	x(alloc_key_journal_seq_in_future,			298,	FSCK_AUTOFIX)	\
+	x(bucket_sector_count_overflow,				112,	0)		\
+	x(bucket_metadata_type_mismatch,			113,	0)		\
+	x(need_discard_key_wrong,				114,	FSCK_AUTOFIX)	\
+	x(freespace_key_wrong,					115,	FSCK_AUTOFIX)	\
+	x(freespace_hole_missing,				116,	FSCK_AUTOFIX)	\
+	x(bucket_gens_val_size_bad,				117,	0)		\
+	x(bucket_gens_key_wrong,				118,	FSCK_AUTOFIX)	\
+	x(bucket_gens_hole_wrong,				119,	FSCK_AUTOFIX)	\
+	x(bucket_gens_to_invalid_dev,				120,	FSCK_AUTOFIX)	\
+	x(bucket_gens_to_invalid_buckets,			121,	FSCK_AUTOFIX)	\
+	x(bucket_gens_nonzero_for_invalid_buckets,		122,	FSCK_AUTOFIX)	\
+	x(need_discard_freespace_key_to_invalid_dev_bucket,	123,	0)		\
+	x(need_discard_freespace_key_bad,			124,	0)		\
+	x(discarding_bucket_not_in_need_discard_btree,		291,	0)		\
+	x(backpointer_bucket_offset_wrong,			125,	0)		\
+	x(backpointer_level_bad,				294,	0)		\
+	x(backpointer_dev_bad,					297,	0)		\
+	x(backpointer_to_missing_device,			126,	0)		\
+	x(backpointer_to_missing_alloc,				127,	0)		\
+	x(backpointer_to_missing_ptr,				128,	FSCK_AUTOFIX)	\
+	x(lru_entry_at_time_0,					129,	FSCK_AUTOFIX)	\
+	x(lru_entry_to_invalid_bucket,				130,	FSCK_AUTOFIX)	\
+	x(lru_entry_bad,					131,	FSCK_AUTOFIX)	\
+	x(btree_ptr_val_too_big,				132,	0)		\
+	x(btree_ptr_v2_val_too_big,				133,	0)		\
+	x(btree_ptr_has_non_ptr,				134,	0)		\
+	x(extent_ptrs_invalid_entry,				135,	0)		\
+	x(extent_ptrs_no_ptrs,					136,	0)		\
+	x(extent_ptrs_too_many_ptrs,				137,	0)		\
+	x(extent_ptrs_redundant_crc,				138,	0)		\
+	x(extent_ptrs_redundant_stripe,				139,	0)		\
+	x(extent_ptrs_unwritten,				140,	0)		\
+	x(extent_ptrs_written_and_unwritten,			141,	0)		\
+	x(ptr_to_invalid_device,				142,	0)		\
+	x(ptr_to_duplicate_device,				143,	0)		\
+	x(ptr_after_last_bucket,				144,	0)		\
+	x(ptr_before_first_bucket,				145,	0)		\
+	x(ptr_spans_multiple_buckets,				146,	0)		\
+	x(ptr_to_missing_backpointer,				147,	FSCK_AUTOFIX)	\
+	x(ptr_to_missing_alloc_key,				148,	FSCK_AUTOFIX)	\
+	x(ptr_to_missing_replicas_entry,			149,	FSCK_AUTOFIX)	\
+	x(ptr_to_missing_stripe,				150,	0)		\
+	x(ptr_to_incorrect_stripe,				151,	0)		\
+	x(ptr_gen_newer_than_bucket_gen,			152,	0)		\
+	x(ptr_too_stale,					153,	0)		\
+	x(stale_dirty_ptr,					154,	0)		\
+	x(ptr_bucket_data_type_mismatch,			155,	0)		\
+	x(ptr_cached_and_erasure_coded,				156,	0)		\
+	x(ptr_crc_uncompressed_size_too_small,			157,	0)		\
+	x(ptr_crc_uncompressed_size_too_big,			161,	0)		\
+	x(ptr_crc_uncompressed_size_mismatch,			300,	0)		\
+	x(ptr_crc_csum_type_unknown,				158,	0)		\
+	x(ptr_crc_compression_type_unknown,			159,	0)		\
+	x(ptr_crc_redundant,					160,	0)		\
+	x(ptr_crc_nonce_mismatch,				162,	0)		\
+	x(ptr_stripe_redundant,					163,	0)		\
+	x(reservation_key_nr_replicas_invalid,			164,	0)		\
+	x(reflink_v_refcount_wrong,				165,	0)		\
+	x(reflink_v_pos_bad,					292,	0)		\
+	x(reflink_p_to_missing_reflink_v,			166,	0)		\
+	x(reflink_refcount_underflow,				293,	0)		\
+	x(stripe_pos_bad,					167,	0)		\
+	x(stripe_val_size_bad,					168,	0)		\
+	x(stripe_csum_granularity_bad,				290,	0)		\
+	x(stripe_sector_count_wrong,				169,	0)		\
+	x(snapshot_tree_pos_bad,				170,	0)		\
+	x(snapshot_tree_to_missing_snapshot,			171,	0)		\
+	x(snapshot_tree_to_missing_subvol,			172,	0)		\
+	x(snapshot_tree_to_wrong_subvol,			173,	0)		\
+	x(snapshot_tree_to_snapshot_subvol,			174,	0)		\
+	x(snapshot_pos_bad,					175,	0)		\
+	x(snapshot_parent_bad,					176,	0)		\
+	x(snapshot_children_not_normalized,			177,	0)		\
+	x(snapshot_child_duplicate,				178,	0)		\
+	x(snapshot_child_bad,					179,	0)		\
+	x(snapshot_skiplist_not_normalized,			180,	0)		\
+	x(snapshot_skiplist_bad,				181,	0)		\
+	x(snapshot_should_not_have_subvol,			182,	0)		\
+	x(snapshot_to_bad_snapshot_tree,			183,	FSCK_AUTOFIX)	\
+	x(snapshot_bad_depth,					184,	0)		\
+	x(snapshot_bad_skiplist,				185,	0)		\
+	x(subvol_pos_bad,					186,	0)		\
+	x(subvol_not_master_and_not_snapshot,			187,	0)		\
+	x(subvol_to_missing_root,				188,	0)		\
+	x(subvol_root_wrong_bi_subvol,				189,	0)		\
+	x(bkey_in_missing_snapshot,				190,	0)		\
+	x(inode_pos_inode_nonzero,				191,	0)		\
+	x(inode_pos_blockdev_range,				192,	0)		\
+	x(inode_unpack_error,					193,	0)		\
+	x(inode_str_hash_invalid,				194,	0)		\
+	x(inode_v3_fields_start_bad,				195,	0)		\
+	x(inode_snapshot_mismatch,				196,	0)		\
+	x(inode_unlinked_but_clean,				197,	0)		\
+	x(inode_unlinked_but_nlink_nonzero,			198,	0)		\
+	x(inode_unlinked_and_not_open,				281,	0)		\
+	x(inode_unlinked_but_has_dirent,			285,	0)		\
+	x(inode_checksum_type_invalid,				199,	0)		\
+	x(inode_compression_type_invalid,			200,	0)		\
+	x(inode_subvol_root_but_not_dir,			201,	0)		\
+	x(inode_i_size_dirty_but_clean,				202,	FSCK_AUTOFIX)	\
+	x(inode_i_sectors_dirty_but_clean,			203,	FSCK_AUTOFIX)	\
+	x(inode_i_sectors_wrong,				204,	FSCK_AUTOFIX)	\
+	x(inode_dir_wrong_nlink,				205,	FSCK_AUTOFIX)	\
+	x(inode_dir_multiple_links,				206,	FSCK_AUTOFIX)	\
+	x(inode_dir_missing_backpointer,			284,	FSCK_AUTOFIX)	\
+	x(inode_dir_unlinked_but_not_empty,			286,	FSCK_AUTOFIX)	\
+	x(inode_multiple_links_but_nlink_0,			207,	FSCK_AUTOFIX)	\
+	x(inode_wrong_backpointer,				208,	FSCK_AUTOFIX)	\
+	x(inode_wrong_nlink,					209,	FSCK_AUTOFIX)	\
+	x(inode_has_child_snapshots_wrong,			287,	0)		\
+	x(inode_unreachable,					210,	FSCK_AUTOFIX)	\
+	x(inode_journal_seq_in_future,				299,	FSCK_AUTOFIX)	\
+	x(deleted_inode_but_clean,				211,	FSCK_AUTOFIX)	\
+	x(deleted_inode_missing,				212,	FSCK_AUTOFIX)	\
+	x(deleted_inode_is_dir,					213,	FSCK_AUTOFIX)	\
+	x(deleted_inode_not_unlinked,				214,	FSCK_AUTOFIX)	\
+	x(deleted_inode_has_child_snapshots,			288,	FSCK_AUTOFIX)	\
+	x(extent_overlapping,					215,	0)		\
+	x(key_in_missing_inode,					216,	0)		\
+	x(key_in_wrong_inode_type,				217,	0)		\
+	x(extent_past_end_of_inode,				218,	0)		\
+	x(dirent_empty_name,					219,	0)		\
+	x(dirent_val_too_big,					220,	0)		\
+	x(dirent_name_too_long,					221,	0)		\
+	x(dirent_name_embedded_nul,				222,	0)		\
+	x(dirent_name_dot_or_dotdot,				223,	0)		\
+	x(dirent_name_has_slash,				224,	0)		\
+	x(dirent_d_type_wrong,					225,	0)		\
+	x(inode_bi_parent_wrong,				226,	0)		\
+	x(dirent_in_missing_dir_inode,				227,	0)		\
+	x(dirent_in_non_dir_inode,				228,	0)		\
+	x(dirent_to_missing_inode,				229,	0)		\
+	x(dirent_to_missing_subvol,				230,	0)		\
+	x(dirent_to_itself,					231,	0)		\
+	x(quota_type_invalid,					232,	0)		\
+	x(xattr_val_size_too_small,				233,	0)		\
+	x(xattr_val_size_too_big,				234,	0)		\
+	x(xattr_invalid_type,					235,	0)		\
+	x(xattr_name_invalid_chars,				236,	0)		\
+	x(xattr_in_missing_inode,				237,	0)		\
+	x(root_subvol_missing,					238,	0)		\
+	x(root_dir_missing,					239,	0)		\
+	x(root_inode_not_dir,					240,	0)		\
+	x(dir_loop,						241,	0)		\
+	x(hash_table_key_duplicate,				242,	0)		\
+	x(hash_table_key_wrong_offset,				243,	0)		\
+	x(unlinked_inode_not_on_deleted_list,			244,	FSCK_AUTOFIX)	\
+	x(reflink_p_front_pad_bad,				245,	0)		\
+	x(journal_entry_dup_same_device,			246,	0)		\
+	x(inode_bi_subvol_missing,				247,	0)		\
+	x(inode_bi_subvol_wrong,				248,	0)		\
+	x(inode_points_to_missing_dirent,			249,	FSCK_AUTOFIX)	\
+	x(inode_points_to_wrong_dirent,				250,	FSCK_AUTOFIX)	\
+	x(inode_bi_parent_nonzero,				251,	0)		\
+	x(dirent_to_missing_parent_subvol,			252,	0)		\
+	x(dirent_not_visible_in_parent_subvol,			253,	0)		\
+	x(subvol_fs_path_parent_wrong,				254,	0)		\
+	x(subvol_root_fs_path_parent_nonzero,			255,	0)		\
+	x(subvol_children_not_set,				256,	0)		\
+	x(subvol_children_bad,					257,	0)		\
+	x(subvol_loop,						258,	0)		\
+	x(subvol_unreachable,					259,	FSCK_AUTOFIX)	\
+	x(btree_node_bkey_bad_u64s,				260,	0)		\
+	x(btree_node_topology_empty_interior_node,		261,	0)		\
+	x(btree_ptr_v2_min_key_bad,				262,	0)		\
+	x(btree_root_unreadable_and_scan_found_nothing,		263,	0)		\
+	x(snapshot_node_missing,				264,	0)		\
+	x(dup_backpointer_to_bad_csum_extent,			265,	0)		\
+	x(btree_bitmap_not_marked,				266,	FSCK_AUTOFIX)	\
+	x(sb_clean_entry_overrun,				267,	0)		\
+	x(btree_ptr_v2_written_0,				268,	0)		\
+	x(subvol_snapshot_bad,					269,	0)		\
+	x(subvol_inode_bad,					270,	0)		\
+	x(alloc_key_stripe_sectors_wrong,			271,	FSCK_AUTOFIX)	\
+	x(accounting_mismatch,					272,	FSCK_AUTOFIX)	\
+	x(accounting_replicas_not_marked,			273,	0)		\
+	x(accounting_to_invalid_device,				289,	0)		\
+	x(invalid_btree_id,					274,	0)		\
+	x(alloc_key_io_time_bad,				275,	0)		\
+	x(alloc_key_fragmentation_lru_wrong,			276,	FSCK_AUTOFIX)	\
+	x(accounting_key_junk_at_end,				277,	FSCK_AUTOFIX)	\
+	x(accounting_key_replicas_nr_devs_0,			278,	FSCK_AUTOFIX)	\
+	x(accounting_key_replicas_nr_required_bad,		279,	FSCK_AUTOFIX)	\
+	x(accounting_key_replicas_devs_unsorted,		280,	FSCK_AUTOFIX)	\
+	x(accounting_key_version_0,				282,	FSCK_AUTOFIX)	\
+	x(logged_op_but_clean,					283,	FSCK_AUTOFIX)	\
+	x(compression_opt_not_marked_in_sb,			295,	FSCK_AUTOFIX)	\
+	x(compression_type_not_marked_in_sb,			296,	FSCK_AUTOFIX)	\
+	x(MAX,							301,	0)
+
+enum bch_sb_error_id {
+#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
+	BCH_SB_ERRS()
+#undef x
+};
+
+struct bch_sb_field_errors {
+	struct bch_sb_field	field;
+	struct bch_sb_field_error_entry {
+		__le64		v;
+		__le64		last_error_time;
+	}			entries[];
+};
+
+LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID,	struct bch_sb_field_error_entry, v,  0, 16);
+LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR,	struct bch_sb_field_error_entry, v, 16, 64);
+
+#endif /* _BCACHEFS_SB_ERRORS_FORMAT_H */
diff --git a/libbcachefs/sb-errors_types.h b/libbcachefs/sb-errors_types.h
new file mode 100644
index 00000000..40325239
--- /dev/null
+++ b/libbcachefs/sb-errors_types.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_ERRORS_TYPES_H
+#define _BCACHEFS_SB_ERRORS_TYPES_H
+
+#include "darray.h"
+
+struct bch_sb_error_entry_cpu {
+	u64			id:16,
+				nr:48;
+	u64			last_error_time;
+};
+
+typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu;
+
+#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */
diff --git a/libbcachefs/sb-members.c b/libbcachefs/sb-members.c
new file mode 100644
index 00000000..116131f9
--- /dev/null
+++ b/libbcachefs/sb-members.c
@@ -0,0 +1,532 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "opts.h"
+#include "replicas.h"
+#include "sb-members.h"
+#include "super-io.h"
+
+void bch2_dev_missing(struct bch_fs *c, unsigned dev)
+{
+	if (dev != BCH_SB_MEMBER_INVALID)
+		bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
+}
+
+void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket)
+{
+	bch2_fs_inconsistent(c, "pointer to nonexistent bucket %llu:%llu", bucket.inode, bucket.offset);
+}
+
+#define x(t, n, ...) [n] = #t,
+static const char * const bch2_iops_measurements[] = {
+	BCH_IOPS_MEASUREMENTS()
+	NULL
+};
+
+char * const bch2_member_error_strs[] = {
+	BCH_MEMBER_ERROR_TYPES()
+	NULL
+};
+#undef x
+
+/* Code for bch_sb_field_members_v1: */
+
+struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i)
+{
+	return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i);
+}
+
+static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i)
+{
+	struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i);
+	memset(&ret, 0, sizeof(ret));
+	memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret)));
+	return ret;
+}
+
+static struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i)
+{
+	return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES);
+}
+
+static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int i)
+{
+	struct bch_member ret, *p = members_v1_get_mut(mi, i);
+	memset(&ret, 0, sizeof(ret));
+	memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret)));
+	return ret;
+}
+
+struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i)
+{
+	struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(sb, members_v2);
+	if (mi2)
+		return members_v2_get(mi2, i);
+	struct bch_sb_field_members_v1 *mi1 = bch2_sb_field_get(sb, members_v1);
+	return members_v1_get(mi1, i);
+}
+
+static int sb_members_v2_resize_entries(struct bch_fs *c)
+{
+	struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+
+	if (le16_to_cpu(mi->member_bytes) < sizeof(struct bch_member)) {
+		unsigned u64s = DIV_ROUND_UP((sizeof(*mi) + sizeof(mi->_members[0]) *
+					      c->disk_sb.sb->nr_devices), 8);
+
+		mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
+		if (!mi)
+			return -BCH_ERR_ENOSPC_sb_members_v2;
+
+		for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) {
+			void *dst = (void *) mi->_members + (i * sizeof(struct bch_member));
+			memmove(dst, __bch2_members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes));
+			memset(dst + le16_to_cpu(mi->member_bytes),
+			       0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes)));
+		}
+		mi->member_bytes = cpu_to_le16(sizeof(struct bch_member));
+	}
+	return 0;
+}
+
+int bch2_sb_members_v2_init(struct bch_fs *c)
+{
+	struct bch_sb_field_members_v1 *mi1;
+	struct bch_sb_field_members_v2 *mi2;
+
+	if (!bch2_sb_field_get(c->disk_sb.sb, members_v2)) {
+		mi2 = bch2_sb_field_resize(&c->disk_sb, members_v2,
+				DIV_ROUND_UP(sizeof(*mi2) +
+					     sizeof(struct bch_member) * c->sb.nr_devices,
+					     sizeof(u64)));
+		mi1 = bch2_sb_field_get(c->disk_sb.sb, members_v1);
+		memcpy(&mi2->_members[0], &mi1->_members[0],
+		       BCH_MEMBER_V1_BYTES * c->sb.nr_devices);
+		memset(&mi2->pad[0], 0, sizeof(mi2->pad));
+		mi2->member_bytes = cpu_to_le16(BCH_MEMBER_V1_BYTES);
+	}
+
+	return sb_members_v2_resize_entries(c);
+}
+
+int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
+{
+	struct bch_sb_field_members_v1 *mi1;
+	struct bch_sb_field_members_v2 *mi2;
+
+	mi1 = bch2_sb_field_resize(disk_sb, members_v1,
+			DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES *
+				     disk_sb->sb->nr_devices, sizeof(u64)));
+	if (!mi1)
+		return -BCH_ERR_ENOSPC_sb_members;
+
+	mi2 = bch2_sb_field_get(disk_sb->sb, members_v2);
+
+	for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++)
+		memcpy(members_v1_get_mut(mi1, i), __bch2_members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES);
+
+	return 0;
+}
+
+static int validate_member(struct printbuf *err,
+			   struct bch_member m,
+			   struct bch_sb *sb,
+			   int i)
+{
+	if (le64_to_cpu(m.nbuckets) > BCH_MEMBER_NBUCKETS_MAX) {
+		prt_printf(err, "device %u: too many buckets (got %llu, max %u)",
+			   i, le64_to_cpu(m.nbuckets), BCH_MEMBER_NBUCKETS_MAX);
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	if (le64_to_cpu(m.nbuckets) -
+	    le16_to_cpu(m.first_bucket) < BCH_MIN_NR_NBUCKETS) {
+		prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
+			   i, le64_to_cpu(m.nbuckets), BCH_MIN_NR_NBUCKETS);
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	if (le16_to_cpu(m.bucket_size) <
+	    le16_to_cpu(sb->block_size)) {
+		prt_printf(err, "device %u: bucket size %u smaller than block size %u",
+			   i, le16_to_cpu(m.bucket_size), le16_to_cpu(sb->block_size));
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	if (le16_to_cpu(m.bucket_size) <
+	    BCH_SB_BTREE_NODE_SIZE(sb)) {
+		prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
+			   i, le16_to_cpu(m.bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	if (m.btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX) {
+		prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift);
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	return 0;
+}
+
+static void member_to_text(struct printbuf *out,
+			   struct bch_member m,
+			   struct bch_sb_field_disk_groups *gi,
+			   struct bch_sb *sb,
+			   int i)
+{
+	unsigned data_have = bch2_sb_dev_has_data(sb, i);
+	u64 bucket_size = le16_to_cpu(m.bucket_size);
+	u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size;
+
+	if (!bch2_member_alive(&m))
+		return;
+
+	prt_printf(out, "Device:\t%u\n", i);
+
+	printbuf_indent_add(out, 2);
+
+	prt_printf(out, "Label:\t");
+	if (BCH_MEMBER_GROUP(&m)) {
+		unsigned idx = BCH_MEMBER_GROUP(&m) - 1;
+
+		if (idx < disk_groups_nr(gi))
+			prt_printf(out, "%s (%u)",
+				   gi->entries[idx].label, idx);
+		else
+			prt_printf(out, "(bad disk labels section)");
+	} else {
+		prt_printf(out, "(none)");
+	}
+	prt_newline(out);
+
+	prt_printf(out, "UUID:\t");
+	pr_uuid(out, m.uuid.b);
+	prt_newline(out);
+
+	prt_printf(out, "Size:\t");
+	prt_units_u64(out, device_size << 9);
+	prt_newline(out);
+
+	for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
+		prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i]));
+
+	for (unsigned i = 0; i < BCH_IOPS_NR; i++)
+		prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i]));
+
+	prt_printf(out, "Bucket size:\t");
+	prt_units_u64(out, bucket_size << 9);
+	prt_newline(out);
+
+	prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket));
+	prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets));
+
+	prt_printf(out, "Last mount:\t");
+	if (m.last_mount)
+		bch2_prt_datetime(out, le64_to_cpu(m.last_mount));
+	else
+		prt_printf(out, "(never)");
+	prt_newline(out);
+
+	prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq));
+
+	prt_printf(out, "State:\t%s\n",
+		   BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR
+		   ? bch2_member_states[BCH_MEMBER_STATE(&m)]
+		   : "unknown");
+
+	prt_printf(out, "Data allowed:\t");
+	if (BCH_MEMBER_DATA_ALLOWED(&m))
+		prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
+	else
+		prt_printf(out, "(none)");
+	prt_newline(out);
+
+	prt_printf(out, "Has data:\t");
+	if (data_have)
+		prt_bitflags(out, __bch2_data_types, data_have);
+	else
+		prt_printf(out, "(none)");
+	prt_newline(out);
+
+	prt_printf(out, "Btree allocated bitmap blocksize:\t");
+	if (m.btree_bitmap_shift < 64)
+		prt_units_u64(out, 1ULL << m.btree_bitmap_shift);
+	else
+		prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift);
+	prt_newline(out);
+
+	prt_printf(out, "Btree allocated bitmap:\t");
+	bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64);
+	prt_newline(out);
+
+	prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1);
+
+	prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m));
+	prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
+
+	printbuf_indent_sub(out, 2);
+}
+
+static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				enum bch_validate_flags flags, struct printbuf *err)
+{
+	struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
+	unsigned i;
+
+	if ((void *) members_v1_get_mut(mi, sb->nr_devices) > vstruct_end(&mi->field)) {
+		prt_printf(err, "too many devices for section size");
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	for (i = 0; i < sb->nr_devices; i++) {
+		struct bch_member m = members_v1_get(mi, i);
+
+		int ret = validate_member(err, m, sb, i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb,
+				       struct bch_sb_field *f)
+{
+	struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
+	struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
+	unsigned i;
+
+	for (i = 0; i < sb->nr_devices; i++)
+		member_to_text(out, members_v1_get(mi, i), gi, sb, i);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = {
+	.validate	= bch2_sb_members_v1_validate,
+	.to_text	= bch2_sb_members_v1_to_text,
+};
+
+static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
+				       struct bch_sb_field *f)
+{
+	struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
+	struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
+	unsigned i;
+
+	for (i = 0; i < sb->nr_devices; i++)
+		member_to_text(out, members_v2_get(mi, i), gi, sb, i);
+}
+
+static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				enum bch_validate_flags flags, struct printbuf *err)
+{
+	struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
+	size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) -
+		(void *) mi;
+
+	if (mi_bytes > vstruct_bytes(&mi->field)) {
+		prt_printf(err, "section too small (%zu > %zu)",
+			   mi_bytes, vstruct_bytes(&mi->field));
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	for (unsigned i = 0; i < sb->nr_devices; i++) {
+		int ret = validate_member(err, members_v2_get(mi, i), sb, i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = {
+	.validate	= bch2_sb_members_v2_validate,
+	.to_text	= bch2_sb_members_v2_to_text,
+};
+
+void bch2_sb_members_from_cpu(struct bch_fs *c)
+{
+	struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+
+	rcu_read_lock();
+	for_each_member_device_rcu(c, ca, NULL) {
+		struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx);
+
+		for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++)
+			m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e]));
+	}
+	rcu_read_unlock();
+}
+
+void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
+{
+	struct bch_fs *c = ca->fs;
+	struct bch_member m;
+
+	mutex_lock(&ca->fs->sb_lock);
+	m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
+	mutex_unlock(&ca->fs->sb_lock);
+
+	printbuf_tabstop_push(out, 12);
+
+	prt_str(out, "IO errors since filesystem creation");
+	prt_newline(out);
+
+	printbuf_indent_add(out, 2);
+	for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
+		prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i]));
+	printbuf_indent_sub(out, 2);
+
+	prt_str(out, "IO errors since ");
+	bch2_pr_time_units(out, (ktime_get_real_seconds() - le64_to_cpu(m.errors_reset_time)) * NSEC_PER_SEC);
+	prt_str(out, " ago");
+	prt_newline(out);
+
+	printbuf_indent_add(out, 2);
+	for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
+		prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i],
+			   atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i]));
+	printbuf_indent_sub(out, 2);
+}
+
+void bch2_dev_errors_reset(struct bch_dev *ca)
+{
+	struct bch_fs *c = ca->fs;
+	struct bch_member *m;
+
+	mutex_lock(&c->sb_lock);
+	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+	for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++)
+		m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i]));
+	m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds());
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+}
+
+/*
+ * Per member "range has btree nodes" bitmap:
+ *
+ * This is so that if we ever have to run the btree node scan to repair we don't
+ * have to scan full devices:
+ */
+
+bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
+{
+	bool ret = true;
+	rcu_read_lock();
+	bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+		struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+		if (!ca)
+			continue;
+
+		if (!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) {
+			ret = false;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
+				u64 start, unsigned sectors)
+{
+	struct bch_member *m = __bch2_members_v2_get_mut(mi, dev);
+	u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap);
+
+	u64 end = start + sectors;
+
+	int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6);
+	if (resize > 0) {
+		u64 new_bitmap = 0;
+
+		for (unsigned i = 0; i < 64; i++)
+			if (bitmap & BIT_ULL(i))
+				new_bitmap |= BIT_ULL(i >> resize);
+		bitmap = new_bitmap;
+		m->btree_bitmap_shift += resize;
+	}
+
+	BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX);
+	BUG_ON(end > 64ULL << m->btree_bitmap_shift);
+
+	for (unsigned bit = start >> m->btree_bitmap_shift;
+	     (u64) bit << m->btree_bitmap_shift < end;
+	     bit++)
+		bitmap |= BIT_ULL(bit);
+
+	m->btree_allocated_bitmap = cpu_to_le64(bitmap);
+}
+
+void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
+{
+	lockdep_assert_held(&c->sb_lock);
+
+	struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+	bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+		if (!bch2_member_exists(c->disk_sb.sb, ptr->dev))
+			continue;
+
+		__bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
+	}
+}
+
+unsigned bch2_sb_nr_devices(const struct bch_sb *sb)
+{
+	unsigned nr = 0;
+
+	for (unsigned i = 0; i < sb->nr_devices; i++)
+		nr += bch2_member_exists((struct bch_sb *) sb, i);
+	return nr;
+}
+
+int bch2_sb_member_alloc(struct bch_fs *c)
+{
+	unsigned dev_idx = c->sb.nr_devices;
+	struct bch_sb_field_members_v2 *mi;
+	unsigned nr_devices;
+	unsigned u64s;
+	int best = -1;
+	u64 best_last_mount = 0;
+
+	if (dev_idx < BCH_SB_MEMBERS_MAX)
+		goto have_slot;
+
+	for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
+		/* eventually BCH_SB_MEMBERS_MAX will be raised */
+		if (dev_idx == BCH_SB_MEMBER_INVALID)
+			continue;
+
+		struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
+		if (bch2_member_alive(&m))
+			continue;
+
+		u64 last_mount = le64_to_cpu(m.last_mount);
+		if (best < 0 || last_mount < best_last_mount) {
+			best = dev_idx;
+			best_last_mount = last_mount;
+		}
+	}
+	if (best >= 0) {
+		dev_idx = best;
+		goto have_slot;
+	}
+
+	return -BCH_ERR_ENOSPC_sb_members;
+have_slot:
+	nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
+
+	mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+	u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
+			    le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
+
+	mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
+	if (!mi)
+		return -BCH_ERR_ENOSPC_sb_members;
+
+	c->disk_sb.sb->nr_devices = nr_devices;
+	return dev_idx;
+}
diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h
new file mode 100644
index 00000000..762083b5
--- /dev/null
+++ b/libbcachefs/sb-members.h
@@ -0,0 +1,367 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_MEMBERS_H
+#define _BCACHEFS_SB_MEMBERS_H
+
+#include "darray.h"
+#include "bkey_types.h"
+
+extern char * const bch2_member_error_strs[];
+
+static inline struct bch_member *
+__bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i)
+{
+	return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes));
+}
+
+int bch2_sb_members_v2_init(struct bch_fs *c);
+int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb);
+struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i);
+struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
+
+static inline bool bch2_dev_is_online(struct bch_dev *ca)
+{
+	return !percpu_ref_is_zero(&ca->io_ref);
+}
+
+static inline bool bch2_dev_is_readable(struct bch_dev *ca)
+{
+	return bch2_dev_is_online(ca) &&
+		ca->mi.state != BCH_MEMBER_STATE_failed;
+}
+
+static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
+{
+	return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
+}
+
+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
+					 unsigned dev)
+{
+	darray_for_each(devs, i)
+		if (*i == dev)
+			return true;
+	return false;
+}
+
+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
+					  unsigned dev)
+{
+	darray_for_each(*devs, i)
+		if (*i == dev) {
+			darray_remove_item(devs, i);
+			return;
+		}
+}
+
+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
+					 unsigned dev)
+{
+	if (!bch2_dev_list_has_dev(*devs, dev)) {
+		BUG_ON(devs->nr >= ARRAY_SIZE(devs->data));
+		devs->data[devs->nr++] = dev;
+	}
+}
+
+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
+{
+	return (struct bch_devs_list) { .nr = 1, .data[0] = dev };
+}
+
+static inline struct bch_dev *__bch2_next_dev_idx(struct bch_fs *c, unsigned idx,
+						  const struct bch_devs_mask *mask)
+{
+	struct bch_dev *ca = NULL;
+
+	while ((idx = mask
+		? find_next_bit(mask->d, c->sb.nr_devices, idx)
+		: idx) < c->sb.nr_devices &&
+	       !(ca = rcu_dereference_check(c->devs[idx],
+					    lockdep_is_held(&c->state_lock))))
+		idx++;
+
+	return ca;
+}
+
+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *ca,
+					      const struct bch_devs_mask *mask)
+{
+	return __bch2_next_dev_idx(c, ca ? ca->dev_idx + 1 : 0, mask);
+}
+
+#define for_each_member_device_rcu(_c, _ca, _mask)			\
+	for (struct bch_dev *_ca = NULL;				\
+	     (_ca = __bch2_next_dev((_c), _ca, (_mask)));)
+
+static inline void bch2_dev_get(struct bch_dev *ca)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	BUG_ON(atomic_long_inc_return(&ca->ref) <= 1L);
+#else
+	percpu_ref_get(&ca->ref);
+#endif
+}
+
+static inline void __bch2_dev_put(struct bch_dev *ca)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	long r = atomic_long_dec_return(&ca->ref);
+	if (r < (long) !ca->dying)
+		panic("bch_dev->ref underflow, last put: %pS\n", (void *) ca->last_put);
+	ca->last_put = _THIS_IP_;
+	if (!r)
+		complete(&ca->ref_completion);
+#else
+	percpu_ref_put(&ca->ref);
+#endif
+}
+
+static inline void bch2_dev_put(struct bch_dev *ca)
+{
+	if (ca)
+		__bch2_dev_put(ca);
+}
+
+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
+{
+	rcu_read_lock();
+	bch2_dev_put(ca);
+	if ((ca = __bch2_next_dev(c, ca, NULL)))
+		bch2_dev_get(ca);
+	rcu_read_unlock();
+
+	return ca;
+}
+
+/*
+ * If you break early, you must drop your ref on the current device
+ */
+#define __for_each_member_device(_c, _ca)				\
+	for (;	(_ca = bch2_get_next_dev(_c, _ca));)
+
+#define for_each_member_device(_c, _ca)					\
+	for (struct bch_dev *_ca = NULL;				\
+	     (_ca = bch2_get_next_dev(_c, _ca));)
+
+static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
+						       struct bch_dev *ca,
+						       unsigned state_mask)
+{
+	rcu_read_lock();
+	if (ca)
+		percpu_ref_put(&ca->io_ref);
+
+	while ((ca = __bch2_next_dev(c, ca, NULL)) &&
+	       (!((1 << ca->mi.state) & state_mask) ||
+		!percpu_ref_tryget(&ca->io_ref)))
+		;
+	rcu_read_unlock();
+
+	return ca;
+}
+
+#define __for_each_online_member(_c, _ca, state_mask)			\
+	for (struct bch_dev *_ca = NULL;				\
+	     (_ca = bch2_get_next_online_dev(_c, _ca, state_mask));)
+
+#define for_each_online_member(c, ca)					\
+	__for_each_online_member(c, ca, ~0)
+
+#define for_each_rw_member(c, ca)					\
+	__for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw))
+
+#define for_each_readable_member(c, ca)				\
+	__for_each_online_member(c, ca,	BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro))
+
+static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev)
+{
+	return dev < c->sb.nr_devices && c->devs[dev];
+}
+
+static inline bool bucket_valid(const struct bch_dev *ca, u64 b)
+{
+	return b - ca->mi.first_bucket < ca->mi.nbuckets_minus_first;
+}
+
+static inline struct bch_dev *bch2_dev_have_ref(const struct bch_fs *c, unsigned dev)
+{
+	EBUG_ON(!bch2_dev_exists(c, dev));
+
+	return rcu_dereference_check(c->devs[dev], 1);
+}
+
+static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev)
+{
+	EBUG_ON(!bch2_dev_exists(c, dev));
+
+	return rcu_dereference_protected(c->devs[dev],
+					 lockdep_is_held(&c->sb_lock) ||
+					 lockdep_is_held(&c->state_lock));
+}
+
+static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned dev)
+{
+	return c && dev < c->sb.nr_devices
+		? rcu_dereference(c->devs[dev])
+		: NULL;
+}
+
+void bch2_dev_missing(struct bch_fs *, unsigned);
+
+static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
+{
+	struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
+	if (unlikely(!ca))
+		bch2_dev_missing(c, dev);
+	return ca;
+}
+
+static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev)
+{
+	rcu_read_lock();
+	struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
+	if (ca)
+		bch2_dev_get(ca);
+	rcu_read_unlock();
+	return ca;
+}
+
+static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev)
+{
+	struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
+	if (unlikely(!ca))
+		bch2_dev_missing(c, dev);
+	return ca;
+}
+
+static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket)
+{
+	struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode);
+	if (ca && !bucket_valid(ca, bucket.offset)) {
+		bch2_dev_put(ca);
+		ca = NULL;
+	}
+	return ca;
+}
+
+void bch2_dev_bucket_missing(struct bch_fs *, struct bpos);
+
+static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket)
+{
+	struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, bucket);
+	if (!ca)
+		bch2_dev_bucket_missing(c, bucket);
+	return ca;
+}
+
+static inline struct bch_dev *bch2_dev_iterate_noerror(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx)
+{
+	if (ca && ca->dev_idx == dev_idx)
+		return ca;
+	bch2_dev_put(ca);
+	return bch2_dev_tryget_noerror(c, dev_idx);
+}
+
+static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx)
+{
+	if (ca && ca->dev_idx == dev_idx)
+		return ca;
+	bch2_dev_put(ca);
+	return bch2_dev_tryget(c, dev_idx);
+}
+
+static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw)
+{
+	rcu_read_lock();
+	struct bch_dev *ca = bch2_dev_rcu(c, dev);
+	if (ca && !percpu_ref_tryget(&ca->io_ref))
+		ca = NULL;
+	rcu_read_unlock();
+
+	if (ca &&
+	    (ca->mi.state == BCH_MEMBER_STATE_rw ||
+	    (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)))
+		return ca;
+
+	if (ca)
+		percpu_ref_put(&ca->io_ref);
+	return NULL;
+}
+
+/* XXX kill, move to struct bch_fs */
+static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
+{
+	struct bch_devs_mask devs;
+
+	memset(&devs, 0, sizeof(devs));
+	for_each_online_member(c, ca)
+		__set_bit(ca->dev_idx, devs.d);
+	return devs;
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1;
+extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
+
+static inline bool bch2_member_alive(struct bch_member *m)
+{
+	return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
+}
+
+static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
+{
+	if (dev < sb->nr_devices) {
+		struct bch_member m = bch2_sb_member_get(sb, dev);
+		return bch2_member_alive(&m);
+	}
+	return false;
+}
+
+unsigned bch2_sb_nr_devices(const struct bch_sb *);
+
+static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
+{
+	return (struct bch_member_cpu) {
+		.nbuckets	= le64_to_cpu(mi->nbuckets),
+		.nbuckets_minus_first = le64_to_cpu(mi->nbuckets) -
+			le16_to_cpu(mi->first_bucket),
+		.first_bucket	= le16_to_cpu(mi->first_bucket),
+		.bucket_size	= le16_to_cpu(mi->bucket_size),
+		.group		= BCH_MEMBER_GROUP(mi),
+		.state		= BCH_MEMBER_STATE(mi),
+		.discard	= BCH_MEMBER_DISCARD(mi),
+		.data_allowed	= BCH_MEMBER_DATA_ALLOWED(mi),
+		.durability	= BCH_MEMBER_DURABILITY(mi)
+			? BCH_MEMBER_DURABILITY(mi) - 1
+			: 1,
+		.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
+		.valid		= bch2_member_alive(mi),
+		.btree_bitmap_shift	= mi->btree_bitmap_shift,
+		.btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap),
+	};
+}
+
+void bch2_sb_members_from_cpu(struct bch_fs *);
+
+void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *);
+void bch2_dev_errors_reset(struct bch_dev *);
+
+static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors)
+{
+	u64 end = start + sectors;
+
+	if (end > 64ULL << ca->mi.btree_bitmap_shift)
+		return false;
+
+	for (unsigned bit = start >> ca->mi.btree_bitmap_shift;
+	     (u64) bit << ca->mi.btree_bitmap_shift < end;
+	     bit++)
+		if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit)))
+			return false;
+	return true;
+}
+
+bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
+void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
+
+int bch2_sb_member_alloc(struct bch_fs *);
+
+#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/libbcachefs/sb-members_format.h b/libbcachefs/sb-members_format.h
new file mode 100644
index 00000000..2adf1221
--- /dev/null
+++ b/libbcachefs/sb-members_format.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_MEMBERS_FORMAT_H
+#define _BCACHEFS_SB_MEMBERS_FORMAT_H
+
+/*
+ * We refer to members with bitmasks in various places - but we need to get rid
+ * of this limit:
+ */
+#define BCH_SB_MEMBERS_MAX		64
+
+/*
+ * Sentinal value - indicates a device that does not exist
+ */
+#define BCH_SB_MEMBER_INVALID		255
+
+#define BCH_MIN_NR_NBUCKETS	(1 << 6)
+
+#define BCH_IOPS_MEASUREMENTS()			\
+	x(seqread,	0)			\
+	x(seqwrite,	1)			\
+	x(randread,	2)			\
+	x(randwrite,	3)
+
+enum bch_iops_measurement {
+#define x(t, n) BCH_IOPS_##t = n,
+	BCH_IOPS_MEASUREMENTS()
+#undef x
+	BCH_IOPS_NR
+};
+
+#define BCH_MEMBER_ERROR_TYPES()		\
+	x(read,		0)			\
+	x(write,	1)			\
+	x(checksum,	2)
+
+enum bch_member_error_type {
+#define x(t, n) BCH_MEMBER_ERROR_##t = n,
+	BCH_MEMBER_ERROR_TYPES()
+#undef x
+	BCH_MEMBER_ERROR_NR
+};
+
+struct bch_member {
+	__uuid_t		uuid;
+	__le64			nbuckets;	/* device size */
+	__le16			first_bucket;   /* index of first bucket used */
+	__le16			bucket_size;	/* sectors */
+	__u8			btree_bitmap_shift;
+	__u8			pad[3];
+	__le64			last_mount;	/* time_t */
+
+	__le64			flags;
+	__le32			iops[4];
+	__le64			errors[BCH_MEMBER_ERROR_NR];
+	__le64			errors_at_reset[BCH_MEMBER_ERROR_NR];
+	__le64			errors_reset_time;
+	__le64			seq;
+	__le64			btree_allocated_bitmap;
+	/*
+	 * On recovery from a clean shutdown we don't normally read the journal,
+	 * but we still want to resume writing from where we left off so we
+	 * don't overwrite more than is necessary, for list journal debugging:
+	 */
+	__le32			last_journal_bucket;
+	__le32			last_journal_bucket_offset;
+};
+
+/*
+ * btree_allocated_bitmap can represent sector addresses of a u64: it itself has
+ * 64 elements, so 64 - ilog2(64)
+ */
+#define BCH_MI_BTREE_BITMAP_SHIFT_MAX	58
+
+/*
+ * This limit comes from the bucket_gens array - it's a single allocation, and
+ * kernel allocation are limited to INT_MAX
+ */
+#define BCH_MEMBER_NBUCKETS_MAX	(INT_MAX - 64)
+
+#define BCH_MEMBER_V1_BYTES	56
+
+LE64_BITMASK(BCH_MEMBER_STATE,		struct bch_member, flags,  0,  4)
+/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
+LE64_BITMASK(BCH_MEMBER_DISCARD,	struct bch_member, flags, 14, 15)
+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,	struct bch_member, flags, 15, 20)
+LE64_BITMASK(BCH_MEMBER_GROUP,		struct bch_member, flags, 20, 28)
+LE64_BITMASK(BCH_MEMBER_DURABILITY,	struct bch_member, flags, 28, 30)
+LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
+					struct bch_member, flags, 30, 31)
+
+#if 0
+LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,	struct bch_member, flags[1], 0,  20);
+LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
+#endif
+
+#define BCH_MEMBER_STATES()			\
+	x(rw,		0)			\
+	x(ro,		1)			\
+	x(failed,	2)			\
+	x(spare,	3)
+
+enum bch_member_state {
+#define x(t, n) BCH_MEMBER_STATE_##t = n,
+	BCH_MEMBER_STATES()
+#undef x
+	BCH_MEMBER_STATE_NR
+};
+
+struct bch_sb_field_members_v1 {
+	struct bch_sb_field	field;
+	struct bch_member	_members[]; //Members are now variable size
+};
+
+struct bch_sb_field_members_v2 {
+	struct bch_sb_field	field;
+	__le16			member_bytes; //size of single member entry
+	u8			pad[6];
+	struct bch_member	_members[];
+};
+
+#endif /* _BCACHEFS_SB_MEMBERS_FORMAT_H */
diff --git a/libbcachefs/sb-members_types.h b/libbcachefs/sb-members_types.h
new file mode 100644
index 00000000..c0eda888
--- /dev/null
+++ b/libbcachefs/sb-members_types.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_MEMBERS_TYPES_H
+#define _BCACHEFS_SB_MEMBERS_TYPES_H
+
+struct bch_member_cpu {
+	u64			nbuckets;	/* device size */
+	u64			nbuckets_minus_first;
+	u16			first_bucket;   /* index of first bucket used */
+	u16			bucket_size;	/* sectors */
+	u16			group;
+	u8			state;
+	u8			discard;
+	u8			data_allowed;
+	u8			durability;
+	u8			freespace_initialized;
+	u8			valid;
+	u8			btree_bitmap_shift;
+	u64			btree_allocated_bitmap;
+};
+
+#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/libbcachefs/seqmutex.h b/libbcachefs/seqmutex.h
new file mode 100644
index 00000000..c4b3d8d3
--- /dev/null
+++ b/libbcachefs/seqmutex.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SEQMUTEX_H
+#define _BCACHEFS_SEQMUTEX_H
+
+#include <linux/mutex.h>
+
+struct seqmutex {
+	struct mutex	lock;
+	u32		seq;
+};
+
+#define seqmutex_init(_lock)	mutex_init(&(_lock)->lock)
+
+static inline bool seqmutex_trylock(struct seqmutex *lock)
+{
+	return mutex_trylock(&lock->lock);
+}
+
+static inline void seqmutex_lock(struct seqmutex *lock)
+{
+	mutex_lock(&lock->lock);
+	lock->seq++;
+}
+
+static inline u32 seqmutex_unlock(struct seqmutex *lock)
+{
+	u32 seq = lock->seq;
+	mutex_unlock(&lock->lock);
+	return seq;
+}
+
+static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq)
+{
+	if (lock->seq != seq || !mutex_trylock(&lock->lock))
+		return false;
+
+	if (lock->seq != seq) {
+		mutex_unlock(&lock->lock);
+		return false;
+	}
+
+	return true;
+}
+
+#endif /* _BCACHEFS_SEQMUTEX_H */
diff --git a/libbcachefs/siphash.c b/libbcachefs/siphash.c
index 3a6c9c82..a1cc44e6 100644
--- a/libbcachefs/siphash.c
+++ b/libbcachefs/siphash.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: BSD-3-Clause
 /*	$OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */
 
 /*-
@@ -44,7 +45,7 @@
  */
 
 #include <asm/byteorder.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 #include <linux/bitops.h>
 #include <linux/string.h>
 
@@ -159,7 +160,7 @@ u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
 
 	r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
 	memset(ctx, 0, sizeof(*ctx));
-	return (r);
+	return r;
 }
 
 u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
diff --git a/libbcachefs/siphash.h b/libbcachefs/siphash.h
index 7a4b2241..3dfaf34a 100644
--- a/libbcachefs/siphash.h
+++ b/libbcachefs/siphash.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
 /* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */
 /*-
  * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
diff --git a/libbcachefs/six.c b/libbcachefs/six.c
index afa59a47..617d07e5 100644
--- a/libbcachefs/six.c
+++ b/libbcachefs/six.c
@@ -1,239 +1,386 @@
+// SPDX-License-Identifier: GPL-2.0
 
+#include <linux/export.h>
 #include <linux/log2.h>
+#include <linux/percpu.h>
 #include <linux/preempt.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+
+#include <trace/events/lock.h>
 
 #include "six.h"
 
-#define six_acquire(l, t)	lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
-#define six_release(l)		lock_release(l, 0, _RET_IP_)
+#ifdef DEBUG
+#define EBUG_ON(cond)			BUG_ON(cond)
+#else
+#define EBUG_ON(cond)			do {} while (0)
+#endif
+
+#define six_acquire(l, t, r, ip)	lock_acquire(l, 0, t, r, 1, NULL, ip)
+#define six_release(l, ip)		lock_release(l, ip)
+
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
+
+#define SIX_LOCK_HELD_read_OFFSET	0
+#define SIX_LOCK_HELD_read		~(~0U << 26)
+#define SIX_LOCK_HELD_intent		(1U << 26)
+#define SIX_LOCK_HELD_write		(1U << 27)
+#define SIX_LOCK_WAITING_read		(1U << (28 + SIX_LOCK_read))
+#define SIX_LOCK_WAITING_write		(1U << (28 + SIX_LOCK_write))
+#define SIX_LOCK_NOSPIN			(1U << 31)
 
 struct six_lock_vals {
 	/* Value we add to the lock in order to take the lock: */
-	u64			lock_val;
+	u32			lock_val;
 
 	/* If the lock has this value (used as a mask), taking the lock fails: */
-	u64			lock_fail;
-
-	/* Value we add to the lock in order to release the lock: */
-	u64			unlock_val;
+	u32			lock_fail;
 
 	/* Mask that indicates lock is held for this type: */
-	u64			held_mask;
+	u32			held_mask;
 
 	/* Waitlist we wakeup when releasing the lock: */
 	enum six_lock_type	unlock_wakeup;
 };
 
-#define __SIX_LOCK_HELD_read	__SIX_VAL(read_lock, ~0)
-#define __SIX_LOCK_HELD_intent	__SIX_VAL(intent_lock, ~0)
-#define __SIX_LOCK_HELD_write	__SIX_VAL(seq, 1)
-
-#define LOCK_VALS {							\
-	[SIX_LOCK_read] = {						\
-		.lock_val	= __SIX_VAL(read_lock, 1),		\
-		.lock_fail	= __SIX_LOCK_HELD_write,		\
-		.unlock_val	= -__SIX_VAL(read_lock, 1),		\
-		.held_mask	= __SIX_LOCK_HELD_read,			\
-		.unlock_wakeup	= SIX_LOCK_write,			\
-	},								\
-	[SIX_LOCK_intent] = {						\
-		.lock_val	= __SIX_VAL(intent_lock, 1),		\
-		.lock_fail	= __SIX_LOCK_HELD_intent,		\
-		.unlock_val	= -__SIX_VAL(intent_lock, 1),		\
-		.held_mask	= __SIX_LOCK_HELD_intent,		\
-		.unlock_wakeup	= SIX_LOCK_intent,			\
-	},								\
-	[SIX_LOCK_write] = {						\
-		.lock_val	= __SIX_VAL(seq, 1),			\
-		.lock_fail	= __SIX_LOCK_HELD_read,			\
-		.unlock_val	= __SIX_VAL(seq, 1),			\
-		.held_mask	= __SIX_LOCK_HELD_write,		\
-		.unlock_wakeup	= SIX_LOCK_read,			\
-	},								\
+static const struct six_lock_vals l[] = {
+	[SIX_LOCK_read] = {
+		.lock_val	= 1U << SIX_LOCK_HELD_read_OFFSET,
+		.lock_fail	= SIX_LOCK_HELD_write,
+		.held_mask	= SIX_LOCK_HELD_read,
+		.unlock_wakeup	= SIX_LOCK_write,
+	},
+	[SIX_LOCK_intent] = {
+		.lock_val	= SIX_LOCK_HELD_intent,
+		.lock_fail	= SIX_LOCK_HELD_intent,
+		.held_mask	= SIX_LOCK_HELD_intent,
+		.unlock_wakeup	= SIX_LOCK_intent,
+	},
+	[SIX_LOCK_write] = {
+		.lock_val	= SIX_LOCK_HELD_write,
+		.lock_fail	= SIX_LOCK_HELD_read,
+		.held_mask	= SIX_LOCK_HELD_write,
+		.unlock_wakeup	= SIX_LOCK_read,
+	},
+};
+
+static inline void six_set_bitmask(struct six_lock *lock, u32 mask)
+{
+	if ((atomic_read(&lock->state) & mask) != mask)
+		atomic_or(mask, &lock->state);
+}
+
+static inline void six_clear_bitmask(struct six_lock *lock, u32 mask)
+{
+	if (atomic_read(&lock->state) & mask)
+		atomic_and(~mask, &lock->state);
 }
 
 static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
-				 union six_lock_state old)
+				 u32 old, struct task_struct *owner)
 {
 	if (type != SIX_LOCK_intent)
 		return;
 
-	if (!old.intent_lock) {
+	if (!(old & SIX_LOCK_HELD_intent)) {
 		EBUG_ON(lock->owner);
-		lock->owner = current;
+		lock->owner = owner;
 	} else {
 		EBUG_ON(lock->owner != current);
 	}
 }
 
-static inline void six_clear_owner(struct six_lock *lock, enum six_lock_type type)
+static inline unsigned pcpu_read_count(struct six_lock *lock)
 {
-	if (type != SIX_LOCK_intent)
-		return;
+	unsigned read_count = 0;
+	int cpu;
 
-	EBUG_ON(lock->owner != current);
-
-	if (lock->state.intent_lock == 1)
-		lock->owner = NULL;
+	for_each_possible_cpu(cpu)
+		read_count += *per_cpu_ptr(lock->readers, cpu);
+	return read_count;
 }
 
-static __always_inline bool do_six_trylock_type(struct six_lock *lock,
-						enum six_lock_type type)
+/*
+ * __do_six_trylock() - main trylock routine
+ *
+ * Returns 1 on success, 0 on failure
+ *
+ * In percpu reader mode, a failed trylock may cause a spurious trylock failure
+ * for anoter thread taking the competing lock type, and we may havve to do a
+ * wakeup: when a wakeup is required, we return -1 - wakeup_type.
+ */
+static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
+			    struct task_struct *task, bool try)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
-	union six_lock_state old;
-	u64 v = READ_ONCE(lock->state.v);
+	int ret;
+	u32 old;
 
-	EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
+	EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
+	EBUG_ON(type == SIX_LOCK_write &&
+		(try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write)));
 
-	do {
-		old.v = v;
+	/*
+	 * Percpu reader mode:
+	 *
+	 * The basic idea behind this algorithm is that you can implement a lock
+	 * between two threads without any atomics, just memory barriers:
+	 *
+	 * For two threads you'll need two variables, one variable for "thread a
+	 * has the lock" and another for "thread b has the lock".
+	 *
+	 * To take the lock, a thread sets its variable indicating that it holds
+	 * the lock, then issues a full memory barrier, then reads from the
+	 * other thread's variable to check if the other thread thinks it has
+	 * the lock. If we raced, we backoff and retry/sleep.
+	 *
+	 * Failure to take the lock may cause a spurious trylock failure in
+	 * another thread, because we temporarily set the lock to indicate that
+	 * we held it. This would be a problem for a thread in six_lock(), when
+	 * they are calling trylock after adding themself to the waitlist and
+	 * prior to sleeping.
+	 *
+	 * Therefore, if we fail to get the lock, and there were waiters of the
+	 * type we conflict with, we will have to issue a wakeup.
+	 *
+	 * Since we may be called under wait_lock (and by the wakeup code
+	 * itself), we return that the wakeup has to be done instead of doing it
+	 * here.
+	 */
+	if (type == SIX_LOCK_read && lock->readers) {
+		preempt_disable();
+		this_cpu_inc(*lock->readers); /* signal that we own lock */
 
-		EBUG_ON(type == SIX_LOCK_write &&
-			((old.v & __SIX_LOCK_HELD_write) ||
-			 !(old.v & __SIX_LOCK_HELD_intent)));
+		smp_mb();
 
-		if (old.v & l[type].lock_fail)
-			return false;
-	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-				old.v,
-				old.v + l[type].lock_val)) != old.v);
+		old = atomic_read(&lock->state);
+		ret = !(old & l[type].lock_fail);
 
-	six_set_owner(lock, type, old);
-	return true;
-}
+		this_cpu_sub(*lock->readers, !ret);
+		preempt_enable();
 
-__always_inline __flatten
-static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
-{
-	if (!do_six_trylock_type(lock, type))
-		return false;
+		if (!ret) {
+			smp_mb();
+			if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write)
+				ret = -1 - SIX_LOCK_write;
+		}
+	} else if (type == SIX_LOCK_write && lock->readers) {
+		if (try)
+			atomic_add(SIX_LOCK_HELD_write, &lock->state);
 
-	six_acquire(&lock->dep_map, 1);
-	return true;
+		/*
+		 * Make sure atomic_add happens before pcpu_read_count and
+		 * six_set_bitmask in slow path happens before pcpu_read_count.
+		 *
+		 * Paired with the smp_mb() in read lock fast path (per-cpu mode)
+		 * and the one before atomic_read in read unlock path.
+		 */
+		smp_mb();
+		ret = !pcpu_read_count(lock);
+
+		if (try && !ret) {
+			old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state);
+			if (old & SIX_LOCK_WAITING_read)
+				ret = -1 - SIX_LOCK_read;
+		}
+	} else {
+		old = atomic_read(&lock->state);
+		do {
+			ret = !(old & l[type].lock_fail);
+			if (!ret || (type == SIX_LOCK_write && !try)) {
+				smp_mb();
+				break;
+			}
+		} while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val));
+
+		EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask));
+	}
+
+	if (ret > 0)
+		six_set_owner(lock, type, old, task);
+
+	EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
+		(atomic_read(&lock->state) & SIX_LOCK_HELD_write));
+
+	return ret;
 }
 
-__always_inline __flatten
-static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
-			      unsigned seq)
+static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
-	union six_lock_state old;
-	u64 v = READ_ONCE(lock->state.v);
+	struct six_lock_waiter *w, *next;
+	struct task_struct *task;
+	bool saw_one;
+	int ret;
+again:
+	ret = 0;
+	saw_one = false;
+	raw_spin_lock(&lock->wait_lock);
 
-	do {
-		old.v = v;
+	list_for_each_entry_safe(w, next, &lock->wait_list, list) {
+		if (w->lock_want != lock_type)
+			continue;
 
-		if (old.seq != seq || old.v & l[type].lock_fail)
-			return false;
-	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-				old.v,
-				old.v + l[type].lock_val)) != old.v);
+		if (saw_one && lock_type != SIX_LOCK_read)
+			goto unlock;
+		saw_one = true;
 
-	six_set_owner(lock, type, old);
-	six_acquire(&lock->dep_map, 1);
-	return true;
-}
+		ret = __do_six_trylock(lock, lock_type, w->task, false);
+		if (ret <= 0)
+			goto unlock;
 
-struct six_lock_waiter {
-	struct list_head	list;
-	struct task_struct	*task;
-};
+		/*
+		 * Similar to percpu_rwsem_wake_function(), we need to guard
+		 * against the wakee noticing w->lock_acquired, returning, and
+		 * then exiting before we do the wakeup:
+		 */
+		task = get_task_struct(w->task);
+		__list_del(w->list.prev, w->list.next);
+		/*
+		 * The release barrier here ensures the ordering of the
+		 * __list_del before setting w->lock_acquired; @w is on the
+		 * stack of the thread doing the waiting and will be reused
+		 * after it sees w->lock_acquired with no other locking:
+		 * pairs with smp_load_acquire() in six_lock_slowpath()
+		 */
+		smp_store_release(&w->lock_acquired, true);
+		wake_up_process(task);
+		put_task_struct(task);
+	}
 
-/* This is probably up there with the more evil things I've done */
-#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
+	six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type);
+unlock:
+	raw_spin_unlock(&lock->wait_lock);
 
-#ifdef CONFIG_LOCK_SPIN_ON_OWNER
+	if (ret < 0) {
+		lock_type = -ret - 1;
+		goto again;
+	}
+}
 
-static inline int six_can_spin_on_owner(struct six_lock *lock)
+__always_inline
+static void six_lock_wakeup(struct six_lock *lock, u32 state,
+			    enum six_lock_type lock_type)
 {
-	struct task_struct *owner;
-	int retval = 1;
+	if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
+		return;
 
-	if (need_resched())
-		return 0;
+	if (!(state & (SIX_LOCK_WAITING_read << lock_type)))
+		return;
 
-	rcu_read_lock();
-	owner = READ_ONCE(lock->owner);
-	if (owner)
-		retval = owner->on_cpu;
-	rcu_read_unlock();
-	/*
-	 * if lock->owner is not set, the mutex owner may have just acquired
-	 * it and not set the owner yet or the mutex has been released.
-	 */
-	return retval;
+	__six_lock_wakeup(lock, lock_type);
 }
 
-static inline bool six_spin_on_owner(struct six_lock *lock,
-				     struct task_struct *owner)
+__always_inline
+static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try)
 {
-	bool ret = true;
+	int ret;
 
-	rcu_read_lock();
-	while (lock->owner == owner) {
-		/*
-		 * Ensure we emit the owner->on_cpu, dereference _after_
-		 * checking lock->owner still matches owner. If that fails,
-		 * owner might point to freed memory. If it still matches,
-		 * the rcu_read_lock() ensures the memory stays valid.
-		 */
-		barrier();
+	ret = __do_six_trylock(lock, type, current, try);
+	if (ret < 0)
+		__six_lock_wakeup(lock, -ret - 1);
 
-		if (!owner->on_cpu || need_resched()) {
-			ret = false;
-			break;
-		}
+	return ret > 0;
+}
 
-		cpu_relax();
+/**
+ * six_trylock_ip - attempt to take a six lock without blocking
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: true on success, false on failure.
+ */
+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
+{
+	if (!do_six_trylock(lock, type, true))
+		return false;
+
+	if (type != SIX_LOCK_write)
+		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
+	return true;
+}
+EXPORT_SYMBOL_GPL(six_trylock_ip);
+
+/**
+ * six_relock_ip - attempt to re-take a lock that was held previously
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @seq:	lock sequence number obtained from six_lock_seq() while lock was
+ *		held previously
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: true on success, false on failure.
+ */
+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
+		   unsigned seq, unsigned long ip)
+{
+	if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip))
+		return false;
+
+	if (six_lock_seq(lock) != seq) {
+		six_unlock_ip(lock, type, ip);
+		return false;
 	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(six_relock_ip);
+
+#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN
+
+static inline bool six_owner_running(struct six_lock *lock)
+{
+	/*
+	 * When there's no owner, we might have preempted between the owner
+	 * acquiring the lock and setting the owner field. If we're an RT task
+	 * that will live-lock because we won't let the owner complete.
+	 */
+	rcu_read_lock();
+	struct task_struct *owner = READ_ONCE(lock->owner);
+	bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
 	rcu_read_unlock();
 
 	return ret;
 }
 
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+static inline bool six_optimistic_spin(struct six_lock *lock,
+				       struct six_lock_waiter *wait,
+				       enum six_lock_type type)
 {
-	struct task_struct *task = current;
+	unsigned loop = 0;
+	u64 end_time;
 
 	if (type == SIX_LOCK_write)
 		return false;
 
-	preempt_disable();
-	if (!six_can_spin_on_owner(lock))
-		goto fail;
+	if (lock->wait_list.next != &wait->list)
+		return false;
 
-	if (!osq_lock(&lock->osq))
-		goto fail;
+	if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN)
+		return false;
 
-	while (1) {
-		struct task_struct *owner;
+	preempt_disable();
+	end_time = sched_clock() + 10 * NSEC_PER_USEC;
 
+	while (!need_resched() && six_owner_running(lock)) {
 		/*
-		 * If there's an owner, wait for it to either
-		 * release the lock or go to sleep.
+		 * Ensures that writes to the waitlist entry happen after we see
+		 * wait->lock_acquired: pairs with the smp_store_release in
+		 * __six_lock_wakeup
 		 */
-		owner = READ_ONCE(lock->owner);
-		if (owner && !six_spin_on_owner(lock, owner))
-			break;
-
-		if (do_six_trylock_type(lock, type)) {
-			osq_unlock(&lock->osq);
+		if (smp_load_acquire(&wait->lock_acquired)) {
 			preempt_enable();
 			return true;
 		}
 
-		/*
-		 * When there's no owner, we might have preempted between the
-		 * owner acquiring the lock and setting the owner field. If
-		 * we're an RT task that will live-lock because we won't let
-		 * the owner complete.
-		 */
-		if (!owner && (need_resched() || rt_task(task)))
+		if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
+			six_set_bitmask(lock, SIX_LOCK_NOSPIN);
 			break;
+		}
 
 		/*
 		 * The cpu_relax() call is a compiler barrier which forces
@@ -244,24 +391,15 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
 		cpu_relax();
 	}
 
-	osq_unlock(&lock->osq);
-fail:
 	preempt_enable();
-
-	/*
-	 * If we fell out of the spin path because of need_resched(),
-	 * reschedule now, before we try-lock again. This avoids getting
-	 * scheduled out right after we obtained the lock.
-	 */
-	if (need_resched())
-		schedule();
-
 	return false;
 }
 
 #else /* CONFIG_LOCK_SPIN_ON_OWNER */
 
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+static inline bool six_optimistic_spin(struct six_lock *lock,
+				       struct six_lock_waiter *wait,
+				       enum six_lock_type type)
 {
 	return false;
 }
@@ -269,219 +407,288 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
 #endif
 
 noinline
-static void __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type)
+static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
+			     struct six_lock_waiter *wait,
+			     six_lock_should_sleep_fn should_sleep_fn, void *p,
+			     unsigned long ip)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
-	union six_lock_state old, new;
-	struct six_lock_waiter wait;
-	u64 v;
+	int ret = 0;
 
-	if (six_optimistic_spin(lock, type))
-		return;
-
-	lock_contended(&lock->dep_map, _RET_IP_);
-
-	INIT_LIST_HEAD(&wait.list);
-	wait.task = current;
+	if (type == SIX_LOCK_write) {
+		EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
+		atomic_add(SIX_LOCK_HELD_write, &lock->state);
+		smp_mb__after_atomic();
+	}
 
-	while (1) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (type == SIX_LOCK_write)
-			EBUG_ON(lock->owner != current);
-		else if (list_empty_careful(&wait.list)) {
-			raw_spin_lock(&lock->wait_lock);
-			list_add_tail(&wait.list, &lock->wait_list[type]);
-			raw_spin_unlock(&lock->wait_lock);
-		}
+	trace_contention_begin(lock, 0);
+	lock_contended(&lock->dep_map, ip);
 
-		v = READ_ONCE(lock->state.v);
-		do {
-			new.v = old.v = v;
-
-			if (!(old.v & l[type].lock_fail))
-				new.v += l[type].lock_val;
-			else if (!(new.waiters & (1 << type)))
-				new.waiters |= 1 << type;
-			else
-				break; /* waiting bit already set */
-		} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-					old.v, new.v)) != old.v);
-
-		if (!(old.v & l[type].lock_fail))
-			break;
+	wait->task		= current;
+	wait->lock_want		= type;
+	wait->lock_acquired	= false;
 
-		schedule();
-	}
+	raw_spin_lock(&lock->wait_lock);
+	six_set_bitmask(lock, SIX_LOCK_WAITING_read << type);
+	/*
+	 * Retry taking the lock after taking waitlist lock, in case we raced
+	 * with an unlock:
+	 */
+	ret = __do_six_trylock(lock, type, current, false);
+	if (ret <= 0) {
+		wait->start_time = local_clock();
 
-	six_set_owner(lock, type, old);
+		if (!list_empty(&lock->wait_list)) {
+			struct six_lock_waiter *last =
+				list_last_entry(&lock->wait_list,
+					struct six_lock_waiter, list);
 
-	__set_current_state(TASK_RUNNING);
+			if (time_before_eq64(wait->start_time, last->start_time))
+				wait->start_time = last->start_time + 1;
+		}
 
-	if (!list_empty_careful(&wait.list)) {
-		raw_spin_lock(&lock->wait_lock);
-		list_del_init(&wait.list);
-		raw_spin_unlock(&lock->wait_lock);
+		list_add_tail(&wait->list, &lock->wait_list);
 	}
-}
-
-__always_inline
-static void __six_lock_type(struct six_lock *lock, enum six_lock_type type)
-{
-	six_acquire(&lock->dep_map, 0);
+	raw_spin_unlock(&lock->wait_lock);
 
-	if (!do_six_trylock_type(lock, type))
-		__six_lock_type_slowpath(lock, type);
+	if (unlikely(ret > 0)) {
+		ret = 0;
+		goto out;
+	}
 
-	lock_acquired(&lock->dep_map, _RET_IP_);
-}
+	if (unlikely(ret < 0)) {
+		__six_lock_wakeup(lock, -ret - 1);
+		ret = 0;
+	}
 
-static inline void six_lock_wakeup(struct six_lock *lock,
-				   union six_lock_state state,
-				   unsigned waitlist_id)
-{
-	struct list_head *wait_list = &lock->wait_list[waitlist_id];
-	struct six_lock_waiter *w, *next;
+	if (six_optimistic_spin(lock, wait, type))
+		goto out;
 
-	if (waitlist_id == SIX_LOCK_write && state.read_lock)
-		return;
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
 
-	if (!(state.waiters & (1 << waitlist_id)))
-		return;
+		/*
+		 * Ensures that writes to the waitlist entry happen after we see
+		 * wait->lock_acquired: pairs with the smp_store_release in
+		 * __six_lock_wakeup
+		 */
+		if (smp_load_acquire(&wait->lock_acquired))
+			break;
 
-	clear_bit(waitlist_bitnr(waitlist_id),
-		  (unsigned long *) &lock->state.v);
+		ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
+		if (unlikely(ret)) {
+			bool acquired;
+
+			/*
+			 * If should_sleep_fn() returns an error, we are
+			 * required to return that error even if we already
+			 * acquired the lock - should_sleep_fn() might have
+			 * modified external state (e.g. when the deadlock cycle
+			 * detector in bcachefs issued a transaction restart)
+			 */
+			raw_spin_lock(&lock->wait_lock);
+			acquired = wait->lock_acquired;
+			if (!acquired)
+				list_del(&wait->list);
+			raw_spin_unlock(&lock->wait_lock);
 
-	if (waitlist_id == SIX_LOCK_write) {
-		struct task_struct *p = READ_ONCE(lock->owner);
+			if (unlikely(acquired))
+				do_six_unlock_type(lock, type);
+			break;
+		}
 
-		if (p)
-			wake_up_process(p);
-		return;
+		schedule();
 	}
 
-	raw_spin_lock(&lock->wait_lock);
-
-	list_for_each_entry_safe(w, next, wait_list, list) {
-		list_del_init(&w->list);
-
-		if (wake_up_process(w->task) &&
-		    waitlist_id != SIX_LOCK_read) {
-			if (!list_empty(wait_list))
-				set_bit(waitlist_bitnr(waitlist_id),
-					(unsigned long *) &lock->state.v);
-			break;
-		}
+	__set_current_state(TASK_RUNNING);
+out:
+	if (ret && type == SIX_LOCK_write) {
+		six_clear_bitmask(lock, SIX_LOCK_HELD_write);
+		six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
 	}
+	trace_contention_end(lock, 0);
 
-	raw_spin_unlock(&lock->wait_lock);
+	return ret;
 }
 
-__always_inline __flatten
-static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+/**
+ * six_lock_ip_waiter - take a lock, with full waitlist interface
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @wait:	pointer to wait object, which will be added to lock's waitlist
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * This is the most general six_lock() variant, with parameters to support full
+ * cycle detection for deadlock avoidance.
+ *
+ * The code calling this function must implement tracking of held locks, and the
+ * @wait object should be embedded into the struct that tracks held locks -
+ * which must also be accessible in a thread-safe way.
+ *
+ * @should_sleep_fn should invoke the cycle detector; it should walk each
+ * lock's waiters, and for each waiter recursively walk their held locks.
+ *
+ * When this function must block, @wait will be added to @lock's waitlist before
+ * calling trylock, and before calling @should_sleep_fn, and @wait will not be
+ * removed from the lock waitlist until the lock has been successfully acquired,
+ * or we abort.
+ *
+ * @wait.start_time will be monotonically increasing for any given waitlist, and
+ * thus may be used as a loop cursor.
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+		       struct six_lock_waiter *wait,
+		       six_lock_should_sleep_fn should_sleep_fn, void *p,
+		       unsigned long ip)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
-	union six_lock_state state;
+	int ret;
 
-	EBUG_ON(!(lock->state.v & l[type].held_mask));
-	EBUG_ON(type == SIX_LOCK_write &&
-		!(lock->state.v & __SIX_LOCK_HELD_intent));
+	wait->start_time = 0;
 
-	six_clear_owner(lock, type);
+	if (type != SIX_LOCK_write)
+		six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
 
-	state.v = atomic64_add_return_release(l[type].unlock_val,
-					      &lock->state.counter);
-	six_release(&lock->dep_map);
-	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
-}
+	ret = do_six_trylock(lock, type, true) ? 0
+		: six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
+
+	if (ret && type != SIX_LOCK_write)
+		six_release(&lock->dep_map, ip);
+	if (!ret)
+		lock_acquired(&lock->dep_map, ip);
 
-#ifdef SIX_LOCK_SEPARATE_LOCKFNS
-
-#define __SIX_LOCK(type)						\
-bool six_trylock_##type(struct six_lock *lock)				\
-{									\
-	return __six_trylock_type(lock, SIX_LOCK_##type);		\
-}									\
-									\
-bool six_relock_##type(struct six_lock *lock, u32 seq)			\
-{									\
-	return __six_relock_type(lock, SIX_LOCK_##type, seq);		\
-}									\
-									\
-void six_lock_##type(struct six_lock *lock)				\
-{									\
-	__six_lock_type(lock, SIX_LOCK_##type);				\
-}									\
-									\
-void six_unlock_##type(struct six_lock *lock)				\
-{									\
-	__six_unlock_type(lock, SIX_LOCK_##type);			\
+	return ret;
 }
+EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
 
-__SIX_LOCK(read)
-__SIX_LOCK(intent)
-__SIX_LOCK(write)
+__always_inline
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	u32 state;
 
-#undef __SIX_LOCK
+	if (type == SIX_LOCK_intent)
+		lock->owner = NULL;
 
-#else
+	if (type == SIX_LOCK_read &&
+	    lock->readers) {
+		smp_mb(); /* unlock barrier */
+		this_cpu_dec(*lock->readers);
+		smp_mb(); /* between unlocking and checking for waiters */
+		state = atomic_read(&lock->state);
+	} else {
+		u32 v = l[type].lock_val;
 
-bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
-{
-	return __six_trylock_type(lock, type);
-}
+		if (type != SIX_LOCK_read)
+			v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN;
 
-bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
-		     unsigned seq)
-{
-	return __six_relock_type(lock, type, seq);
+		EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask));
+		state = atomic_sub_return_release(v, &lock->state);
+	}
 
+	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
 }
 
-void six_lock_type(struct six_lock *lock, enum six_lock_type type)
+/**
+ * six_unlock_ip - drop a six lock
+ * @lock:	lock to unlock
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * When a lock is held multiple times (because six_lock_incement()) was used),
+ * this decrements the 'lock held' counter by one.
+ *
+ * For example:
+ * six_lock_read(&foo->lock);				read count 1
+ * six_lock_increment(&foo->lock, SIX_LOCK_read);	read count 2
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 1
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 0
+ */
+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
 {
-	__six_lock_type(lock, type);
-}
+	EBUG_ON(type == SIX_LOCK_write &&
+		!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
+	EBUG_ON((type == SIX_LOCK_write ||
+		 type == SIX_LOCK_intent) &&
+		lock->owner != current);
+
+	if (type != SIX_LOCK_write)
+		six_release(&lock->dep_map, ip);
+	else
+		lock->seq++;
+
+	if (type == SIX_LOCK_intent &&
+	    lock->intent_lock_recurse) {
+		--lock->intent_lock_recurse;
+		return;
+	}
 
-void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-{
-	__six_unlock_type(lock, type);
+	do_six_unlock_type(lock, type);
 }
+EXPORT_SYMBOL_GPL(six_unlock_ip);
 
-#endif
-
-/* Convert from intent to read: */
+/**
+ * six_lock_downgrade - convert an intent lock to a read lock
+ * @lock:	lock to dowgrade
+ *
+ * @lock will have read count incremented and intent count decremented
+ */
 void six_lock_downgrade(struct six_lock *lock)
 {
 	six_lock_increment(lock, SIX_LOCK_read);
 	six_unlock_intent(lock);
 }
-
+EXPORT_SYMBOL_GPL(six_lock_downgrade);
+
+/**
+ * six_lock_tryupgrade - attempt to convert read lock to an intent lock
+ * @lock:	lock to upgrade
+ *
+ * On success, @lock will have intent count incremented and read count
+ * decremented
+ *
+ * Return: true on success, false on failure
+ */
 bool six_lock_tryupgrade(struct six_lock *lock)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
-	union six_lock_state old, new;
-	u64 v = READ_ONCE(lock->state.v);
+	u32 old = atomic_read(&lock->state), new;
 
 	do {
-		new.v = old.v = v;
+		new = old;
 
-		EBUG_ON(!(old.v & l[SIX_LOCK_read].held_mask));
+		if (new & SIX_LOCK_HELD_intent)
+			return false;
 
-		new.v += l[SIX_LOCK_read].unlock_val;
+		if (!lock->readers) {
+			EBUG_ON(!(new & SIX_LOCK_HELD_read));
+			new -= l[SIX_LOCK_read].lock_val;
+		}
 
-		if (new.v & l[SIX_LOCK_intent].lock_fail)
-			return false;
+		new |= SIX_LOCK_HELD_intent;
+	} while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new));
 
-		new.v += l[SIX_LOCK_intent].lock_val;
-	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-				old.v, new.v)) != old.v);
+	if (lock->readers)
+		this_cpu_dec(*lock->readers);
 
-	six_set_owner(lock, SIX_LOCK_intent, old);
-	six_lock_wakeup(lock, new, l[SIX_LOCK_read].unlock_wakeup);
+	six_set_owner(lock, SIX_LOCK_intent, old, current);
 
 	return true;
 }
-
+EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
+
+/**
+ * six_trylock_convert - attempt to convert a held lock from one type to another
+ * @lock:	lock to upgrade
+ * @from:	SIX_LOCK_read or SIX_LOCK_intent
+ * @to:		SIX_LOCK_read or SIX_LOCK_intent
+ *
+ * On success, @lock will have intent count incremented and read count
+ * decremented
+ *
+ * Return: true on success, false on failure
+ */
 bool six_trylock_convert(struct six_lock *lock,
 			 enum six_lock_type from,
 			 enum six_lock_type to)
@@ -498,19 +705,169 @@ bool six_trylock_convert(struct six_lock *lock,
 		return six_lock_tryupgrade(lock);
 	}
 }
-
-/*
- * Increment read/intent lock count, assuming we already have it read or intent
- * locked:
+EXPORT_SYMBOL_GPL(six_trylock_convert);
+
+/**
+ * six_lock_increment - increase held lock count on a lock that is already held
+ * @lock:	lock to increment
+ * @type:	SIX_LOCK_read or SIX_LOCK_intent
+ *
+ * @lock must already be held, with a lock type that is greater than or equal to
+ * @type
+ *
+ * A corresponding six_unlock_type() call will be required for @lock to be fully
+ * unlocked.
  */
 void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
-
-	EBUG_ON(type == SIX_LOCK_write);
-	six_acquire(&lock->dep_map, 0);
+	six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_);
 
 	/* XXX: assert already locked, and that we don't overflow: */
 
-	atomic64_add(l[type].lock_val, &lock->state.counter);
+	switch (type) {
+	case SIX_LOCK_read:
+		if (lock->readers) {
+			this_cpu_inc(*lock->readers);
+		} else {
+			EBUG_ON(!(atomic_read(&lock->state) &
+				  (SIX_LOCK_HELD_read|
+				   SIX_LOCK_HELD_intent)));
+			atomic_add(l[type].lock_val, &lock->state);
+		}
+		break;
+	case SIX_LOCK_intent:
+		EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
+		lock->intent_lock_recurse++;
+		break;
+	case SIX_LOCK_write:
+		BUG();
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(six_lock_increment);
+
+/**
+ * six_lock_wakeup_all - wake up all waiters on @lock
+ * @lock:	lock to wake up waiters for
+ *
+ * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then
+ * abort the lock operation.
+ *
+ * This function is never needed in a bug-free program; it's only useful in
+ * debug code, e.g. to determine if a cycle detector is at fault.
+ */
+void six_lock_wakeup_all(struct six_lock *lock)
+{
+	u32 state = atomic_read(&lock->state);
+	struct six_lock_waiter *w;
+
+	six_lock_wakeup(lock, state, SIX_LOCK_read);
+	six_lock_wakeup(lock, state, SIX_LOCK_intent);
+	six_lock_wakeup(lock, state, SIX_LOCK_write);
+
+	raw_spin_lock(&lock->wait_lock);
+	list_for_each_entry(w, &lock->wait_list, list)
+		wake_up_process(w->task);
+	raw_spin_unlock(&lock->wait_lock);
+}
+EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
+
+/**
+ * six_lock_counts - return held lock counts, for each lock type
+ * @lock:	lock to return counters for
+ *
+ * Return: the number of times a lock is held for read, intent and write.
+ */
+struct six_lock_count six_lock_counts(struct six_lock *lock)
+{
+	struct six_lock_count ret;
+
+	ret.n[SIX_LOCK_read]	= !lock->readers
+		? atomic_read(&lock->state) & SIX_LOCK_HELD_read
+		: pcpu_read_count(lock);
+	ret.n[SIX_LOCK_intent]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) +
+		lock->intent_lock_recurse;
+	ret.n[SIX_LOCK_write]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(six_lock_counts);
+
+/**
+ * six_lock_readers_add - directly manipulate reader count of a lock
+ * @lock:	lock to add/subtract readers for
+ * @nr:		reader count to add/subtract
+ *
+ * When an upper layer is implementing lock reentrency, we may have both read
+ * and intent locks on the same lock.
+ *
+ * When we need to take a write lock, the read locks will cause self-deadlock,
+ * because six locks themselves do not track which read locks are held by the
+ * current thread and which are held by a different thread - it does no
+ * per-thread tracking of held locks.
+ *
+ * The upper layer that is tracking held locks may however, if trylock() has
+ * failed, count up its own read locks, subtract them, take the write lock, and
+ * then re-add them.
+ *
+ * As in any other situation when taking a write lock, @lock must be held for
+ * intent one (or more) times, so @lock will never be left unlocked.
+ */
+void six_lock_readers_add(struct six_lock *lock, int nr)
+{
+	if (lock->readers) {
+		this_cpu_add(*lock->readers, nr);
+	} else {
+		EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0);
+		/* reader count starts at bit 0 */
+		atomic_add(nr, &lock->state);
+	}
+}
+EXPORT_SYMBOL_GPL(six_lock_readers_add);
+
+/**
+ * six_lock_exit - release resources held by a lock prior to freeing
+ * @lock:	lock to exit
+ *
+ * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is
+ * required to free the percpu read counts.
+ */
+void six_lock_exit(struct six_lock *lock)
+{
+	WARN_ON(lock->readers && pcpu_read_count(lock));
+	WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read);
+
+	free_percpu(lock->readers);
+	lock->readers = NULL;
+}
+EXPORT_SYMBOL_GPL(six_lock_exit);
+
+void __six_lock_init(struct six_lock *lock, const char *name,
+		     struct lock_class_key *key, enum six_lock_init_flags flags)
+{
+	atomic_set(&lock->state, 0);
+	raw_spin_lock_init(&lock->wait_lock);
+	INIT_LIST_HEAD(&lock->wait_list);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	debug_check_no_locks_freed((void *) lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+
+	/*
+	 * Don't assume that we have real percpu variables available in
+	 * userspace:
+	 */
+#ifdef __KERNEL__
+	if (flags & SIX_LOCK_INIT_PCPU) {
+		/*
+		 * We don't return an error here on memory allocation failure
+		 * since percpu is an optimization, and locks will work with the
+		 * same semantics in non-percpu mode: callers can check for
+		 * failure if they wish by checking lock->readers, but generally
+		 * will not want to treat it as an error.
+		 */
+		lock->readers = alloc_percpu(unsigned);
+	}
+#endif
 }
+EXPORT_SYMBOL_GPL(__six_lock_init);
diff --git a/libbcachefs/six.h b/libbcachefs/six.h
index 999c49db..68d46fd7 100644
--- a/libbcachefs/six.h
+++ b/libbcachefs/six.h
@@ -1,103 +1,132 @@
-#ifndef _BCACHEFS_SIX_H
-#define _BCACHEFS_SIX_H
+/* SPDX-License-Identifier: GPL-2.0 */
 
-/*
- * Shared/intent/exclusive locks: sleepable read/write locks, much like rw
- * semaphores, except with a third intermediate state, intent. Basic operations
- * are:
+#ifndef _LINUX_SIX_H
+#define _LINUX_SIX_H
+
+/**
+ * DOC: SIX locks overview
  *
- * six_lock_read(&foo->lock);
- * six_unlock_read(&foo->lock);
+ * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores
+ * but with an additional state: read/shared, intent, exclusive/write
  *
- * six_lock_intent(&foo->lock);
- * six_unlock_intent(&foo->lock);
+ * The purpose of the intent state is to allow for greater concurrency on tree
+ * structures without deadlocking. In general, a read can't be upgraded to a
+ * write lock without deadlocking, so an operation that updates multiple nodes
+ * will have to take write locks for the full duration of the operation.
  *
- * six_lock_write(&foo->lock);
- * six_unlock_write(&foo->lock);
+ * But by adding an intent state, which is exclusive with other intent locks but
+ * not with readers, we can take intent locks at the start of the operation,
+ * and then take write locks only for the actual update to each individual
+ * nodes, without deadlocking.
  *
- * Intent locks block other intent locks, but do not block read locks, and you
- * must have an intent lock held before taking a write lock, like so:
+ * Example usage:
+ *   six_lock_read(&foo->lock);
+ *   six_unlock_read(&foo->lock);
  *
- * six_lock_intent(&foo->lock);
- * six_lock_write(&foo->lock);
- * six_unlock_write(&foo->lock);
- * six_unlock_intent(&foo->lock);
+ * An intent lock must be held before taking a write lock:
+ *   six_lock_intent(&foo->lock);
+ *   six_lock_write(&foo->lock);
+ *   six_unlock_write(&foo->lock);
+ *   six_unlock_intent(&foo->lock);
  *
  * Other operations:
- *
  *   six_trylock_read()
  *   six_trylock_intent()
  *   six_trylock_write()
  *
- *   six_lock_downgrade():	convert from intent to read
- *   six_lock_tryupgrade():	attempt to convert from read to intent
- *
- * Locks also embed a sequence number, which is incremented when the lock is
- * locked or unlocked for write. The current sequence number can be grabbed
- * while a lock is held from lock->state.seq; then, if you drop the lock you can
- * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock
- * iff it hasn't been locked for write in the meantime.
- *
- * There are also operations that take the lock type as a parameter, where the
- * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write:
- *
- *   six_lock_type(lock, type)
- *   six_unlock_type(lock, type)
- *   six_relock(lock, type, seq)
- *   six_trylock_type(lock, type)
- *   six_trylock_convert(lock, from, to)
- *
- * A lock may be held multiple types by the same thread (for read or intent,
- * not write) - up to SIX_LOCK_MAX_RECURSE. However, the six locks code does
- * _not_ implement the actual recursive checks itself though - rather, if your
- * code (e.g. btree iterator code) knows that the current thread already has a
- * lock held, and for the correct type, six_lock_increment() may be used to
- * bump up the counter for that type - the only effect is that one more call to
- * unlock will be required before the lock is unlocked.
+ *   six_lock_downgrade()	convert from intent to read
+ *   six_lock_tryupgrade()	attempt to convert from read to intent, may fail
+ *
+ * There are also interfaces that take the lock type as an enum:
+ *
+ *   six_lock_type(&foo->lock, SIX_LOCK_read);
+ *   six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent)
+ *   six_lock_type(&foo->lock, SIX_LOCK_write);
+ *   six_unlock_type(&foo->lock, SIX_LOCK_write);
+ *   six_unlock_type(&foo->lock, SIX_LOCK_intent);
+ *
+ * Lock sequence numbers - unlock(), relock():
+ *
+ *   Locks embed sequences numbers, which are incremented on write lock/unlock.
+ *   This allows locks to be dropped and the retaken iff the state they protect
+ *   hasn't changed; this makes it much easier to avoid holding locks while e.g.
+ *   doing IO or allocating memory.
+ *
+ *   Example usage:
+ *     six_lock_read(&foo->lock);
+ *     u32 seq = six_lock_seq(&foo->lock);
+ *     six_unlock_read(&foo->lock);
+ *
+ *     some_operation_that_may_block();
+ *
+ *     if (six_relock_read(&foo->lock, seq)) { ... }
+ *
+ *   If the relock operation succeeds, it is as if the lock was never unlocked.
+ *
+ * Reentrancy:
+ *
+ *   Six locks are not by themselves reentrant, but have counters for both the
+ *   read and intent states that can be used to provide reentrancy by an upper
+ *   layer that tracks held locks. If a lock is known to already be held in the
+ *   read or intent state, six_lock_increment() can be used to bump the "lock
+ *   held in this state" counter, increasing the number of unlock calls that
+ *   will be required to fully unlock it.
+ *
+ *   Example usage:
+ *     six_lock_read(&foo->lock);
+ *     six_lock_increment(&foo->lock, SIX_LOCK_read);
+ *     six_unlock_read(&foo->lock);
+ *     six_unlock_read(&foo->lock);
+ *   foo->lock is now fully unlocked.
+ *
+ *   Since the intent state supercedes read, it's legal to increment the read
+ *   counter when holding an intent lock, but not the reverse.
+ *
+ *   A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write)
+ *   is not legal.
+ *
+ * should_sleep_fn:
+ *
+ *   There is a six_lock() variant that takes a function pointer that is called
+ *   immediately prior to schedule() when blocking, and may return an error to
+ *   abort.
+ *
+ *   One possible use for this feature is when objects being locked are part of
+ *   a cache and may reused, and lock ordering is based on a property of the
+ *   object that will change when the object is reused - i.e. logical key order.
+ *
+ *   If looking up an object in the cache may race with object reuse, and lock
+ *   ordering is required to prevent deadlock, object reuse may change the
+ *   correct lock order for that object and cause a deadlock. should_sleep_fn
+ *   can be used to check if the object is still the object we want and avoid
+ *   this deadlock.
+ *
+ * Wait list entry interface:
+ *
+ *   There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a
+ *   wait list entry. By embedding six_lock_waiter into another object, and by
+ *   traversing lock waitlists, it is then possible for an upper layer to
+ *   implement full cycle detection for deadlock avoidance.
+ *
+ *   should_sleep_fn should be used for invoking the cycle detector, walking the
+ *   graph of held locks to check for a deadlock. The upper layer must track
+ *   held locks for each thread, and each thread's held locks must be reachable
+ *   from its six_lock_waiter object.
+ *
+ *   six_lock_waiter() will add the wait object to the waitlist re-trying taking
+ *   the lock, and before calling should_sleep_fn, and the wait object will not
+ *   be removed from the waitlist until either the lock has been successfully
+ *   acquired, or we aborted because should_sleep_fn returned an error.
+ *
+ *   Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will
+ *   have timestamps in strictly ascending order - this is so the timestamp can
+ *   be used as a cursor for lock graph traverse.
  */
 
 #include <linux/lockdep.h>
-#include <linux/osq_lock.h>
 #include <linux/sched.h>
 #include <linux/types.h>
 
-#include "util.h"
-
-#define SIX_LOCK_SEPARATE_LOCKFNS
-
-union six_lock_state {
-	struct {
-		atomic64_t	counter;
-	};
-
-	struct {
-		u64		v;
-	};
-
-	struct {
-		/* for waitlist_bitnr() */
-		unsigned long	l;
-	};
-
-	struct {
-		unsigned	read_lock:26;
-		unsigned	intent_lock:3;
-		unsigned	waiters:3;
-		/*
-		 * seq works much like in seqlocks: it's incremented every time
-		 * we lock and unlock for write.
-		 *
-		 * If it's odd write lock is held, even unlocked.
-		 *
-		 * Thus readers can unlock, and then lock again later iff it
-		 * hasn't been modified in the meantime.
-		 */
-		u32		seq;
-	};
-};
-
-#define SIX_LOCK_MAX_RECURSE	((1 << 3) - 1)
-
 enum six_lock_type {
 	SIX_LOCK_read,
 	SIX_LOCK_intent,
@@ -105,112 +134,232 @@ enum six_lock_type {
 };
 
 struct six_lock {
-	union six_lock_state	state;
+	atomic_t		state;
+	u32			seq;
+	unsigned		intent_lock_recurse;
 	struct task_struct	*owner;
-	struct optimistic_spin_queue osq;
-
+	unsigned __percpu	*readers;
 	raw_spinlock_t		wait_lock;
-	struct list_head	wait_list[2];
+	struct list_head	wait_list;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
 #endif
 };
 
-static __always_inline void __six_lock_init(struct six_lock *lock,
-					    const char *name,
-					    struct lock_class_key *key)
-{
-	atomic64_set(&lock->state.counter, 0);
-	raw_spin_lock_init(&lock->wait_lock);
-	INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]);
-	INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	debug_check_no_locks_freed((void *) lock, sizeof(*lock));
-	lockdep_init_map(&lock->dep_map, name, key, 0);
-#endif
-}
+struct six_lock_waiter {
+	struct list_head	list;
+	struct task_struct	*task;
+	enum six_lock_type	lock_want;
+	bool			lock_acquired;
+	u64			start_time;
+};
+
+typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
+
+void six_lock_exit(struct six_lock *lock);
+
+enum six_lock_init_flags {
+	SIX_LOCK_INIT_PCPU	= 1U << 0,
+};
 
-#define six_lock_init(lock)						\
+void __six_lock_init(struct six_lock *lock, const char *name,
+		     struct lock_class_key *key, enum six_lock_init_flags flags);
+
+/**
+ * six_lock_init - initialize a six lock
+ * @lock:	lock to initialize
+ * @flags:	optional flags, i.e. SIX_LOCK_INIT_PCPU
+ */
+#define six_lock_init(lock, flags)					\
 do {									\
 	static struct lock_class_key __key;				\
 									\
-	__six_lock_init((lock), #lock, &__key);				\
+	__six_lock_init((lock), #lock, &__key, flags);			\
 } while (0)
 
-#define __SIX_VAL(field, _v)	(((union six_lock_state) { .field = _v }).v)
-
-#ifdef SIX_LOCK_SEPARATE_LOCKFNS
+/**
+ * six_lock_seq - obtain current lock sequence number
+ * @lock:	six_lock to obtain sequence number for
+ *
+ * @lock should be held for read or intent, and not write
+ *
+ * By saving the lock sequence number, we can unlock @lock and then (typically
+ * after some blocking operation) attempt to relock it: the relock will succeed
+ * if the sequence number hasn't changed, meaning no write locks have been taken
+ * and state corresponding to what @lock protects is still valid.
+ */
+static inline u32 six_lock_seq(const struct six_lock *lock)
+{
+	return lock->seq;
+}
 
-#define __SIX_LOCK(type)						\
-bool six_trylock_##type(struct six_lock *);				\
-bool six_relock_##type(struct six_lock *, u32);				\
-void six_lock_##type(struct six_lock *);				\
-void six_unlock_##type(struct six_lock *);
+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
 
-__SIX_LOCK(read)
-__SIX_LOCK(intent)
-__SIX_LOCK(write)
-#undef __SIX_LOCK
+/**
+ * six_trylock_type - attempt to take a six lock without blocking
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ *
+ * Return: true on success, false on failure.
+ */
+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	return six_trylock_ip(lock, type, _THIS_IP_);
+}
 
-#define SIX_LOCK_DISPATCH(type, fn, ...)			\
-	switch (type) {						\
-	case SIX_LOCK_read:					\
-		return fn##_read(__VA_ARGS__);			\
-	case SIX_LOCK_intent:					\
-		return fn##_intent(__VA_ARGS__);		\
-	case SIX_LOCK_write:					\
-		return fn##_write(__VA_ARGS__);			\
-	default:						\
-		BUG();						\
-	}
+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+		       struct six_lock_waiter *wait,
+		       six_lock_should_sleep_fn should_sleep_fn, void *p,
+		       unsigned long ip);
 
-static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+/**
+ * six_lock_waiter - take a lock, with full waitlist interface
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @wait:	pointer to wait object, which will be added to lock's waitlist
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ *
+ * This is a convenience wrapper around six_lock_ip_waiter(), see that function
+ * for full documentation.
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type,
+				  struct six_lock_waiter *wait,
+				  six_lock_should_sleep_fn should_sleep_fn, void *p)
 {
-	SIX_LOCK_DISPATCH(type, six_trylock, lock);
+	return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_);
 }
 
-static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
-		     unsigned seq)
+/**
+ * six_lock_ip - take a six lock lock
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type,
+			      six_lock_should_sleep_fn should_sleep_fn, void *p,
+			      unsigned long ip)
 {
-	SIX_LOCK_DISPATCH(type, six_relock, lock, seq);
+	struct six_lock_waiter wait;
+
+	return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip);
 }
 
-static inline void six_lock_type(struct six_lock *lock, enum six_lock_type type)
+/**
+ * six_lock_type - take a six lock lock
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
+				six_lock_should_sleep_fn should_sleep_fn, void *p)
 {
-	SIX_LOCK_DISPATCH(type, six_lock, lock);
+	struct six_lock_waiter wait;
+
+	return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_);
 }
 
-static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
+		   unsigned seq, unsigned long ip);
+
+/**
+ * six_relock_type - attempt to re-take a lock that was held previously
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @seq:	lock sequence number obtained from six_lock_seq() while lock was
+ *		held previously
+ *
+ * Return: true on success, false on failure.
+ */
+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
+				   unsigned seq)
 {
-	SIX_LOCK_DISPATCH(type, six_unlock, lock);
+	return six_relock_ip(lock, type, seq, _THIS_IP_);
 }
 
-#else
+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
 
-bool six_trylock_type(struct six_lock *, enum six_lock_type);
-bool six_relock_type(struct six_lock *, enum six_lock_type, unsigned);
-void six_lock_type(struct six_lock *, enum six_lock_type);
-void six_unlock_type(struct six_lock *, enum six_lock_type);
+/**
+ * six_unlock_type - drop a six lock
+ * @lock:	lock to unlock
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ *
+ * When a lock is held multiple times (because six_lock_incement()) was used),
+ * this decrements the 'lock held' counter by one.
+ *
+ * For example:
+ * six_lock_read(&foo->lock);				read count 1
+ * six_lock_increment(&foo->lock, SIX_LOCK_read);	read count 2
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 1
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 0
+ */
+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	six_unlock_ip(lock, type, _THIS_IP_);
+}
 
 #define __SIX_LOCK(type)						\
-static __always_inline bool six_trylock_##type(struct six_lock *lock)	\
+static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\
+{									\
+	return six_trylock_ip(lock, SIX_LOCK_##type, ip);		\
+}									\
+									\
+static inline bool six_trylock_##type(struct six_lock *lock)		\
+{									\
+	return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_);	\
+}									\
+									\
+static inline int six_lock_ip_waiter_##type(struct six_lock *lock,	\
+			   struct six_lock_waiter *wait,		\
+			   six_lock_should_sleep_fn should_sleep_fn, void *p,\
+			   unsigned long ip)				\
+{									\
+	return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
+}									\
+									\
+static inline int six_lock_ip_##type(struct six_lock *lock,		\
+		    six_lock_should_sleep_fn should_sleep_fn, void *p,	\
+		    unsigned long ip)					\
 {									\
-	return six_trylock_type(lock, SIX_LOCK_##type);			\
+	return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
 }									\
 									\
-static __always_inline bool six_relock_##type(struct six_lock *lock, u32 seq)\
+static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
 {									\
-	return six_relock_type(lock, SIX_LOCK_##type, seq);		\
+	return six_relock_ip(lock, SIX_LOCK_##type, seq, ip);		\
 }									\
 									\
-static __always_inline void six_lock_##type(struct six_lock *lock)	\
+static inline bool six_relock_##type(struct six_lock *lock, u32 seq)	\
 {									\
-	six_lock_type(lock, SIX_LOCK_##type);				\
+	return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_);	\
 }									\
 									\
-static __always_inline void six_unlock_##type(struct six_lock *lock)	\
+static inline int six_lock_##type(struct six_lock *lock,		\
+				  six_lock_should_sleep_fn fn, void *p)\
 {									\
-	six_unlock_type(lock, SIX_LOCK_##type);				\
+	return six_lock_ip_##type(lock, fn, p, _THIS_IP_);		\
+}									\
+									\
+static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip)	\
+{									\
+	six_unlock_ip(lock, SIX_LOCK_##type, ip);			\
+}									\
+									\
+static inline void six_unlock_##type(struct six_lock *lock)		\
+{									\
+	six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_);		\
 }
 
 __SIX_LOCK(read)
@@ -218,8 +367,6 @@ __SIX_LOCK(intent)
 __SIX_LOCK(write)
 #undef __SIX_LOCK
 
-#endif
-
 void six_lock_downgrade(struct six_lock *);
 bool six_lock_tryupgrade(struct six_lock *);
 bool six_trylock_convert(struct six_lock *, enum six_lock_type,
@@ -227,4 +374,13 @@ bool six_trylock_convert(struct six_lock *, enum six_lock_type,
 
 void six_lock_increment(struct six_lock *, enum six_lock_type);
 
-#endif /* _BCACHEFS_SIX_H */
+void six_lock_wakeup_all(struct six_lock *);
+
+struct six_lock_count {
+	unsigned n[3];
+};
+
+struct six_lock_count six_lock_counts(struct six_lock *);
+void six_lock_readers_add(struct six_lock *, int);
+
+#endif /* _LINUX_SIX_H */
diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c
new file mode 100644
index 00000000..f368270d
--- /dev/null
+++ b/libbcachefs/snapshot.c
@@ -0,0 +1,1823 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "errcode.h"
+#include "error.h"
+#include "fs.h"
+#include "recovery_passes.h"
+#include "snapshot.h"
+
+#include <linux/random.h>
+
+/*
+ * Snapshot trees:
+ *
+ * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
+ * exist to provide a stable identifier for the whole lifetime of a snapshot
+ * tree.
+ */
+
+void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
+				struct bkey_s_c k)
+{
+	struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
+
+	prt_printf(out, "subvol %u root snapshot %u",
+		   le32_to_cpu(t.v->master_subvol),
+		   le32_to_cpu(t.v->root_snapshot));
+}
+
+int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
+				struct bkey_validate_context from)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+			 bkey_lt(k.k->p, POS(0, 1)),
+			 c, snapshot_tree_pos_bad,
+			 "bad pos");
+fsck_err:
+	return ret;
+}
+
+int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
+			      struct bch_snapshot_tree *s)
+{
+	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
+					  BTREE_ITER_with_updates, snapshot_tree, s);
+
+	if (bch2_err_matches(ret, ENOENT))
+		ret = -BCH_ERR_ENOENT_snapshot_tree;
+	return ret;
+}
+
+struct bkey_i_snapshot_tree *
+__bch2_snapshot_tree_create(struct btree_trans *trans)
+{
+	struct btree_iter iter;
+	int ret = bch2_bkey_get_empty_slot(trans, &iter,
+			BTREE_ID_snapshot_trees, POS(0, U32_MAX));
+	struct bkey_i_snapshot_tree *s_t;
+
+	if (ret == -BCH_ERR_ENOSPC_btree_slot)
+		ret = -BCH_ERR_ENOSPC_snapshot_tree;
+	if (ret)
+		return ERR_PTR(ret);
+
+	s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
+	ret = PTR_ERR_OR_ZERO(s_t);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret ? ERR_PTR(ret) : s_t;
+}
+
+static int bch2_snapshot_tree_create(struct btree_trans *trans,
+				u32 root_id, u32 subvol_id, u32 *tree_id)
+{
+	struct bkey_i_snapshot_tree *n_tree =
+		__bch2_snapshot_tree_create(trans);
+
+	if (IS_ERR(n_tree))
+		return PTR_ERR(n_tree);
+
+	n_tree->v.master_subvol	= cpu_to_le32(subvol_id);
+	n_tree->v.root_snapshot	= cpu_to_le32(root_id);
+	*tree_id = n_tree->k.p.offset;
+	return 0;
+}
+
+/* Snapshot nodes: */
+
+static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
+{
+	while (id && id < ancestor) {
+		const struct snapshot_t *s = __snapshot_t(t, id);
+		id = s ? s->parent : 0;
+	}
+	return id == ancestor;
+}
+
+static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	rcu_read_lock();
+	bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
+{
+	const struct snapshot_t *s = __snapshot_t(t, id);
+	if (!s)
+		return 0;
+
+	if (s->skip[2] <= ancestor)
+		return s->skip[2];
+	if (s->skip[1] <= ancestor)
+		return s->skip[1];
+	if (s->skip[0] <= ancestor)
+		return s->skip[0];
+	return s->parent;
+}
+
+static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
+{
+	const struct snapshot_t *s = __snapshot_t(t, id);
+	if (!s)
+		return false;
+
+	return test_bit(ancestor - id - 1, s->is_ancestor);
+}
+
+bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	bool ret;
+
+	rcu_read_lock();
+	struct snapshot_table *t = rcu_dereference(c->snapshots);
+
+	if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
+		ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
+		goto out;
+	}
+
+	while (id && id < ancestor - IS_ANCESTOR_BITMAP)
+		id = get_ancestor_below(t, id, ancestor);
+
+	ret = id && id < ancestor
+		? test_ancestor_bitmap(t, id, ancestor)
+		: id == ancestor;
+
+	EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
+out:
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
+{
+	size_t idx = U32_MAX - id;
+	struct snapshot_table *new, *old;
+
+	size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
+	size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
+
+	if (unlikely(new_bytes > INT_MAX))
+		return NULL;
+
+	new = kvzalloc(new_bytes, GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	new->nr = new_size;
+
+	old = rcu_dereference_protected(c->snapshots, true);
+	if (old)
+		memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
+
+	rcu_assign_pointer(c->snapshots, new);
+	kvfree_rcu(old, rcu);
+
+	return &rcu_dereference_protected(c->snapshots,
+				lockdep_is_held(&c->snapshot_table_lock))->s[idx];
+}
+
+static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
+{
+	size_t idx = U32_MAX - id;
+	struct snapshot_table *table =
+		rcu_dereference_protected(c->snapshots,
+				lockdep_is_held(&c->snapshot_table_lock));
+
+	lockdep_assert_held(&c->snapshot_table_lock);
+
+	if (likely(table && idx < table->nr))
+		return &table->s[idx];
+
+	return __snapshot_t_mut(c, id);
+}
+
+void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
+			   struct bkey_s_c k)
+{
+	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+
+	prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
+	       BCH_SNAPSHOT_SUBVOL(s.v),
+	       BCH_SNAPSHOT_DELETED(s.v),
+	       le32_to_cpu(s.v->parent),
+	       le32_to_cpu(s.v->children[0]),
+	       le32_to_cpu(s.v->children[1]),
+	       le32_to_cpu(s.v->subvol),
+	       le32_to_cpu(s.v->tree));
+
+	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
+		prt_printf(out, " depth %u skiplist %u %u %u",
+			   le32_to_cpu(s.v->depth),
+			   le32_to_cpu(s.v->skip[0]),
+			   le32_to_cpu(s.v->skip[1]),
+			   le32_to_cpu(s.v->skip[2]));
+}
+
+int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
+			   struct bkey_validate_context from)
+{
+	struct bkey_s_c_snapshot s;
+	u32 i, id;
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+			 bkey_lt(k.k->p, POS(0, 1)),
+			 c, snapshot_pos_bad,
+			 "bad pos");
+
+	s = bkey_s_c_to_snapshot(k);
+
+	id = le32_to_cpu(s.v->parent);
+	bkey_fsck_err_on(id && id <= k.k->p.offset,
+			 c, snapshot_parent_bad,
+			 "bad parent node (%u <= %llu)",
+			 id, k.k->p.offset);
+
+	bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
+			 c, snapshot_children_not_normalized,
+			 "children not normalized");
+
+	bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
+			 c, snapshot_child_duplicate,
+			 "duplicate child nodes");
+
+	for (i = 0; i < 2; i++) {
+		id = le32_to_cpu(s.v->children[i]);
+
+		bkey_fsck_err_on(id >= k.k->p.offset,
+				 c, snapshot_child_bad,
+				 "bad child node (%u >= %llu)",
+				 id, k.k->p.offset);
+	}
+
+	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
+		bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
+				 le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
+				 c, snapshot_skiplist_not_normalized,
+				 "skiplist not normalized");
+
+		for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
+			id = le32_to_cpu(s.v->skip[i]);
+
+			bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
+					 c, snapshot_skiplist_bad,
+					 "bad skiplist node %u", id);
+		}
+	}
+fsck_err:
+	return ret;
+}
+
+static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
+{
+	struct snapshot_t *t = snapshot_t_mut(c, id);
+	u32 parent = id;
+
+	while ((parent = bch2_snapshot_parent_early(c, parent)) &&
+	       parent - id - 1 < IS_ANCESTOR_BITMAP)
+		__set_bit(parent - id - 1, t->is_ancestor);
+}
+
+static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
+{
+	mutex_lock(&c->snapshot_table_lock);
+	__set_is_ancestor_bitmap(c, id);
+	mutex_unlock(&c->snapshot_table_lock);
+}
+
+static int __bch2_mark_snapshot(struct btree_trans *trans,
+		       enum btree_id btree, unsigned level,
+		       struct bkey_s_c old, struct bkey_s_c new,
+		       enum btree_iter_update_trigger_flags flags)
+{
+	struct bch_fs *c = trans->c;
+	struct snapshot_t *t;
+	u32 id = new.k->p.offset;
+	int ret = 0;
+
+	mutex_lock(&c->snapshot_table_lock);
+
+	t = snapshot_t_mut(c, id);
+	if (!t) {
+		ret = -BCH_ERR_ENOMEM_mark_snapshot;
+		goto err;
+	}
+
+	if (new.k->type == KEY_TYPE_snapshot) {
+		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
+
+		t->parent	= le32_to_cpu(s.v->parent);
+		t->children[0]	= le32_to_cpu(s.v->children[0]);
+		t->children[1]	= le32_to_cpu(s.v->children[1]);
+		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
+		t->tree		= le32_to_cpu(s.v->tree);
+
+		if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
+			t->depth	= le32_to_cpu(s.v->depth);
+			t->skip[0]	= le32_to_cpu(s.v->skip[0]);
+			t->skip[1]	= le32_to_cpu(s.v->skip[1]);
+			t->skip[2]	= le32_to_cpu(s.v->skip[2]);
+		} else {
+			t->depth	= 0;
+			t->skip[0]	= 0;
+			t->skip[1]	= 0;
+			t->skip[2]	= 0;
+		}
+
+		__set_is_ancestor_bitmap(c, id);
+
+		if (BCH_SNAPSHOT_DELETED(s.v)) {
+			set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
+			if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
+				bch2_delete_dead_snapshots_async(c);
+		}
+	} else {
+		memset(t, 0, sizeof(*t));
+	}
+err:
+	mutex_unlock(&c->snapshot_table_lock);
+	return ret;
+}
+
+int bch2_mark_snapshot(struct btree_trans *trans,
+		       enum btree_id btree, unsigned level,
+		       struct bkey_s_c old, struct bkey_s new,
+		       enum btree_iter_update_trigger_flags flags)
+{
+	return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
+}
+
+int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
+			 struct bch_snapshot *s)
+{
+	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
+				       BTREE_ITER_with_updates, snapshot, s);
+}
+
+static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
+{
+	struct bch_snapshot v;
+	int ret;
+
+	if (!id)
+		return 0;
+
+	ret = bch2_snapshot_lookup(trans, id, &v);
+	if (bch2_err_matches(ret, ENOENT))
+		bch_err(trans->c, "snapshot node %u not found", id);
+	if (ret)
+		return ret;
+
+	return !BCH_SNAPSHOT_DELETED(&v);
+}
+
+/*
+ * If @k is a snapshot with just one live child, it's part of a linear chain,
+ * which we consider to be an equivalence class: and then after snapshot
+ * deletion cleanup, there should only be a single key at a given position in
+ * this equivalence class.
+ *
+ * This sets the equivalence class of @k to be the child's equivalence class, if
+ * it's part of such a linear chain: this correctly sets equivalence classes on
+ * startup if we run leaf to root (i.e. in natural key order).
+ */
+static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	unsigned i, nr_live = 0, live_idx = 0;
+	struct bkey_s_c_snapshot snap;
+	u32 id = k.k->p.offset, child[2];
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	snap = bkey_s_c_to_snapshot(k);
+
+	child[0] = le32_to_cpu(snap.v->children[0]);
+	child[1] = le32_to_cpu(snap.v->children[1]);
+
+	for (i = 0; i < 2; i++) {
+		int ret = bch2_snapshot_live(trans, child[i]);
+
+		if (ret < 0)
+			return ret;
+
+		if (ret)
+			live_idx = i;
+		nr_live += ret;
+	}
+
+	mutex_lock(&c->snapshot_table_lock);
+
+	snapshot_t_mut(c, id)->equiv = nr_live == 1
+		? snapshot_t_mut(c, child[live_idx])->equiv
+		: id;
+
+	mutex_unlock(&c->snapshot_table_lock);
+
+	return 0;
+}
+
+/* fsck: */
+
+static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
+{
+	return snapshot_t(c, id)->children[child];
+}
+
+static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
+{
+	return bch2_snapshot_child(c, id, 0);
+}
+
+static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
+{
+	return bch2_snapshot_child(c, id, 1);
+}
+
+static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
+{
+	u32 n, parent;
+
+	n = bch2_snapshot_left_child(c, id);
+	if (n)
+		return n;
+
+	while ((parent = bch2_snapshot_parent(c, id))) {
+		n = bch2_snapshot_right_child(c, parent);
+		if (n && n != id)
+			return n;
+		id = parent;
+	}
+
+	return 0;
+}
+
+static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
+{
+	u32 id = snapshot_root;
+	u32 subvol = 0, s;
+
+	rcu_read_lock();
+	while (id) {
+		s = snapshot_t(c, id)->subvol;
+
+		if (s && (!subvol || s < subvol))
+			subvol = s;
+
+		id = bch2_snapshot_tree_next(c, id);
+	}
+	rcu_read_unlock();
+
+	return subvol;
+}
+
+static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
+					    u32 snapshot_root, u32 *subvol_id)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	bool found = false;
+	int ret;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
+				     0, k, ret) {
+		if (k.k->type != KEY_TYPE_subvolume)
+			continue;
+
+		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+		if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
+			continue;
+		if (!BCH_SUBVOLUME_SNAP(s.v)) {
+			*subvol_id = s.k->p.offset;
+			found = true;
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (!ret && !found) {
+		struct bkey_i_subvolume *u;
+
+		*subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
+
+		u = bch2_bkey_get_mut_typed(trans, &iter,
+					    BTREE_ID_subvolumes, POS(0, *subvol_id),
+					    0, subvolume);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			return ret;
+
+		SET_BCH_SUBVOLUME_SNAP(&u->v, false);
+	}
+
+	return ret;
+}
+
+static int check_snapshot_tree(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c_snapshot_tree st;
+	struct bch_snapshot s;
+	struct bch_subvolume subvol;
+	struct printbuf buf = PRINTBUF;
+	struct btree_iter snapshot_iter = {};
+	u32 root_id;
+	int ret;
+
+	if (k.k->type != KEY_TYPE_snapshot_tree)
+		return 0;
+
+	st = bkey_s_c_to_snapshot_tree(k);
+	root_id = le32_to_cpu(st.v->root_snapshot);
+
+	struct bkey_s_c_snapshot snapshot_k =
+		bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots,
+					 POS(0, root_id), 0, snapshot);
+	ret = bkey_err(snapshot_k);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		goto err;
+
+	if (!ret)
+		bkey_val_copy(&s, snapshot_k);
+
+	if (fsck_err_on(ret ||
+			root_id != bch2_snapshot_root(c, root_id) ||
+			st.k->p.offset != le32_to_cpu(s.tree),
+			trans, snapshot_tree_to_missing_snapshot,
+			"snapshot tree points to missing/incorrect snapshot:\n  %s",
+			(bch2_bkey_val_to_text(&buf, c, st.s_c),
+			 prt_newline(&buf),
+			 ret
+			 ? prt_printf(&buf, "(%s)", bch2_err_str(ret))
+			 : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c),
+			 buf.buf))) {
+		ret = bch2_btree_delete_at(trans, iter, 0);
+		goto err;
+	}
+
+	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
+				 false, 0, &subvol);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		goto err;
+
+	if (fsck_err_on(ret,
+			trans, snapshot_tree_to_missing_subvol,
+			"snapshot tree points to missing subvolume:\n  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
+	    fsck_err_on(!bch2_snapshot_is_ancestor(c,
+						le32_to_cpu(subvol.snapshot),
+						root_id),
+			trans, snapshot_tree_to_wrong_subvol,
+			"snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
+	    fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
+			trans, snapshot_tree_to_snapshot_subvol,
+			"snapshot tree points to snapshot subvolume:\n  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
+		struct bkey_i_snapshot_tree *u;
+		u32 subvol_id;
+
+		ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
+		bch_err_fn(c, ret);
+
+		if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
+			ret = 0;
+			goto err;
+		}
+
+		if (ret)
+			goto err;
+
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		u->v.master_subvol = cpu_to_le32(subvol_id);
+		st = snapshot_tree_i_to_s_c(u);
+	}
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &snapshot_iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/*
+ * For each snapshot_tree, make sure it points to the root of a snapshot tree
+ * and that snapshot entry points back to it, or delete it.
+ *
+ * And, make sure it points to a subvolume within that snapshot tree, or correct
+ * it to point to the oldest subvolume within that snapshot tree.
+ */
+int bch2_check_snapshot_trees(struct bch_fs *c)
+{
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+			BTREE_ID_snapshot_trees, POS_MIN,
+			BTREE_ITER_prefetch, k,
+			NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+		check_snapshot_tree(trans, &iter, k)));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+/*
+ * Look up snapshot tree for @tree_id and find root,
+ * make sure @snap_id is a descendent:
+ */
+static int snapshot_tree_ptr_good(struct btree_trans *trans,
+				  u32 snap_id, u32 tree_id)
+{
+	struct bch_snapshot_tree s_t;
+	int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
+
+	if (bch2_err_matches(ret, ENOENT))
+		return 0;
+	if (ret)
+		return ret;
+
+	return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
+}
+
+u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *s;
+
+	if (!id)
+		return 0;
+
+	rcu_read_lock();
+	s = snapshot_t(c, id);
+	if (s->parent)
+		id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
+	rcu_read_unlock();
+
+	return id;
+}
+
+static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
+{
+	unsigned i;
+
+	for (i = 0; i < 3; i++)
+		if (!s.parent) {
+			if (s.skip[i])
+				return false;
+		} else {
+			if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
+				return false;
+		}
+
+	return true;
+}
+
+/*
+ * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
+ * its snapshot_tree pointer is correct (allocate new one if necessary), then
+ * update this node's pointer to root node's pointer:
+ */
+static int snapshot_tree_ptr_repair(struct btree_trans *trans,
+				    struct btree_iter *iter,
+				    struct bkey_s_c k,
+				    struct bch_snapshot *s)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter root_iter;
+	struct bch_snapshot_tree s_t;
+	struct bkey_s_c_snapshot root;
+	struct bkey_i_snapshot *u;
+	u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
+	int ret;
+
+	root = bch2_bkey_get_iter_typed(trans, &root_iter,
+			       BTREE_ID_snapshots, POS(0, root_id),
+			       BTREE_ITER_with_updates, snapshot);
+	ret = bkey_err(root);
+	if (ret)
+		goto err;
+
+	tree_id = le32_to_cpu(root.v->tree);
+
+	ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
+		u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
+		ret =   PTR_ERR_OR_ZERO(u) ?:
+			bch2_snapshot_tree_create(trans, root_id,
+				bch2_snapshot_tree_oldest_subvol(c, root_id),
+				&tree_id);
+		if (ret)
+			goto err;
+
+		u->v.tree = cpu_to_le32(tree_id);
+		if (k.k->p.offset == root_id)
+			*s = u->v;
+	}
+
+	if (k.k->p.offset != root_id) {
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		u->v.tree = cpu_to_le32(tree_id);
+		*s = u->v;
+	}
+err:
+	bch2_trans_iter_exit(trans, &root_iter);
+	return ret;
+}
+
+static int check_snapshot(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_snapshot s;
+	struct bch_subvolume subvol;
+	struct bch_snapshot v;
+	struct bkey_i_snapshot *u;
+	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
+	u32 real_depth;
+	struct printbuf buf = PRINTBUF;
+	u32 i, id;
+	int ret = 0;
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	memset(&s, 0, sizeof(s));
+	memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
+
+	id = le32_to_cpu(s.parent);
+	if (id) {
+		ret = bch2_snapshot_lookup(trans, id, &v);
+		if (bch2_err_matches(ret, ENOENT))
+			bch_err(c, "snapshot with nonexistent parent:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		if (ret)
+			goto err;
+
+		if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
+		    le32_to_cpu(v.children[1]) != k.k->p.offset) {
+			bch_err(c, "snapshot parent %u missing pointer to child %llu",
+				id, k.k->p.offset);
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	for (i = 0; i < 2 && s.children[i]; i++) {
+		id = le32_to_cpu(s.children[i]);
+
+		ret = bch2_snapshot_lookup(trans, id, &v);
+		if (bch2_err_matches(ret, ENOENT))
+			bch_err(c, "snapshot node %llu has nonexistent child %u",
+				k.k->p.offset, id);
+		if (ret)
+			goto err;
+
+		if (le32_to_cpu(v.parent) != k.k->p.offset) {
+			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
+				id, le32_to_cpu(v.parent), k.k->p.offset);
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
+		!BCH_SNAPSHOT_DELETED(&s);
+
+	if (should_have_subvol) {
+		id = le32_to_cpu(s.subvol);
+		ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
+		if (bch2_err_matches(ret, ENOENT))
+			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		if (ret)
+			goto err;
+
+		if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
+			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
+				k.k->p.offset);
+			ret = -EINVAL;
+			goto err;
+		}
+	} else {
+		if (fsck_err_on(s.subvol,
+				trans, snapshot_should_not_have_subvol,
+				"snapshot should not point to subvol:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+			u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+			ret = PTR_ERR_OR_ZERO(u);
+			if (ret)
+				goto err;
+
+			u->v.subvol = 0;
+			s = u->v;
+		}
+	}
+
+	ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
+	if (ret < 0)
+		goto err;
+
+	if (fsck_err_on(!ret,
+			trans, snapshot_to_bad_snapshot_tree,
+			"snapshot points to missing/incorrect tree:\n  %s",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
+		if (ret)
+			goto err;
+	}
+	ret = 0;
+
+	real_depth = bch2_snapshot_depth(c, parent_id);
+
+	if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
+			trans, snapshot_bad_depth,
+			"snapshot with incorrect depth field, should be %u:\n  %s",
+			real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		u->v.depth = cpu_to_le32(real_depth);
+		s = u->v;
+	}
+
+	ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
+	if (ret < 0)
+		goto err;
+
+	if (fsck_err_on(!ret,
+			trans, snapshot_bad_skiplist,
+			"snapshot with bad skiplist field:\n  %s",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
+			u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
+
+		bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
+		s = u->v;
+	}
+	ret = 0;
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_check_snapshots(struct bch_fs *c)
+{
+	/*
+	 * We iterate backwards as checking/fixing the depth field requires that
+	 * the parent's depth already be correct:
+	 */
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_reverse_commit(trans, iter,
+				BTREE_ID_snapshots, POS_MAX,
+				BTREE_ITER_prefetch, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			check_snapshot(trans, &iter, k)));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int check_snapshot_exists(struct btree_trans *trans, u32 id)
+{
+	struct bch_fs *c = trans->c;
+
+	if (bch2_snapshot_equiv(c, id))
+		return 0;
+
+	/* Do we need to reconstruct the snapshot_tree entry as well? */
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+	u32 tree_id = 0;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
+				     0, k, ret) {
+		if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
+			tree_id = k.k->p.offset;
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret)
+		return ret;
+
+	if (!tree_id) {
+		ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
+		if (ret)
+			return ret;
+	}
+
+	struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
+	ret = PTR_ERR_OR_ZERO(snapshot);
+	if (ret)
+		return ret;
+
+	bkey_snapshot_init(&snapshot->k_i);
+	snapshot->k.p		= POS(0, id);
+	snapshot->v.tree	= cpu_to_le32(tree_id);
+	snapshot->v.btime.lo	= cpu_to_le64(bch2_current_time(c));
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
+				     0, k, ret) {
+		if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
+			snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
+			SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return  bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
+		bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+				   bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
+		bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i));
+}
+
+/* Figure out which snapshot nodes belong in the same tree: */
+struct snapshot_tree_reconstruct {
+	enum btree_id			btree;
+	struct bpos			cur_pos;
+	snapshot_id_list		cur_ids;
+	DARRAY(snapshot_id_list)	trees;
+};
+
+static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
+{
+	darray_for_each(r->trees, i)
+		darray_exit(i);
+	darray_exit(&r->trees);
+	darray_exit(&r->cur_ids);
+}
+
+static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
+{
+	return r->btree == BTREE_ID_inodes
+		? r->cur_pos.offset == pos.offset
+		: r->cur_pos.inode == pos.inode;
+}
+
+static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
+{
+	darray_for_each(*l, i)
+		if (snapshot_list_has_id(r, *i))
+			return true;
+	return false;
+}
+
+static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
+{
+	bool first = true;
+	darray_for_each(*s, i) {
+		if (!first)
+			prt_char(out, ' ');
+		first = false;
+		prt_printf(out, "%u", *i);
+	}
+}
+
+static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
+{
+	if (r->cur_ids.nr) {
+		darray_for_each(r->trees, i)
+			if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
+				int ret = snapshot_list_merge(c, i, &r->cur_ids);
+				if (ret)
+					return ret;
+				goto out;
+			}
+		darray_push(&r->trees, r->cur_ids);
+		darray_init(&r->cur_ids);
+	}
+out:
+	r->cur_ids.nr = 0;
+	return 0;
+}
+
+static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
+{
+	if (!same_snapshot(r, pos))
+		snapshot_tree_reconstruct_next(c, r);
+	r->cur_pos = pos;
+	return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
+}
+
+int bch2_reconstruct_snapshots(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct printbuf buf = PRINTBUF;
+	struct snapshot_tree_reconstruct r = {};
+	int ret = 0;
+
+	for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
+		if (btree_type_has_snapshots(btree)) {
+			r.btree = btree;
+
+			ret = for_each_btree_key(trans, iter, btree, POS_MIN,
+					BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
+				get_snapshot_trees(c, &r, k.k->p);
+			}));
+			if (ret)
+				goto err;
+
+			snapshot_tree_reconstruct_next(c, &r);
+		}
+	}
+
+	darray_for_each(r.trees, t) {
+		printbuf_reset(&buf);
+		snapshot_id_list_to_text(&buf, t);
+
+		darray_for_each(*t, id) {
+			if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
+					trans, snapshot_node_missing,
+					"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
+				if (t->nr > 1) {
+					bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
+					ret = -BCH_ERR_fsck_repair_unimplemented;
+					goto err;
+				}
+
+				ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+						check_snapshot_exists(trans, *id));
+				if (ret)
+					goto err;
+			}
+		}
+	}
+fsck_err:
+err:
+	bch2_trans_put(trans);
+	snapshot_tree_reconstruct_exit(&r);
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+int bch2_check_key_has_snapshot(struct btree_trans *trans,
+				struct btree_iter *iter,
+				struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot),
+			trans, bkey_in_missing_snapshot,
+			"key in missing snapshot %s, delete?",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+		ret = bch2_btree_delete_at(trans, iter,
+					    BTREE_UPDATE_internal_snapshot_node) ?: 1;
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/*
+ * Mark a snapshot as deleted, for future cleanup:
+ */
+int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
+{
+	struct btree_iter iter;
+	struct bkey_i_snapshot *s;
+	int ret = 0;
+
+	s = bch2_bkey_get_mut_typed(trans, &iter,
+				    BTREE_ID_snapshots, POS(0, id),
+				    0, snapshot);
+	ret = PTR_ERR_OR_ZERO(s);
+	if (unlikely(ret)) {
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
+					trans->c, "missing snapshot %u", id);
+		return ret;
+	}
+
+	/* already deleted? */
+	if (BCH_SNAPSHOT_DELETED(&s->v))
+		goto err;
+
+	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
+	s->v.subvol = 0;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
+{
+	if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
+		swap(s->children[0], s->children[1]);
+}
+
+static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
+	struct btree_iter c_iter = (struct btree_iter) { NULL };
+	struct btree_iter tree_iter = (struct btree_iter) { NULL };
+	struct bkey_s_c_snapshot s;
+	u32 parent_id, child_id;
+	unsigned i;
+	int ret = 0;
+
+	s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+				     BTREE_ITER_intent, snapshot);
+	ret = bkey_err(s);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+				"missing snapshot %u", id);
+
+	if (ret)
+		goto err;
+
+	BUG_ON(s.v->children[1]);
+
+	parent_id = le32_to_cpu(s.v->parent);
+	child_id = le32_to_cpu(s.v->children[0]);
+
+	if (parent_id) {
+		struct bkey_i_snapshot *parent;
+
+		parent = bch2_bkey_get_mut_typed(trans, &p_iter,
+				     BTREE_ID_snapshots, POS(0, parent_id),
+				     0, snapshot);
+		ret = PTR_ERR_OR_ZERO(parent);
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+					"missing snapshot %u", parent_id);
+		if (unlikely(ret))
+			goto err;
+
+		/* find entry in parent->children for node being deleted */
+		for (i = 0; i < 2; i++)
+			if (le32_to_cpu(parent->v.children[i]) == id)
+				break;
+
+		if (bch2_fs_inconsistent_on(i == 2, c,
+					"snapshot %u missing child pointer to %u",
+					parent_id, id))
+			goto err;
+
+		parent->v.children[i] = cpu_to_le32(child_id);
+
+		normalize_snapshot_child_pointers(&parent->v);
+	}
+
+	if (child_id) {
+		struct bkey_i_snapshot *child;
+
+		child = bch2_bkey_get_mut_typed(trans, &c_iter,
+				     BTREE_ID_snapshots, POS(0, child_id),
+				     0, snapshot);
+		ret = PTR_ERR_OR_ZERO(child);
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+					"missing snapshot %u", child_id);
+		if (unlikely(ret))
+			goto err;
+
+		child->v.parent = cpu_to_le32(parent_id);
+
+		if (!child->v.parent) {
+			child->v.skip[0] = 0;
+			child->v.skip[1] = 0;
+			child->v.skip[2] = 0;
+		}
+	}
+
+	if (!parent_id) {
+		/*
+		 * We're deleting the root of a snapshot tree: update the
+		 * snapshot_tree entry to point to the new root, or delete it if
+		 * this is the last snapshot ID in this tree:
+		 */
+		struct bkey_i_snapshot_tree *s_t;
+
+		BUG_ON(s.v->children[1]);
+
+		s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
+				BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
+				0, snapshot_tree);
+		ret = PTR_ERR_OR_ZERO(s_t);
+		if (ret)
+			goto err;
+
+		if (s.v->children[0]) {
+			s_t->v.root_snapshot = s.v->children[0];
+		} else {
+			s_t->k.type = KEY_TYPE_deleted;
+			set_bkey_val_u64s(&s_t->k, 0);
+		}
+	}
+
+	ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+	bch2_trans_iter_exit(trans, &tree_iter);
+	bch2_trans_iter_exit(trans, &p_iter);
+	bch2_trans_iter_exit(trans, &c_iter);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
+			  u32 *new_snapids,
+			  u32 *snapshot_subvols,
+			  unsigned nr_snapids)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_i_snapshot *n;
+	struct bkey_s_c k;
+	unsigned i, j;
+	u32 depth = bch2_snapshot_depth(c, parent);
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
+			     POS_MIN, BTREE_ITER_intent);
+	k = bch2_btree_iter_peek(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < nr_snapids; i++) {
+		k = bch2_btree_iter_prev_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (!k.k || !k.k->p.offset) {
+			ret = -BCH_ERR_ENOSPC_snapshot_create;
+			goto err;
+		}
+
+		n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(n);
+		if (ret)
+			goto err;
+
+		n->v.flags	= 0;
+		n->v.parent	= cpu_to_le32(parent);
+		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
+		n->v.tree	= cpu_to_le32(tree);
+		n->v.depth	= cpu_to_le32(depth);
+		n->v.btime.lo	= cpu_to_le64(bch2_current_time(c));
+		n->v.btime.hi	= 0;
+
+		for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
+			n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
+
+		bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
+		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
+
+		ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+					 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
+		if (ret)
+			goto err;
+
+		new_snapids[i]	= iter.pos.offset;
+
+		mutex_lock(&c->snapshot_table_lock);
+		snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
+		mutex_unlock(&c->snapshot_table_lock);
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/*
+ * Create new snapshot IDs as children of an existing snapshot ID:
+ */
+static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
+			      u32 *new_snapids,
+			      u32 *snapshot_subvols,
+			      unsigned nr_snapids)
+{
+	struct btree_iter iter;
+	struct bkey_i_snapshot *n_parent;
+	int ret = 0;
+
+	n_parent = bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_snapshots, POS(0, parent),
+			0, snapshot);
+	ret = PTR_ERR_OR_ZERO(n_parent);
+	if (unlikely(ret)) {
+		if (bch2_err_matches(ret, ENOENT))
+			bch_err(trans->c, "snapshot %u not found", parent);
+		return ret;
+	}
+
+	if (n_parent->v.children[0] || n_parent->v.children[1]) {
+		bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
+			     new_snapids, snapshot_subvols, nr_snapids);
+	if (ret)
+		goto err;
+
+	n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
+	n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
+	n_parent->v.subvol = 0;
+	SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/*
+ * Create a snapshot node that is the root of a new tree:
+ */
+static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
+			      u32 *new_snapids,
+			      u32 *snapshot_subvols,
+			      unsigned nr_snapids)
+{
+	struct bkey_i_snapshot_tree *n_tree;
+	int ret;
+
+	n_tree = __bch2_snapshot_tree_create(trans);
+	ret =   PTR_ERR_OR_ZERO(n_tree) ?:
+		create_snapids(trans, 0, n_tree->k.p.offset,
+			     new_snapids, snapshot_subvols, nr_snapids);
+	if (ret)
+		return ret;
+
+	n_tree->v.master_subvol	= cpu_to_le32(snapshot_subvols[0]);
+	n_tree->v.root_snapshot	= cpu_to_le32(new_snapids[0]);
+	return 0;
+}
+
+int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
+			      u32 *new_snapids,
+			      u32 *snapshot_subvols,
+			      unsigned nr_snapids)
+{
+	BUG_ON((parent == 0) != (nr_snapids == 1));
+	BUG_ON((parent != 0) != (nr_snapids == 2));
+
+	return parent
+		? bch2_snapshot_node_create_children(trans, parent,
+				new_snapids, snapshot_subvols, nr_snapids)
+		: bch2_snapshot_node_create_tree(trans,
+				new_snapids, snapshot_subvols, nr_snapids);
+
+}
+
+/*
+ * If we have an unlinked inode in an internal snapshot node, and the inode
+ * really has been deleted in all child snapshots, how does this get cleaned up?
+ *
+ * first there is the problem of how keys that have been overwritten in all
+ * child snapshots get deleted (unimplemented?), but inodes may perhaps be
+ * special?
+ *
+ * also: unlinked inode in internal snapshot appears to not be getting deleted
+ * correctly if inode doesn't exist in leaf snapshots
+ *
+ * solution:
+ *
+ * for a key in an interior snapshot node that needs work to be done that
+ * requires it to be mutated: iterate over all descendent leaf nodes and copy
+ * that key to snapshot leaf nodes, where we can mutate it
+ */
+
+static int delete_dead_snapshots_process_key(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c k,
+			       snapshot_id_list *deleted,
+			       snapshot_id_list *equiv_seen,
+			       struct bpos *last_pos)
+{
+	int ret = bch2_check_key_has_snapshot(trans, iter, k);
+	if (ret)
+		return ret < 0 ? ret : 0;
+
+	struct bch_fs *c = trans->c;
+	u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+	if (!equiv) /* key for invalid snapshot node, but we chose not to delete */
+		return 0;
+
+	if (!bkey_eq(k.k->p, *last_pos))
+		equiv_seen->nr = 0;
+
+	if (snapshot_list_has_id(deleted, k.k->p.snapshot))
+		return bch2_btree_delete_at(trans, iter,
+					    BTREE_UPDATE_internal_snapshot_node);
+
+	if (!bpos_eq(*last_pos, k.k->p) &&
+	    snapshot_list_has_id(equiv_seen, equiv))
+		return bch2_btree_delete_at(trans, iter,
+					    BTREE_UPDATE_internal_snapshot_node);
+
+	*last_pos = k.k->p;
+
+	ret = snapshot_list_add_nodup(c, equiv_seen, equiv);
+	if (ret)
+		return ret;
+
+	/*
+	 * When we have a linear chain of snapshot nodes, we consider
+	 * those to form an equivalence class: we're going to collapse
+	 * them all down to a single node, and keep the leaf-most node -
+	 * which has the same id as the equivalence class id.
+	 *
+	 * If there are multiple keys in different snapshots at the same
+	 * position, we're only going to keep the one in the newest
+	 * snapshot (we delete the others above) - the rest have been
+	 * overwritten and are redundant, and for the key we're going to keep we
+	 * need to move it to the equivalance class ID if it's not there
+	 * already.
+	 */
+	if (equiv != k.k->p.snapshot) {
+		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+		int ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			return ret;
+
+		new->k.p.snapshot = equiv;
+
+		struct btree_iter new_iter;
+		bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
+				     BTREE_ITER_all_snapshots|
+				     BTREE_ITER_cached|
+				     BTREE_ITER_intent);
+
+		ret =   bch2_btree_iter_traverse(&new_iter) ?:
+			bch2_trans_update(trans, &new_iter, new,
+					BTREE_UPDATE_internal_snapshot_node) ?:
+			bch2_btree_delete_at(trans, iter,
+					BTREE_UPDATE_internal_snapshot_node);
+		bch2_trans_iter_exit(trans, &new_iter);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bkey_s_c_snapshot snap;
+	u32 children[2];
+	int ret;
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	snap = bkey_s_c_to_snapshot(k);
+	if (BCH_SNAPSHOT_DELETED(snap.v) ||
+	    BCH_SNAPSHOT_SUBVOL(snap.v))
+		return 0;
+
+	children[0] = le32_to_cpu(snap.v->children[0]);
+	children[1] = le32_to_cpu(snap.v->children[1]);
+
+	ret   = bch2_snapshot_live(trans, children[0]) ?:
+		bch2_snapshot_live(trans, children[1]);
+	if (ret < 0)
+		return ret;
+	return !ret;
+}
+
+/*
+ * For a given snapshot, if it doesn't have a subvolume that points to it, and
+ * it doesn't have child snapshot nodes - it's now redundant and we can mark it
+ * as deleted.
+ */
+static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k)
+{
+	int ret = bch2_snapshot_needs_delete(trans, k);
+
+	return ret <= 0
+		? ret
+		: bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
+}
+
+static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
+						snapshot_id_list *skip)
+{
+	rcu_read_lock();
+	while (snapshot_list_has_id(skip, id))
+		id = __bch2_snapshot_parent(c, id);
+
+	while (n--) {
+		do {
+			id = __bch2_snapshot_parent(c, id);
+		} while (snapshot_list_has_id(skip, id));
+	}
+	rcu_read_unlock();
+
+	return id;
+}
+
+static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
+					      struct btree_iter *iter, struct bkey_s_c k,
+					      snapshot_id_list *deleted)
+{
+	struct bch_fs *c = trans->c;
+	u32 nr_deleted_ancestors = 0;
+	struct bkey_i_snapshot *s;
+	int ret;
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	if (snapshot_list_has_id(deleted, k.k->p.offset))
+		return 0;
+
+	s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
+	ret = PTR_ERR_OR_ZERO(s);
+	if (ret)
+		return ret;
+
+	darray_for_each(*deleted, i)
+		nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i);
+
+	if (!nr_deleted_ancestors)
+		return 0;
+
+	le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
+
+	if (!s->v.depth) {
+		s->v.skip[0] = 0;
+		s->v.skip[1] = 0;
+		s->v.skip[2] = 0;
+	} else {
+		u32 depth = le32_to_cpu(s->v.depth);
+		u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
+
+		for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
+			u32 id = le32_to_cpu(s->v.skip[j]);
+
+			if (snapshot_list_has_id(deleted, id)) {
+				id = bch2_snapshot_nth_parent_skip(c,
+							parent,
+							depth > 1
+							? get_random_u32_below(depth - 1)
+							: 0,
+							deleted);
+				s->v.skip[j] = cpu_to_le32(id);
+			}
+		}
+
+		bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
+	}
+
+	return bch2_trans_update(trans, iter, &s->k_i, 0);
+}
+
+int bch2_delete_dead_snapshots(struct bch_fs *c)
+{
+	struct btree_trans *trans;
+	snapshot_id_list deleted = { 0 };
+	snapshot_id_list deleted_interior = { 0 };
+	int ret = 0;
+
+	if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
+		return 0;
+
+	trans = bch2_trans_get(c);
+
+	/*
+	 * For every snapshot node: If we have no live children and it's not
+	 * pointed to by a subvolume, delete it:
+	 */
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
+			POS_MIN, 0, k,
+			NULL, NULL, 0,
+		bch2_delete_redundant_snapshot(trans, k));
+	bch_err_msg(c, ret, "deleting redundant snapshots");
+	if (ret)
+		goto err;
+
+	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+				 POS_MIN, 0, k,
+		bch2_snapshot_set_equiv(trans, k));
+	bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
+	if (ret)
+		goto err;
+
+	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+				 POS_MIN, 0, k, ({
+		if (k.k->type != KEY_TYPE_snapshot)
+			continue;
+
+		BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)
+			? snapshot_list_add(c, &deleted, k.k->p.offset)
+			: 0;
+	}));
+	bch_err_msg(c, ret, "walking snapshots");
+	if (ret)
+		goto err;
+
+	for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
+		struct bpos last_pos = POS_MIN;
+		snapshot_id_list equiv_seen = { 0 };
+		struct disk_reservation res = { 0 };
+
+		if (!btree_type_has_snapshots(btree))
+			continue;
+
+		ret = for_each_btree_key_commit(trans, iter,
+				btree, POS_MIN,
+				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+				&res, NULL, BCH_TRANS_COMMIT_no_enospc,
+			delete_dead_snapshots_process_key(trans, &iter, k, &deleted,
+							  &equiv_seen, &last_pos));
+
+		bch2_disk_reservation_put(c, &res);
+		darray_exit(&equiv_seen);
+
+		bch_err_msg(c, ret, "deleting keys from dying snapshots");
+		if (ret)
+			goto err;
+	}
+
+	bch2_trans_unlock(trans);
+	down_write(&c->snapshot_create_lock);
+
+	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+				 POS_MIN, 0, k, ({
+		u32 snapshot = k.k->p.offset;
+		u32 equiv = bch2_snapshot_equiv(c, snapshot);
+
+		equiv != snapshot
+			? snapshot_list_add(c, &deleted_interior, snapshot)
+			: 0;
+	}));
+
+	bch_err_msg(c, ret, "walking snapshots");
+	if (ret)
+		goto err_create_lock;
+
+	/*
+	 * Fixing children of deleted snapshots can't be done completely
+	 * atomically, if we crash between here and when we delete the interior
+	 * nodes some depth fields will be off:
+	 */
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
+				  BTREE_ITER_intent, k,
+				  NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+		bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
+	if (ret)
+		goto err_create_lock;
+
+	darray_for_each(deleted, i) {
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_snapshot_node_delete(trans, *i));
+		bch_err_msg(c, ret, "deleting snapshot %u", *i);
+		if (ret)
+			goto err_create_lock;
+	}
+
+	darray_for_each(deleted_interior, i) {
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_snapshot_node_delete(trans, *i));
+		bch_err_msg(c, ret, "deleting snapshot %u", *i);
+		if (ret)
+			goto err_create_lock;
+	}
+err_create_lock:
+	up_write(&c->snapshot_create_lock);
+err:
+	darray_exit(&deleted_interior);
+	darray_exit(&deleted);
+	bch2_trans_put(trans);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+void bch2_delete_dead_snapshots_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+
+	set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
+
+	bch2_delete_dead_snapshots(c);
+	bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
+}
+
+void bch2_delete_dead_snapshots_async(struct bch_fs *c)
+{
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots))
+		return;
+
+	BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
+
+	if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
+}
+
+int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
+				       enum btree_id id,
+				       struct bpos pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos),
+					     BTREE_ITER_not_extents|
+					     BTREE_ITER_all_snapshots,
+					     k, ret) {
+		if (!bkey_eq(pos, k.k->p))
+			break;
+
+		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
+			ret = 1;
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c_snapshot snap;
+	int ret = 0;
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	snap = bkey_s_c_to_snapshot(k);
+	if (BCH_SNAPSHOT_DELETED(snap.v) ||
+	    bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset ||
+	    (ret = bch2_snapshot_needs_delete(trans, k)) > 0) {
+		set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
+		return 0;
+	}
+
+	return ret;
+}
+
+int bch2_snapshots_read(struct bch_fs *c)
+{
+	int ret = bch2_trans_run(c,
+		for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+				   POS_MIN, 0, k,
+			__bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
+			bch2_snapshot_set_equiv(trans, k) ?:
+			bch2_check_snapshot_needs_deletion(trans, k)) ?:
+		for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+				   POS_MIN, 0, k,
+			   (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
+	bch_err_fn(c, ret);
+
+	/*
+	 * It's important that we check if we need to reconstruct snapshots
+	 * before going RW, so we mark that pass as required in the superblock -
+	 * otherwise, we could end up deleting keys with missing snapshot nodes
+	 * instead
+	 */
+	BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
+	       test_bit(BCH_FS_may_go_rw, &c->flags));
+
+	if (bch2_err_matches(ret, EIO) ||
+	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
+		ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
+
+	return ret;
+}
+
+void bch2_fs_snapshots_exit(struct bch_fs *c)
+{
+	kvfree(rcu_dereference_protected(c->snapshots, true));
+}
diff --git a/libbcachefs/snapshot.h b/libbcachefs/snapshot.h
new file mode 100644
index 00000000..ae23d45f
--- /dev/null
+++ b/libbcachefs/snapshot.h
@@ -0,0 +1,264 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SNAPSHOT_H
+#define _BCACHEFS_SNAPSHOT_H
+
+void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c,
+				struct bkey_validate_context);
+
+#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) {	\
+	.key_validate	= bch2_snapshot_tree_validate,		\
+	.val_to_text	= bch2_snapshot_tree_to_text,		\
+	.min_val_size	= 8,					\
+})
+
+struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *);
+
+int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
+
+void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c,
+			   struct bkey_validate_context);
+int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
+		       struct bkey_s_c, struct bkey_s,
+		       enum btree_iter_update_trigger_flags);
+
+#define bch2_bkey_ops_snapshot ((struct bkey_ops) {		\
+	.key_validate	= bch2_snapshot_validate,		\
+	.val_to_text	= bch2_snapshot_to_text,		\
+	.trigger	= bch2_mark_snapshot,			\
+	.min_val_size	= 24,					\
+})
+
+static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
+{
+	u32 idx = U32_MAX - id;
+
+	return likely(t && idx < t->nr)
+		? &t->s[idx]
+		: NULL;
+}
+
+static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
+{
+	return __snapshot_t(rcu_dereference(c->snapshots), id);
+}
+
+static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	const struct snapshot_t *s = snapshot_t(c, id);
+	id = s ? s->tree : 0;
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *s = snapshot_t(c, id);
+	return s ? s->parent : 0;
+}
+
+static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = __bch2_snapshot_parent_early(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *s = snapshot_t(c, id);
+	if (!s)
+		return 0;
+
+	u32 parent = s->parent;
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+	    parent &&
+	    s->depth != snapshot_t(c, parent)->depth + 1)
+		panic("id %u depth=%u parent %u depth=%u\n",
+		      id, snapshot_t(c, id)->depth,
+		      parent, snapshot_t(c, parent)->depth);
+
+	return parent;
+}
+
+static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = __bch2_snapshot_parent(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
+{
+	rcu_read_lock();
+	while (n--)
+		id = __bch2_snapshot_parent(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
+u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
+
+static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
+{
+	u32 parent;
+
+	rcu_read_lock();
+	while ((parent = __bch2_snapshot_parent(c, id)))
+		id = parent;
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *s = snapshot_t(c, id);
+	return s ? s->equiv : 0;
+}
+
+static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = __bch2_snapshot_equiv(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	const struct snapshot_t *s = snapshot_t(c, id);
+	int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
+{
+	int ret = bch2_snapshot_is_internal_node(c, id);
+	if (ret < 0)
+		return ret;
+	return !ret;
+}
+
+static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
+{
+	u32 depth;
+
+	rcu_read_lock();
+	depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
+	rcu_read_unlock();
+
+	return depth;
+}
+
+bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
+
+static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	return id == ancestor
+		? true
+		: __bch2_snapshot_is_ancestor(c, id, ancestor);
+}
+
+static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	const struct snapshot_t *t = snapshot_t(c, id);
+	bool ret = t && (t->children[0]|t->children[1]) != 0;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
+{
+	darray_for_each(*s, i)
+		if (*i == id)
+			return true;
+	return false;
+}
+
+static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+	darray_for_each(*s, i)
+		if (bch2_snapshot_is_ancestor(c, id, *i))
+			return true;
+	return false;
+}
+
+static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+	BUG_ON(snapshot_list_has_id(s, id));
+	int ret = darray_push(s, id);
+	if (ret)
+		bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
+	return ret;
+}
+
+static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+	int ret = snapshot_list_has_id(s, id)
+		? 0
+		: darray_push(s, id);
+	if (ret)
+		bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
+	return ret;
+}
+
+static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src)
+{
+	darray_for_each(*src, i) {
+		int ret = snapshot_list_add_nodup(c, dst, *i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
+			 struct bch_snapshot *s);
+int bch2_snapshot_get_subvol(struct btree_trans *, u32,
+			     struct bch_subvolume *);
+
+/* only exported for tests: */
+int bch2_snapshot_node_create(struct btree_trans *, u32,
+			      u32 *, u32 *, unsigned);
+
+int bch2_check_snapshot_trees(struct bch_fs *);
+int bch2_check_snapshots(struct bch_fs *);
+int bch2_reconstruct_snapshots(struct bch_fs *);
+int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c);
+
+int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
+void bch2_delete_dead_snapshots_work(struct work_struct *);
+
+int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);
+
+static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
+					  enum btree_id id,
+					  struct bpos pos)
+{
+	if (!btree_type_has_snapshots(id) ||
+	    bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0)
+		return 0;
+
+	return __bch2_key_has_snapshot_overwrites(trans, id, pos);
+}
+
+int bch2_snapshots_read(struct bch_fs *);
+void bch2_fs_snapshots_exit(struct bch_fs *);
+
+#endif /* _BCACHEFS_SNAPSHOT_H */
diff --git a/libbcachefs/snapshot_format.h b/libbcachefs/snapshot_format.h
new file mode 100644
index 00000000..aabcd3a7
--- /dev/null
+++ b/libbcachefs/snapshot_format.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H
+#define _BCACHEFS_SNAPSHOT_FORMAT_H
+
+struct bch_snapshot {
+	struct bch_val		v;
+	__le32			flags;
+	__le32			parent;
+	__le32			children[2];
+	__le32			subvol;
+	/* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
+	__le32			tree;
+	__le32			depth;
+	__le32			skip[3];
+	bch_le128		btime;
+};
+
+LE32_BITMASK(BCH_SNAPSHOT_DELETED,	struct bch_snapshot, flags,  0,  1)
+
+/* True if a subvolume points to this snapshot node: */
+LE32_BITMASK(BCH_SNAPSHOT_SUBVOL,	struct bch_snapshot, flags,  1,  2)
+
+/*
+ * Snapshot trees:
+ *
+ * The snapshot_trees btree gives us persistent indentifier for each tree of
+ * bch_snapshot nodes, and allow us to record and easily find the root/master
+ * subvolume that other snapshots were created from:
+ */
+struct bch_snapshot_tree {
+	struct bch_val		v;
+	__le32			master_subvol;
+	__le32			root_snapshot;
+};
+
+#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */
diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h
index 99f1fe87..00c78505 100644
--- a/libbcachefs/str_hash.h
+++ b/libbcachefs/str_hash.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_STR_HASH_H
 #define _BCACHEFS_STR_HASH_H
 
@@ -7,48 +8,57 @@
 #include "error.h"
 #include "inode.h"
 #include "siphash.h"
+#include "subvolume.h"
 #include "super.h"
 
 #include <linux/crc32c.h>
 #include <crypto/hash.h>
+#include <crypto/sha2.h>
+
+static inline enum bch_str_hash_type
+bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
+{
+	switch (opt) {
+	case BCH_STR_HASH_OPT_crc32c:
+		return BCH_STR_HASH_crc32c;
+	case BCH_STR_HASH_OPT_crc64:
+		return BCH_STR_HASH_crc64;
+	case BCH_STR_HASH_OPT_siphash:
+		return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
+			? BCH_STR_HASH_siphash
+			: BCH_STR_HASH_siphash_old;
+	default:
+	     BUG();
+	}
+}
 
 struct bch_hash_info {
 	u8			type;
-	union {
-		__le64		crc_key;
-		SIPHASH_KEY	siphash_key;
-	};
+	/*
+	 * For crc32 or crc64 string hashes the first key value of
+	 * the siphash_key (k0) is used as the key.
+	 */
+	SIPHASH_KEY	siphash_key;
 };
 
 static inline struct bch_hash_info
-bch2_hash_info_init(struct bch_fs *c,
-		   const struct bch_inode_unpacked *bi)
+bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
 {
 	/* XXX ick */
 	struct bch_hash_info info = {
-		.type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) &
-			~(~0U << INODE_STR_HASH_BITS)
+		.type = INODE_STR_HASH(bi),
+		.siphash_key = { .k0 = bi->bi_hash_seed }
 	};
 
-	switch (info.type) {
-	case BCH_STR_HASH_CRC32C:
-	case BCH_STR_HASH_CRC64:
-		info.crc_key = bi->bi_hash_seed;
-		break;
-	case BCH_STR_HASH_SIPHASH: {
+	if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
 		SHASH_DESC_ON_STACK(desc, c->sha256);
-		u8 digest[crypto_shash_digestsize(c->sha256)];
+		u8 digest[SHA256_DIGEST_SIZE];
 
 		desc->tfm = c->sha256;
-		desc->flags = 0;
 
 		crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
 				    sizeof(bi->bi_hash_seed), digest);
 		memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
-		break;
-	}
-	default:
-		BUG();
 	}
 
 	return info;
@@ -66,13 +76,16 @@ static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
 				     const struct bch_hash_info *info)
 {
 	switch (info->type) {
-	case BCH_STR_HASH_CRC32C:
-		ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key));
+	case BCH_STR_HASH_crc32c:
+		ctx->crc32c = crc32c(~0, &info->siphash_key.k0,
+				     sizeof(info->siphash_key.k0));
 		break;
-	case BCH_STR_HASH_CRC64:
-		ctx->crc64 = bch2_crc64_update(~0, &info->crc_key, sizeof(info->crc_key));
+	case BCH_STR_HASH_crc64:
+		ctx->crc64 = crc64_be(~0, &info->siphash_key.k0,
+				      sizeof(info->siphash_key.k0));
 		break;
-	case BCH_STR_HASH_SIPHASH:
+	case BCH_STR_HASH_siphash_old:
+	case BCH_STR_HASH_siphash:
 		SipHash24_Init(&ctx->siphash, &info->siphash_key);
 		break;
 	default:
@@ -85,13 +98,14 @@ static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
 				       const void *data, size_t len)
 {
 	switch (info->type) {
-	case BCH_STR_HASH_CRC32C:
+	case BCH_STR_HASH_crc32c:
 		ctx->crc32c = crc32c(ctx->crc32c, data, len);
 		break;
-	case BCH_STR_HASH_CRC64:
-		ctx->crc64 = bch2_crc64_update(ctx->crc64, data, len);
+	case BCH_STR_HASH_crc64:
+		ctx->crc64 = crc64_be(ctx->crc64, data, len);
 		break;
-	case BCH_STR_HASH_SIPHASH:
+	case BCH_STR_HASH_siphash_old:
+	case BCH_STR_HASH_siphash:
 		SipHash24_Update(&ctx->siphash, data, len);
 		break;
 	default:
@@ -103,11 +117,12 @@ static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
 				   const struct bch_hash_info *info)
 {
 	switch (info->type) {
-	case BCH_STR_HASH_CRC32C:
+	case BCH_STR_HASH_crc32c:
 		return ctx->crc32c;
-	case BCH_STR_HASH_CRC64:
+	case BCH_STR_HASH_crc64:
 		return ctx->crc64 >> 1;
-	case BCH_STR_HASH_SIPHASH:
+	case BCH_STR_HASH_siphash_old:
+	case BCH_STR_HASH_siphash:
 		return SipHash24_End(&ctx->siphash) >> 1;
 	default:
 		BUG();
@@ -117,119 +132,146 @@ static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
 struct bch_hash_desc {
 	enum btree_id	btree_id;
 	u8		key_type;
-	u8		whiteout_type;
 
 	u64		(*hash_key)(const struct bch_hash_info *, const void *);
 	u64		(*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
 	bool		(*cmp_key)(struct bkey_s_c, const void *);
 	bool		(*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
+	bool		(*is_visible)(subvol_inum inum, struct bkey_s_c);
 };
 
-static inline struct btree_iter *
-bch2_hash_lookup(struct btree_trans *trans,
+static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k)
+{
+	return k.k->type == desc.key_type &&
+		(!desc.is_visible ||
+		 !inum.inum ||
+		 desc.is_visible(inum, k));
+}
+
+static __always_inline struct bkey_s_c
+bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
+		 struct btree_iter *iter,
 		 const struct bch_hash_desc desc,
 		 const struct bch_hash_info *info,
-		 u64 inode, const void *key,
-		 unsigned flags)
+		 subvol_inum inum, const void *key,
+		 enum btree_iter_update_trigger_flags flags,
+		 u32 snapshot)
 {
-	struct btree_iter *iter;
 	struct bkey_s_c k;
+	int ret;
 
-	iter = bch2_trans_get_iter(trans, desc.btree_id,
-				   POS(inode, desc.hash_key(info, key)),
-				   BTREE_ITER_SLOTS|flags);
-	if (IS_ERR(iter))
-		return iter;
-
-	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
-		if (iter->pos.inode != inode)
-			break;
-
-		if (k.k->type == desc.key_type) {
+	for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
+			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
+			   POS(inum.inum, U64_MAX),
+			   BTREE_ITER_slots|flags, k, ret) {
+		if (is_visible_key(desc, inum, k)) {
 			if (!desc.cmp_key(k, key))
-				return iter;
-		} else if (k.k->type == desc.whiteout_type) {
+				return k;
+		} else if (k.k->type == KEY_TYPE_hash_whiteout) {
 			;
 		} else {
 			/* hole, not found */
 			break;
 		}
 	}
+	bch2_trans_iter_exit(trans, iter);
 
-	return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOENT);
+	return bkey_s_c_err(ret ?: -BCH_ERR_ENOENT_str_hash_lookup);
 }
 
-static inline struct btree_iter *
+static __always_inline struct bkey_s_c
+bch2_hash_lookup(struct btree_trans *trans,
+		 struct btree_iter *iter,
+		 const struct bch_hash_desc desc,
+		 const struct bch_hash_info *info,
+		 subvol_inum inum, const void *key,
+		 enum btree_iter_update_trigger_flags flags)
+{
+	u32 snapshot;
+	int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		return bkey_s_c_err(ret);
+
+	return bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
+}
+
+static __always_inline int
 bch2_hash_hole(struct btree_trans *trans,
+	       struct btree_iter *iter,
 	       const struct bch_hash_desc desc,
 	       const struct bch_hash_info *info,
-	       u64 inode, const void *key)
+	       subvol_inum inum, const void *key)
 {
-	struct btree_iter *iter;
 	struct bkey_s_c k;
+	u32 snapshot;
+	int ret;
 
-	iter = bch2_trans_get_iter(trans, desc.btree_id,
-				   POS(inode, desc.hash_key(info, key)),
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	if (IS_ERR(iter))
-		return iter;
-
-	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
-		if (iter->pos.inode != inode)
-			break;
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		return ret;
 
-		if (k.k->type != desc.key_type)
-			return iter;
-	}
+	for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
+			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
+			   POS(inum.inum, U64_MAX),
+			   BTREE_ITER_slots|BTREE_ITER_intent, k, ret)
+		if (!is_visible_key(desc, inum, k))
+			return 0;
+	bch2_trans_iter_exit(trans, iter);
 
-	return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOSPC);
+	return ret ?: -BCH_ERR_ENOSPC_str_hash_create;
 }
 
-static inline int bch2_hash_needs_whiteout(struct btree_trans *trans,
-					   const struct bch_hash_desc desc,
-					   const struct bch_hash_info *info,
-					   struct btree_iter *start)
+static __always_inline
+int bch2_hash_needs_whiteout(struct btree_trans *trans,
+			     const struct bch_hash_desc desc,
+			     const struct bch_hash_info *info,
+			     struct btree_iter *start)
 {
-	struct btree_iter *iter;
+	struct btree_iter iter;
 	struct bkey_s_c k;
+	int ret;
 
-	iter = bch2_trans_copy_iter(trans, start);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
+	bch2_trans_copy_iter(&iter, start);
 
-	bch2_btree_iter_next_slot(iter);
+	bch2_btree_iter_advance(&iter);
 
-	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+	for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) {
 		if (k.k->type != desc.key_type &&
-		    k.k->type != desc.whiteout_type)
-			return false;
+		    k.k->type != KEY_TYPE_hash_whiteout)
+			break;
 
 		if (k.k->type == desc.key_type &&
-		    desc.hash_bkey(info, k) <= start->pos.offset)
-			return true;
+		    desc.hash_bkey(info, k) <= start->pos.offset) {
+			ret = 1;
+			break;
+		}
 	}
-	return btree_iter_err(k);
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
 }
 
-static inline int __bch2_hash_set(struct btree_trans *trans,
-				  const struct bch_hash_desc desc,
-				  const struct bch_hash_info *info,
-				  u64 inode, struct bkey_i *insert, int flags)
+static __always_inline
+struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
+			   struct btree_iter *iter,
+			   const struct bch_hash_desc desc,
+			   const struct bch_hash_info *info,
+			   subvol_inum inum, u32 snapshot,
+			   struct bkey_i *insert,
+			   enum btree_iter_update_trigger_flags flags)
 {
-	struct btree_iter *iter, *slot = NULL;
+	struct btree_iter slot = {};
 	struct bkey_s_c k;
+	bool found = false;
+	int ret;
 
-	iter = bch2_trans_get_iter(trans, desc.btree_id,
-			POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
-			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
-
-	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
-		if (iter->pos.inode != inode)
-			break;
-
-		if (k.k->type == desc.key_type) {
+	for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
+			   SPOS(insert->k.p.inode,
+				desc.hash_bkey(info, bkey_i_to_s_c(insert)),
+				snapshot),
+			   POS(insert->k.p.inode, U64_MAX),
+			   BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) {
+		if (is_visible_key(desc, inum, k)) {
 			if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
 				goto found;
 
@@ -237,82 +279,118 @@ static inline int __bch2_hash_set(struct btree_trans *trans,
 			continue;
 		}
 
-		if (!slot &&
-		    !(flags & BCH_HASH_SET_MUST_REPLACE)) {
-			slot = bch2_trans_copy_iter(trans, iter);
-			if (IS_ERR(slot))
-				return PTR_ERR(slot);
-		}
+		if (!slot.path && !(flags & STR_HASH_must_replace))
+			bch2_trans_copy_iter(&slot, iter);
 
-		if (k.k->type != desc.whiteout_type)
+		if (k.k->type != KEY_TYPE_hash_whiteout)
 			goto not_found;
 	}
 
-	return btree_iter_err(k) ?: -ENOSPC;
+	if (!ret)
+		ret = -BCH_ERR_ENOSPC_str_hash_create;
+out:
+	bch2_trans_iter_exit(trans, &slot);
+	bch2_trans_iter_exit(trans, iter);
+	return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
+found:
+	found = true;
 not_found:
-	if (flags & BCH_HASH_SET_MUST_REPLACE)
-		return -ENOENT;
+	if (found && (flags & STR_HASH_must_create)) {
+		bch2_trans_iter_exit(trans, &slot);
+		return k;
+	} else if (!found && (flags & STR_HASH_must_replace)) {
+		ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
+	} else {
+		if (!found && slot.path)
+			swap(*iter, slot);
+
+		insert->k.p = iter->pos;
+		ret = bch2_trans_update(trans, iter, insert, flags);
+	}
 
-	insert->k.p = slot->pos;
-	bch2_trans_update(trans, slot, insert, 0);
-	return 0;
-found:
-	if (flags & BCH_HASH_SET_MUST_CREATE)
-		return -EEXIST;
+	goto out;
+}
+
+static __always_inline
+int bch2_hash_set_in_snapshot(struct btree_trans *trans,
+			   const struct bch_hash_desc desc,
+			   const struct bch_hash_info *info,
+			   subvol_inum inum, u32 snapshot,
+			   struct bkey_i *insert,
+			   enum btree_iter_update_trigger_flags flags)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, info, inum,
+							     snapshot, insert, flags);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
+	if (k.k) {
+		bch2_trans_iter_exit(trans, &iter);
+		return -BCH_ERR_EEXIST_str_hash_set;
+	}
 
-	insert->k.p = iter->pos;
-	bch2_trans_update(trans, iter, insert, 0);
 	return 0;
 }
 
-static inline int bch2_hash_set(const struct bch_hash_desc desc,
-			       const struct bch_hash_info *info,
-			       struct bch_fs *c, u64 inode,
-			       u64 *journal_seq,
-			       struct bkey_i *insert, int flags)
+static __always_inline
+int bch2_hash_set(struct btree_trans *trans,
+		  const struct bch_hash_desc desc,
+		  const struct bch_hash_info *info,
+		  subvol_inum inum,
+		  struct bkey_i *insert,
+		  enum btree_iter_update_trigger_flags flags)
 {
-	return bch2_trans_do(c, journal_seq, flags|BTREE_INSERT_ATOMIC,
-			__bch2_hash_set(&trans, desc, info,
-					inode, insert, flags));
+	insert->k.p.inode = inum.inum;
+
+	u32 snapshot;
+	return  bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+		bch2_hash_set_in_snapshot(trans, desc, info, inum,
+					  snapshot, insert, flags);
 }
 
-static inline int bch2_hash_delete_at(struct btree_trans *trans,
-				      const struct bch_hash_desc desc,
-				      const struct bch_hash_info *info,
-				      struct btree_iter *iter)
+static __always_inline
+int bch2_hash_delete_at(struct btree_trans *trans,
+			const struct bch_hash_desc desc,
+			const struct bch_hash_info *info,
+			struct btree_iter *iter,
+			enum btree_iter_update_trigger_flags flags)
 {
 	struct bkey_i *delete;
 	int ret;
 
+	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+	ret = PTR_ERR_OR_ZERO(delete);
+	if (ret)
+		return ret;
+
 	ret = bch2_hash_needs_whiteout(trans, desc, info, iter);
 	if (ret < 0)
 		return ret;
 
-	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
-	if (IS_ERR(delete))
-		return PTR_ERR(delete);
-
 	bkey_init(&delete->k);
 	delete->k.p = iter->pos;
-	delete->k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
+	delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
 
-	bch2_trans_update(trans, iter, delete, 0);
-	return 0;
+	return bch2_trans_update(trans, iter, delete, flags);
 }
 
-static inline int bch2_hash_delete(struct btree_trans *trans,
-				   const struct bch_hash_desc desc,
-				   const struct bch_hash_info *info,
-				   u64 inode, const void *key)
+static __always_inline
+int bch2_hash_delete(struct btree_trans *trans,
+		     const struct bch_hash_desc desc,
+		     const struct bch_hash_info *info,
+		     subvol_inum inum, const void *key)
 {
-	struct btree_iter *iter;
-
-	iter = bch2_hash_lookup(trans, desc, info, inode, key,
-				BTREE_ITER_INTENT);
-	if (IS_ERR(iter))
-		return PTR_ERR(iter);
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
+					     BTREE_ITER_intent);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
 
-	return bch2_hash_delete_at(trans, desc, info, iter);
+	ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
 }
 
 #endif /* _BCACHEFS_STR_HASH_H */
diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c
new file mode 100644
index 00000000..5e5ae405
--- /dev/null
+++ b/libbcachefs/subvolume.c
@@ -0,0 +1,691 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "errcode.h"
+#include "error.h"
+#include "fs.h"
+#include "snapshot.h"
+#include "subvolume.h"
+
+#include <linux/random.h>
+
+static int bch2_subvolume_delete(struct btree_trans *, u32);
+
+static struct bpos subvolume_children_pos(struct bkey_s_c k)
+{
+	if (k.k->type != KEY_TYPE_subvolume)
+		return POS_MIN;
+
+	struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+	if (!s.v->fs_path_parent)
+		return POS_MIN;
+	return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset);
+}
+
+static int check_subvol(struct btree_trans *trans,
+			struct btree_iter *iter,
+			struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c_subvolume subvol;
+	struct btree_iter subvol_children_iter = {};
+	struct bch_snapshot snapshot;
+	struct printbuf buf = PRINTBUF;
+	unsigned snapid;
+	int ret = 0;
+
+	if (k.k->type != KEY_TYPE_subvolume)
+		return 0;
+
+	subvol = bkey_s_c_to_subvolume(k);
+	snapid = le32_to_cpu(subvol.v->snapshot);
+	ret = bch2_snapshot_lookup(trans, snapid, &snapshot);
+
+	if (bch2_err_matches(ret, ENOENT))
+		bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
+			k.k->p.offset, snapid);
+	if (ret)
+		return ret;
+
+	if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
+		ret = bch2_subvolume_delete(trans, iter->pos.offset);
+		bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+		return ret ?: -BCH_ERR_transaction_restart_nested;
+	}
+
+	if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL &&
+			subvol.v->fs_path_parent,
+			trans, subvol_root_fs_path_parent_nonzero,
+			"root subvolume has nonzero fs_path_parent\n%s",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		struct bkey_i_subvolume *n =
+			bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
+		ret = PTR_ERR_OR_ZERO(n);
+		if (ret)
+			goto err;
+
+		n->v.fs_path_parent = 0;
+	}
+
+	if (subvol.v->fs_path_parent) {
+		struct bpos pos = subvolume_children_pos(k);
+
+		struct bkey_s_c subvol_children_k =
+			bch2_bkey_get_iter(trans, &subvol_children_iter,
+					   BTREE_ID_subvolume_children, pos, 0);
+		ret = bkey_err(subvol_children_k);
+		if (ret)
+			goto err;
+
+		if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set,
+				trans, subvol_children_not_set,
+				"subvolume not set in subvolume_children btree at %llu:%llu\n%s",
+				pos.inode, pos.offset,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+			ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true);
+			if (ret)
+				goto err;
+		}
+	}
+
+	struct bch_inode_unpacked inode;
+	ret = bch2_inode_find_by_inum_nowarn_trans(trans,
+				    (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
+				    &inode);
+	if (!ret) {
+		if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
+				trans, subvol_root_wrong_bi_subvol,
+				"subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
+				inode.bi_inum, inode.bi_snapshot,
+				inode.bi_subvol, subvol.k->p.offset)) {
+			inode.bi_subvol = subvol.k->p.offset;
+			inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot);
+			ret = __bch2_fsck_write_inode(trans, &inode);
+			if (ret)
+				goto err;
+		}
+	} else if (bch2_err_matches(ret, ENOENT)) {
+		if (fsck_err(trans, subvol_to_missing_root,
+			     "subvolume %llu points to missing subvolume root %llu:%u",
+			     k.k->p.offset, le64_to_cpu(subvol.v->inode),
+			     le32_to_cpu(subvol.v->snapshot))) {
+			ret = bch2_subvolume_delete(trans, iter->pos.offset);
+			bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+			ret = ret ?: -BCH_ERR_transaction_restart_nested;
+			goto err;
+		}
+	} else {
+		goto err;
+	}
+
+	if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
+		u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
+		u32 snapshot_tree;
+		struct bch_snapshot_tree st;
+
+		rcu_read_lock();
+		snapshot_tree = snapshot_t(c, snapshot_root)->tree;
+		rcu_read_unlock();
+
+		ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);
+
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+				"%s: snapshot tree %u not found", __func__, snapshot_tree);
+
+		if (ret)
+			goto err;
+
+		if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
+				trans, subvol_not_master_and_not_snapshot,
+				"subvolume %llu is not set as snapshot but is not master subvolume",
+				k.k->p.offset)) {
+			struct bkey_i_subvolume *s =
+				bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
+			ret = PTR_ERR_OR_ZERO(s);
+			if (ret)
+				goto err;
+
+			SET_BCH_SUBVOLUME_SNAP(&s->v, true);
+		}
+	}
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &subvol_children_iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_check_subvols(struct bch_fs *c)
+{
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			check_subvol(trans, &iter, k)));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int check_subvol_child(struct btree_trans *trans,
+			      struct btree_iter *child_iter,
+			      struct bkey_s_c child_k)
+{
+	struct bch_subvolume s;
+	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset),
+					  0, subvolume, &s);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (fsck_err_on(ret ||
+			le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode,
+			trans, subvol_children_bad,
+			"incorrect entry in subvolume_children btree %llu:%llu",
+			child_k.k->p.inode, child_k.k->p.offset)) {
+		ret = bch2_btree_delete_at(trans, child_iter, 0);
+		if (ret)
+			goto err;
+	}
+err:
+fsck_err:
+	return ret;
+}
+
+int bch2_check_subvol_children(struct bch_fs *c)
+{
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_prefetch, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			check_subvol_child(trans, &iter, k)));
+	bch_err_fn(c, ret);
+	return 0;
+}
+
+/* Subvolumes: */
+
+int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k,
+			    struct bkey_validate_context from)
+{
+	struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k);
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
+			 bkey_gt(k.k->p, SUBVOL_POS_MAX),
+			 c, subvol_pos_bad,
+			 "invalid pos");
+
+	bkey_fsck_err_on(!subvol.v->snapshot,
+			 c, subvol_snapshot_bad,
+			 "invalid snapshot");
+
+	bkey_fsck_err_on(!subvol.v->inode,
+			 c, subvol_inode_bad,
+			 "invalid inode");
+fsck_err:
+	return ret;
+}
+
+void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+
+	prt_printf(out, "root %llu snapshot id %u",
+		   le64_to_cpu(s.v->inode),
+		   le32_to_cpu(s.v->snapshot));
+
+	if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) {
+		prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent));
+		prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent));
+	}
+}
+
+static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set)
+{
+	return !bpos_eq(pos, POS_MIN)
+		? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set)
+		: 0;
+}
+
+int bch2_subvolume_trigger(struct btree_trans *trans,
+			   enum btree_id btree_id, unsigned level,
+			   struct bkey_s_c old, struct bkey_s new,
+			   enum btree_iter_update_trigger_flags flags)
+{
+	if (flags & BTREE_TRIGGER_transactional) {
+		struct bpos children_pos_old = subvolume_children_pos(old);
+		struct bpos children_pos_new = subvolume_children_pos(new.s_c);
+
+		if (!bpos_eq(children_pos_old, children_pos_new)) {
+			int ret = subvolume_children_mod(trans, children_pos_old, false) ?:
+				  subvolume_children_mod(trans, children_pos_new, true);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol)
+{
+	struct btree_iter iter;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0);
+	struct bkey_s_c k = bch2_btree_iter_peek(&iter);
+	bch2_trans_iter_exit(trans, &iter);
+
+	return bkey_err(k) ?: k.k && k.k->p.inode == subvol
+		? -BCH_ERR_ENOTEMPTY_subvol_not_empty
+		: 0;
+}
+
+static __always_inline int
+bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
+			   bool inconsistent_if_not_found,
+			   int iter_flags,
+			   struct bch_subvolume *s)
+{
+	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
+					  iter_flags, subvolume, s);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) &&
+				inconsistent_if_not_found,
+				trans->c, "missing subvolume %u", subvol);
+	return ret;
+}
+
+int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
+		       bool inconsistent_if_not_found,
+		       int iter_flags,
+		       struct bch_subvolume *s)
+{
+	return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s);
+}
+
+int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol)
+{
+	struct bch_subvolume s;
+	int ret = bch2_subvolume_get_inlined(trans, subvol, true, 0, &s);
+	if (ret)
+		return ret;
+
+	if (BCH_SUBVOLUME_RO(&s))
+		return -EROFS;
+	return 0;
+}
+
+int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol)
+{
+	return bch2_trans_do(c, bch2_subvol_is_ro_trans(trans, subvol));
+}
+
+int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
+			     struct bch_subvolume *subvol)
+{
+	struct bch_snapshot snap;
+
+	return  bch2_snapshot_lookup(trans, snapshot, &snap) ?:
+		bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
+}
+
+int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
+				  u32 *snapid, bool warn)
+{
+	struct btree_iter iter;
+	struct bkey_s_c_subvolume subvol;
+	int ret;
+
+	subvol = bch2_bkey_get_iter_typed(trans, &iter,
+					  BTREE_ID_subvolumes, POS(0, subvolid),
+					  BTREE_ITER_cached|BTREE_ITER_with_updates,
+					  subvolume);
+	ret = bkey_err(subvol);
+
+	bch2_fs_inconsistent_on(warn && bch2_err_matches(ret, ENOENT), trans->c,
+				"missing subvolume %u", subvolid);
+
+	if (likely(!ret))
+		*snapid = le32_to_cpu(subvol.v->snapshot);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
+				u32 *snapid)
+{
+	return __bch2_subvolume_get_snapshot(trans, subvolid, snapid, true);
+}
+
+static int bch2_subvolume_reparent(struct btree_trans *trans,
+				   struct btree_iter *iter,
+				   struct bkey_s_c k,
+				   u32 old_parent, u32 new_parent)
+{
+	struct bkey_i_subvolume *s;
+	int ret;
+
+	if (k.k->type != KEY_TYPE_subvolume)
+		return 0;
+
+	if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) &&
+	    le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent)
+		return 0;
+
+	s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
+	ret = PTR_ERR_OR_ZERO(s);
+	if (ret)
+		return ret;
+
+	s->v.creation_parent = cpu_to_le32(new_parent);
+	return 0;
+}
+
+/*
+ * Separate from the snapshot tree in the snapshots btree, we record the tree
+ * structure of how snapshot subvolumes were created - the parent subvolume of
+ * each snapshot subvolume.
+ *
+ * When a subvolume is deleted, we scan for child subvolumes and reparant them,
+ * to avoid dangling references:
+ */
+static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
+{
+	struct bch_subvolume s;
+
+	return lockrestart_do(trans,
+			bch2_subvolume_get(trans, subvolid_to_delete, true,
+				   BTREE_ITER_cached, &s)) ?:
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			bch2_subvolume_reparent(trans, &iter, k,
+					subvolid_to_delete, le32_to_cpu(s.creation_parent)));
+}
+
+/*
+ * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
+ * deletion/cleanup:
+ */
+static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
+{
+	struct btree_iter iter;
+	struct bkey_s_c_subvolume subvol;
+	u32 snapid;
+	int ret = 0;
+
+	subvol = bch2_bkey_get_iter_typed(trans, &iter,
+				BTREE_ID_subvolumes, POS(0, subvolid),
+				BTREE_ITER_cached|BTREE_ITER_intent,
+				subvolume);
+	ret = bkey_err(subvol);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+				"missing subvolume %u", subvolid);
+	if (ret)
+		return ret;
+
+	snapid = le32_to_cpu(subvol.v->snapshot);
+
+	ret =   bch2_btree_delete_at(trans, &iter, 0) ?:
+		bch2_snapshot_node_set_deleted(trans, snapid);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
+{
+	return bch2_subvolumes_reparent(trans, subvolid) ?:
+		commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			  __bch2_subvolume_delete(trans, subvolid));
+}
+
+static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs,
+				snapshot_wait_for_pagecache_and_delete_work);
+	snapshot_id_list s;
+	u32 *id;
+	int ret = 0;
+
+	while (!ret) {
+		mutex_lock(&c->snapshots_unlinked_lock);
+		s = c->snapshots_unlinked;
+		darray_init(&c->snapshots_unlinked);
+		mutex_unlock(&c->snapshots_unlinked_lock);
+
+		if (!s.nr)
+			break;
+
+		bch2_evict_subvolume_inodes(c, &s);
+
+		for (id = s.data; id < s.data + s.nr; id++) {
+			ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
+			bch_err_msg(c, ret, "deleting subvolume %u", *id);
+			if (ret)
+				break;
+		}
+
+		darray_exit(&s);
+	}
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
+}
+
+struct subvolume_unlink_hook {
+	struct btree_trans_commit_hook	h;
+	u32				subvol;
+};
+
+static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
+						      struct btree_trans_commit_hook *_h)
+{
+	struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
+	struct bch_fs *c = trans->c;
+	int ret = 0;
+
+	mutex_lock(&c->snapshots_unlinked_lock);
+	if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
+		ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
+	mutex_unlock(&c->snapshots_unlinked_lock);
+
+	if (ret)
+		return ret;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
+		return -EROFS;
+
+	if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
+	return 0;
+}
+
+int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
+{
+	struct btree_iter iter;
+	struct bkey_i_subvolume *n;
+	struct subvolume_unlink_hook *h;
+	int ret = 0;
+
+	h = bch2_trans_kmalloc(trans, sizeof(*h));
+	ret = PTR_ERR_OR_ZERO(h);
+	if (ret)
+		return ret;
+
+	h->h.fn		= bch2_subvolume_wait_for_pagecache_and_delete_hook;
+	h->subvol	= subvolid;
+	bch2_trans_commit_hook(trans, &h->h);
+
+	n = bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_subvolumes, POS(0, subvolid),
+			BTREE_ITER_cached, subvolume);
+	ret = PTR_ERR_OR_ZERO(n);
+	if (unlikely(ret)) {
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+					"missing subvolume %u", subvolid);
+		return ret;
+	}
+
+	SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
+			  u32 parent_subvolid,
+			  u32 src_subvolid,
+			  u32 *new_subvolid,
+			  u32 *new_snapshotid,
+			  bool ro)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
+	struct bkey_i_subvolume *new_subvol = NULL;
+	struct bkey_i_subvolume *src_subvol = NULL;
+	u32 parent = 0, new_nodes[2], snapshot_subvols[2];
+	int ret = 0;
+
+	ret = bch2_bkey_get_empty_slot(trans, &dst_iter,
+				BTREE_ID_subvolumes, POS(0, U32_MAX));
+	if (ret == -BCH_ERR_ENOSPC_btree_slot)
+		ret = -BCH_ERR_ENOSPC_subvolume_create;
+	if (ret)
+		return ret;
+
+	snapshot_subvols[0] = dst_iter.pos.offset;
+	snapshot_subvols[1] = src_subvolid;
+
+	if (src_subvolid) {
+		/* Creating a snapshot: */
+
+		src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter,
+				BTREE_ID_subvolumes, POS(0, src_subvolid),
+				BTREE_ITER_cached, subvolume);
+		ret = PTR_ERR_OR_ZERO(src_subvol);
+		if (unlikely(ret)) {
+			bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+						"subvolume %u not found", src_subvolid);
+			goto err;
+		}
+
+		parent = le32_to_cpu(src_subvol->v.snapshot);
+	}
+
+	ret = bch2_snapshot_node_create(trans, parent, new_nodes,
+					snapshot_subvols,
+					src_subvolid ? 2 : 1);
+	if (ret)
+		goto err;
+
+	if (src_subvolid) {
+		src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
+		ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
+		if (ret)
+			goto err;
+	}
+
+	new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume);
+	ret = PTR_ERR_OR_ZERO(new_subvol);
+	if (ret)
+		goto err;
+
+	new_subvol->v.flags		= 0;
+	new_subvol->v.snapshot		= cpu_to_le32(new_nodes[0]);
+	new_subvol->v.inode		= cpu_to_le64(inode);
+	new_subvol->v.creation_parent	= cpu_to_le32(src_subvolid);
+	new_subvol->v.fs_path_parent	= cpu_to_le32(parent_subvolid);
+	new_subvol->v.otime.lo		= cpu_to_le64(bch2_current_time(c));
+	new_subvol->v.otime.hi		= 0;
+
+	SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
+	SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
+
+	*new_subvolid	= new_subvol->k.p.offset;
+	*new_snapshotid	= new_nodes[0];
+err:
+	bch2_trans_iter_exit(trans, &src_iter);
+	bch2_trans_iter_exit(trans, &dst_iter);
+	return ret;
+}
+
+int bch2_initialize_subvolumes(struct bch_fs *c)
+{
+	struct bkey_i_snapshot_tree	root_tree;
+	struct bkey_i_snapshot		root_snapshot;
+	struct bkey_i_subvolume		root_volume;
+	int ret;
+
+	bkey_snapshot_tree_init(&root_tree.k_i);
+	root_tree.k.p.offset		= 1;
+	root_tree.v.master_subvol	= cpu_to_le32(1);
+	root_tree.v.root_snapshot	= cpu_to_le32(U32_MAX);
+
+	bkey_snapshot_init(&root_snapshot.k_i);
+	root_snapshot.k.p.offset = U32_MAX;
+	root_snapshot.v.flags	= 0;
+	root_snapshot.v.parent	= 0;
+	root_snapshot.v.subvol	= cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
+	root_snapshot.v.tree	= cpu_to_le32(1);
+	SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
+
+	bkey_subvolume_init(&root_volume.k_i);
+	root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+	root_volume.v.flags	= 0;
+	root_volume.v.snapshot	= cpu_to_le32(U32_MAX);
+	root_volume.v.inode	= cpu_to_le64(BCACHEFS_ROOT_INO);
+
+	ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,	&root_tree.k_i, NULL, 0, 0) ?:
+		bch2_btree_insert(c, BTREE_ID_snapshots,	&root_snapshot.k_i, NULL, 0, 0) ?:
+		bch2_btree_insert(c, BTREE_ID_subvolumes,	&root_volume.k_i, NULL, 0, 0);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_inode_unpacked inode;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (!bkey_is_inode(k.k)) {
+		bch_err(trans->c, "root inode not found");
+		ret = -BCH_ERR_ENOENT_inode;
+		goto err;
+	}
+
+	ret = bch2_inode_unpack(k, &inode);
+	BUG_ON(ret);
+
+	inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
+
+	ret = bch2_inode_write(trans, &iter, &inode);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/* set bi_subvol on root inode */
+int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
+{
+	int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+				       __bch2_fs_upgrade_for_subvolumes(trans));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+int bch2_fs_subvolumes_init(struct bch_fs *c)
+{
+	INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
+	INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
+		  bch2_subvolume_wait_for_pagecache_and_delete);
+	mutex_init(&c->snapshots_unlinked_lock);
+	return 0;
+}
diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h
new file mode 100644
index 00000000..d53d292c
--- /dev/null
+++ b/libbcachefs/subvolume.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_H
+#define _BCACHEFS_SUBVOLUME_H
+
+#include "darray.h"
+#include "subvolume_types.h"
+
+int bch2_check_subvols(struct bch_fs *);
+int bch2_check_subvol_children(struct bch_fs *);
+
+int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c,
+			    struct bkey_validate_context);
+void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
+			   struct bkey_s_c, struct bkey_s,
+			   enum btree_iter_update_trigger_flags);
+
+#define bch2_bkey_ops_subvolume ((struct bkey_ops) {		\
+	.key_validate	= bch2_subvolume_validate,		\
+	.val_to_text	= bch2_subvolume_to_text,		\
+	.trigger	= bch2_subvolume_trigger,		\
+	.min_val_size	= 16,					\
+})
+
+int bch2_subvol_has_children(struct btree_trans *, u32);
+int bch2_subvolume_get(struct btree_trans *, unsigned,
+		       bool, int, struct bch_subvolume *);
+int __bch2_subvolume_get_snapshot(struct btree_trans *, u32,
+				  u32 *, bool);
+int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
+
+int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
+int bch2_subvol_is_ro(struct bch_fs *, u32);
+
+static inline struct bkey_s_c
+bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end,
+					    u32 subvolid, unsigned flags)
+{
+	u32 snapshot;
+	int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot);
+	if (ret)
+		return bkey_s_c_err(ret);
+
+	bch2_btree_iter_set_snapshot(iter, snapshot);
+	return bch2_btree_iter_peek_max_type(iter, end, flags);
+}
+
+#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter,		\
+					 _end, _subvolid, _flags, _k, _do)	\
+({										\
+	struct bkey_s_c _k;							\
+	int _ret3 = 0;								\
+										\
+	do {									\
+		_ret3 = lockrestart_do(_trans, ({				\
+			(_k) = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter),	\
+						_end, _subvolid, (_flags));	\
+			if (!(_k).k)						\
+				break;						\
+										\
+			bkey_err(_k) ?: (_do);					\
+		}));								\
+	} while (!_ret3 && bch2_btree_iter_advance(&(_iter)));			\
+										\
+	bch2_trans_iter_exit((_trans), &(_iter));				\
+	_ret3;									\
+})
+
+#define for_each_btree_key_in_subvolume_max(_trans, _iter, _btree_id,		\
+				_start, _end, _subvolid, _flags, _k, _do)	\
+({										\
+	struct btree_iter _iter;						\
+	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),			\
+			     (_start), (_flags));				\
+										\
+	for_each_btree_key_in_subvolume_max_continue(_trans, _iter,		\
+					_end, _subvolid, _flags, _k, _do);	\
+})
+
+int bch2_delete_dead_snapshots(struct bch_fs *);
+void bch2_delete_dead_snapshots_async(struct bch_fs *);
+
+int bch2_subvolume_unlink(struct btree_trans *, u32);
+int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool);
+
+int bch2_initialize_subvolumes(struct bch_fs *);
+int bch2_fs_upgrade_for_subvolumes(struct bch_fs *);
+
+int bch2_fs_subvolumes_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_SUBVOLUME_H */
diff --git a/libbcachefs/subvolume_format.h b/libbcachefs/subvolume_format.h
new file mode 100644
index 00000000..e029df7b
--- /dev/null
+++ b/libbcachefs/subvolume_format.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H
+#define _BCACHEFS_SUBVOLUME_FORMAT_H
+
+#define SUBVOL_POS_MIN		POS(0, 1)
+#define SUBVOL_POS_MAX		POS(0, S32_MAX)
+#define BCACHEFS_ROOT_SUBVOL	1
+
+struct bch_subvolume {
+	struct bch_val		v;
+	__le32			flags;
+	__le32			snapshot;
+	__le64			inode;
+	/*
+	 * Snapshot subvolumes form a tree, separate from the snapshot nodes
+	 * tree - if this subvolume is a snapshot, this is the ID of the
+	 * subvolume it was created from:
+	 *
+	 * This is _not_ necessarily the subvolume of the directory containing
+	 * this subvolume:
+	 */
+	__le32			creation_parent;
+	__le32			fs_path_parent;
+	bch_le128		otime;
+};
+
+LE32_BITMASK(BCH_SUBVOLUME_RO,		struct bch_subvolume, flags,  0,  1)
+/*
+ * We need to know whether a subvolume is a snapshot so we can know whether we
+ * can delete it (or whether it should just be rm -rf'd)
+ */
+LE32_BITMASK(BCH_SUBVOLUME_SNAP,	struct bch_subvolume, flags,  1,  2)
+LE32_BITMASK(BCH_SUBVOLUME_UNLINKED,	struct bch_subvolume, flags,  2,  3)
+
+#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */
diff --git a/libbcachefs/subvolume_types.h b/libbcachefs/subvolume_types.h
new file mode 100644
index 00000000..f2ec4277
--- /dev/null
+++ b/libbcachefs/subvolume_types.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
+#define _BCACHEFS_SUBVOLUME_TYPES_H
+
+#include "darray.h"
+
+typedef DARRAY(u32) snapshot_id_list;
+
+#define IS_ANCESTOR_BITMAP	128
+
+struct snapshot_t {
+	u32			parent;
+	u32			skip[3];
+	u32			depth;
+	u32			children[2];
+	u32			subvol; /* Nonzero only if a subvolume points to this node: */
+	u32			tree;
+	u32			equiv;
+	unsigned long		is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
+};
+
+struct snapshot_table {
+	struct rcu_head		rcu;
+	size_t			nr;
+#ifndef RUST_BINDGEN
+	DECLARE_FLEX_ARRAY(struct snapshot_t, s);
+#else
+	struct snapshot_t	s[0];
+#endif
+};
+
+typedef struct {
+	/* we can't have padding in this struct: */
+	u64		subvol;
+	u64		inum;
+} subvol_inum;
+
+#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 54de9fac..6a086c1c 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -1,18 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
 #include "checksum.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "error.h"
-#include "io.h"
 #include "journal.h"
+#include "journal_sb.h"
+#include "journal_seq_blacklist.h"
+#include "recovery_passes.h"
 #include "replicas.h"
 #include "quota.h"
+#include "sb-clean.h"
+#include "sb-counters.h"
+#include "sb-downgrade.h"
+#include "sb-errors.h"
+#include "sb-members.h"
 #include "super-io.h"
 #include "super.h"
+#include "trace.h"
 #include "vstructs.h"
 
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
+#include <linux/string_choices.h>
+
+static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
+};
+
+struct bch2_metadata_version {
+	u16		version;
+	const char	*name;
+};
+
+static const struct bch2_metadata_version bch2_metadata_versions[] = {
+#define x(n, v) {		\
+	.version = v,				\
+	.name = #n,				\
+},
+	BCH_METADATA_VERSIONS()
+#undef x
+};
+
+void bch2_version_to_text(struct printbuf *out, unsigned v)
+{
+	const char *str = "(unknown version)";
+
+	for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++)
+		if (bch2_metadata_versions[i].version == v) {
+			str = bch2_metadata_versions[i].name;
+			break;
+		}
+
+	prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str);
+}
+
+unsigned bch2_latest_compatible_version(unsigned v)
+{
+	if (!BCH_VERSION_MAJOR(v))
+		return v;
+
+	for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++)
+		if (bch2_metadata_versions[i].version > v &&
+		    BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) ==
+		    BCH_VERSION_MAJOR(v))
+			v = bch2_metadata_versions[i].version;
+
+	return v;
+}
 
 const char * const bch2_sb_fields[] = {
 #define x(name, nr)	#name,
@@ -21,14 +76,12 @@ const char * const bch2_sb_fields[] = {
 	NULL
 };
 
-static const char *bch2_sb_field_validate(struct bch_sb *,
-					  struct bch_sb_field *);
+static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
+				  enum bch_validate_flags, struct printbuf *);
 
-struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
+struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb,
 				      enum bch_sb_field_type type)
 {
-	struct bch_sb_field *f;
-
 	/* XXX: need locking around superblock to access optional fields */
 
 	vstruct_for_each(sb, f)
@@ -44,10 +97,11 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
 	unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
 	unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s;
 
-	BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) >
-	       sb->page_order);
+	BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size);
 
-	if (!f) {
+	if (!f && !u64s) {
+		/* nothing to do: */
+	} else if (!f) {
 		f = vstruct_last(sb->sb);
 		memset(f, 0, sizeof(u64) * u64s);
 		f->u64s = cpu_to_le32(u64s);
@@ -56,8 +110,13 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
 		void *src, *dst;
 
 		src = vstruct_end(f);
-		f->u64s = cpu_to_le32(u64s);
-		dst = vstruct_end(f);
+
+		if (u64s) {
+			f->u64s = cpu_to_le32(u64s);
+			dst = vstruct_end(f);
+		} else {
+			dst = f;
+		}
 
 		memmove(dst, src, vstruct_end(sb->sb) - src);
 
@@ -67,80 +126,96 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
 
 	sb->sb->u64s = cpu_to_le32(sb_u64s);
 
-	return f;
+	return u64s ? f : NULL;
+}
+
+void bch2_sb_field_delete(struct bch_sb_handle *sb,
+			  enum bch_sb_field_type type)
+{
+	struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
+
+	if (f)
+		__bch2_sb_field_resize(sb, f, 0);
 }
 
 /* Superblock realloc/free: */
 
 void bch2_free_super(struct bch_sb_handle *sb)
 {
-	if (sb->bio)
-		bio_put(sb->bio);
-	if (!IS_ERR_OR_NULL(sb->bdev))
-		blkdev_put(sb->bdev, sb->mode);
+	kfree(sb->bio);
+	if (!IS_ERR_OR_NULL(sb->s_bdev_file))
+		bdev_fput(sb->s_bdev_file);
+	kfree(sb->holder);
+	kfree(sb->sb_name);
 
-	free_pages((unsigned long) sb->sb, sb->page_order);
+	kfree(sb->sb);
 	memset(sb, 0, sizeof(*sb));
 }
 
 int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
 {
 	size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s);
-	unsigned order = get_order(new_bytes);
+	size_t new_buffer_size;
 	struct bch_sb *new_sb;
 	struct bio *bio;
 
-	if (sb->sb && sb->page_order >= order)
+	if (sb->bdev)
+		new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev));
+
+	new_buffer_size = roundup_pow_of_two(new_bytes);
+
+	if (sb->sb && sb->buffer_size >= new_buffer_size)
 		return 0;
 
-	if (sb->have_layout) {
+	if (sb->sb && sb->have_layout) {
 		u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
 
 		if (new_bytes > max_bytes) {
-			char buf[BDEVNAME_SIZE];
+			struct printbuf buf = PRINTBUF;
 
-			pr_err("%s: superblock too big: want %zu but have %llu",
-			       bdevname(sb->bdev, buf), new_bytes, max_bytes);
-			return -ENOSPC;
+			prt_bdevname(&buf, sb->bdev);
+			prt_printf(&buf, ": superblock too big: want %zu but have %llu", new_bytes, max_bytes);
+			pr_err("%s", buf.buf);
+			printbuf_exit(&buf);
+			return -BCH_ERR_ENOSPC_sb;
 		}
 	}
 
-	if (sb->page_order >= order && sb->sb)
+	if (sb->buffer_size >= new_buffer_size && sb->sb)
 		return 0;
 
 	if (dynamic_fault("bcachefs:add:super_realloc"))
-		return -ENOMEM;
+		return -BCH_ERR_ENOMEM_sb_realloc_injected;
+
+	new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
+	if (!new_sb)
+		return -BCH_ERR_ENOMEM_sb_buf_realloc;
+
+	sb->sb = new_sb;
 
 	if (sb->have_bio) {
-		bio = bio_kmalloc(GFP_KERNEL, 1 << order);
+		unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size);
+
+		bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
 		if (!bio)
-			return -ENOMEM;
+			return -BCH_ERR_ENOMEM_sb_bio_realloc;
 
-		if (sb->bio)
-			bio_put(sb->bio);
+		bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0);
+
+		kfree(sb->bio);
 		sb->bio = bio;
 	}
 
-	new_sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
-	if (!new_sb)
-		return -ENOMEM;
-
-	if (sb->sb)
-		memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order);
-
-	free_pages((unsigned long) sb->sb, sb->page_order);
-	sb->sb = new_sb;
-
-	sb->page_order = order;
+	sb->buffer_size = new_buffer_size;
 
 	return 0;
 }
 
-struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
+struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
 					  enum bch_sb_field_type type,
 					  unsigned u64s)
 {
-	struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
+	struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
 	ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
 	ssize_t d = -old_u64s + u64s;
 
@@ -149,51 +224,74 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
 
 	if (sb->fs_sb) {
 		struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
-		struct bch_dev *ca;
-		unsigned i;
 
 		lockdep_assert_held(&c->sb_lock);
 
 		/* XXX: we're not checking that offline device have enough space */
 
-		for_each_online_member(ca, c, i) {
-			struct bch_sb_handle *sb = &ca->disk_sb;
+		for_each_online_member(c, ca) {
+			struct bch_sb_handle *dev_sb = &ca->disk_sb;
 
-			if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
-				percpu_ref_put(&ca->ref);
+			if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
+				percpu_ref_put(&ca->io_ref);
 				return NULL;
 			}
 		}
 	}
 
+	f = bch2_sb_field_get_id(sb->sb, type);
 	f = __bch2_sb_field_resize(sb, f, u64s);
-	f->type = cpu_to_le32(type);
+	if (f)
+		f->type = cpu_to_le32(type);
 	return f;
 }
 
-/* Superblock validate: */
-
-static inline void __bch2_sb_layout_size_assert(void)
+struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *sb,
+						  enum bch_sb_field_type type,
+						  unsigned u64s)
 {
-	BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
+	struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
+
+	if (!f || le32_to_cpu(f->u64s) < u64s)
+		f = bch2_sb_field_resize_id(sb, type, u64s);
+	return f;
 }
 
-static const char *validate_sb_layout(struct bch_sb_layout *layout)
+/* Superblock validate: */
+
+static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out)
 {
 	u64 offset, prev_offset, max_sectors;
 	unsigned i;
 
-	if (uuid_le_cmp(layout->magic, BCACHE_MAGIC))
-		return "Not a bcachefs superblock layout";
+	BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
+
+	if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) &&
+	    !uuid_equal(&layout->magic, &BCHFS_MAGIC)) {
+		prt_printf(out, "Not a bcachefs superblock layout");
+		return -BCH_ERR_invalid_sb_layout;
+	}
 
-	if (layout->layout_type != 0)
-		return "Invalid superblock layout type";
+	if (layout->layout_type != 0) {
+		prt_printf(out, "Invalid superblock layout type %u",
+		       layout->layout_type);
+		return -BCH_ERR_invalid_sb_layout_type;
+	}
+
+	if (!layout->nr_superblocks) {
+		prt_printf(out, "Invalid superblock layout: no superblocks");
+		return -BCH_ERR_invalid_sb_layout_nr_superblocks;
+	}
 
-	if (!layout->nr_superblocks)
-		return "Invalid superblock layout: no superblocks";
+	if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
+		prt_printf(out, "Invalid superblock layout: too many superblocks");
+		return -BCH_ERR_invalid_sb_layout_nr_superblocks;
+	}
 
-	if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset))
-		return "Invalid superblock layout: too many superblocks";
+	if (layout->sb_max_size_bits > BCH_SB_LAYOUT_SIZE_BITS_MAX) {
+		prt_printf(out, "Invalid superblock layout: max_size_bits too high");
+		return -BCH_ERR_invalid_sb_layout_sb_max_size_bits;
+	}
 
 	max_sectors = 1 << layout->sb_max_size_bits;
 
@@ -202,160 +300,266 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout)
 	for (i = 1; i < layout->nr_superblocks; i++) {
 		offset = le64_to_cpu(layout->sb_offset[i]);
 
-		if (offset < prev_offset + max_sectors)
-			return "Invalid superblock layout: superblocks overlap";
+		if (offset < prev_offset + max_sectors) {
+			prt_printf(out, "Invalid superblock layout: superblocks overlap\n"
+			       "  (sb %u ends at %llu next starts at %llu",
+			       i - 1, prev_offset + max_sectors, offset);
+			return -BCH_ERR_invalid_sb_layout_superblocks_overlap;
+		}
 		prev_offset = offset;
 	}
 
-	return NULL;
+	return 0;
 }
 
-const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
+static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
 {
-	struct bch_sb *sb = disk_sb->sb;
-	struct bch_sb_field *f;
-	struct bch_sb_field_members *mi;
-	const char *err;
-	u16 block_size;
+	u16 version		= le16_to_cpu(sb->version);
+	u16 version_min		= le16_to_cpu(sb->version_min);
+
+	if (!bch2_version_compatible(version)) {
+		prt_str(out, "Unsupported superblock version ");
+		bch2_version_to_text(out, version);
+		prt_str(out, " (min ");
+		bch2_version_to_text(out, bcachefs_metadata_version_min);
+		prt_str(out, ", max ");
+		bch2_version_to_text(out, bcachefs_metadata_version_current);
+		prt_str(out, ")");
+		return -BCH_ERR_invalid_sb_version;
+	}
 
-	if (le64_to_cpu(sb->version) < BCH_SB_VERSION_MIN ||
-	    le64_to_cpu(sb->version) > BCH_SB_VERSION_MAX)
-		return"Unsupported superblock version";
+	if (!bch2_version_compatible(version_min)) {
+		prt_str(out, "Unsupported superblock version_min ");
+		bch2_version_to_text(out, version_min);
+		prt_str(out, " (min ");
+		bch2_version_to_text(out, bcachefs_metadata_version_min);
+		prt_str(out, ", max ");
+		bch2_version_to_text(out, bcachefs_metadata_version_current);
+		prt_str(out, ")");
+		return -BCH_ERR_invalid_sb_version;
+	}
 
-	if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX) {
-		SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb, 7);
-		SET_BCH_SB_POSIX_ACL(sb, 1);
+	if (version_min > version) {
+		prt_str(out, "Bad minimum version ");
+		bch2_version_to_text(out, version_min);
+		prt_str(out, ", greater than version field ");
+		bch2_version_to_text(out, version);
+		return -BCH_ERR_invalid_sb_version;
 	}
 
-	block_size = le16_to_cpu(sb->block_size);
+	return 0;
+}
 
-	if (!is_power_of_2(block_size) ||
-	    block_size > PAGE_SECTORS)
-		return "Bad block size";
+static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
+			    enum bch_validate_flags flags, struct printbuf *out)
+{
+	struct bch_sb *sb = disk_sb->sb;
+	struct bch_sb_field_members_v1 *mi;
+	enum bch_opt_id opt_id;
+	u16 block_size;
+	int ret;
 
-	if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
-		return "Bad user UUID";
+	ret = bch2_sb_compatible(sb, out);
+	if (ret)
+		return ret;
 
-	if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le)))
-		return "Bad internal UUID";
+	if (sb->features[1] ||
+	    (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
+		prt_printf(out, "Filesystem has incompatible features");
+		return -BCH_ERR_invalid_sb_features;
+	}
 
-	if (!sb->nr_devices ||
-	    sb->nr_devices <= sb->dev_idx ||
-	    sb->nr_devices > BCH_SB_MEMBERS_MAX)
-		return "Bad number of member devices";
+	block_size = le16_to_cpu(sb->block_size);
 
-	if (!BCH_SB_META_REPLICAS_WANT(sb) ||
-	    BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
-		return "Invalid number of metadata replicas";
+	if (block_size > PAGE_SECTORS) {
+		prt_printf(out, "Block size too big (got %u, max %u)",
+		       block_size, PAGE_SECTORS);
+		return -BCH_ERR_invalid_sb_block_size;
+	}
 
-	if (!BCH_SB_META_REPLICAS_REQ(sb) ||
-	    BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
-		return "Invalid number of metadata replicas";
+	if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
+		prt_printf(out, "Bad user UUID (got zeroes)");
+		return -BCH_ERR_invalid_sb_uuid;
+	}
 
-	if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
-	    BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
-		return "Invalid number of data replicas";
+	if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) {
+		prt_printf(out, "Bad internal UUID (got zeroes)");
+		return -BCH_ERR_invalid_sb_uuid;
+	}
 
-	if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
-	    BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
-		return "Invalid number of data replicas";
+	if (!sb->nr_devices ||
+	    sb->nr_devices > BCH_SB_MEMBERS_MAX) {
+		prt_printf(out, "Bad number of member devices %u (max %u)",
+		       sb->nr_devices, BCH_SB_MEMBERS_MAX);
+		return -BCH_ERR_invalid_sb_too_many_members;
+	}
 
-	if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
-		return "Invalid metadata checksum type";
+	if (sb->dev_idx >= sb->nr_devices) {
+		prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)",
+		       sb->dev_idx, sb->nr_devices);
+		return -BCH_ERR_invalid_sb_dev_idx;
+	}
 
-	if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
-		return "Invalid metadata checksum type";
+	if (!sb->time_precision ||
+	    le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
+		prt_printf(out, "Invalid time precision: %u (min 1, max %lu)",
+		       le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
+		return -BCH_ERR_invalid_sb_time_precision;
+	}
 
-	if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
-		return "Invalid compression type";
+	if (!flags) {
+		/*
+		 * Been seeing a bug where these are getting inexplicably
+		 * zeroed, so we're now validating them, but we have to be
+		 * careful not to preven people's filesystems from mounting:
+		 */
+		if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
+			SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
+		if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
+			SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000);
+
+		if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb))
+			SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version));
+
+		if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 &&
+		    !BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb))
+			SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30);
+
+		if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
+			SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
+	}
 
-	if (!BCH_SB_BTREE_NODE_SIZE(sb))
-		return "Btree node size not set";
+	for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
+		const struct bch_option *opt = bch2_opt_table + opt_id;
 
-	if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb)))
-		return "Btree node size not a power of two";
+		if (opt->get_sb != BCH2_NO_SB_OPT) {
+			u64 v = bch2_opt_from_sb(sb, opt_id);
 
-	if (BCH_SB_GC_RESERVE(sb) < 5)
-		return "gc reserve percentage too small";
+			prt_printf(out, "Invalid option ");
+			ret = bch2_opt_validate(opt, v, out);
+			if (ret)
+				return ret;
 
-	if (!sb->time_precision ||
-	    le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
-		return "invalid time precision";
+			printbuf_reset(out);
+		}
+	}
 
 	/* validate layout */
-	err = validate_sb_layout(&sb->layout);
-	if (err)
-		return err;
+	ret = validate_sb_layout(&sb->layout, out);
+	if (ret)
+		return ret;
 
 	vstruct_for_each(sb, f) {
-		if (!f->u64s)
-			return "Invalid superblock: invalid optional field";
+		if (!f->u64s) {
+			prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)",
+			       le32_to_cpu(f->type));
+			return -BCH_ERR_invalid_sb_field_size;
+		}
 
-		if (vstruct_next(f) > vstruct_last(sb))
-			return "Invalid superblock: invalid optional field";
+		if (vstruct_next(f) > vstruct_last(sb)) {
+			prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
+			       le32_to_cpu(f->type));
+			return -BCH_ERR_invalid_sb_field_size;
+		}
 	}
 
 	/* members must be validated first: */
-	mi = bch2_sb_get_members(sb);
-	if (!mi)
-		return "Invalid superblock: member info area missing";
+	mi = bch2_sb_field_get(sb, members_v1);
+	if (!mi) {
+		prt_printf(out, "Invalid superblock: member info area missing");
+		return -BCH_ERR_invalid_sb_members_missing;
+	}
 
-	err = bch2_sb_field_validate(sb, &mi->field);
-	if (err)
-		return err;
+	ret = bch2_sb_field_validate(sb, &mi->field, flags, out);
+	if (ret)
+		return ret;
 
 	vstruct_for_each(sb, f) {
-		if (le32_to_cpu(f->type) == BCH_SB_FIELD_members)
+		if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1)
 			continue;
 
-		err = bch2_sb_field_validate(sb, f);
-		if (err)
-			return err;
+		ret = bch2_sb_field_validate(sb, f, flags, out);
+		if (ret)
+			return ret;
 	}
 
-	if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_NONCE_V1 &&
-	    bch2_sb_get_crypt(sb) &&
-	    BCH_SB_INITIALIZED(sb))
-		return "Incompatible extent nonces";
-
-	sb->version = cpu_to_le64(BCH_SB_VERSION_MAX);
+	if ((flags & BCH_VALIDATE_write) &&
+	    bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) {
+		prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu",
+			   le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq),
+			   le64_to_cpu(sb->seq));
+		return -BCH_ERR_invalid_sb_members_missing;
+	}
 
-	return NULL;
+	return 0;
 }
 
 /* device open: */
 
+static unsigned long le_ulong_to_cpu(unsigned long v)
+{
+	return sizeof(unsigned long) == 8
+		? le64_to_cpu(v)
+		: le32_to_cpu(v);
+}
+
+static void le_bitvector_to_cpu(unsigned long *dst, unsigned long *src, unsigned nr)
+{
+	BUG_ON(nr & (BITS_PER_TYPE(long) - 1));
+
+	for (unsigned i = 0; i < BITS_TO_LONGS(nr); i++)
+		dst[i] = le_ulong_to_cpu(src[i]);
+}
+
 static void bch2_sb_update(struct bch_fs *c)
 {
 	struct bch_sb *src = c->disk_sb.sb;
-	struct bch_sb_field_members *mi = bch2_sb_get_members(src);
-	struct bch_dev *ca;
-	unsigned i;
 
 	lockdep_assert_held(&c->sb_lock);
 
 	c->sb.uuid		= src->uuid;
 	c->sb.user_uuid		= src->user_uuid;
+	c->sb.version		= le16_to_cpu(src->version);
+	c->sb.version_min	= le16_to_cpu(src->version_min);
+	c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src);
 	c->sb.nr_devices	= src->nr_devices;
 	c->sb.clean		= BCH_SB_CLEAN(src);
 	c->sb.encryption_type	= BCH_SB_ENCRYPTION_TYPE(src);
-	c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src);
-	c->sb.time_base_lo	= le64_to_cpu(src->time_base_lo);
+
+	c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision);
+	c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit;
+
+	/* XXX this is wrong, we need a 96 or 128 bit integer type */
+	c->sb.time_base_lo	= div_u64(le64_to_cpu(src->time_base_lo),
+					  c->sb.nsec_per_time_unit);
 	c->sb.time_base_hi	= le32_to_cpu(src->time_base_hi);
-	c->sb.time_precision	= le32_to_cpu(src->time_precision);
+
 	c->sb.features		= le64_to_cpu(src->features[0]);
+	c->sb.compat		= le64_to_cpu(src->compat[0]);
+
+	memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent));
 
-	for_each_member_device(ca, c, i)
-		ca->mi = bch2_mi_to_cpu(mi->members + i);
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
+	if (ext) {
+		le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
+				    sizeof(c->sb.errors_silent) * 8);
+		c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
+	}
+
+	for_each_member_device(c, ca) {
+		struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
+		ca->mi = bch2_mi_to_cpu(&m);
+	}
 }
 
-/* doesn't copy member info */
-static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
+static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
 {
 	struct bch_sb_field *src_f, *dst_f;
 	struct bch_sb *dst = dst_handle->sb;
+	unsigned i;
 
 	dst->version		= src->version;
+	dst->version_min	= src->version_min;
 	dst->seq		= src->seq;
 	dst->uuid		= src->uuid;
 	dst->user_uuid		= src->user_uuid;
@@ -367,46 +571,54 @@ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
 	dst->time_base_lo	= src->time_base_lo;
 	dst->time_base_hi	= src->time_base_hi;
 	dst->time_precision	= src->time_precision;
+	dst->write_time		= src->write_time;
 
 	memcpy(dst->flags,	src->flags,	sizeof(dst->flags));
 	memcpy(dst->features,	src->features,	sizeof(dst->features));
 	memcpy(dst->compat,	src->compat,	sizeof(dst->compat));
 
-	vstruct_for_each(src, src_f) {
-		if (src_f->type == BCH_SB_FIELD_journal)
+	for (i = 0; i < BCH_SB_FIELD_NR; i++) {
+		int d;
+
+		if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
 			continue;
 
-		dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type));
+		src_f = bch2_sb_field_get_id(src, i);
+		dst_f = bch2_sb_field_get_id(dst, i);
+
+		d = (src_f ? le32_to_cpu(src_f->u64s) : 0) -
+		    (dst_f ? le32_to_cpu(dst_f->u64s) : 0);
+		if (d > 0) {
+			int ret = bch2_sb_realloc(dst_handle,
+					le32_to_cpu(dst_handle->sb->u64s) + d);
+
+			if (ret)
+				return ret;
+
+			dst = dst_handle->sb;
+			dst_f = bch2_sb_field_get_id(dst, i);
+		}
+
 		dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
-					       le32_to_cpu(src_f->u64s));
+				src_f ? le32_to_cpu(src_f->u64s) : 0);
 
-		memcpy(dst_f, src_f, vstruct_bytes(src_f));
+		if (src_f)
+			memcpy(dst_f, src_f, vstruct_bytes(src_f));
 	}
+
+	return 0;
 }
 
 int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
 {
-	struct bch_sb_field_journal *journal_buckets =
-		bch2_sb_get_journal(src);
-	unsigned journal_u64s = journal_buckets
-		? le32_to_cpu(journal_buckets->field.u64s)
-		: 0;
 	int ret;
 
 	lockdep_assert_held(&c->sb_lock);
 
-	ret = bch2_sb_realloc(&c->disk_sb,
-			      le32_to_cpu(src->u64s) - journal_u64s);
-	if (ret)
-		return ret;
-
-	__copy_super(&c->disk_sb, src);
-
-	ret = bch2_sb_replicas_to_cpu_replicas(c);
-	if (ret)
-		return ret;
-
-	ret = bch2_sb_disk_groups_to_cpu(c);
+	ret =   bch2_sb_realloc(&c->disk_sb, 0) ?:
+		__copy_super(&c->disk_sb, src) ?:
+		bch2_sb_replicas_to_cpu_replicas(c) ?:
+		bch2_sb_disk_groups_to_cpu(c);
 	if (ret)
 		return ret;
 
@@ -416,188 +628,245 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
 
 int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
 {
-	struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb;
-	struct bch_sb_field_journal *journal_buckets =
-		bch2_sb_get_journal(dst);
-	unsigned journal_u64s = journal_buckets
-		? le32_to_cpu(journal_buckets->field.u64s)
-		: 0;
-	unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
-	int ret;
-
-	ret = bch2_sb_realloc(&ca->disk_sb, u64s);
-	if (ret)
-		return ret;
-
-	__copy_super(&ca->disk_sb, src);
-	return 0;
+	return __copy_super(&ca->disk_sb, c->disk_sb.sb);
 }
 
 /* read superblock: */
 
-static const char *read_one_super(struct bch_sb_handle *sb, u64 offset)
+static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
 {
-	struct bch_csum csum;
 	size_t bytes;
+	int ret;
 reread:
-	bio_reset(sb->bio);
-	bio_set_dev(sb->bio, sb->bdev);
+	bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
 	sb->bio->bi_iter.bi_sector = offset;
-	sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
-	bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
-	bch2_bio_map(sb->bio, sb->sb);
+	bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
 
-	if (submit_bio_wait(sb->bio))
-		return "IO error";
+	ret = submit_bio_wait(sb->bio);
+	if (ret) {
+		prt_printf(err, "IO error: %i", ret);
+		return ret;
+	}
 
-	if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
-		return "Not a bcachefs superblock";
+	if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) &&
+	    !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) {
+		prt_str(err, "Not a bcachefs superblock (got magic ");
+		pr_uuid(err, sb->sb->magic.b);
+		prt_str(err, ")");
+		return -BCH_ERR_invalid_sb_magic;
+	}
 
-	if (le64_to_cpu(sb->sb->version) < BCH_SB_VERSION_MIN ||
-	    le64_to_cpu(sb->sb->version) > BCH_SB_VERSION_MAX)
-		return"Unsupported superblock version";
+	ret = bch2_sb_compatible(sb->sb, err);
+	if (ret)
+		return ret;
 
 	bytes = vstruct_bytes(sb->sb);
 
-	if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
-		return "Bad superblock: too big";
+	u64 sb_size = 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits);
+	if (bytes > sb_size) {
+		prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %llu)",
+			   bytes, sb_size);
+		return -BCH_ERR_invalid_sb_too_big;
+	}
 
-	if (get_order(bytes) > sb->page_order) {
-		if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)))
-			return "cannot allocate memory";
+	if (bytes > sb->buffer_size) {
+		ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s));
+		if (ret)
+			return ret;
 		goto reread;
 	}
 
-	if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR)
-		return "unknown csum type";
+	enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb);
+	if (csum_type >= BCH_CSUM_NR ||
+	    bch2_csum_type_is_encryption(csum_type)) {
+		prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
+		return -BCH_ERR_invalid_sb_csum_type;
+	}
 
 	/* XXX: verify MACs */
-	csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
-			    null_nonce(), sb->sb);
+	struct bch_csum csum = csum_vstruct(NULL, csum_type, null_nonce(), sb->sb);
+	if (bch2_crc_cmp(csum, sb->sb->csum)) {
+		bch2_csum_err_msg(err, csum_type, sb->sb->csum, csum);
+		return -BCH_ERR_invalid_sb_csum;
+	}
 
-	if (bch2_crc_cmp(csum, sb->sb->csum))
-		return "bad checksum reading superblock";
+	sb->seq = le64_to_cpu(sb->sb->seq);
 
-	return NULL;
+	return 0;
 }
 
-int bch2_read_super(const char *path, struct bch_opts *opts,
-		    struct bch_sb_handle *sb)
+static int __bch2_read_super(const char *path, struct bch_opts *opts,
+		    struct bch_sb_handle *sb, bool ignore_notbchfs_msg)
 {
 	u64 offset = opt_get(*opts, sb);
 	struct bch_sb_layout layout;
-	const char *err;
+	struct printbuf err = PRINTBUF;
+	struct printbuf err2 = PRINTBUF;
 	__le64 *i;
 	int ret;
-
-	pr_verbose_init(*opts, "");
-
+#ifndef __KERNEL__
+retry:
+#endif
 	memset(sb, 0, sizeof(*sb));
-	sb->mode	= FMODE_READ;
+	sb->mode	= BLK_OPEN_READ;
 	sb->have_bio	= true;
+	sb->holder	= kmalloc(1, GFP_KERNEL);
+	if (!sb->holder)
+		return -ENOMEM;
+
+	sb->sb_name = kstrdup(path, GFP_KERNEL);
+	if (!sb->sb_name) {
+		ret = -ENOMEM;
+		prt_printf(&err, "error allocating memory for sb_name");
+		goto err;
+	}
+
+#ifndef __KERNEL__
+	if (opt_get(*opts, direct_io) == false)
+		sb->mode |= BLK_OPEN_BUFFERED;
+#endif
 
 	if (!opt_get(*opts, noexcl))
-		sb->mode |= FMODE_EXCL;
+		sb->mode |= BLK_OPEN_EXCL;
 
 	if (!opt_get(*opts, nochanges))
-		sb->mode |= FMODE_WRITE;
+		sb->mode |= BLK_OPEN_WRITE;
 
-	sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
-	if (IS_ERR(sb->bdev) &&
-	    PTR_ERR(sb->bdev) == -EACCES &&
+	sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+	if (IS_ERR(sb->s_bdev_file) &&
+	    PTR_ERR(sb->s_bdev_file) == -EACCES &&
 	    opt_get(*opts, read_only)) {
-		sb->mode &= ~FMODE_WRITE;
+		sb->mode &= ~BLK_OPEN_WRITE;
 
-		sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
-		if (!IS_ERR(sb->bdev))
+		sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+		if (!IS_ERR(sb->s_bdev_file))
 			opt_set(*opts, nochanges, true);
 	}
 
-	if (IS_ERR(sb->bdev)) {
-		ret = PTR_ERR(sb->bdev);
-		goto out;
+	if (IS_ERR(sb->s_bdev_file)) {
+		ret = PTR_ERR(sb->s_bdev_file);
+		prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret));
+		goto err;
 	}
+	sb->bdev = file_bdev(sb->s_bdev_file);
 
-	err = "cannot allocate memory";
 	ret = bch2_sb_realloc(sb, 0);
-	if (ret)
+	if (ret) {
+		prt_printf(&err, "error allocating memory for superblock");
 		goto err;
+	}
 
-	ret = -EFAULT;
-	err = "dynamic fault";
-	if (bch2_fs_init_fault("read_super"))
+	if (bch2_fs_init_fault("read_super")) {
+		prt_printf(&err, "dynamic fault");
+		ret = -EFAULT;
 		goto err;
+	}
 
-	ret = -EINVAL;
-	err = read_one_super(sb, offset);
-	if (!err)
+	ret = read_one_super(sb, offset, &err);
+	if (!ret)
 		goto got_super;
 
 	if (opt_defined(*opts, sb))
 		goto err;
 
-	pr_err("error reading default superblock: %s", err);
+	prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
+	       path, err.buf);
+	if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
+		bch2_print_opts(opts, KERN_INFO "%s", err2.buf);
+	else
+		bch2_print_opts(opts, KERN_ERR "%s", err2.buf);
+
+	printbuf_exit(&err2);
+	printbuf_reset(&err);
 
 	/*
 	 * Error reading primary superblock - read location of backup
 	 * superblocks:
 	 */
-	bio_reset(sb->bio);
-	bio_set_dev(sb->bio, sb->bdev);
+	bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
 	sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
-	sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout);
-	bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
 	/*
 	 * use sb buffer to read layout, since sb buffer is page aligned but
 	 * layout won't be:
 	 */
-	bch2_bio_map(sb->bio, sb->sb);
+	bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout));
 
-	err = "IO error";
-	if (submit_bio_wait(sb->bio))
+	ret = submit_bio_wait(sb->bio);
+	if (ret) {
+		prt_printf(&err, "IO error: %i", ret);
 		goto err;
+	}
 
 	memcpy(&layout, sb->sb, sizeof(layout));
-	err = validate_sb_layout(&layout);
-	if (err)
+	ret = validate_sb_layout(&layout, &err);
+	if (ret)
 		goto err;
 
 	for (i = layout.sb_offset;
 	     i < layout.sb_offset + layout.nr_superblocks; i++) {
 		offset = le64_to_cpu(*i);
 
-		if (offset == opt_get(*opts, sb))
+		if (offset == opt_get(*opts, sb)) {
+			ret = -BCH_ERR_invalid;
 			continue;
+		}
 
-		err = read_one_super(sb, offset);
-		if (!err)
+		ret = read_one_super(sb, offset, &err);
+		if (!ret)
 			goto got_super;
 	}
 
-	ret = -EINVAL;
 	goto err;
 
 got_super:
-	err = "Superblock block size smaller than device block size";
-	ret = -EINVAL;
 	if (le16_to_cpu(sb->sb->block_size) << 9 <
-	    bdev_logical_block_size(sb->bdev))
+	    bdev_logical_block_size(sb->bdev) &&
+	    opt_get(*opts, direct_io)) {
+#ifndef __KERNEL__
+		opt_set(*opts, direct_io, false);
+		bch2_free_super(sb);
+		goto retry;
+#endif
+		prt_printf(&err, "block size (%u) smaller than device block size (%u)",
+		       le16_to_cpu(sb->sb->block_size) << 9,
+		       bdev_logical_block_size(sb->bdev));
+		ret = -BCH_ERR_block_size_too_small;
 		goto err;
+	}
 
-	if (sb->mode & FMODE_WRITE)
-		bdev_get_queue(sb->bdev)->backing_dev_info->capabilities
-			|= BDI_CAP_STABLE_WRITES;
-	ret = 0;
 	sb->have_layout = true;
+
+	ret = bch2_sb_validate(sb, 0, &err);
+	if (ret) {
+		bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
+				path, err.buf);
+		goto err_no_print;
+	}
 out:
-	pr_verbose_init(*opts, "ret %i", ret);
+	printbuf_exit(&err);
 	return ret;
 err:
+	bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
+			path, err.buf);
+err_no_print:
 	bch2_free_super(sb);
-	pr_err("error reading superblock: %s", err);
 	goto out;
 }
 
+int bch2_read_super(const char *path, struct bch_opts *opts,
+		    struct bch_sb_handle *sb)
+{
+	return __bch2_read_super(path, opts, sb, false);
+}
+
+/* provide a silenced version for mount.bcachefs */
+
+int bch2_read_super_silent(const char *path, struct bch_opts *opts,
+		    struct bch_sb_handle *sb)
+{
+	return __bch2_read_super(path, opts, sb, true);
+}
+
 /* write superblock: */
 
 static void write_super_endio(struct bio *bio)
@@ -606,13 +875,38 @@ static void write_super_endio(struct bio *bio)
 
 	/* XXX: return errors directly */
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write"))
+	if (bch2_dev_io_err_on(bio->bi_status, ca,
+			       bio_data_dir(bio)
+			       ? BCH_MEMBER_ERROR_write
+			       : BCH_MEMBER_ERROR_read,
+			       "superblock %s error: %s",
+			       str_write_read(bio_data_dir(bio)),
+			       bch2_blk_status_to_str(bio->bi_status)))
 		ca->sb_write_error = 1;
 
 	closure_put(&ca->fs->sb_write);
 	percpu_ref_put(&ca->io_ref);
 }
 
+static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bch_sb *sb = ca->disk_sb.sb;
+	struct bio *bio = ca->disk_sb.bio;
+
+	memset(ca->sb_read_scratch, 0, BCH_SB_READ_SCRATCH_BUF_SIZE);
+
+	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
+	bio->bi_iter.bi_sector	= le64_to_cpu(sb->layout.sb_offset[0]);
+	bio->bi_end_io		= write_super_endio;
+	bio->bi_private		= ca;
+	bch2_bio_map(bio, ca->sb_read_scratch, BCH_SB_READ_SCRATCH_BUF_SIZE);
+
+	this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio));
+
+	percpu_ref_get(&ca->io_ref);
+	closure_bio_submit(bio, &c->sb_write);
+}
+
 static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
 {
 	struct bch_sb *sb = ca->disk_sb.sb;
@@ -620,91 +914,191 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
 
 	sb->offset = sb->layout.sb_offset[idx];
 
-	SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
+	SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false));
 	sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
 				null_nonce(), sb);
 
-	bio_reset(bio);
-	bio_set_dev(bio, ca->disk_sb.bdev);
+	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
 	bio->bi_iter.bi_sector	= le64_to_cpu(sb->offset);
-	bio->bi_iter.bi_size	=
-		roundup((size_t) vstruct_bytes(sb),
-			bdev_logical_block_size(ca->disk_sb.bdev));
 	bio->bi_end_io		= write_super_endio;
 	bio->bi_private		= ca;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
-	bch2_bio_map(bio, sb);
+	bch2_bio_map(bio, sb,
+		     roundup((size_t) vstruct_bytes(sb),
+			     bdev_logical_block_size(ca->disk_sb.bdev)));
 
-	this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB],
+	this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb],
 		     bio_sectors(bio));
 
 	percpu_ref_get(&ca->io_ref);
 	closure_bio_submit(bio, &c->sb_write);
 }
 
-void bch2_write_super(struct bch_fs *c)
+int bch2_write_super(struct bch_fs *c)
 {
 	struct closure *cl = &c->sb_write;
-	struct bch_dev *ca;
-	unsigned i, sb = 0, nr_wrote;
-	const char *err;
+	struct printbuf err = PRINTBUF;
+	unsigned sb = 0, nr_wrote;
 	struct bch_devs_mask sb_written;
 	bool wrote, can_mount_without_written, can_mount_with_written;
+	unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
+	DARRAY(struct bch_dev *) online_devices = {};
+	int ret = 0;
+
+	trace_and_count(c, write_super, c, _RET_IP_);
+
+	if (c->opts.very_degraded)
+		degraded_flags |= BCH_FORCE_IF_LOST;
 
 	lockdep_assert_held(&c->sb_lock);
 
 	closure_init_stack(cl);
 	memset(&sb_written, 0, sizeof(sb_written));
 
+	for_each_online_member(c, ca) {
+		ret = darray_push(&online_devices, ca);
+		if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) {
+			percpu_ref_put(&ca->io_ref);
+			goto out;
+		}
+		percpu_ref_get(&ca->io_ref);
+	}
+
+	/* Make sure we're using the new magic numbers: */
+	c->disk_sb.sb->magic = BCHFS_MAGIC;
+	c->disk_sb.sb->layout.magic = BCHFS_MAGIC;
+
 	le64_add_cpu(&c->disk_sb.sb->seq, 1);
 
-	for_each_online_member(ca, c, i)
-		bch2_sb_from_fs(c, ca);
+	struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+	darray_for_each(online_devices, ca)
+		__bch2_members_v2_get_mut(mi, (*ca)->dev_idx)->seq = c->disk_sb.sb->seq;
+	c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds());
+
+	if (test_bit(BCH_FS_error, &c->flags))
+		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
+	if (test_bit(BCH_FS_topology_error, &c->flags))
+		SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1);
+
+	SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
+
+	bch2_sb_counters_from_cpu(c);
+	bch2_sb_members_from_cpu(c);
+	bch2_sb_members_cpy_v2_v1(&c->disk_sb);
+	bch2_sb_errors_from_cpu(c);
+	bch2_sb_downgrade_update(c);
 
-	for_each_online_member(ca, c, i) {
-		err = bch2_sb_validate(&ca->disk_sb);
-		if (err) {
-			bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
+	darray_for_each(online_devices, ca)
+		bch2_sb_from_fs(c, (*ca));
+
+	darray_for_each(online_devices, ca) {
+		printbuf_reset(&err);
+
+		ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err);
+		if (ret) {
+			bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
 			goto out;
 		}
 	}
 
-	if (c->opts.nochanges ||
-	    test_bit(BCH_FS_ERROR, &c->flags))
+	if (c->opts.nochanges)
+		goto out;
+
+	/*
+	 * Defer writing the superblock until filesystem initialization is
+	 * complete - don't write out a partly initialized superblock:
+	 */
+	if (!BCH_SB_INITIALIZED(c->disk_sb.sb))
 		goto out;
 
-	for_each_online_member(ca, c, i) {
-		__set_bit(ca->dev_idx, sb_written.d);
-		ca->sb_write_error = 0;
+	if (le16_to_cpu(c->disk_sb.sb->version) > bcachefs_metadata_version_current) {
+		struct printbuf buf = PRINTBUF;
+		prt_printf(&buf, "attempting to write superblock that wasn't version downgraded (");
+		bch2_version_to_text(&buf, le16_to_cpu(c->disk_sb.sb->version));
+		prt_str(&buf, " > ");
+		bch2_version_to_text(&buf, bcachefs_metadata_version_current);
+		prt_str(&buf, ")");
+		bch2_fs_fatal_error(c, ": %s", buf.buf);
+		printbuf_exit(&buf);
+		return -BCH_ERR_sb_not_downgraded;
+	}
+
+	darray_for_each(online_devices, ca) {
+		__set_bit((*ca)->dev_idx, sb_written.d);
+		(*ca)->sb_write_error = 0;
 	}
 
+	darray_for_each(online_devices, ca)
+		read_back_super(c, *ca);
+	closure_sync(cl);
+
+	darray_for_each(online_devices, cap) {
+		struct bch_dev *ca = *cap;
+
+		if (ca->sb_write_error)
+			continue;
+
+		if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
+			struct printbuf buf = PRINTBUF;
+			prt_char(&buf, ' ');
+			prt_bdevname(&buf, ca->disk_sb.bdev);
+			prt_printf(&buf,
+				": Superblock write was silently dropped! (seq %llu expected %llu)",
+				le64_to_cpu(ca->sb_read_scratch->seq),
+				ca->disk_sb.seq);
+			bch2_fs_fatal_error(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+			ret = -BCH_ERR_erofs_sb_err;
+		}
+
+		if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
+			struct printbuf buf = PRINTBUF;
+			prt_char(&buf, ' ');
+			prt_bdevname(&buf, ca->disk_sb.bdev);
+			prt_printf(&buf,
+				": Superblock modified by another process (seq %llu expected %llu)",
+				le64_to_cpu(ca->sb_read_scratch->seq),
+				ca->disk_sb.seq);
+			bch2_fs_fatal_error(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+			ret = -BCH_ERR_erofs_sb_err;
+		}
+	}
+
+	if (ret)
+		goto out;
+
 	do {
 		wrote = false;
-		for_each_online_member(ca, c, i)
-			if (sb < ca->disk_sb.sb->layout.nr_superblocks) {
+		darray_for_each(online_devices, cap) {
+			struct bch_dev *ca = *cap;
+			if (!ca->sb_write_error &&
+			    sb < ca->disk_sb.sb->layout.nr_superblocks) {
 				write_one_super(c, ca, sb);
 				wrote = true;
 			}
+		}
 		closure_sync(cl);
 		sb++;
 	} while (wrote);
 
-	for_each_online_member(ca, c, i)
+	darray_for_each(online_devices, cap) {
+		struct bch_dev *ca = *cap;
 		if (ca->sb_write_error)
 			__clear_bit(ca->dev_idx, sb_written.d);
+		else
+			ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq);
+	}
 
 	nr_wrote = dev_mask_nr(&sb_written);
 
 	can_mount_with_written =
-		bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
-				      BCH_FORCE_IF_DEGRADED);
+		bch2_have_enough_devs(c, sb_written, degraded_flags, false);
 
-	for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
+	for (unsigned i = 0; i < ARRAY_SIZE(sb_written.d); i++)
 		sb_written.d[i] = ~sb_written.d[i];
 
 	can_mount_without_written =
-		bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
-				      BCH_FORCE_IF_DEGRADED);
+		bch2_have_enough_devs(c, sb_written, degraded_flags, false);
 
 	/*
 	 * If we would be able to mount _without_ the devices we successfully
@@ -714,255 +1108,314 @@ void bch2_write_super(struct bch_fs *c)
 	 * written anything (new filesystem), we continue if we'd be able to
 	 * mount with the devices we did successfully write to:
 	 */
-	bch2_fs_fatal_err_on(!nr_wrote ||
-			     (can_mount_without_written &&
-			      !can_mount_with_written), c,
-		"Unable to write superblock to sufficient devices");
+	if (bch2_fs_fatal_err_on(!nr_wrote ||
+				 !can_mount_with_written ||
+				 (can_mount_without_written &&
+				  !can_mount_with_written), c,
+		": Unable to write superblock to sufficient devices (from %ps)",
+		(void *) _RET_IP_))
+		ret = -1;
 out:
 	/* Make new options visible after they're persistent: */
 	bch2_sb_update(c);
+	darray_for_each(online_devices, ca)
+		percpu_ref_put(&(*ca)->io_ref);
+	darray_exit(&online_devices);
+	printbuf_exit(&err);
+	return ret;
 }
 
-/* BCH_SB_FIELD_journal: */
-
-static int u64_cmp(const void *_l, const void *_r)
+void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
 {
-	u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
+	mutex_lock(&c->sb_lock);
+	if (!(c->sb.features & (1ULL << feat))) {
+		c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat);
 
-	return l < r ? -1 : l > r ? 1 : 0;
+		bch2_write_super(c);
+	}
+	mutex_unlock(&c->sb_lock);
 }
 
-static const char *bch2_sb_validate_journal(struct bch_sb *sb,
-					    struct bch_sb_field *f)
+/* Downgrade if superblock is at a higher version than currently supported: */
+bool bch2_check_version_downgrade(struct bch_fs *c)
 {
-	struct bch_sb_field_journal *journal = field_to_type(f, journal);
-	struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
-	const char *err;
-	unsigned nr;
-	unsigned i;
-	u64 *b;
-
-	journal = bch2_sb_get_journal(sb);
-	if (!journal)
-		return NULL;
-
-	nr = bch2_nr_journal_buckets(journal);
-	if (!nr)
-		return NULL;
-
-	b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
-	if (!b)
-		return "cannot allocate memory";
+	bool ret = bcachefs_metadata_version_current < c->sb.version;
 
-	for (i = 0; i < nr; i++)
-		b[i] = le64_to_cpu(journal->buckets[i]);
-
-	sort(b, nr, sizeof(u64), u64_cmp, NULL);
-
-	err = "journal bucket at sector 0";
-	if (!b[0])
-		goto err;
+	lockdep_assert_held(&c->sb_lock);
 
-	err = "journal bucket before first bucket";
-	if (m && b[0] < le16_to_cpu(m->first_bucket))
-		goto err;
+	/*
+	 * Downgrade, if superblock is at a higher version than currently
+	 * supported:
+	 *
+	 * c->sb will be checked before we write the superblock, so update it as
+	 * well:
+	 */
+	if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
+		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
+	if (c->sb.version > bcachefs_metadata_version_current)
+		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
+	if (c->sb.version_min > bcachefs_metadata_version_current)
+		c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current);
+	c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
+	return ret;
+}
 
-	err = "journal bucket past end of device";
-	if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets))
-		goto err;
+void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version)
+{
+	lockdep_assert_held(&c->sb_lock);
 
-	err = "duplicate journal buckets";
-	for (i = 0; i + 1 < nr; i++)
-		if (b[i] == b[i + 1])
-			goto err;
+	if (BCH_VERSION_MAJOR(new_version) >
+	    BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
+		bch2_sb_field_resize(&c->disk_sb, downgrade, 0);
 
-	err = NULL;
-err:
-	kfree(b);
-	return err;
+	c->disk_sb.sb->version = cpu_to_le16(new_version);
+	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
 }
 
-static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
-	.validate	= bch2_sb_validate_journal,
-};
-
-/* BCH_SB_FIELD_members: */
-
-static const char *bch2_sb_validate_members(struct bch_sb *sb,
-					    struct bch_sb_field *f)
+static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				enum bch_validate_flags flags, struct printbuf *err)
 {
-	struct bch_sb_field_members *mi = field_to_type(f, members);
-	struct bch_member *m;
+	if (vstruct_bytes(f) < 88) {
+		prt_printf(err, "field too small (%zu < %u)", vstruct_bytes(f), 88);
+		return -BCH_ERR_invalid_sb_ext;
+	}
 
-	if ((void *) (mi->members + sb->nr_devices) >
-	    vstruct_end(&mi->field))
-		return "Invalid superblock: bad member info";
+	return 0;
+}
 
-	for (m = mi->members;
-	     m < mi->members + sb->nr_devices;
-	     m++) {
-		if (!bch2_member_exists(m))
-			continue;
+static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
+				struct bch_sb_field *f)
+{
+	struct bch_sb_field_ext *e = field_to_type(f, ext);
 
-		if (le64_to_cpu(m->nbuckets) > LONG_MAX)
-			return "Too many buckets";
+	prt_printf(out, "Recovery passes required:\t");
+	prt_bitflags(out, bch2_recovery_passes,
+		     bch2_recovery_passes_from_stable(le64_to_cpu(e->recovery_passes_required[0])));
+	prt_newline(out);
 
-		if (le64_to_cpu(m->nbuckets) -
-		    le16_to_cpu(m->first_bucket) < 1 << 10)
-			return "Not enough buckets";
+	unsigned long *errors_silent = kmalloc(sizeof(e->errors_silent), GFP_KERNEL);
+	if (errors_silent) {
+		le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8);
 
-		if (le16_to_cpu(m->bucket_size) <
-		    le16_to_cpu(sb->block_size))
-			return "bucket size smaller than block size";
+		prt_printf(out, "Errors to silently fix:\t");
+		prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent,
+				    min(BCH_FSCK_ERR_MAX, sizeof(e->errors_silent) * 8));
+		prt_newline(out);
 
-		if (le16_to_cpu(m->bucket_size) <
-		    BCH_SB_BTREE_NODE_SIZE(sb))
-			return "bucket size smaller than btree node size";
+		kfree(errors_silent);
 	}
 
-	if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX)
-		for (m = mi->members;
-		     m < mi->members + sb->nr_devices;
-		     m++)
-			SET_BCH_MEMBER_DATA_ALLOWED(m, ~0);
-
-	return NULL;
+	prt_printf(out, "Btrees with missing data:\t");
+	prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data));
+	prt_newline(out);
 }
 
-static const struct bch_sb_field_ops bch_sb_field_ops_members = {
-	.validate	= bch2_sb_validate_members,
+static const struct bch_sb_field_ops bch_sb_field_ops_ext = {
+	.validate	= bch2_sb_ext_validate,
+	.to_text	= bch2_sb_ext_to_text,
 };
 
-/* BCH_SB_FIELD_crypt: */
-
-static const char *bch2_sb_validate_crypt(struct bch_sb *sb,
-					  struct bch_sb_field *f)
-{
-	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
-	if (vstruct_bytes(&crypt->field) != sizeof(*crypt))
-		return "invalid field crypt: wrong size";
+static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
+#define x(f, nr)					\
+	[BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
+	BCH_SB_FIELDS()
+#undef x
+};
 
-	if (BCH_CRYPT_KDF_TYPE(crypt))
-		return "invalid field crypt: bad kdf type";
+static const struct bch_sb_field_ops bch2_sb_field_null_ops;
 
-	return NULL;
+static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type)
+{
+	return likely(type < ARRAY_SIZE(bch2_sb_field_ops))
+		? bch2_sb_field_ops[type]
+		: &bch2_sb_field_null_ops;
 }
 
-static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
-	.validate	= bch2_sb_validate_crypt,
-};
-
-/* BCH_SB_FIELD_clean: */
-
-void bch2_fs_mark_clean(struct bch_fs *c, bool clean)
+static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				  enum bch_validate_flags flags, struct printbuf *err)
 {
-	struct bch_sb_field_clean *sb_clean;
-	unsigned u64s = sizeof(*sb_clean) / sizeof(u64);
-	struct jset_entry *entry;
-	struct btree_root *r;
+	unsigned type = le32_to_cpu(f->type);
+	struct printbuf field_err = PRINTBUF;
+	const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
+	int ret;
 
-	mutex_lock(&c->sb_lock);
-	if (clean == BCH_SB_CLEAN(c->disk_sb.sb))
-		goto out;
+	ret = ops->validate ? ops->validate(sb, f, flags, &field_err) : 0;
+	if (ret) {
+		prt_printf(err, "Invalid superblock section %s: %s",
+			   bch2_sb_fields[type], field_err.buf);
+		prt_newline(err);
+		bch2_sb_field_to_text(err, sb, f);
+	}
 
-	SET_BCH_SB_CLEAN(c->disk_sb.sb, clean);
+	printbuf_exit(&field_err);
+	return ret;
+}
 
-	if (!clean)
-		goto write_super;
+void __bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
+			     struct bch_sb_field *f)
+{
+	unsigned type = le32_to_cpu(f->type);
+	const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
 
-	mutex_lock(&c->btree_root_lock);
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 32);
 
-	for (r = c->btree_roots;
-	     r < c->btree_roots + BTREE_ID_NR;
-	     r++)
-		if (r->alive)
-			u64s += jset_u64s(r->key.u64s);
+	if (ops->to_text)
+		ops->to_text(out, sb, f);
+}
 
-	sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
-	if (!sb_clean) {
-		bch_err(c, "error resizing superblock while setting filesystem clean");
-		goto out;
-	}
+void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
+			   struct bch_sb_field *f)
+{
+	unsigned type = le32_to_cpu(f->type);
 
-	sb_clean->flags		= 0;
-	sb_clean->read_clock	= cpu_to_le16(c->bucket_clock[READ].hand);
-	sb_clean->write_clock	= cpu_to_le16(c->bucket_clock[WRITE].hand);
-	sb_clean->journal_seq	= journal_cur_seq(&c->journal) - 1;
-
-	entry = sb_clean->start;
-	memset(entry, 0,
-	       vstruct_end(&sb_clean->field) - (void *) entry);
-
-	for (r = c->btree_roots;
-	     r < c->btree_roots + BTREE_ID_NR;
-	     r++)
-		if (r->alive) {
-			entry->u64s	= r->key.u64s;
-			entry->btree_id	= r - c->btree_roots;
-			entry->level	= r->level;
-			entry->type	= BCH_JSET_ENTRY_btree_root;
-			bkey_copy(&entry->start[0], &r->key);
-			entry = vstruct_next(entry);
-			BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
-		}
+	if (type < BCH_SB_FIELD_NR)
+		prt_printf(out, "%s", bch2_sb_fields[type]);
+	else
+		prt_printf(out, "(unknown field %u)", type);
 
-	BUG_ON(entry != vstruct_end(&sb_clean->field));
+	prt_printf(out, " (size %zu):", vstruct_bytes(f));
+	prt_newline(out);
 
-	mutex_unlock(&c->btree_root_lock);
-write_super:
-	bch2_write_super(c);
-out:
-	mutex_unlock(&c->sb_lock);
+	__bch2_sb_field_to_text(out, sb, f);
 }
 
-static const char *bch2_sb_validate_clean(struct bch_sb *sb,
-					  struct bch_sb_field *f)
+void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
 {
-	struct bch_sb_field_clean *clean = field_to_type(f, clean);
+	unsigned i;
 
-	if (vstruct_bytes(&clean->field) < sizeof(*clean))
-		return "invalid field crypt: wrong size";
+	prt_printf(out, "Type:                    %u", l->layout_type);
+	prt_newline(out);
 
-	return NULL;
-}
+	prt_str(out, "Superblock max size:     ");
+	prt_units_u64(out, 512 << l->sb_max_size_bits);
+	prt_newline(out);
 
-static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
-	.validate	= bch2_sb_validate_clean,
-};
+	prt_printf(out, "Nr superblocks:          %u", l->nr_superblocks);
+	prt_newline(out);
 
-static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
-#define x(f, nr)					\
-	[BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
-	BCH_SB_FIELDS()
-#undef x
-};
+	prt_str(out, "Offsets:                 ");
+	for (i = 0; i < l->nr_superblocks; i++) {
+		if (i)
+			prt_str(out, ", ");
+		prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i]));
+	}
+	prt_newline(out);
+}
 
-static const char *bch2_sb_field_validate(struct bch_sb *sb,
-					  struct bch_sb_field *f)
+void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
+		     bool print_layout, unsigned fields)
 {
-	unsigned type = le32_to_cpu(f->type);
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 44);
+
+	prt_printf(out, "External UUID:\t");
+	pr_uuid(out, sb->user_uuid.b);
+	prt_newline(out);
+
+	prt_printf(out, "Internal UUID:\t");
+	pr_uuid(out, sb->uuid.b);
+	prt_newline(out);
+
+	prt_printf(out, "Magic number:\t");
+	pr_uuid(out, sb->magic.b);
+	prt_newline(out);
+
+	prt_printf(out, "Device index:\t%u\n", sb->dev_idx);
+
+	prt_printf(out, "Label:\t");
+	if (!strlen(sb->label))
+		prt_printf(out, "(none)");
+	else
+		prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
+	prt_newline(out);
+
+	prt_printf(out, "Version:\t");
+	bch2_version_to_text(out, le16_to_cpu(sb->version));
+	prt_newline(out);
+
+	prt_printf(out, "Version upgrade complete:\t");
+	bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb));
+	prt_newline(out);
+
+	prt_printf(out, "Oldest version on disk:\t");
+	bch2_version_to_text(out, le16_to_cpu(sb->version_min));
+	prt_newline(out);
+
+	prt_printf(out, "Created:\t");
+	if (sb->time_base_lo)
+		bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
+	else
+		prt_printf(out, "(not set)");
+	prt_newline(out);
+
+	prt_printf(out, "Sequence number:\t");
+	prt_printf(out, "%llu", le64_to_cpu(sb->seq));
+	prt_newline(out);
+
+	prt_printf(out, "Time of last write:\t");
+	bch2_prt_datetime(out, le64_to_cpu(sb->write_time));
+	prt_newline(out);
+
+	prt_printf(out, "Superblock size:\t");
+	prt_units_u64(out, vstruct_bytes(sb));
+	prt_str(out, "/");
+	prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits);
+	prt_newline(out);
+
+	prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb));
+	prt_printf(out, "Devices:\t%u\n", bch2_sb_nr_devices(sb));
+
+	prt_printf(out, "Sections:\t");
+	u64 fields_have = 0;
+	vstruct_for_each(sb, f)
+		fields_have |= 1 << le32_to_cpu(f->type);
+	prt_bitflags(out, bch2_sb_fields, fields_have);
+	prt_newline(out);
+
+	prt_printf(out, "Features:\t");
+	prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0]));
+	prt_newline(out);
+
+	prt_printf(out, "Compat features:\t");
+	prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0]));
+	prt_newline(out);
+
+	prt_newline(out);
+	prt_printf(out, "Options:");
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+	{
+		enum bch_opt_id id;
+
+		for (id = 0; id < bch2_opts_nr; id++) {
+			const struct bch_option *opt = bch2_opt_table + id;
+
+			if (opt->get_sb != BCH2_NO_SB_OPT) {
+				u64 v = bch2_opt_from_sb(sb, id);
+
+				prt_printf(out, "%s:\t", opt->attr.name);
+				bch2_opt_to_text(out, NULL, sb, opt, v,
+						 OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
+				prt_newline(out);
+			}
+		}
+	}
 
-	return type < BCH_SB_FIELD_NR
-		? bch2_sb_field_ops[type]->validate(sb, f)
-		: NULL;
-}
+	printbuf_indent_sub(out, 2);
 
-size_t bch2_sb_field_to_text(char *buf, size_t size,
-			     struct bch_sb *sb, struct bch_sb_field *f)
-{
-	unsigned type = le32_to_cpu(f->type);
-	size_t (*to_text)(char *, size_t, struct bch_sb *,
-				   struct bch_sb_field *) =
-		type < BCH_SB_FIELD_NR
-		? bch2_sb_field_ops[type]->to_text
-		: NULL;
-
-	if (!to_text) {
-		if (size)
-			buf[0] = '\0';
-		return 0;
+	if (print_layout) {
+		prt_newline(out);
+		prt_printf(out, "layout:");
+		prt_newline(out);
+		printbuf_indent_add(out, 2);
+		bch2_sb_layout_to_text(out, &sb->layout);
+		printbuf_indent_sub(out, 2);
 	}
 
-	return to_text(buf, size, sb, f);
+	vstruct_for_each(sb, f)
+		if (fields & (1 << le32_to_cpu(f->type))) {
+			prt_newline(out);
+			bch2_sb_field_to_text(out, sb, f);
+		}
 }
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index 7d09d8e4..90e7b176 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_SUPER_IO_H
 #define _BCACHEFS_SUPER_IO_H
 
@@ -5,65 +6,62 @@
 #include "eytzinger.h"
 #include "super_types.h"
 #include "super.h"
+#include "sb-members.h"
 
 #include <asm/byteorder.h>
 
-struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
-struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *,
-					  enum bch_sb_field_type, unsigned);
+#define BCH_SB_READ_SCRATCH_BUF_SIZE		4096
 
-#define field_to_type(_f, _name)					\
-	container_of_or_null(_f, struct bch_sb_field_##_name, field)
+static inline bool bch2_version_compatible(u16 version)
+{
+	return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) &&
+		version >= bcachefs_metadata_version_min;
+}
+
+void bch2_version_to_text(struct printbuf *, unsigned);
+unsigned bch2_latest_compatible_version(unsigned);
 
-#define x(_name, _nr)							\
-static inline struct bch_sb_field_##_name *				\
-bch2_sb_get_##_name(struct bch_sb *sb)					\
-{									\
-	return field_to_type(bch2_sb_field_get(sb,			\
-				BCH_SB_FIELD_##_name), _name);		\
-}									\
-									\
-static inline struct bch_sb_field_##_name *				\
-bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s)	\
-{									\
-	return field_to_type(bch2_sb_field_resize(sb,			\
-				BCH_SB_FIELD_##_name, u64s), _name);	\
+static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
+{
+	return le32_to_cpu(f->u64s) * sizeof(u64);
 }
 
-BCH_SB_FIELDS()
-#undef x
+#define field_to_type(_f, _name)					\
+	container_of_or_null(_f, struct bch_sb_field_##_name, field)
 
-extern const char * const bch2_sb_fields[];
+struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *, enum bch_sb_field_type);
+#define bch2_sb_field_get(_sb, _name)					\
+	field_to_type(bch2_sb_field_get_id(_sb, BCH_SB_FIELD_##_name), _name)
 
-struct bch_sb_field_ops {
-	const char *	(*validate)(struct bch_sb *, struct bch_sb_field *);
-	size_t		(*to_text)(char *, size_t, struct bch_sb *,
-				   struct bch_sb_field *);
-};
+struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *,
+					     enum bch_sb_field_type, unsigned);
+#define bch2_sb_field_resize(_sb, _name, _u64s)				\
+	field_to_type(bch2_sb_field_resize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name)
 
-static inline bool bch2_sb_test_feature(struct bch_sb *sb,
-					enum bch_sb_features f)
-{
-	unsigned w = f / 64;
-	unsigned b = f % 64;
+struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *,
+					enum bch_sb_field_type, unsigned);
+#define bch2_sb_field_get_minsize(_sb, _name, _u64s)				\
+	field_to_type(bch2_sb_field_get_minsize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name)
 
-	return le64_to_cpu(sb->features[w]) & (1ULL << b);
-}
+#define bch2_sb_field_nr_entries(_f)					\
+	(_f ? ((bch2_sb_field_bytes(&_f->field) - sizeof(*_f)) /	\
+	       sizeof(_f->entries[0]))					\
+	    : 0)
 
-static inline void bch2_sb_set_feature(struct bch_sb *sb,
-				       enum bch_sb_features f)
-{
-	if (!bch2_sb_test_feature(sb, f)) {
-		unsigned w = f / 64;
-		unsigned b = f % 64;
+void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
 
-		le64_add_cpu(&sb->features[w], 1ULL << b);
-	}
-}
+extern const char * const bch2_sb_fields[];
+
+struct bch_sb_field_ops {
+	int	(*validate)(struct bch_sb *, struct bch_sb_field *,
+			    enum bch_validate_flags, struct printbuf *);
+	void	(*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *);
+};
 
 static inline __le64 bch2_sb_magic(struct bch_fs *c)
 {
 	__le64 ret;
+
 	memcpy(&ret, &c->sb.uuid, sizeof(ret));
 	return ret;
 }
@@ -84,58 +82,25 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
 void bch2_free_super(struct bch_sb_handle *);
 int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
 
-const char *bch2_sb_validate(struct bch_sb_handle *);
-
 int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
-void bch2_write_super(struct bch_fs *);
-
-/* BCH_SB_FIELD_journal: */
-
-static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
-{
-	return j
-		? (__le64 *) vstruct_end(&j->field) - j->buckets
-		: 0;
-}
+int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
+int bch2_write_super(struct bch_fs *);
+void __bch2_check_set_feature(struct bch_fs *, unsigned);
 
-/* BCH_SB_FIELD_members: */
-
-static inline bool bch2_member_exists(struct bch_member *m)
-{
-	return !bch2_is_zero(m->uuid.b, sizeof(uuid_le));
-}
-
-static inline bool bch2_dev_exists(struct bch_sb *sb,
-				   struct bch_sb_field_members *mi,
-				   unsigned dev)
+static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
 {
-	return dev < sb->nr_devices &&
-		bch2_member_exists(&mi->members[dev]);
+	if (!(c->sb.features & (1ULL << feat)))
+		__bch2_check_set_feature(c, feat);
 }
 
-static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
-{
-	return (struct bch_member_cpu) {
-		.nbuckets	= le64_to_cpu(mi->nbuckets),
-		.first_bucket	= le16_to_cpu(mi->first_bucket),
-		.bucket_size	= le16_to_cpu(mi->bucket_size),
-		.group		= BCH_MEMBER_GROUP(mi),
-		.state		= BCH_MEMBER_STATE(mi),
-		.replacement	= BCH_MEMBER_REPLACEMENT(mi),
-		.discard	= BCH_MEMBER_DISCARD(mi),
-		.data_allowed	= BCH_MEMBER_DATA_ALLOWED(mi),
-		.durability	= BCH_MEMBER_DURABILITY(mi)
-			? BCH_MEMBER_DURABILITY(mi) - 1
-			: 1,
-		.valid		= !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)),
-	};
-}
-
-/* BCH_SB_FIELD_clean: */
-
-void bch2_fs_mark_clean(struct bch_fs *, bool);
+bool bch2_check_version_downgrade(struct bch_fs *);
+void bch2_sb_upgrade(struct bch_fs *, unsigned);
 
-size_t bch2_sb_field_to_text(char *, size_t, struct bch_sb *,
+void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
 			     struct bch_sb_field *);
+void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
+			   struct bch_sb_field *);
+void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
+void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned);
 
 #endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index a2a32b92..14157820 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * bcachefs setup/teardown code, and some metadata io - read a superblock and
  * figure out what to do with it.
@@ -7,63 +8,156 @@
  */
 
 #include "bcachefs.h"
-#include "alloc.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "bkey_sort.h"
 #include "btree_cache.h"
 #include "btree_gc.h"
+#include "btree_journal_iter.h"
+#include "btree_key_cache.h"
+#include "btree_node_scan.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
+#include "btree_write_buffer.h"
+#include "buckets_waiting_for_journal.h"
 #include "chardev.h"
 #include "checksum.h"
 #include "clock.h"
 #include "compress.h"
 #include "debug.h"
+#include "disk_accounting.h"
 #include "disk_groups.h"
+#include "ec.h"
+#include "errcode.h"
 #include "error.h"
 #include "fs.h"
 #include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
 #include "fsck.h"
 #include "inode.h"
-#include "io.h"
+#include "io_read.h"
+#include "io_write.h"
 #include "journal.h"
 #include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
 #include "move.h"
 #include "migrate.h"
 #include "movinggc.h"
+#include "nocow_locking.h"
 #include "quota.h"
 #include "rebalance.h"
 #include "recovery.h"
 #include "replicas.h"
+#include "sb-clean.h"
+#include "sb-counters.h"
+#include "sb-errors.h"
+#include "sb-members.h"
+#include "snapshot.h"
+#include "subvolume.h"
 #include "super.h"
 #include "super-io.h"
 #include "sysfs.h"
+#include "thread_with_file.h"
+#include "trace.h"
 
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 #include <linux/debugfs.h>
 #include <linux/device.h>
-#include <linux/genhd.h>
 #include <linux/idr.h>
-#include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/random.h>
 #include <linux/sysfs.h>
 #include <crypto/hash.h>
 
-#include <trace/events/bcachefs.h>
-
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+MODULE_DESCRIPTION("bcachefs filesystem");
+MODULE_SOFTDEP("pre: crc32c");
+MODULE_SOFTDEP("pre: crc64");
+MODULE_SOFTDEP("pre: sha256");
+MODULE_SOFTDEP("pre: chacha20");
+MODULE_SOFTDEP("pre: poly1305");
+MODULE_SOFTDEP("pre: xxhash");
+
+const char * const bch2_fs_flag_strs[] = {
+#define x(n)		#n,
+	BCH_FS_FLAGS()
+#undef x
+	NULL
+};
+
+void bch2_print_str(struct bch_fs *c, const char *str)
+{
+#ifdef __KERNEL__
+	struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
+
+	if (unlikely(stdio)) {
+		bch2_stdio_redirect_printf(stdio, true, "%s", str);
+		return;
+	}
+#endif
+	bch2_print_string_as_lines(KERN_ERR, str);
+}
+
+__printf(2, 0)
+static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
+{
+#ifdef __KERNEL__
+	if (unlikely(stdio)) {
+		if (fmt[0] == KERN_SOH[0])
+			fmt += 2;
+
+		bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
+		return;
+	}
+#endif
+	vprintk(fmt, args);
+}
+
+void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)
+{
+	struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;
+
+	va_list args;
+	va_start(args, fmt);
+	bch2_print_maybe_redirect(stdio, fmt, args);
+	va_end(args);
+}
+
+void __bch2_print(struct bch_fs *c, const char *fmt, ...)
+{
+	struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
+
+	va_list args;
+	va_start(args, fmt);
+	bch2_print_maybe_redirect(stdio, fmt, args);
+	va_end(args);
+}
 
 #define KTYPE(type)							\
-struct kobj_type type ## _ktype = {					\
+static const struct attribute_group type ## _group = {			\
+	.attrs = type ## _files						\
+};									\
+									\
+static const struct attribute_group *type ## _groups[] = {		\
+	&type ## _group,						\
+	NULL								\
+};									\
+									\
+static const struct kobj_type type ## _ktype = {			\
 	.release	= type ## _release,				\
 	.sysfs_ops	= &type ## _sysfs_ops,				\
-	.default_attrs	= type ## _files				\
+	.default_groups = type ## _groups				\
 }
 
 static void bch2_fs_release(struct kobject *);
 static void bch2_dev_release(struct kobject *);
+static void bch2_fs_counters_release(struct kobject *k)
+{
+}
 
 static void bch2_fs_internal_release(struct kobject *k)
 {
@@ -77,35 +171,35 @@ static void bch2_fs_time_stats_release(struct kobject *k)
 {
 }
 
-static KTYPE(bch2_fs);
-static KTYPE(bch2_fs_internal);
-static KTYPE(bch2_fs_opts_dir);
-static KTYPE(bch2_fs_time_stats);
-static KTYPE(bch2_dev);
+KTYPE(bch2_fs);
+KTYPE(bch2_fs_counters);
+KTYPE(bch2_fs_internal);
+KTYPE(bch2_fs_opts_dir);
+KTYPE(bch2_fs_time_stats);
+KTYPE(bch2_dev);
 
 static struct kset *bcachefs_kset;
 static LIST_HEAD(bch_fs_list);
 static DEFINE_MUTEX(bch_fs_list_lock);
 
-static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
+DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);
 
+static void bch2_dev_unlink(struct bch_dev *);
 static void bch2_dev_free(struct bch_dev *);
 static int bch2_dev_alloc(struct bch_fs *, unsigned);
 static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
 static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
 
-struct bch_fs *bch2_bdev_to_fs(struct block_device *bdev)
+struct bch_fs *bch2_dev_to_fs(dev_t dev)
 {
 	struct bch_fs *c;
-	struct bch_dev *ca;
-	unsigned i;
 
 	mutex_lock(&bch_fs_list_lock);
 	rcu_read_lock();
 
 	list_for_each_entry(c, &bch_fs_list, list)
-		for_each_member_device_rcu(ca, c, i, NULL)
-			if (ca->disk_sb.bdev == bdev) {
+		for_each_member_device_rcu(c, ca, NULL)
+			if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
 				closure_get(&c->cl);
 				goto found;
 			}
@@ -117,20 +211,20 @@ found:
 	return c;
 }
 
-static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid)
+static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid)
 {
 	struct bch_fs *c;
 
 	lockdep_assert_held(&bch_fs_list_lock);
 
 	list_for_each_entry(c, &bch_fs_list, list)
-		if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid_le)))
+		if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid)))
 			return c;
 
 	return NULL;
 }
 
-struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
+struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
 {
 	struct bch_fs *c;
 
@@ -143,45 +237,6 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
 	return c;
 }
 
-int bch2_congested(void *data, int bdi_bits)
-{
-	struct bch_fs *c = data;
-	struct backing_dev_info *bdi;
-	struct bch_dev *ca;
-	unsigned i;
-	int ret = 0;
-
-	rcu_read_lock();
-	if (bdi_bits & (1 << WB_sync_congested)) {
-		/* Reads - check all devices: */
-		for_each_readable_member(ca, c, i) {
-			bdi = ca->disk_sb.bdev->bd_bdi;
-
-			if (bdi_congested(bdi, bdi_bits)) {
-				ret = 1;
-				break;
-			}
-		}
-	} else {
-		unsigned target = READ_ONCE(c->opts.foreground_target);
-		const struct bch_devs_mask *devs = target
-			? bch2_target_to_mask(c, target)
-			: &c->rw_devs[BCH_DATA_USER];
-
-		for_each_member_device_rcu(ca, c, i, devs) {
-			bdi = ca->disk_sb.bdev->bd_bdi;
-
-			if (bdi_congested(bdi, bdi_bits)) {
-				ret = 1;
-				break;
-			}
-		}
-	}
-	rcu_read_unlock();
-
-	return ret;
-}
-
 /* Filesystem RO/RW: */
 
 /*
@@ -201,81 +256,83 @@ int bch2_congested(void *data, int bdi_bits)
 
 static void __bch2_fs_read_only(struct bch_fs *c)
 {
-	struct bch_dev *ca;
-	unsigned i;
+	unsigned clean_passes = 0;
+	u64 seq = 0;
 
+	bch2_fs_ec_stop(c);
+	bch2_open_buckets_stop(c, NULL, true);
 	bch2_rebalance_stop(c);
+	bch2_copygc_stop(c);
+	bch2_fs_ec_flush(c);
+
+	bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
+		    journal_cur_seq(&c->journal));
+
+	do {
+		clean_passes++;
+
+		if (bch2_btree_interior_updates_flush(c) ||
+		    bch2_btree_write_buffer_flush_going_ro(c) ||
+		    bch2_journal_flush_all_pins(&c->journal) ||
+		    bch2_btree_flush_all_writes(c) ||
+		    seq != atomic64_read(&c->journal.seq)) {
+			seq = atomic64_read(&c->journal.seq);
+			clean_passes = 0;
+		}
+	} while (clean_passes < 2);
 
-	for_each_member_device(ca, c, i)
-		bch2_copygc_stop(ca);
-
-	bch2_gc_thread_stop(c);
-
-	/*
-	 * Flush journal before stopping allocators, because flushing journal
-	 * blacklist entries involves allocating new btree nodes:
-	 */
-	bch2_journal_flush_all_pins(&c->journal);
-
-	for_each_member_device(ca, c, i)
-		bch2_dev_allocator_stop(ca);
-
-	bch2_journal_flush_all_pins(&c->journal);
+	bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu",
+		    journal_cur_seq(&c->journal));
 
-	/*
-	 * We need to explicitly wait on btree interior updates to complete
-	 * before stopping the journal, flushing all journal pins isn't
-	 * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
-	 * interior updates have to drop their journal pin before they're
-	 * fully complete:
-	 */
-	closure_wait_event(&c->btree_interior_update_wait,
-			   !bch2_btree_interior_updates_nr_pending(c));
+	if (test_bit(JOURNAL_replay_done, &c->journal.flags) &&
+	    !test_bit(BCH_FS_emergency_ro, &c->flags))
+		set_bit(BCH_FS_clean_shutdown, &c->flags);
 
 	bch2_fs_journal_stop(&c->journal);
 
-	/*
-	 * the journal kicks off btree writes via reclaim - wait for in flight
-	 * writes after stopping journal:
-	 */
-	if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
-		bch2_btree_flush_all_writes(c);
-	else
-		bch2_btree_verify_flushed(c);
+	bch_info(c, "%sclean shutdown complete, journal seq %llu",
+		 test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un",
+		 c->journal.seq_ondisk);
 
 	/*
 	 * After stopping journal:
 	 */
-	for_each_member_device(ca, c, i)
+	for_each_member_device(c, ca)
 		bch2_dev_allocator_remove(c, ca);
 }
 
+#ifndef BCH_WRITE_REF_DEBUG
 static void bch2_writes_disabled(struct percpu_ref *writes)
 {
 	struct bch_fs *c = container_of(writes, struct bch_fs, writes);
 
-	set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
-	wake_up(&bch_read_only_wait);
+	set_bit(BCH_FS_write_disable_complete, &c->flags);
+	wake_up(&bch2_read_only_wait);
 }
+#endif
 
 void bch2_fs_read_only(struct bch_fs *c)
 {
-	if (c->state == BCH_FS_RO)
+	if (!test_bit(BCH_FS_rw, &c->flags)) {
+		bch2_journal_reclaim_stop(&c->journal);
 		return;
+	}
+
+	BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags));
 
-	BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+	bch_verbose(c, "going read-only");
 
 	/*
 	 * Block new foreground-end write operations from starting - any new
 	 * writes will return -EROFS:
-	 *
-	 * (This is really blocking new _allocations_, writes to previously
-	 * allocated space can still happen until stopping the allocator in
-	 * bch2_dev_allocator_stop()).
 	 */
+	set_bit(BCH_FS_going_ro, &c->flags);
+#ifndef BCH_WRITE_REF_DEBUG
 	percpu_ref_kill(&c->writes);
-
-	cancel_delayed_work(&c->pd_controllers_update);
+#else
+	for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
+		bch2_write_ref_put(c, i);
+#endif
 
 	/*
 	 * If we're not doing an emergency shutdown, we want to wait on
@@ -288,24 +345,44 @@ void bch2_fs_read_only(struct bch_fs *c)
 	 * we do need to wait on them before returning and signalling
 	 * that going RO is complete:
 	 */
-	wait_event(bch_read_only_wait,
-		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
-		   test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
+	wait_event(bch2_read_only_wait,
+		   test_bit(BCH_FS_write_disable_complete, &c->flags) ||
+		   test_bit(BCH_FS_emergency_ro, &c->flags));
+
+	bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags);
+	if (writes_disabled)
+		bch_verbose(c, "finished waiting for writes to stop");
 
 	__bch2_fs_read_only(c);
 
-	wait_event(bch_read_only_wait,
-		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+	wait_event(bch2_read_only_wait,
+		   test_bit(BCH_FS_write_disable_complete, &c->flags));
 
-	clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+	if (!writes_disabled)
+		bch_verbose(c, "finished waiting for writes to stop");
 
-	if (!bch2_journal_error(&c->journal) &&
-	    !test_bit(BCH_FS_ERROR, &c->flags) &&
-	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
-		bch2_fs_mark_clean(c, true);
+	clear_bit(BCH_FS_write_disable_complete, &c->flags);
+	clear_bit(BCH_FS_going_ro, &c->flags);
+	clear_bit(BCH_FS_rw, &c->flags);
 
-	if (c->state != BCH_FS_STOPPING)
-		c->state = BCH_FS_RO;
+	if (!bch2_journal_error(&c->journal) &&
+	    !test_bit(BCH_FS_error, &c->flags) &&
+	    !test_bit(BCH_FS_emergency_ro, &c->flags) &&
+	    test_bit(BCH_FS_started, &c->flags) &&
+	    test_bit(BCH_FS_clean_shutdown, &c->flags) &&
+	    c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {
+		BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
+		BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty));
+		BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
+		BUG_ON(c->btree_write_buffer.inc.keys.nr);
+		BUG_ON(c->btree_write_buffer.flushing.keys.nr);
+		bch2_verify_accounting_clean(c);
+
+		bch_verbose(c, "marking filesystem clean");
+		bch2_fs_mark_clean(c);
+	} else {
+		bch_verbose(c, "done going read-only, filesystem not clean");
+	}
 }
 
 static void bch2_fs_read_only_work(struct work_struct *work)
@@ -313,9 +390,9 @@ static void bch2_fs_read_only_work(struct work_struct *work)
 	struct bch_fs *c =
 		container_of(work, struct bch_fs, read_only_work);
 
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 	bch2_fs_read_only(c);
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 }
 
 static void bch2_fs_read_only_async(struct bch_fs *c)
@@ -325,101 +402,207 @@ static void bch2_fs_read_only_async(struct bch_fs *c)
 
 bool bch2_fs_emergency_read_only(struct bch_fs *c)
 {
-	bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
+	bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
 
-	bch2_fs_read_only_async(c);
 	bch2_journal_halt(&c->journal);
+	bch2_fs_read_only_async(c);
 
-	wake_up(&bch_read_only_wait);
+	wake_up(&bch2_read_only_wait);
 	return ret;
 }
 
-const char *bch2_fs_read_write(struct bch_fs *c)
+static int bch2_fs_read_write_late(struct bch_fs *c)
 {
-	struct bch_dev *ca;
-	const char *err = NULL;
-	unsigned i;
+	int ret;
 
-	if (c->state == BCH_FS_RW)
-		return NULL;
+	/*
+	 * Data move operations can't run until after check_snapshots has
+	 * completed, and bch2_snapshot_is_ancestor() is available.
+	 *
+	 * Ideally we'd start copygc/rebalance earlier instead of waiting for
+	 * all of recovery/fsck to complete:
+	 */
+	ret = bch2_copygc_start(c);
+	if (ret) {
+		bch_err(c, "error starting copygc thread");
+		return ret;
+	}
+
+	ret = bch2_rebalance_start(c);
+	if (ret) {
+		bch_err(c, "error starting rebalance thread");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int __bch2_fs_read_write(struct bch_fs *c, bool early)
+{
+	int ret;
+
+	BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
+
+	if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {
+		bch_err(c, "cannot go rw, unfixed btree errors");
+		return -BCH_ERR_erofs_unfixed_errors;
+	}
+
+	if (test_bit(BCH_FS_rw, &c->flags))
+		return 0;
+
+	bch_info(c, "going read-write");
+
+	ret = bch2_sb_members_v2_init(c);
+	if (ret)
+		goto err;
+
+	ret = bch2_fs_mark_dirty(c);
+	if (ret)
+		goto err;
+
+	clear_bit(BCH_FS_clean_shutdown, &c->flags);
 
-	bch2_fs_mark_clean(c, false);
+	/*
+	 * First journal write must be a flush write: after a clean shutdown we
+	 * don't read the journal, so the first journal write may end up
+	 * overwriting whatever was there previously, and there must always be
+	 * at least one non-flush write in the journal or recovery will fail:
+	 */
+	set_bit(JOURNAL_need_flush_write, &c->journal.flags);
+	set_bit(JOURNAL_running, &c->journal.flags);
 
-	for_each_rw_member(ca, c, i)
+	for_each_rw_member(c, ca)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
 
-	err = "error starting allocator thread";
-	for_each_rw_member(ca, c, i)
-		if (bch2_dev_allocator_start(ca)) {
-			percpu_ref_put(&ca->io_ref);
-			goto err;
-		}
+	set_bit(BCH_FS_rw, &c->flags);
+	set_bit(BCH_FS_was_rw, &c->flags);
+
+#ifndef BCH_WRITE_REF_DEBUG
+	percpu_ref_reinit(&c->writes);
+#else
+	for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {
+		BUG_ON(atomic_long_read(&c->writes[i]));
+		atomic_long_inc(&c->writes[i]);
+	}
+#endif
 
-	err = "error starting btree GC thread";
-	if (bch2_gc_thread_start(c))
+	ret = bch2_journal_reclaim_start(&c->journal);
+	if (ret)
 		goto err;
 
-	err = "error starting copygc thread";
-	for_each_rw_member(ca, c, i)
-		if (bch2_copygc_start(c, ca)) {
-			percpu_ref_put(&ca->io_ref);
+	if (!early) {
+		ret = bch2_fs_read_write_late(c);
+		if (ret)
 			goto err;
-		}
+	}
 
-	err = "error starting rebalance thread";
-	if (bch2_rebalance_start(c))
-		goto err;
+	bch2_do_discards(c);
+	bch2_do_invalidates(c);
+	bch2_do_stripe_deletes(c);
+	bch2_do_pending_node_rewrites(c);
+	return 0;
+err:
+	if (test_bit(BCH_FS_rw, &c->flags))
+		bch2_fs_read_only(c);
+	else
+		__bch2_fs_read_only(c);
+	return ret;
+}
 
-	schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
+int bch2_fs_read_write(struct bch_fs *c)
+{
+	if (c->opts.recovery_pass_last &&
+	    c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay)
+		return -BCH_ERR_erofs_norecovery;
 
-	if (c->state != BCH_FS_STARTING)
-		percpu_ref_reinit(&c->writes);
+	if (c->opts.nochanges)
+		return -BCH_ERR_erofs_nochanges;
 
-	c->state = BCH_FS_RW;
-	return NULL;
-err:
-	__bch2_fs_read_only(c);
-	return err;
+	return __bch2_fs_read_write(c, false);
+}
+
+int bch2_fs_read_write_early(struct bch_fs *c)
+{
+	lockdep_assert_held(&c->state_lock);
+
+	return __bch2_fs_read_write(c, true);
 }
 
 /* Filesystem startup/shutdown: */
 
-static void bch2_fs_free(struct bch_fs *c)
+static void __bch2_fs_free(struct bch_fs *c)
 {
-	unsigned i;
-
-	for (i = 0; i < BCH_TIME_STAT_NR; i++)
+	for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++)
 		bch2_time_stats_exit(&c->times[i]);
 
+	bch2_find_btree_nodes_exit(&c->found_btree_nodes);
+	bch2_free_pending_node_rewrites(c);
+	bch2_fs_accounting_exit(c);
+	bch2_fs_sb_errors_exit(c);
+	bch2_fs_counters_exit(c);
+	bch2_fs_snapshots_exit(c);
 	bch2_fs_quota_exit(c);
+	bch2_fs_fs_io_direct_exit(c);
+	bch2_fs_fs_io_buffered_exit(c);
 	bch2_fs_fsio_exit(c);
+	bch2_fs_vfs_exit(c);
+	bch2_fs_ec_exit(c);
 	bch2_fs_encryption_exit(c);
-	bch2_fs_io_exit(c);
+	bch2_fs_nocow_locking_exit(c);
+	bch2_fs_io_write_exit(c);
+	bch2_fs_io_read_exit(c);
+	bch2_fs_buckets_waiting_for_journal_exit(c);
+	bch2_fs_btree_interior_update_exit(c);
+	bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
 	bch2_fs_btree_cache_exit(c);
+	bch2_fs_btree_iter_exit(c);
+	bch2_fs_replicas_exit(c);
 	bch2_fs_journal_exit(&c->journal);
 	bch2_io_clock_exit(&c->io_clock[WRITE]);
 	bch2_io_clock_exit(&c->io_clock[READ]);
 	bch2_fs_compress_exit(c);
-	percpu_free_rwsem(&c->usage_lock);
-	free_percpu(c->usage_percpu);
+	bch2_journal_keys_put_initial(c);
+	bch2_find_btree_nodes_exit(&c->found_btree_nodes);
+	BUG_ON(atomic_read(&c->journal_keys.ref));
+	bch2_fs_btree_write_buffer_exit(c);
+	percpu_free_rwsem(&c->mark_lock);
+	if (c->online_reserved) {
+		u64 v = percpu_u64_get(c->online_reserved);
+		WARN(v, "online_reserved not 0 at shutdown: %lli", v);
+		free_percpu(c->online_reserved);
+	}
+
+	darray_exit(&c->btree_roots_extra);
+	free_percpu(c->pcpu);
+	free_percpu(c->usage);
+	mempool_exit(&c->large_bkey_pool);
 	mempool_exit(&c->btree_bounce_pool);
 	bioset_exit(&c->btree_bio);
-	mempool_exit(&c->btree_interior_update_pool);
-	mempool_exit(&c->btree_reserve_pool);
 	mempool_exit(&c->fill_iter);
+#ifndef BCH_WRITE_REF_DEBUG
 	percpu_ref_exit(&c->writes);
-	kfree(rcu_dereference_protected(c->replicas, 1));
+#endif
 	kfree(rcu_dereference_protected(c->disk_groups, 1));
-
+	kfree(c->journal_seq_blacklist_table);
+	kfree(c->unused_inode_hints);
+
+	if (c->write_ref_wq)
+		destroy_workqueue(c->write_ref_wq);
+	if (c->btree_write_submit_wq)
+		destroy_workqueue(c->btree_write_submit_wq);
+	if (c->btree_read_complete_wq)
+		destroy_workqueue(c->btree_read_complete_wq);
 	if (c->copygc_wq)
 		destroy_workqueue(c->copygc_wq);
-	if (c->wq)
-		destroy_workqueue(c->wq);
+	if (c->btree_io_complete_wq)
+		destroy_workqueue(c->btree_io_complete_wq);
+	if (c->btree_update_wq)
+		destroy_workqueue(c->btree_update_wq);
 
-	free_pages((unsigned long) c->disk_sb.sb,
-		   c->disk_sb.page_order);
-	kvpfree(c, sizeof(*c));
+	bch2_free_super(&c->disk_sb);
+	kvfree(c);
 	module_put(THIS_MODULE);
 }
 
@@ -427,19 +610,21 @@ static void bch2_fs_release(struct kobject *kobj)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
 
-	bch2_fs_free(c);
+	__bch2_fs_free(c);
 }
 
-void bch2_fs_stop(struct bch_fs *c)
+void __bch2_fs_stop(struct bch_fs *c)
 {
-	struct bch_dev *ca;
-	unsigned i;
+	bch_verbose(c, "shutting down");
+
+	set_bit(BCH_FS_stopping, &c->flags);
+
+	down_write(&c->state_lock);
+	bch2_fs_read_only(c);
+	up_write(&c->state_lock);
 
-	for_each_member_device(ca, c, i)
-		if (ca->kobj.state_in_sysfs &&
-		    ca->disk_sb.bdev)
-			sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
-					  "bcachefs");
+	for_each_member_device(c, ca)
+		bch2_dev_unlink(ca);
 
 	if (c->kobj.state_in_sysfs)
 		kobject_del(&c->kobj);
@@ -447,10 +632,27 @@ void bch2_fs_stop(struct bch_fs *c)
 	bch2_fs_debug_exit(c);
 	bch2_fs_chardev_exit(c);
 
+	bch2_ro_ref_put(c);
+	wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref));
+
+	kobject_put(&c->counters_kobj);
 	kobject_put(&c->time_stats);
 	kobject_put(&c->opts_dir);
 	kobject_put(&c->internal);
 
+	/* btree prefetch might have kicked off reads in the background: */
+	bch2_btree_flush_all_reads(c);
+
+	for_each_member_device(c, ca)
+		cancel_work_sync(&ca->io_error_work);
+
+	cancel_work_sync(&c->read_only_work);
+}
+
+void bch2_fs_free(struct bch_fs *c)
+{
+	unsigned i;
+
 	mutex_lock(&bch_fs_list_lock);
 	list_del(&c->list);
 	mutex_unlock(&bch_fs_list_lock);
@@ -458,338 +660,520 @@ void bch2_fs_stop(struct bch_fs *c)
 	closure_sync(&c->cl);
 	closure_debug_destroy(&c->cl);
 
-	mutex_lock(&c->state_lock);
-	bch2_fs_read_only(c);
-	mutex_unlock(&c->state_lock);
+	for (i = 0; i < c->sb.nr_devices; i++) {
+		struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
 
-	/* btree prefetch might have kicked off reads in the background: */
-	bch2_btree_flush_all_reads(c);
-
-	for_each_member_device(ca, c, i)
-		cancel_work_sync(&ca->io_error_work);
-
-	cancel_work_sync(&c->btree_write_error_work);
-	cancel_delayed_work_sync(&c->pd_controllers_update);
-	cancel_work_sync(&c->read_only_work);
+		if (ca) {
+			EBUG_ON(atomic_long_read(&ca->ref) != 1);
+			bch2_free_super(&ca->disk_sb);
+			bch2_dev_free(ca);
+		}
+	}
 
-	for (i = 0; i < c->sb.nr_devices; i++)
-		if (c->devs[i])
-			bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
+	bch_verbose(c, "shutdown complete");
 
 	kobject_put(&c->kobj);
 }
 
-static const char *bch2_fs_online(struct bch_fs *c)
+void bch2_fs_stop(struct bch_fs *c)
 {
-	struct bch_dev *ca;
-	const char *err = NULL;
-	unsigned i;
-	int ret;
+	__bch2_fs_stop(c);
+	bch2_fs_free(c);
+}
 
-	lockdep_assert_held(&bch_fs_list_lock);
+static int bch2_fs_online(struct bch_fs *c)
+{
+	int ret = 0;
 
-	if (!list_empty(&c->list))
-		return NULL;
+	lockdep_assert_held(&bch_fs_list_lock);
 
-	if (__bch2_uuid_to_fs(c->sb.uuid))
-		return "filesystem UUID already open";
+	if (__bch2_uuid_to_fs(c->sb.uuid)) {
+		bch_err(c, "filesystem UUID already open");
+		return -EINVAL;
+	}
 
 	ret = bch2_fs_chardev_init(c);
-	if (ret)
-		return "error creating character device";
+	if (ret) {
+		bch_err(c, "error creating character device");
+		return ret;
+	}
 
 	bch2_fs_debug_init(c);
 
-	if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
-	    kobject_add(&c->internal, &c->kobj, "internal") ||
-	    kobject_add(&c->opts_dir, &c->kobj, "options") ||
-	    kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
-	    bch2_opts_create_sysfs_files(&c->opts_dir))
-		return "error creating sysfs objects";
+	ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
+	    kobject_add(&c->internal, &c->kobj, "internal") ?:
+	    kobject_add(&c->opts_dir, &c->kobj, "options") ?:
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+	    kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+#endif
+	    kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
+	    bch2_opts_create_sysfs_files(&c->opts_dir);
+	if (ret) {
+		bch_err(c, "error creating sysfs objects");
+		return ret;
+	}
 
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 
-	err = "error creating sysfs objects";
-	__for_each_member_device(ca, c, i, NULL)
-		if (bch2_dev_sysfs_online(c, ca))
+	for_each_member_device(c, ca) {
+		ret = bch2_dev_sysfs_online(c, ca);
+		if (ret) {
+			bch_err(c, "error creating sysfs objects");
+			bch2_dev_put(ca);
 			goto err;
+		}
+	}
 
+	BUG_ON(!list_empty(&c->list));
 	list_add(&c->list, &bch_fs_list);
-	err = NULL;
 err:
-	mutex_unlock(&c->state_lock);
-	return err;
+	up_write(&c->state_lock);
+	return ret;
 }
 
 static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 {
-	struct bch_sb_field_members *mi;
 	struct bch_fs *c;
+	struct printbuf name = PRINTBUF;
 	unsigned i, iter_size;
-	const char *err;
-
-	pr_verbose_init(opts, "");
+	int ret = 0;
 
-	c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
-	if (!c)
+	c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
+	if (!c) {
+		c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
 		goto out;
+	}
+
+	c->stdio = (void *)(unsigned long) opts.stdio;
 
 	__module_get(THIS_MODULE);
 
+	closure_init(&c->cl, NULL);
+
+	c->kobj.kset = bcachefs_kset;
+	kobject_init(&c->kobj, &bch2_fs_ktype);
+	kobject_init(&c->internal, &bch2_fs_internal_ktype);
+	kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
+	kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+	kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype);
+
 	c->minor		= -1;
 	c->disk_sb.fs_sb	= true;
 
-	mutex_init(&c->state_lock);
+	init_rwsem(&c->state_lock);
 	mutex_init(&c->sb_lock);
 	mutex_init(&c->replicas_gc_lock);
 	mutex_init(&c->btree_root_lock);
 	INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
 
+	refcount_set(&c->ro_ref, 1);
+	init_waitqueue_head(&c->ro_ref_wait);
+	spin_lock_init(&c->recovery_pass_lock);
+	sema_init(&c->online_fsck_mutex, 1);
+
 	init_rwsem(&c->gc_lock);
+	mutex_init(&c->gc_gens_lock);
 
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
 		bch2_time_stats_init(&c->times[i]);
 
-	bch2_fs_allocator_init(c);
+	bch2_fs_gc_init(c);
+	bch2_fs_copygc_init(c);
+	bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
+	bch2_fs_btree_iter_init_early(c);
+	bch2_fs_btree_interior_update_init_early(c);
+	bch2_fs_journal_keys_init(c);
+	bch2_fs_allocator_background_init(c);
+	bch2_fs_allocator_foreground_init(c);
 	bch2_fs_rebalance_init(c);
 	bch2_fs_quota_init(c);
+	bch2_fs_ec_init_early(c);
+	bch2_fs_move_init(c);
+	bch2_fs_sb_errors_init_early(c);
 
 	INIT_LIST_HEAD(&c->list);
 
-	INIT_LIST_HEAD(&c->btree_interior_update_list);
-	mutex_init(&c->btree_reserve_cache_lock);
-	mutex_init(&c->btree_interior_update_lock);
-
 	mutex_init(&c->bio_bounce_pages_lock);
+	mutex_init(&c->snapshot_table_lock);
+	init_rwsem(&c->snapshot_create_lock);
 
-	bio_list_init(&c->btree_write_error_list);
 	spin_lock_init(&c->btree_write_error_lock);
-	INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work);
 
-	INIT_LIST_HEAD(&c->fsck_errors);
-	mutex_init(&c->fsck_error_lock);
+	INIT_LIST_HEAD(&c->journal_iters);
 
-	seqcount_init(&c->gc_pos_lock);
+	INIT_LIST_HEAD(&c->fsck_error_msgs);
+	mutex_init(&c->fsck_error_msgs_lock);
 
-	c->copy_gc_enabled		= 1;
-	c->rebalance.enabled		= 1;
-	c->promote_whole_extents	= true;
+	seqcount_init(&c->usage_lock);
 
-	c->journal.write_time	= &c->times[BCH_TIME_journal_write];
-	c->journal.delay_time	= &c->times[BCH_TIME_journal_delay];
-	c->journal.blocked_time	= &c->times[BCH_TIME_journal_blocked];
-	c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
+	sema_init(&c->io_in_flight, 128);
+
+	INIT_LIST_HEAD(&c->vfs_inodes_list);
+	mutex_init(&c->vfs_inodes_lock);
+
+	c->journal.flush_write_time	= &c->times[BCH_TIME_journal_flush_write];
+	c->journal.noflush_write_time	= &c->times[BCH_TIME_journal_noflush_write];
+	c->journal.flush_seq_time	= &c->times[BCH_TIME_journal_flush_seq];
 
 	bch2_fs_btree_cache_init_early(&c->btree_cache);
 
-	mutex_lock(&c->sb_lock);
+	mutex_init(&c->sectors_available_lock);
 
-	if (bch2_sb_to_fs(c, sb)) {
-		mutex_unlock(&c->sb_lock);
+	ret = percpu_init_rwsem(&c->mark_lock);
+	if (ret)
 		goto err;
-	}
 
+	mutex_lock(&c->sb_lock);
+	ret = bch2_sb_to_fs(c, sb);
 	mutex_unlock(&c->sb_lock);
 
-	scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
+	if (ret)
+		goto err;
+
+	pr_uuid(&name, c->sb.user_uuid.b);
+	ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
+	if (ret)
+		goto err;
+
+	strscpy(c->name, name.buf, sizeof(c->name));
+	printbuf_exit(&name);
+
+	/* Compat: */
+	if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
+	    !BCH_SB_JOURNAL_FLUSH_DELAY(sb))
+		SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
+
+	if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
+	    !BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
+		SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100);
 
 	c->opts = bch2_opts_default;
-	bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb));
+	ret = bch2_opts_from_sb(&c->opts, sb);
+	if (ret)
+		goto err;
+
 	bch2_opts_apply(&c->opts, opts);
 
-	c->block_bits		= ilog2(c->opts.block_size);
-	c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
+	c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
+	if (c->opts.inodes_use_key_cache)
+		c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
+	c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops;
 
-	c->opts.nochanges	|= c->opts.noreplay;
-	c->opts.read_only	|= c->opts.nochanges;
+	c->block_bits		= ilog2(block_sectors(c));
+	c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
 
-	if (bch2_fs_init_fault("fs_alloc"))
+	if (bch2_fs_init_fault("fs_alloc")) {
+		bch_err(c, "fs_alloc fault injected");
+		ret = -EFAULT;
 		goto err;
+	}
 
-	iter_size = sizeof(struct btree_node_iter_large) +
+	iter_size = sizeof(struct sort_iter) +
 		(btree_blocks(c) + 1) * 2 *
-		sizeof(struct btree_node_iter_set);
-
-	if (!(c->wq = alloc_workqueue("bcachefs",
-				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
-	    !(c->copygc_wq = alloc_workqueue("bcache_copygc",
-				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
-	    percpu_ref_init(&c->writes, bch2_writes_disabled, 0, GFP_KERNEL) ||
-	    mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
-				      sizeof(struct btree_reserve)) ||
-	    mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
-				      sizeof(struct btree_update)) ||
+		sizeof(struct sort_iter_set);
+
+	c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
+
+	if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
+				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
+	    !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
+				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+	    !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
+				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+	    !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete",
+				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
+	    !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit",
+				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+	    !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
+				WQ_FREEZABLE, 0)) ||
+#ifndef BCH_WRITE_REF_DEBUG
+	    percpu_ref_init(&c->writes, bch2_writes_disabled,
+			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+#endif
 	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
 	    bioset_init(&c->btree_bio, 1,
 			max(offsetof(struct btree_read_bio, bio),
 			    offsetof(struct btree_write_bio, wbio.bio)),
 			BIOSET_NEED_BVECS) ||
-	    !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
-	    percpu_init_rwsem(&c->usage_lock) ||
-	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
-					btree_bytes(c)) ||
-	    bch2_io_clock_init(&c->io_clock[READ]) ||
-	    bch2_io_clock_init(&c->io_clock[WRITE]) ||
-	    bch2_fs_journal_init(&c->journal) ||
-	    bch2_fs_btree_cache_init(c) ||
-	    bch2_fs_io_init(c) ||
-	    bch2_fs_encryption_init(c) ||
-	    bch2_fs_compress_init(c) ||
-	    bch2_fs_fsio_init(c))
+	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
+	    !(c->usage = alloc_percpu(struct bch_fs_usage_base)) ||
+	    !(c->online_reserved = alloc_percpu(u64)) ||
+	    mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
+				       c->opts.btree_node_size) ||
+	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
+	    !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
+					      sizeof(u64), GFP_KERNEL))) {
+		ret = -BCH_ERR_ENOMEM_fs_other_alloc;
 		goto err;
+	}
 
-	mi = bch2_sb_get_members(c->disk_sb.sb);
-	for (i = 0; i < c->sb.nr_devices; i++)
-		if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
-		    bch2_dev_alloc(c, i))
-			goto err;
+	ret = bch2_fs_counters_init(c) ?:
+	    bch2_fs_sb_errors_init(c) ?:
+	    bch2_io_clock_init(&c->io_clock[READ]) ?:
+	    bch2_io_clock_init(&c->io_clock[WRITE]) ?:
+	    bch2_fs_journal_init(&c->journal) ?:
+	    bch2_fs_btree_iter_init(c) ?:
+	    bch2_fs_btree_cache_init(c) ?:
+	    bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
+	    bch2_fs_btree_interior_update_init(c) ?:
+	    bch2_fs_buckets_waiting_for_journal_init(c) ?:
+	    bch2_fs_btree_write_buffer_init(c) ?:
+	    bch2_fs_subvolumes_init(c) ?:
+	    bch2_fs_io_read_init(c) ?:
+	    bch2_fs_io_write_init(c) ?:
+	    bch2_fs_nocow_locking_init(c) ?:
+	    bch2_fs_encryption_init(c) ?:
+	    bch2_fs_compress_init(c) ?:
+	    bch2_fs_ec_init(c) ?:
+	    bch2_fs_vfs_init(c) ?:
+	    bch2_fs_fsio_init(c) ?:
+	    bch2_fs_fs_io_buffered_init(c) ?:
+	    bch2_fs_fs_io_direct_init(c);
+	if (ret)
+		goto err;
 
-	/*
-	 * Now that all allocations have succeeded, init various refcounty
-	 * things that let us shutdown:
-	 */
-	closure_init(&c->cl, NULL);
+	for (i = 0; i < c->sb.nr_devices; i++) {
+		if (!bch2_member_exists(c->disk_sb.sb, i))
+			continue;
+		ret = bch2_dev_alloc(c, i);
+		if (ret)
+			goto err;
+	}
 
-	c->kobj.kset = bcachefs_kset;
-	kobject_init(&c->kobj, &bch2_fs_ktype);
-	kobject_init(&c->internal, &bch2_fs_internal_ktype);
-	kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
-	kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+	bch2_journal_entry_res_resize(&c->journal,
+			&c->btree_root_journal_res,
+			BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
+	bch2_journal_entry_res_resize(&c->journal,
+			&c->clock_journal_res,
+			(sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
 
 	mutex_lock(&bch_fs_list_lock);
-	err = bch2_fs_online(c);
+	ret = bch2_fs_online(c);
 	mutex_unlock(&bch_fs_list_lock);
-	if (err) {
-		bch_err(c, "bch2_fs_online() error: %s", err);
+
+	if (ret)
 		goto err;
-	}
 out:
-	pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM);
 	return c;
 err:
 	bch2_fs_free(c);
-	c = NULL;
+	c = ERR_PTR(ret);
 	goto out;
 }
 
-const char *bch2_fs_start(struct bch_fs *c)
+noinline_for_stack
+static void print_mount_opts(struct bch_fs *c)
 {
-	const char *err = "cannot allocate memory";
-	struct bch_sb_field_members *mi;
-	struct bch_dev *ca;
-	time64_t now = ktime_get_seconds();
-	unsigned i;
-	int ret = -EINVAL;
+	enum bch_opt_id i;
+	struct printbuf p = PRINTBUF;
+	bool first = true;
+
+	prt_str(&p, "starting version ");
+	bch2_version_to_text(&p, c->sb.version);
+
+	if (c->opts.read_only) {
+		prt_str(&p, " opts=");
+		first = false;
+		prt_printf(&p, "ro");
+	}
 
-	mutex_lock(&c->state_lock);
+	for (i = 0; i < bch2_opts_nr; i++) {
+		const struct bch_option *opt = &bch2_opt_table[i];
+		u64 v = bch2_opt_get_by_id(&c->opts, i);
 
-	BUG_ON(c->state != BCH_FS_STARTING);
+		if (!(opt->flags & OPT_MOUNT))
+			continue;
+
+		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+			continue;
+
+		prt_str(&p, first ? " opts=" : ",");
+		first = false;
+		bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
+	}
+
+	bch_info(c, "%s", p.buf);
+	printbuf_exit(&p);
+}
+
+int bch2_fs_start(struct bch_fs *c)
+{
+	time64_t now = ktime_get_real_seconds();
+	int ret;
+
+	print_mount_opts(c);
+
+	down_write(&c->state_lock);
+
+	BUG_ON(test_bit(BCH_FS_started, &c->flags));
 
 	mutex_lock(&c->sb_lock);
 
-	for_each_online_member(ca, c, i)
-		bch2_sb_from_fs(c, ca);
+	ret = bch2_sb_members_v2_init(c);
+	if (ret) {
+		mutex_unlock(&c->sb_lock);
+		goto err;
+	}
 
-	mi = bch2_sb_get_members(c->disk_sb.sb);
-	for_each_online_member(ca, c, i)
-		mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
+	for_each_online_member(c, ca)
+		bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now);
 
+	struct bch_sb_field_ext *ext =
+		bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
 	mutex_unlock(&c->sb_lock);
 
-	for_each_rw_member(ca, c, i)
+	if (!ext) {
+		bch_err(c, "insufficient space in superblock for sb_field_ext");
+		ret = -BCH_ERR_ENOSPC_sb;
+		goto err;
+	}
+
+	for_each_rw_member(c, ca)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
 
+	c->recovery_task = current;
 	ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
 		? bch2_fs_recovery(c)
 		: bch2_fs_initialize(c);
+	c->recovery_task = NULL;
+
 	if (ret)
 		goto err;
 
-	err = "dynamic fault";
-	if (bch2_fs_init_fault("fs_start"))
+	ret = bch2_opts_check_may_set(c);
+	if (ret)
 		goto err;
 
+	if (bch2_fs_init_fault("fs_start")) {
+		bch_err(c, "fs_start fault injected");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	set_bit(BCH_FS_started, &c->flags);
+
 	if (c->opts.read_only) {
 		bch2_fs_read_only(c);
 	} else {
-		err = bch2_fs_read_write(c);
-		if (err)
+		ret = !test_bit(BCH_FS_rw, &c->flags)
+			? bch2_fs_read_write(c)
+			: bch2_fs_read_write_late(c);
+		if (ret)
 			goto err;
 	}
 
-	set_bit(BCH_FS_STARTED, &c->flags);
-
-	err = NULL;
-out:
-	mutex_unlock(&c->state_lock);
-	return err;
+	ret = 0;
 err:
-	switch (ret) {
-	case BCH_FSCK_ERRORS_NOT_FIXED:
-		bch_err(c, "filesystem contains errors: please report this to the developers");
-		pr_cont("mount with -o fix_errors to repair\n");
-		err = "fsck error";
-		break;
-	case BCH_FSCK_REPAIR_UNIMPLEMENTED:
-		bch_err(c, "filesystem contains errors: please report this to the developers");
-		pr_cont("repair unimplemented: inform the developers so that it can be added\n");
-		err = "fsck error";
-		break;
-	case BCH_FSCK_REPAIR_IMPOSSIBLE:
-		bch_err(c, "filesystem contains errors, but repair impossible");
-		err = "fsck error";
-		break;
-	case BCH_FSCK_UNKNOWN_VERSION:
-		err = "unknown metadata version";;
-		break;
-	case -ENOMEM:
-		err = "cannot allocate memory";
-		break;
-	case -EIO:
-		err = "IO error";
-		break;
-	}
-
-	BUG_ON(!err);
-	set_bit(BCH_FS_ERROR, &c->flags);
-	goto out;
+	if (ret)
+		bch_err_msg(c, ret, "starting filesystem");
+	else
+		bch_verbose(c, "done starting filesystem");
+	up_write(&c->state_lock);
+	return ret;
 }
 
-static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
+static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
 {
-	struct bch_sb_field_members *sb_mi;
-
-	sb_mi = bch2_sb_get_members(sb);
-	if (!sb_mi)
-		return "Invalid superblock: member info area missing";
+	struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
 
-	if (le16_to_cpu(sb->block_size) != c->opts.block_size)
-		return "mismatched block size";
+	if (le16_to_cpu(sb->block_size) != block_sectors(c))
+		return -BCH_ERR_mismatched_block_size;
 
-	if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
+	if (le16_to_cpu(m.bucket_size) <
 	    BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
-		return "new cache bucket size is too small";
+		return -BCH_ERR_bucket_size_too_small;
 
-	return NULL;
+	return 0;
 }
 
-static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
+static int bch2_dev_in_fs(struct bch_sb_handle *fs,
+			  struct bch_sb_handle *sb,
+			  struct bch_opts *opts)
 {
-	struct bch_sb *newest =
-		le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
-	struct bch_sb_field_members *mi = bch2_sb_get_members(newest);
+	if (fs == sb)
+		return 0;
 
-	if (uuid_le_cmp(fs->uuid, sb->uuid))
-		return "device not a member of filesystem";
+	if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid))
+		return -BCH_ERR_device_not_a_member_of_filesystem;
 
-	if (!bch2_dev_exists(newest, mi, sb->dev_idx))
-		return "device has been removed";
+	if (!bch2_member_exists(fs->sb, sb->sb->dev_idx))
+		return -BCH_ERR_device_has_been_removed;
 
-	if (fs->block_size != sb->block_size)
-		return "mismatched block size";
+	if (fs->sb->block_size != sb->sb->block_size)
+		return -BCH_ERR_mismatched_block_size;
 
-	return NULL;
+	if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq ||
+	    le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq)
+		return 0;
+
+	if (fs->sb->seq == sb->sb->seq &&
+	    fs->sb->write_time != sb->sb->write_time) {
+		struct printbuf buf = PRINTBUF;
+
+		prt_str(&buf, "Split brain detected between ");
+		prt_bdevname(&buf, sb->bdev);
+		prt_str(&buf, " and ");
+		prt_bdevname(&buf, fs->bdev);
+		prt_char(&buf, ':');
+		prt_newline(&buf);
+		prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq));
+		prt_newline(&buf);
+
+		prt_bdevname(&buf, fs->bdev);
+		prt_char(&buf, ' ');
+		bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));
+		prt_newline(&buf);
+
+		prt_bdevname(&buf, sb->bdev);
+		prt_char(&buf, ' ');
+		bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));
+		prt_newline(&buf);
+
+		if (!opts->no_splitbrain_check)
+			prt_printf(&buf, "Not using older sb");
+
+		pr_err("%s", buf.buf);
+		printbuf_exit(&buf);
+
+		if (!opts->no_splitbrain_check)
+			return -BCH_ERR_device_splitbrain;
+	}
+
+	struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
+	u64 seq_from_fs		= le64_to_cpu(m.seq);
+	u64 seq_from_member	= le64_to_cpu(sb->sb->seq);
+
+	if (seq_from_fs && seq_from_fs < seq_from_member) {
+		struct printbuf buf = PRINTBUF;
+
+		prt_str(&buf, "Split brain detected between ");
+		prt_bdevname(&buf, sb->bdev);
+		prt_str(&buf, " and ");
+		prt_bdevname(&buf, fs->bdev);
+		prt_char(&buf, ':');
+		prt_newline(&buf);
+
+		prt_bdevname(&buf, fs->bdev);
+		prt_str(&buf, " believes seq of ");
+		prt_bdevname(&buf, sb->bdev);
+		prt_printf(&buf, " to be %llu, but ", seq_from_fs);
+		prt_bdevname(&buf, sb->bdev);
+		prt_printf(&buf, " has %llu\n", seq_from_member);
+
+		if (!opts->no_splitbrain_check) {
+			prt_str(&buf, "Not using ");
+			prt_bdevname(&buf, sb->bdev);
+		}
+
+		pr_err("%s", buf.buf);
+		printbuf_exit(&buf);
+
+		if (!opts->no_splitbrain_check)
+			return -BCH_ERR_device_splitbrain;
+	}
+
+	return 0;
 }
 
 /* Device startup/shutdown: */
@@ -805,26 +1189,26 @@ static void bch2_dev_free(struct bch_dev *ca)
 {
 	cancel_work_sync(&ca->io_error_work);
 
-	if (ca->kobj.state_in_sysfs &&
-	    ca->disk_sb.bdev)
-		sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
-				  "bcachefs");
+	bch2_dev_unlink(ca);
 
 	if (ca->kobj.state_in_sysfs)
 		kobject_del(&ca->kobj);
 
 	bch2_free_super(&ca->disk_sb);
+	bch2_dev_allocator_background_exit(ca);
 	bch2_dev_journal_exit(ca);
 
 	free_percpu(ca->io_done);
-	bioset_exit(&ca->replica_set);
 	bch2_dev_buckets_free(ca);
+	kfree(ca->sb_read_scratch);
 
-	bch2_time_stats_exit(&ca->io_latency[WRITE]);
-	bch2_time_stats_exit(&ca->io_latency[READ]);
+	bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
+	bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
 
 	percpu_ref_exit(&ca->io_ref);
+#ifndef CONFIG_BCACHEFS_DEBUG
 	percpu_ref_exit(&ca->ref);
+#endif
 	kobject_put(&ca->kobj);
 }
 
@@ -842,24 +1226,20 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
 	percpu_ref_kill(&ca->io_ref);
 	wait_for_completion(&ca->io_ref_completion);
 
-	if (ca->kobj.state_in_sysfs) {
-		struct kobject *block =
-			&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj;
-
-		sysfs_remove_link(block, "bcachefs");
-		sysfs_remove_link(&ca->kobj, "block");
-	}
+	bch2_dev_unlink(ca);
 
 	bch2_free_super(&ca->disk_sb);
 	bch2_dev_journal_exit(ca);
 }
 
+#ifndef CONFIG_BCACHEFS_DEBUG
 static void bch2_dev_ref_complete(struct percpu_ref *ref)
 {
 	struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
 
 	complete(&ca->ref_completion);
 }
+#endif
 
 static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
 {
@@ -868,6 +1248,26 @@ static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
 	complete(&ca->io_ref_completion);
 }
 
+static void bch2_dev_unlink(struct bch_dev *ca)
+{
+	struct kobject *b;
+
+	/*
+	 * This is racy w.r.t. the underlying block device being hot-removed,
+	 * which removes it from sysfs.
+	 *
+	 * It'd be lovely if we had a way to handle this race, but the sysfs
+	 * code doesn't appear to provide a good method and block/holder.c is
+	 * susceptible as well:
+	 */
+	if (ca->kobj.state_in_sysfs &&
+	    ca->disk_sb.bdev &&
+	    (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) {
+		sysfs_remove_link(b, "bcachefs");
+		sysfs_remove_link(&ca->kobj, "block");
+	}
+}
+
 static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
 {
 	int ret;
@@ -883,12 +1283,12 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
 	}
 
 	if (ca->disk_sb.bdev) {
-		struct kobject *block =
-			&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj;
+		struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
 
 		ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
 		if (ret)
 			return ret;
+
 		ret = sysfs_create_link(&ca->kobj, block, "block");
 		if (ret)
 			return ret;
@@ -901,6 +1301,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 					struct bch_member *member)
 {
 	struct bch_dev *ca;
+	unsigned i;
 
 	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 	if (!ca)
@@ -912,29 +1313,34 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 
 	init_rwsem(&ca->bucket_lock);
 
-	writepoint_init(&ca->copygc_write_point, BCH_DATA_USER);
-
-	spin_lock_init(&ca->freelist_lock);
-	bch2_dev_copygc_init(ca);
-
 	INIT_WORK(&ca->io_error_work, bch2_io_error_work);
 
-	bch2_time_stats_init(&ca->io_latency[READ]);
-	bch2_time_stats_init(&ca->io_latency[WRITE]);
+	bch2_time_stats_quantiles_init(&ca->io_latency[READ]);
+	bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]);
 
 	ca->mi = bch2_mi_to_cpu(member);
+
+	for (i = 0; i < ARRAY_SIZE(member->errors); i++)
+		atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i]));
+
 	ca->uuid = member->uuid;
 
-	if (opt_defined(c->opts, discard))
-		ca->mi.discard = opt_get(c->opts, discard);
+	ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
+			     ca->mi.bucket_size / btree_sectors(c));
 
-	if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
-			    0, GFP_KERNEL) ||
-	    percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
+#ifndef CONFIG_BCACHEFS_DEBUG
+	if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL))
+		goto err;
+#else
+	atomic_long_set(&ca->ref, 1);
+#endif
+
+	bch2_dev_allocator_background_init(ca);
+
+	if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
 			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+	    !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) ||
 	    bch2_dev_buckets_alloc(c, ca) ||
-	    bioset_init(&ca->replica_set, 4,
-			offsetof(struct bch_write_bio, bio), 0) ||
 	    !(ca->io_done	= alloc_percpu(*ca->io_done)))
 		goto err;
 
@@ -960,29 +1366,22 @@ static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
 
 static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 {
-	struct bch_member *member =
-		bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx;
+	struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
 	struct bch_dev *ca = NULL;
-	int ret = 0;
-
-	pr_verbose_init(c->opts, "");
 
 	if (bch2_fs_init_fault("dev_alloc"))
 		goto err;
 
-	ca = __bch2_dev_alloc(c, member);
+	ca = __bch2_dev_alloc(c, &member);
 	if (!ca)
 		goto err;
 
+	ca->fs = c;
+
 	bch2_dev_attach(c, ca, dev_idx);
-out:
-	pr_verbose_init(c->opts, "ret %i", ret);
-	return ret;
+	return 0;
 err:
-	if (ca)
-		bch2_dev_free(ca);
-	ret = -ENOMEM;
-	goto out;
+	return -BCH_ERR_ENOMEM_dev_alloc;
 }
 
 static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
@@ -992,40 +1391,26 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
 	if (bch2_dev_is_online(ca)) {
 		bch_err(ca, "already have device online in slot %u",
 			sb->sb->dev_idx);
-		return -EINVAL;
+		return -BCH_ERR_device_already_online;
 	}
 
 	if (get_capacity(sb->bdev->bd_disk) <
 	    ca->mi.bucket_size * ca->mi.nbuckets) {
 		bch_err(ca, "cannot online: device too small");
-		return -EINVAL;
+		return -BCH_ERR_device_size_too_small;
 	}
 
 	BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
 
-	if (get_capacity(sb->bdev->bd_disk) <
-	    ca->mi.bucket_size * ca->mi.nbuckets) {
-		bch_err(ca, "device too small");
-		return -EINVAL;
-	}
-
 	ret = bch2_dev_journal_init(ca, sb->sb);
 	if (ret)
 		return ret;
 
 	/* Commit: */
 	ca->disk_sb = *sb;
-	if (sb->mode & FMODE_EXCL)
-		ca->disk_sb.bdev->bd_holder = ca;
 	memset(sb, 0, sizeof(*sb));
 
-	if (ca->fs)
-		mutex_lock(&ca->fs->sb_lock);
-
-	bch2_mark_dev_superblock(ca->fs, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
-
-	if (ca->fs)
-		mutex_unlock(&ca->fs->sb_lock);
+	ca->dev = ca->disk_sb.bdev->bd_dev;
 
 	percpu_ref_reinit(&ca->io_ref);
 
@@ -1043,10 +1428,9 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
 	    le64_to_cpu(c->disk_sb.sb->seq))
 		bch2_sb_to_fs(c, sb->sb);
 
-	BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
-	       !c->devs[sb->sb->dev_idx]);
+	BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx));
 
-	ca = bch_dev_locked(c, sb->sb->dev_idx);
+	ca = bch2_dev_locked(c, sb->sb->dev_idx);
 
 	ret = __bch2_dev_attach_bdev(ca, sb);
 	if (ret)
@@ -1054,9 +1438,14 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
 
 	bch2_dev_sysfs_online(c, ca);
 
+	struct printbuf name = PRINTBUF;
+	prt_bdevname(&name, ca->disk_sb.bdev);
+
 	if (c->sb.nr_devices == 1)
-		bdevname(ca->disk_sb.bdev, c->name);
-	bdevname(ca->disk_sb.bdev, ca->name);
+		strscpy(c->name, name.buf, sizeof(c->name));
+	strscpy(ca->name, name.buf, sizeof(ca->name));
+
+	printbuf_exit(&name);
 
 	rebalance_wakeup(c);
 	return 0;
@@ -1077,45 +1466,41 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
 			    enum bch_member_state new_state, int flags)
 {
 	struct bch_devs_mask new_online_devs;
-	struct replicas_status s;
-	struct bch_dev *ca2;
-	int i, nr_rw = 0, required;
+	int nr_rw = 0, required;
 
 	lockdep_assert_held(&c->state_lock);
 
 	switch (new_state) {
-	case BCH_MEMBER_STATE_RW:
+	case BCH_MEMBER_STATE_rw:
 		return true;
-	case BCH_MEMBER_STATE_RO:
-		if (ca->mi.state != BCH_MEMBER_STATE_RW)
+	case BCH_MEMBER_STATE_ro:
+		if (ca->mi.state != BCH_MEMBER_STATE_rw)
 			return true;
 
 		/* do we have enough devices to write to?  */
-		for_each_member_device(ca2, c, i)
+		for_each_member_device(c, ca2)
 			if (ca2 != ca)
-				nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
+				nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
 
 		required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
 			       ? c->opts.metadata_replicas
-			       : c->opts.metadata_replicas_required,
+			       : metadata_replicas_required(c),
 			       !(flags & BCH_FORCE_IF_DATA_DEGRADED)
 			       ? c->opts.data_replicas
-			       : c->opts.data_replicas_required);
+			       : data_replicas_required(c));
 
 		return nr_rw >= required;
-	case BCH_MEMBER_STATE_FAILED:
-	case BCH_MEMBER_STATE_SPARE:
-		if (ca->mi.state != BCH_MEMBER_STATE_RW &&
-		    ca->mi.state != BCH_MEMBER_STATE_RO)
+	case BCH_MEMBER_STATE_failed:
+	case BCH_MEMBER_STATE_spare:
+		if (ca->mi.state != BCH_MEMBER_STATE_rw &&
+		    ca->mi.state != BCH_MEMBER_STATE_ro)
 			return true;
 
 		/* do we have enough devices to read from?  */
 		new_online_devs = bch2_online_devs(c);
 		__clear_bit(ca->dev_idx, new_online_devs.d);
 
-		s = __bch2_replicas_status(c, new_online_devs);
-
-		return bch2_have_enough_devs(s, flags);
+		return bch2_have_enough_devs(c, new_online_devs, flags, false);
 	default:
 		BUG();
 	}
@@ -1123,26 +1508,28 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
 
 static bool bch2_fs_may_start(struct bch_fs *c)
 {
-	struct replicas_status s;
-	struct bch_sb_field_members *mi;
 	struct bch_dev *ca;
-	unsigned i, flags = c->opts.degraded
-		? BCH_FORCE_IF_DEGRADED
-		: 0;
+	unsigned i, flags = 0;
 
-	if (!c->opts.degraded) {
+	if (c->opts.very_degraded)
+		flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
+
+	if (c->opts.degraded)
+		flags |= BCH_FORCE_IF_DEGRADED;
+
+	if (!c->opts.degraded &&
+	    !c->opts.very_degraded) {
 		mutex_lock(&c->sb_lock);
-		mi = bch2_sb_get_members(c->disk_sb.sb);
 
 		for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
-			if (!bch2_dev_exists(c->disk_sb.sb, mi, i))
+			if (!bch2_member_exists(c->disk_sb.sb, i))
 				continue;
 
-			ca = bch_dev_locked(c, i);
+			ca = bch2_dev_locked(c, i);
 
 			if (!bch2_dev_is_online(ca) &&
-			    (ca->mi.state == BCH_MEMBER_STATE_RW ||
-			     ca->mi.state == BCH_MEMBER_STATE_RO)) {
+			    (ca->mi.state == BCH_MEMBER_STATE_rw ||
+			     ca->mi.state == BCH_MEMBER_STATE_ro)) {
 				mutex_unlock(&c->sb_lock);
 				return false;
 			}
@@ -1150,67 +1537,55 @@ static bool bch2_fs_may_start(struct bch_fs *c)
 		mutex_unlock(&c->sb_lock);
 	}
 
-	s = bch2_replicas_status(c);
-
-	return bch2_have_enough_devs(s, flags);
+	return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
 }
 
 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
 {
-	bch2_copygc_stop(ca);
-
 	/*
 	 * The allocator thread itself allocates btree nodes, so stop it first:
 	 */
-	bch2_dev_allocator_stop(ca);
 	bch2_dev_allocator_remove(c, ca);
+	bch2_recalc_capacity(c);
 	bch2_dev_journal_stop(&c->journal, ca);
 }
 
-static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 {
 	lockdep_assert_held(&c->state_lock);
 
-	BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
+	BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw);
 
 	bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
-
-	if (bch2_dev_allocator_start(ca))
-		return "error starting allocator thread";
-
-	if (bch2_copygc_start(c, ca))
-		return "error starting copygc thread";
-
-	return NULL;
+	bch2_dev_do_discards(ca);
 }
 
 int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 			 enum bch_member_state new_state, int flags)
 {
-	struct bch_sb_field_members *mi;
+	struct bch_member *m;
 	int ret = 0;
 
 	if (ca->mi.state == new_state)
 		return 0;
 
 	if (!bch2_dev_state_allowed(c, ca, new_state, flags))
-		return -EINVAL;
+		return -BCH_ERR_device_state_not_allowed;
 
-	if (new_state != BCH_MEMBER_STATE_RW)
+	if (new_state != BCH_MEMBER_STATE_rw)
 		__bch2_dev_read_only(c, ca);
 
-	bch_notice(ca, "%s", bch2_dev_state[new_state]);
+	bch_notice(ca, "%s", bch2_member_states[new_state]);
 
 	mutex_lock(&c->sb_lock);
-	mi = bch2_sb_get_members(c->disk_sb.sb);
-	SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
+	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+	SET_BCH_MEMBER_STATE(m, new_state);
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	if (new_state == BCH_MEMBER_STATE_RW &&
-	    __bch2_dev_read_write(c, ca))
-		ret = -ENOMEM;
+	if (new_state == BCH_MEMBER_STATE_rw)
+		__bch2_dev_read_write(c, ca);
 
 	rebalance_wakeup(c);
 
@@ -1222,9 +1597,9 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 {
 	int ret;
 
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 	ret = __bch2_dev_set_state(c, ca, new_state, flags);
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 
 	return ret;
 }
@@ -1233,68 +1608,69 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 
 int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 {
-	struct bch_sb_field_members *mi;
+	struct bch_member *m;
 	unsigned dev_idx = ca->dev_idx, data;
-	int ret = -EINVAL;
+	int ret;
 
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 
-	percpu_ref_put(&ca->ref); /* XXX */
+	/*
+	 * We consume a reference to ca->ref, regardless of whether we succeed
+	 * or fail:
+	 */
+	bch2_dev_put(ca);
 
-	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
+	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
 		bch_err(ca, "Cannot remove without losing data");
+		ret = -BCH_ERR_device_state_not_allowed;
 		goto err;
 	}
 
 	__bch2_dev_read_only(c, ca);
 
-	/*
-	 * XXX: verify that dev_idx is really not in use anymore, anywhere
-	 *
-	 * flag_data_bad() does not check btree pointers
-	 */
 	ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
-	if (ret) {
-		bch_err(ca, "Remove failed: error %i dropping data", ret);
+	bch_err_msg(ca, ret, "bch2_dev_data_drop()");
+	if (ret)
+		goto err;
+
+	ret = bch2_dev_remove_alloc(c, ca);
+	bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
+	if (ret)
 		goto err;
-	}
 
+	/*
+	 * We need to flush the entire journal to get rid of keys that reference
+	 * the device being removed before removing the superblock entry
+	 */
+	bch2_journal_flush_all_pins(&c->journal);
+
+	/*
+	 * this is really just needed for the bch2_replicas_gc_(start|end)
+	 * calls, and could be cleaned up:
+	 */
 	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
-	if (ret) {
-		bch_err(ca, "Remove failed: error %i flushing journal", ret);
+	bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
+	if (ret)
 		goto err;
-	}
 
-	data = bch2_dev_has_data(c, ca);
-	if (data) {
-		char data_has_str[100];
-		bch2_scnprint_flag_list(data_has_str,
-					sizeof(data_has_str),
-					bch2_data_types,
-					data);
-		bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
-		ret = -EBUSY;
+	ret = bch2_journal_flush(&c->journal);
+	bch_err_msg(ca, ret, "bch2_journal_flush()");
+	if (ret)
 		goto err;
-	}
 
-	ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
-				      POS(ca->dev_idx, 0),
-				      POS(ca->dev_idx + 1, 0),
-				      ZERO_VERSION,
-				      NULL, NULL, NULL);
-	if (ret) {
-		bch_err(ca, "Remove failed, error deleting alloc info");
+	ret = bch2_replicas_gc2(c);
+	bch_err_msg(ca, ret, "bch2_replicas_gc2()");
+	if (ret)
 		goto err;
-	}
 
-	/*
-	 * must flush all existing journal entries, they might have
-	 * (overwritten) keys that point to the device we're removing:
-	 */
-	bch2_journal_flush_all_pins(&c->journal);
-	ret = bch2_journal_error(&c->journal);
-	if (ret) {
-		bch_err(ca, "Remove failed, journal error");
+	data = bch2_dev_has_data(c, ca);
+	if (data) {
+		struct printbuf data_has = PRINTBUF;
+
+		prt_bitflags(&data_has, __bch2_data_types, data);
+		bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
+		printbuf_exit(&data_has);
+		ret = -EBUSY;
 		goto err;
 	}
 
@@ -1304,7 +1680,12 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
 	mutex_unlock(&c->sb_lock);
 
+#ifndef CONFIG_BCACHEFS_DEBUG
 	percpu_ref_kill(&ca->ref);
+#else
+	ca->dying = true;
+	bch2_dev_put(ca);
+#endif
 	wait_for_completion(&ca->ref_completion);
 
 	bch2_dev_free(ca);
@@ -1314,18 +1695,19 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	 * this device must be gone:
 	 */
 	mutex_lock(&c->sb_lock);
-	mi = bch2_sb_get_members(c->disk_sb.sb);
-	memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
+	m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
+	memset(&m->uuid, 0, sizeof(m->uuid));
 
 	bch2_write_super(c);
 
 	mutex_unlock(&c->sb_lock);
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 	return 0;
 err:
-	if (ca->mi.state == BCH_MEMBER_STATE_RW)
+	if (ca->mi.state == BCH_MEMBER_STATE_rw &&
+	    !percpu_ref_is_zero(&ca->io_ref))
 		__bch2_dev_read_write(c, ca);
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 	return ret;
 }
 
@@ -1334,118 +1716,116 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 {
 	struct bch_opts opts = bch2_opts_empty();
 	struct bch_sb_handle sb;
-	const char *err;
 	struct bch_dev *ca = NULL;
-	struct bch_sb_field_members *mi;
-	struct bch_member dev_mi;
-	unsigned dev_idx, nr_devices, u64s;
+	struct printbuf errbuf = PRINTBUF;
+	struct printbuf label = PRINTBUF;
 	int ret;
 
 	ret = bch2_read_super(path, &opts, &sb);
+	bch_err_msg(c, ret, "reading super");
 	if (ret)
-		return ret;
+		goto err;
 
-	err = bch2_sb_validate(&sb);
-	if (err)
-		return -EINVAL;
+	struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
 
-	dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
+	if (BCH_MEMBER_GROUP(&dev_mi)) {
+		bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
+		if (label.allocation_failure) {
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
 
-	err = bch2_dev_may_add(sb.sb, c);
-	if (err)
-		return -EINVAL;
+	ret = bch2_dev_may_add(sb.sb, c);
+	if (ret)
+		goto err;
 
 	ca = __bch2_dev_alloc(c, &dev_mi);
 	if (!ca) {
-		bch2_free_super(&sb);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto err;
 	}
 
 	ret = __bch2_dev_attach_bdev(ca, &sb);
-	if (ret) {
-		bch2_dev_free(ca);
-		return ret;
-	}
-
-	err = "journal alloc failed";
-	ret = bch2_dev_journal_alloc(ca);
 	if (ret)
 		goto err;
 
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 	mutex_lock(&c->sb_lock);
 
-	err = "insufficient space in new superblock";
 	ret = bch2_sb_from_fs(c, ca);
+	bch_err_msg(c, ret, "setting up new superblock");
 	if (ret)
 		goto err_unlock;
 
-	mi = bch2_sb_get_members(ca->disk_sb.sb);
-
-	if (!bch2_sb_resize_members(&ca->disk_sb,
-				le32_to_cpu(mi->field.u64s) +
-				sizeof(dev_mi) / sizeof(u64))) {
-		ret = -ENOSPC;
+	if (dynamic_fault("bcachefs:add:no_slot"))
 		goto err_unlock;
-	}
 
-	if (dynamic_fault("bcachefs:add:no_slot"))
-		goto no_slot;
-
-	mi = bch2_sb_get_members(c->disk_sb.sb);
-	for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
-		if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
-			goto have_slot;
-no_slot:
-	err = "no slots available in superblock";
-	ret = -ENOSPC;
-	goto err_unlock;
-
-have_slot:
-	nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
-	u64s = (sizeof(struct bch_sb_field_members) +
-		sizeof(struct bch_member) * nr_devices) / sizeof(u64);
-
-	err = "no space in superblock for member info";
-	ret = -ENOSPC;
-
-	mi = bch2_sb_resize_members(&c->disk_sb, u64s);
-	if (!mi)
+	ret = bch2_sb_member_alloc(c);
+	if (ret < 0) {
+		bch_err_msg(c, ret, "setting up new superblock");
 		goto err_unlock;
+	}
+	unsigned dev_idx = ret;
 
 	/* success: */
 
-	mi->members[dev_idx] = dev_mi;
-	mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_seconds());
-	c->disk_sb.sb->nr_devices	= nr_devices;
+	dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds());
+	*bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi;
 
 	ca->disk_sb.sb->dev_idx	= dev_idx;
 	bch2_dev_attach(c, ca, dev_idx);
 
+	if (BCH_MEMBER_GROUP(&dev_mi)) {
+		ret = __bch2_dev_group_set(c, ca, label.buf);
+		bch_err_msg(c, ret, "creating new label");
+		if (ret)
+			goto err_unlock;
+	}
+
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
-		err = __bch2_dev_read_write(c, ca);
-		if (err)
-			goto err_late;
-	}
+	ret = bch2_dev_usage_init(ca, false);
+	if (ret)
+		goto err_late;
+
+	ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
+	bch_err_msg(ca, ret, "marking new superblock");
+	if (ret)
+		goto err_late;
+
+	ret = bch2_fs_freespace_init(c);
+	bch_err_msg(ca, ret, "initializing free space");
+	if (ret)
+		goto err_late;
+
+	if (ca->mi.state == BCH_MEMBER_STATE_rw)
+		__bch2_dev_read_write(c, ca);
+
+	ret = bch2_dev_journal_alloc(ca, false);
+	bch_err_msg(c, ret, "allocating journal");
+	if (ret)
+		goto err_late;
 
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 	return 0;
 
 err_unlock:
 	mutex_unlock(&c->sb_lock);
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 err:
 	if (ca)
 		bch2_dev_free(ca);
 	bch2_free_super(&sb);
-	bch_err(c, "Unable to add device: %s", err);
+	printbuf_exit(&label);
+	printbuf_exit(&errbuf);
+	bch_err_fn(c, ret);
 	return ret;
 err_late:
-	bch_err(c, "Error going rw after adding device: %s", err);
-	return -EINVAL;
+	up_write(&c->state_lock);
+	ca = NULL;
+	goto err;
 }
 
 /* Hot add existing device to running filesystem: */
@@ -1453,84 +1833,97 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 {
 	struct bch_opts opts = bch2_opts_empty();
 	struct bch_sb_handle sb = { NULL };
-	struct bch_sb_field_members *mi;
 	struct bch_dev *ca;
 	unsigned dev_idx;
-	const char *err;
 	int ret;
 
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 
 	ret = bch2_read_super(path, &opts, &sb);
 	if (ret) {
-		mutex_unlock(&c->state_lock);
+		up_write(&c->state_lock);
 		return ret;
 	}
 
 	dev_idx = sb.sb->dev_idx;
 
-	err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
-	if (err)
+	ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);
+	bch_err_msg(c, ret, "bringing %s online", path);
+	if (ret)
+		goto err;
+
+	ret = bch2_dev_attach_bdev(c, &sb);
+	if (ret)
 		goto err;
 
-	if (bch2_dev_attach_bdev(c, &sb)) {
-		err = "bch2_dev_attach_bdev() error";
+	ca = bch2_dev_locked(c, dev_idx);
+
+	ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
+	bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
+	if (ret)
 		goto err;
+
+	if (ca->mi.state == BCH_MEMBER_STATE_rw)
+		__bch2_dev_read_write(c, ca);
+
+	if (!ca->mi.freespace_initialized) {
+		ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
+		bch_err_msg(ca, ret, "initializing free space");
+		if (ret)
+			goto err;
 	}
 
-	ca = bch_dev_locked(c, dev_idx);
-	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
-		err = __bch2_dev_read_write(c, ca);
-		if (err)
+	if (!ca->journal.nr) {
+		ret = bch2_dev_journal_alloc(ca, false);
+		bch_err_msg(ca, ret, "allocating journal");
+		if (ret)
 			goto err;
 	}
 
 	mutex_lock(&c->sb_lock);
-	mi = bch2_sb_get_members(c->disk_sb.sb);
-
-	mi->members[ca->dev_idx].last_mount =
-		cpu_to_le64(ktime_get_seconds());
-
+	bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
+		cpu_to_le64(ktime_get_real_seconds());
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 	return 0;
 err:
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 	bch2_free_super(&sb);
-	bch_err(c, "error bringing %s online: %s", path, err);
-	return -EINVAL;
+	return ret;
 }
 
 int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
 {
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
 
 	if (!bch2_dev_is_online(ca)) {
 		bch_err(ca, "Already offline");
-		mutex_unlock(&c->state_lock);
+		up_write(&c->state_lock);
 		return 0;
 	}
 
-	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
+	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
 		bch_err(ca, "Cannot offline required disk");
-		mutex_unlock(&c->state_lock);
-		return -EINVAL;
+		up_write(&c->state_lock);
+		return -BCH_ERR_device_state_not_allowed;
 	}
 
 	__bch2_dev_offline(c, ca);
 
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 	return 0;
 }
 
 int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 {
-	struct bch_member *mi;
+	struct bch_member *m;
+	u64 old_nbuckets;
 	int ret = 0;
 
-	mutex_lock(&c->state_lock);
+	down_write(&c->state_lock);
+	old_nbuckets = ca->mi.nbuckets;
 
 	if (nbuckets < ca->mi.nbuckets) {
 		bch_err(ca, "Cannot shrink yet");
@@ -1538,217 +1931,176 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		goto err;
 	}
 
+	if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
+		bch_err(ca, "New device size too big (%llu greater than max %u)",
+			nbuckets, BCH_MEMBER_NBUCKETS_MAX);
+		ret = -BCH_ERR_device_size_too_big;
+		goto err;
+	}
+
 	if (bch2_dev_is_online(ca) &&
 	    get_capacity(ca->disk_sb.bdev->bd_disk) <
 	    ca->mi.bucket_size * nbuckets) {
 		bch_err(ca, "New size larger than device");
-		ret = -EINVAL;
+		ret = -BCH_ERR_device_size_too_small;
 		goto err;
 	}
 
 	ret = bch2_dev_buckets_resize(c, ca, nbuckets);
-	if (ret) {
-		bch_err(ca, "Resize error: %i", ret);
+	bch_err_msg(ca, ret, "resizing buckets");
+	if (ret)
+		goto err;
+
+	ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
+	if (ret)
 		goto err;
-	}
 
 	mutex_lock(&c->sb_lock);
-	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
-	mi->nbuckets = cpu_to_le64(nbuckets);
+	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+	m->nbuckets = cpu_to_le64(nbuckets);
 
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
+	if (ca->mi.freespace_initialized) {
+		struct disk_accounting_pos acc = {
+			.type = BCH_DISK_ACCOUNTING_dev_data_type,
+			.dev_data_type.dev = ca->dev_idx,
+			.dev_data_type.data_type = BCH_DATA_free,
+		};
+		u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };
+
+		ret   = bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
+				bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?:
+			bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
+		if (ret)
+			goto err;
+	}
+
 	bch2_recalc_capacity(c);
 err:
-	mutex_unlock(&c->state_lock);
+	up_write(&c->state_lock);
 	return ret;
 }
 
 /* return with ref on ca->ref: */
-struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
 {
+	if (!strncmp(name, "/dev/", strlen("/dev/")))
+		name += strlen("/dev/");
 
-	struct block_device *bdev = lookup_bdev(path);
-	struct bch_dev *ca;
-	unsigned i;
-
-	if (IS_ERR(bdev))
-		return ERR_CAST(bdev);
-
-	for_each_member_device(ca, c, i)
-		if (ca->disk_sb.bdev == bdev)
-			goto found;
-
-	ca = ERR_PTR(-ENOENT);
-found:
-	bdput(bdev);
-	return ca;
+	for_each_member_device(c, ca)
+		if (!strcmp(name, ca->name))
+			return ca;
+	return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
 }
 
 /* Filesystem open: */
 
+static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
+{
+	return  cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?:
+		cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time));
+}
+
 struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 			    struct bch_opts opts)
 {
-	struct bch_sb_handle *sb = NULL;
+	DARRAY(struct bch_sb_handle) sbs = { 0 };
 	struct bch_fs *c = NULL;
-	unsigned i, best_sb = 0;
-	const char *err;
-	int ret = -ENOMEM;
+	struct bch_sb_handle *best = NULL;
+	struct printbuf errbuf = PRINTBUF;
+	int ret = 0;
 
-	pr_verbose_init(opts, "");
+	if (!try_module_get(THIS_MODULE))
+		return ERR_PTR(-ENODEV);
 
 	if (!nr_devices) {
-		c = ERR_PTR(-EINVAL);
-		goto out2;
-	}
-
-	if (!try_module_get(THIS_MODULE)) {
-		c = ERR_PTR(-ENODEV);
-		goto out2;
+		ret = -EINVAL;
+		goto err;
 	}
 
-	sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
-	if (!sb)
+	ret = darray_make_room(&sbs, nr_devices);
+	if (ret)
 		goto err;
 
-	for (i = 0; i < nr_devices; i++) {
-		ret = bch2_read_super(devices[i], &opts, &sb[i]);
+	for (unsigned i = 0; i < nr_devices; i++) {
+		struct bch_sb_handle sb = { NULL };
+
+		ret = bch2_read_super(devices[i], &opts, &sb);
 		if (ret)
 			goto err;
 
-		err = bch2_sb_validate(&sb[i]);
-		if (err)
-			goto err_print;
+		BUG_ON(darray_push(&sbs, sb));
 	}
 
-	for (i = 1; i < nr_devices; i++)
-		if (le64_to_cpu(sb[i].sb->seq) >
-		    le64_to_cpu(sb[best_sb].sb->seq))
-			best_sb = i;
+	if (opts.nochanges && !opts.read_only) {
+		ret = -BCH_ERR_erofs_nochanges;
+		goto err_print;
+	}
 
-	for (i = 0; i < nr_devices; i++) {
-		err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
-		if (err)
+	darray_for_each(sbs, sb)
+		if (!best || sb_cmp(sb->sb, best->sb) > 0)
+			best = sb;
+
+	darray_for_each_reverse(sbs, sb) {
+		ret = bch2_dev_in_fs(best, sb, &opts);
+
+		if (ret == -BCH_ERR_device_has_been_removed ||
+		    ret == -BCH_ERR_device_splitbrain) {
+			bch2_free_super(sb);
+			darray_remove_item(&sbs, sb);
+			best -= best > sb;
+			ret = 0;
+			continue;
+		}
+
+		if (ret)
 			goto err_print;
 	}
 
-	ret = -ENOMEM;
-	c = bch2_fs_alloc(sb[best_sb].sb, opts);
-	if (!c)
+	c = bch2_fs_alloc(best->sb, opts);
+	ret = PTR_ERR_OR_ZERO(c);
+	if (ret)
 		goto err;
 
-	err = "bch2_dev_online() error";
-	mutex_lock(&c->state_lock);
-	for (i = 0; i < nr_devices; i++)
-		if (bch2_dev_attach_bdev(c, &sb[i])) {
-			mutex_unlock(&c->state_lock);
-			goto err_print;
+	down_write(&c->state_lock);
+	darray_for_each(sbs, sb) {
+		ret = bch2_dev_attach_bdev(c, sb);
+		if (ret) {
+			up_write(&c->state_lock);
+			goto err;
 		}
-	mutex_unlock(&c->state_lock);
+	}
+	up_write(&c->state_lock);
 
-	err = "insufficient devices";
-	if (!bch2_fs_may_start(c))
+	if (!bch2_fs_may_start(c)) {
+		ret = -BCH_ERR_insufficient_devices_to_start;
 		goto err_print;
+	}
 
 	if (!c->opts.nostart) {
-		err = bch2_fs_start(c);
-		if (err)
-			goto err_print;
+		ret = bch2_fs_start(c);
+		if (ret)
+			goto err;
 	}
 out:
-	kfree(sb);
+	darray_for_each(sbs, sb)
+		bch2_free_super(sb);
+	darray_exit(&sbs);
+	printbuf_exit(&errbuf);
 	module_put(THIS_MODULE);
-out2:
-	pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
 	return c;
 err_print:
 	pr_err("bch_fs_open err opening %s: %s",
-	       devices[0], err);
-	ret = -EINVAL;
+	       devices[0], bch2_err_str(ret));
 err:
-	if (c)
+	if (!IS_ERR_OR_NULL(c))
 		bch2_fs_stop(c);
-	for (i = 0; i < nr_devices; i++)
-		bch2_free_super(&sb[i]);
 	c = ERR_PTR(ret);
 	goto out;
 }
 
-static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
-					      struct bch_opts opts)
-{
-	const char *err;
-	struct bch_fs *c;
-	bool allocated_fs = false;
-
-	err = bch2_sb_validate(sb);
-	if (err)
-		return err;
-
-	mutex_lock(&bch_fs_list_lock);
-	c = __bch2_uuid_to_fs(sb->sb->uuid);
-	if (c) {
-		closure_get(&c->cl);
-
-		err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb);
-		if (err)
-			goto err;
-	} else {
-		c = bch2_fs_alloc(sb->sb, opts);
-		err = "cannot allocate memory";
-		if (!c)
-			goto err;
-
-		allocated_fs = true;
-	}
-
-	err = "bch2_dev_online() error";
-
-	mutex_lock(&c->sb_lock);
-	if (bch2_dev_attach_bdev(c, sb)) {
-		mutex_unlock(&c->sb_lock);
-		goto err;
-	}
-	mutex_unlock(&c->sb_lock);
-
-	if (!c->opts.nostart && bch2_fs_may_start(c)) {
-		err = bch2_fs_start(c);
-		if (err)
-			goto err;
-	}
-
-	closure_put(&c->cl);
-	mutex_unlock(&bch_fs_list_lock);
-
-	return NULL;
-err:
-	mutex_unlock(&bch_fs_list_lock);
-
-	if (allocated_fs)
-		bch2_fs_stop(c);
-	else if (c)
-		closure_put(&c->cl);
-
-	return err;
-}
-
-const char *bch2_fs_open_incremental(const char *path)
-{
-	struct bch_sb_handle sb;
-	struct bch_opts opts = bch2_opts_empty();
-	const char *err;
-
-	if (bch2_read_super(path, &opts, &sb))
-		return "error reading superblock";
-
-	err = __bch2_fs_open_incremental(&sb, opts);
-	bch2_free_super(&sb);
-
-	return err;
-}
-
 /* Global interfaces/init */
 
 static void bcachefs_exit(void)
@@ -1756,6 +2108,7 @@ static void bcachefs_exit(void)
 	bch2_debug_exit();
 	bch2_vfs_exit();
 	bch2_chardev_exit();
+	bch2_btree_key_cache_exit();
 	if (bcachefs_kset)
 		kset_unregister(bcachefs_kset);
 }
@@ -1763,9 +2116,9 @@ static void bcachefs_exit(void)
 static int __init bcachefs_init(void)
 {
 	bch2_bkey_pack_test();
-	bch2_inode_pack_test();
 
 	if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
+	    bch2_btree_key_cache_init() ||
 	    bch2_chardev_init() ||
 	    bch2_vfs_init() ||
 	    bch2_debug_init())
@@ -1784,5 +2137,9 @@ err:
 BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
+__maybe_unused
+static unsigned bch2_metadata_version = bcachefs_metadata_version_current;
+module_param_named(version, bch2_metadata_version, uint, 0400);
+
 module_exit(bcachefs_exit);
 module_init(bcachefs_init);
diff --git a/libbcachefs/super.h b/libbcachefs/super.h
index 231bc529..fa6d5221 100644
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_SUPER_H
 #define _BCACHEFS_SUPER_H
 
@@ -7,198 +8,10 @@
 
 #include <linux/math64.h>
 
-static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
-{
-	return div_u64(s, ca->mi.bucket_size);
-}
+extern const char * const bch2_fs_flag_strs[];
 
-static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
-{
-	return ((sector_t) b) * ca->mi.bucket_size;
-}
-
-static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
-{
-	u32 remainder;
-
-	div_u64_rem(s, ca->mi.bucket_size, &remainder);
-	return remainder;
-}
-
-static inline bool bch2_dev_is_online(struct bch_dev *ca)
-{
-	return !percpu_ref_is_zero(&ca->io_ref);
-}
-
-static inline bool bch2_dev_is_readable(struct bch_dev *ca)
-{
-	return bch2_dev_is_online(ca) &&
-		ca->mi.state != BCH_MEMBER_STATE_FAILED;
-}
-
-static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
-{
-	if (!percpu_ref_tryget(&ca->io_ref))
-		return false;
-
-	if (ca->mi.state == BCH_MEMBER_STATE_RW ||
-	    (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ))
-		return true;
-
-	percpu_ref_put(&ca->io_ref);
-	return false;
-}
-
-static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
-{
-	return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
-}
-
-static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
-					 unsigned dev)
-{
-	unsigned i;
-
-	for (i = 0; i < devs.nr; i++)
-		if (devs.devs[i] == dev)
-			return true;
-
-	return false;
-}
-
-static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
-					  unsigned dev)
-{
-	unsigned i;
-
-	for (i = 0; i < devs->nr; i++)
-		if (devs->devs[i] == dev) {
-			array_remove_item(devs->devs, devs->nr, i);
-			return;
-		}
-}
-
-static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
-					 unsigned dev)
-{
-	BUG_ON(bch2_dev_list_has_dev(*devs, dev));
-	BUG_ON(devs->nr >= BCH_REPLICAS_MAX);
-	devs->devs[devs->nr++] = dev;
-}
-
-static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
-{
-	return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
-}
-
-static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
-					      const struct bch_devs_mask *mask)
-{
-	struct bch_dev *ca = NULL;
-
-	while ((*iter = mask
-		? find_next_bit(mask->d, c->sb.nr_devices, *iter)
-		: *iter) < c->sb.nr_devices &&
-	       !(ca = rcu_dereference_check(c->devs[*iter],
-					    lockdep_is_held(&c->state_lock))))
-		(*iter)++;
-
-	return ca;
-}
-
-#define __for_each_member_device(ca, c, iter, mask)			\
-	for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
-
-#define for_each_member_device_rcu(ca, c, iter, mask)			\
-	__for_each_member_device(ca, c, iter, mask)
-
-static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
-{
-	struct bch_dev *ca;
-
-	rcu_read_lock();
-	if ((ca = __bch2_next_dev(c, iter, NULL)))
-		percpu_ref_get(&ca->ref);
-	rcu_read_unlock();
-
-	return ca;
-}
-
-/*
- * If you break early, you must drop your ref on the current device
- */
-#define for_each_member_device(ca, c, iter)				\
-	for ((iter) = 0;						\
-	     (ca = bch2_get_next_dev(c, &(iter)));			\
-	     percpu_ref_put(&ca->ref), (iter)++)
-
-static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
-						      unsigned *iter,
-						      int state_mask)
-{
-	struct bch_dev *ca;
-
-	rcu_read_lock();
-	while ((ca = __bch2_next_dev(c, iter, NULL)) &&
-	       (!((1 << ca->mi.state) & state_mask) ||
-		!percpu_ref_tryget(&ca->io_ref)))
-		(*iter)++;
-	rcu_read_unlock();
-
-	return ca;
-}
-
-#define __for_each_online_member(ca, c, iter, state_mask)		\
-	for ((iter) = 0;						\
-	     (ca = bch2_get_next_online_dev(c, &(iter), state_mask));	\
-	     percpu_ref_put(&ca->io_ref), (iter)++)
-
-#define for_each_online_member(ca, c, iter)				\
-	__for_each_online_member(ca, c, iter, ~0)
-
-#define for_each_rw_member(ca, c, iter)					\
-	__for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW)
-
-#define for_each_readable_member(ca, c, iter)				\
-	__for_each_online_member(ca, c, iter,				\
-		(1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))
-
-/*
- * If a key exists that references a device, the device won't be going away and
- * we can omit rcu_read_lock():
- */
-static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
-{
-	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
-
-	return rcu_dereference_check(c->devs[idx], 1);
-}
-
-static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
-{
-	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
-
-	return rcu_dereference_protected(c->devs[idx],
-					 lockdep_is_held(&c->sb_lock) ||
-					 lockdep_is_held(&c->state_lock));
-}
-
-/* XXX kill, move to struct bch_fs */
-static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
-{
-	struct bch_devs_mask devs;
-	struct bch_dev *ca;
-	unsigned i;
-
-	memset(&devs, 0, sizeof(devs));
-	for_each_online_member(ca, c, i)
-		__set_bit(ca->dev_idx, devs.d);
-	return devs;
-}
-
-struct bch_fs *bch2_bdev_to_fs(struct block_device *);
-struct bch_fs *bch2_uuid_to_fs(uuid_le);
-int bch2_congested(void *, int);
+struct bch_fs *bch2_dev_to_fs(dev_t);
+struct bch_fs *bch2_uuid_to_fs(__uuid_t);
 
 bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
 			   enum bch_member_state, int);
@@ -217,12 +30,15 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
 
 bool bch2_fs_emergency_read_only(struct bch_fs *);
 void bch2_fs_read_only(struct bch_fs *);
-const char *bch2_fs_read_write(struct bch_fs *);
 
+int bch2_fs_read_write(struct bch_fs *);
+int bch2_fs_read_write_early(struct bch_fs *);
+
+void __bch2_fs_stop(struct bch_fs *);
+void bch2_fs_free(struct bch_fs *);
 void bch2_fs_stop(struct bch_fs *);
 
-const char *bch2_fs_start(struct bch_fs *);
+int bch2_fs_start(struct bch_fs *);
 struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
-const char *bch2_fs_open_incremental(const char *path);
 
 #endif /* _BCACHEFS_SUPER_H */
diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h
index ab83ade9..368a63d9 100644
--- a/libbcachefs/super_types.h
+++ b/libbcachefs/super_types.h
@@ -1,15 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_SUPER_TYPES_H
 #define _BCACHEFS_SUPER_TYPES_H
 
 struct bch_sb_handle {
 	struct bch_sb		*sb;
+	struct file		*s_bdev_file;
 	struct block_device	*bdev;
+	char			*sb_name;
 	struct bio		*bio;
-	unsigned		page_order;
-	fmode_t			mode;
+	void			*holder;
+	size_t			buffer_size;
+	blk_mode_t		mode;
 	unsigned		have_layout:1;
 	unsigned		have_bio:1;
 	unsigned		fs_sb:1;
+	u64			seq;
 };
 
 struct bch_devs_mask {
@@ -18,44 +23,7 @@ struct bch_devs_mask {
 
 struct bch_devs_list {
 	u8			nr;
-	u8			devs[BCH_REPLICAS_MAX + 1];
-};
-
-struct bch_member_cpu {
-	u64			nbuckets;	/* device size */
-	u16			first_bucket;   /* index of first bucket used */
-	u16			bucket_size;	/* sectors */
-	u16			group;
-	u8			state;
-	u8			replacement;
-	u8			discard;
-	u8			data_allowed;
-	u8			durability;
-	u8			valid;
-};
-
-struct bch_replicas_cpu_entry {
-	u8			data_type;
-	u8			devs[BCH_SB_MEMBERS_MAX / 8];
-};
-
-struct bch_replicas_cpu {
-	struct rcu_head		rcu;
-	unsigned		nr;
-	unsigned		entry_size;
-	struct bch_replicas_cpu_entry entries[];
-};
-
-struct bch_disk_group_cpu {
-	bool				deleted;
-	u16				parent;
-	struct bch_devs_mask		devs;
-};
-
-struct bch_disk_groups_cpu {
-	struct rcu_head			rcu;
-	unsigned			nr;
-	struct bch_disk_group_cpu	entries[];
+	u8			data[BCH_BKEY_PTRS_MAX];
 };
 
 #endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 4987ee76..97733c76 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * bcache sysfs interfaces
  *
@@ -8,21 +9,29 @@
 #ifndef NO_BCACHEFS_SYSFS
 
 #include "bcachefs.h"
-#include "alloc.h"
-#include "compress.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
 #include "sysfs.h"
 #include "btree_cache.h"
 #include "btree_io.h"
 #include "btree_iter.h"
+#include "btree_key_cache.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_gc.h"
 #include "buckets.h"
+#include "clock.h"
+#include "compress.h"
+#include "disk_accounting.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "inode.h"
 #include "journal.h"
+#include "journal_reclaim.h"
 #include "keylist.h"
 #include "move.h"
+#include "movinggc.h"
+#include "nocow_locking.h"
 #include "opts.h"
 #include "rebalance.h"
 #include "replicas.h"
@@ -36,52 +45,77 @@
 #include "util.h"
 
 #define SYSFS_OPS(type)							\
-struct sysfs_ops type ## _sysfs_ops = {					\
+const struct sysfs_ops type ## _sysfs_ops = {				\
 	.show	= type ## _show,					\
 	.store	= type ## _store					\
 }
 
 #define SHOW(fn)							\
+static ssize_t fn ## _to_text(struct printbuf *,			\
+			      struct kobject *, struct attribute *);	\
+									\
 static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
 			   char *buf)					\
+{									\
+	struct printbuf out = PRINTBUF;					\
+	ssize_t ret = fn ## _to_text(&out, kobj, attr);			\
+									\
+	if (out.pos && out.buf[out.pos - 1] != '\n')			\
+		prt_newline(&out);					\
+									\
+	if (!ret && out.allocation_failure)				\
+		ret = -ENOMEM;						\
+									\
+	if (!ret) {							\
+		ret = min_t(size_t, out.pos, PAGE_SIZE - 1);		\
+		memcpy(buf, out.buf, ret);				\
+	}								\
+	printbuf_exit(&out);						\
+	return bch2_err_class(ret);					\
+}									\
+									\
+static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\
+			      struct attribute *attr)
 
 #define STORE(fn)							\
+static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\
+			    const char *, size_t);			\
+									\
 static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
 			    const char *buf, size_t size)		\
+{									\
+	return bch2_err_class(fn##_store_inner(kobj, attr, buf, size));	\
+}									\
+									\
+static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\
+				  const char *buf, size_t size)
 
 #define __sysfs_attribute(_name, _mode)					\
 	static struct attribute sysfs_##_name =				\
 		{ .name = #_name, .mode = _mode }
 
-#define write_attribute(n)	__sysfs_attribute(n, S_IWUSR)
-#define read_attribute(n)	__sysfs_attribute(n, S_IRUGO)
-#define rw_attribute(n)		__sysfs_attribute(n, S_IRUGO|S_IWUSR)
+#define write_attribute(n)	__sysfs_attribute(n, 0200)
+#define read_attribute(n)	__sysfs_attribute(n, 0444)
+#define rw_attribute(n)		__sysfs_attribute(n, 0644)
 
 #define sysfs_printf(file, fmt, ...)					\
 do {									\
 	if (attr == &sysfs_ ## file)					\
-		return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\
+		prt_printf(out, fmt "\n", __VA_ARGS__);			\
 } while (0)
 
 #define sysfs_print(file, var)						\
 do {									\
 	if (attr == &sysfs_ ## file)					\
-		return snprint(buf, PAGE_SIZE, var);			\
+		snprint(out, var);					\
 } while (0)
 
 #define sysfs_hprint(file, val)						\
 do {									\
-	if (attr == &sysfs_ ## file) {					\
-		ssize_t ret = bch2_hprint(buf, val);			\
-		strcat(buf, "\n");					\
-		return ret + 1;						\
-	}								\
+	if (attr == &sysfs_ ## file)					\
+		prt_human_readable_s64(out, val);			\
 } while (0)
 
-#define var_printf(_var, fmt)	sysfs_printf(_var, fmt, var(_var))
-#define var_print(_var)		sysfs_print(_var, var(_var))
-#define var_hprint(_var)	sysfs_hprint(_var, var(_var))
-
 #define sysfs_strtoul(file, var)					\
 do {									\
 	if (attr == &sysfs_ ## file)					\
@@ -104,45 +138,26 @@ do {									\
 	_v;								\
 })
 
-#define strtoul_restrict_or_return(cp, min, max)			\
-({									\
-	unsigned long __v = 0;						\
-	int _r = strtoul_safe_restrict(cp, __v, min, max);		\
-	if (_r)								\
-		return _r;						\
-	__v;								\
-})
-
-#define strtoi_h_or_return(cp)						\
-({									\
-	u64 _v;								\
-	int _r = strtoi_h(cp, &_v);					\
-	if (_r)								\
-		return _r;						\
-	_v;								\
-})
-
-#define sysfs_hatoi(file, var)						\
-do {									\
-	if (attr == &sysfs_ ## file)					\
-		return strtoi_h(buf, &var) ?: (ssize_t) size;		\
-} while (0)
-
-write_attribute(trigger_journal_flush);
-write_attribute(trigger_btree_coalesce);
 write_attribute(trigger_gc);
-write_attribute(prune_cache);
-rw_attribute(btree_gc_periodic);
+write_attribute(trigger_discards);
+write_attribute(trigger_invalidates);
+write_attribute(trigger_journal_flush);
+write_attribute(trigger_journal_writes);
+write_attribute(trigger_btree_cache_shrink);
+write_attribute(trigger_btree_key_cache_shrink);
+write_attribute(trigger_freelist_wakeup);
+read_attribute(gc_gens_pos);
 
 read_attribute(uuid);
 read_attribute(minor);
+read_attribute(flags);
 read_attribute(bucket_size);
-read_attribute(block_size);
-read_attribute(btree_node_size);
 read_attribute(first_bucket);
 read_attribute(nbuckets);
-read_attribute(durability);
-read_attribute(iodone);
+rw_attribute(durability);
+read_attribute(io_done);
+read_attribute(io_errors);
+write_attribute(io_errors_reset);
 
 read_attribute(io_latency_read);
 read_attribute(io_latency_write);
@@ -150,171 +165,155 @@ read_attribute(io_latency_stats_read);
 read_attribute(io_latency_stats_write);
 read_attribute(congested);
 
-read_attribute(bucket_quantiles_last_read);
-read_attribute(bucket_quantiles_last_write);
-read_attribute(bucket_quantiles_fragmentation);
-read_attribute(bucket_quantiles_oldest_gen);
+read_attribute(btree_write_stats);
 
-read_attribute(reserve_stats);
 read_attribute(btree_cache_size);
 read_attribute(compression_stats);
 read_attribute(journal_debug);
-read_attribute(journal_pins);
-read_attribute(btree_updates);
-read_attribute(dirty_btree_nodes);
+read_attribute(btree_cache);
+read_attribute(btree_key_cache);
+read_attribute(btree_reserve_cache);
+read_attribute(stripes_heap);
+read_attribute(open_buckets);
+read_attribute(open_buckets_partial);
+read_attribute(write_points);
+read_attribute(nocow_lock_table);
+
+#ifdef BCH_WRITE_REF_DEBUG
+read_attribute(write_refs);
+
+static const char * const bch2_write_refs[] = {
+#define x(n)	#n,
+	BCH_WRITE_REFS()
+#undef x
+	NULL
+};
+
+static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	bch2_printbuf_tabstop_push(out, 24);
+
+	for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++)
+		prt_printf(out, "%s\t%li\n", bch2_write_refs[i], atomic_long_read(&c->writes[i]));
+}
+#endif
 
 read_attribute(internal_uuid);
+read_attribute(disk_groups);
 
 read_attribute(has_data);
 read_attribute(alloc_debug);
-write_attribute(wake_allocator);
-
-read_attribute(read_realloc_races);
-read_attribute(extent_migrate_done);
-read_attribute(extent_migrate_raced);
+read_attribute(accounting);
+read_attribute(usage_base);
 
-rw_attribute(journal_write_delay_ms);
-rw_attribute(journal_reclaim_delay_ms);
+#define x(t, n, ...) read_attribute(t);
+BCH_PERSISTENT_COUNTERS()
+#undef x
 
 rw_attribute(discard);
-rw_attribute(cache_replacement_policy);
+read_attribute(state);
 rw_attribute(label);
 
-rw_attribute(copy_gc_enabled);
-sysfs_pd_controller_attribute(copy_gc);
+read_attribute(copy_gc_wait);
 
-rw_attribute(rebalance_enabled);
 sysfs_pd_controller_attribute(rebalance);
-read_attribute(rebalance_work);
-rw_attribute(promote_whole_extents);
+read_attribute(rebalance_status);
 
-rw_attribute(pd_controllers_update_seconds);
+read_attribute(new_stripes);
 
-read_attribute(meta_replicas_have);
-read_attribute(data_replicas_have);
+read_attribute(io_timers_read);
+read_attribute(io_timers_write);
+
+read_attribute(moving_ctxts);
 
 #ifdef CONFIG_BCACHEFS_TESTS
 write_attribute(perf_test);
 #endif /* CONFIG_BCACHEFS_TESTS */
 
-#define BCH_DEBUG_PARAM(name, description)				\
-	rw_attribute(name);
-
-	BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
 #define x(_name)						\
 	static struct attribute sysfs_time_stat_##_name =		\
-		{ .name = #_name, .mode = S_IRUGO };
+		{ .name = #_name, .mode = 0644 };
 	BCH_TIME_STATS()
 #undef x
 
-static struct attribute sysfs_state_rw = {
-	.name = "state",
-	.mode = S_IRUGO
-};
-
 static size_t bch2_btree_cache_size(struct bch_fs *c)
 {
+	struct btree_cache *bc = &c->btree_cache;
 	size_t ret = 0;
 	struct btree *b;
 
-	mutex_lock(&c->btree_cache.lock);
-	list_for_each_entry(b, &c->btree_cache.live, list)
-		ret += btree_bytes(c);
-
-	mutex_unlock(&c->btree_cache.lock);
+	mutex_lock(&bc->lock);
+	list_for_each_entry(b, &bc->live[0].list, list)
+		ret += btree_buf_bytes(b);
+	list_for_each_entry(b, &bc->live[1].list, list)
+		ret += btree_buf_bytes(b);
+	list_for_each_entry(b, &bc->freeable, list)
+		ret += btree_buf_bytes(b);
+	mutex_unlock(&bc->lock);
 	return ret;
 }
 
-static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
+static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
 {
-	struct bch_fs_usage stats = bch2_fs_usage_read(c);
-
-	return scnprintf(buf, PAGE_SIZE,
-			 "capacity:\t\t%llu\n"
-			 "1 replicas:\n"
-			 "\tmeta:\t\t%llu\n"
-			 "\tdirty:\t\t%llu\n"
-			 "\treserved:\t%llu\n"
-			 "2 replicas:\n"
-			 "\tmeta:\t\t%llu\n"
-			 "\tdirty:\t\t%llu\n"
-			 "\treserved:\t%llu\n"
-			 "3 replicas:\n"
-			 "\tmeta:\t\t%llu\n"
-			 "\tdirty:\t\t%llu\n"
-			 "\treserved:\t%llu\n"
-			 "4 replicas:\n"
-			 "\tmeta:\t\t%llu\n"
-			 "\tdirty:\t\t%llu\n"
-			 "\treserved:\t%llu\n"
-			 "online reserved:\t%llu\n",
-			 c->capacity,
-			 stats.s[0].data[S_META],
-			 stats.s[0].data[S_DIRTY],
-			 stats.s[0].persistent_reserved,
-			 stats.s[1].data[S_META],
-			 stats.s[1].data[S_DIRTY],
-			 stats.s[1].persistent_reserved,
-			 stats.s[2].data[S_META],
-			 stats.s[2].data[S_DIRTY],
-			 stats.s[2].persistent_reserved,
-			 stats.s[3].data[S_META],
-			 stats.s[3].data[S_DIRTY],
-			 stats.s[3].persistent_reserved,
-			 stats.online_reserved);
+	prt_str(out, "type");
+	printbuf_tabstop_push(out, 12);
+	printbuf_tabstop_push(out, 16);
+	printbuf_tabstop_push(out, 16);
+	printbuf_tabstop_push(out, 24);
+	prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n");
+
+	for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) {
+		struct disk_accounting_pos a = {
+			.type			= BCH_DISK_ACCOUNTING_compression,
+			.compression.type	= i,
+		};
+		struct bpos p = disk_accounting_pos_to_bpos(&a);
+		u64 v[3];
+		bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v));
+
+		u64 nr_extents			= v[0];
+		u64 sectors_uncompressed	= v[1];
+		u64 sectors_compressed		= v[2];
+
+		bch2_prt_compression_type(out, i);
+		prt_tab(out);
+
+		prt_human_readable_u64(out, sectors_compressed << 9);
+		prt_tab_rjust(out);
+
+		prt_human_readable_u64(out, sectors_uncompressed << 9);
+		prt_tab_rjust(out);
+
+		prt_human_readable_u64(out, nr_extents
+				       ? div64_u64(sectors_uncompressed << 9, nr_extents)
+				       : 0);
+		prt_tab_rjust(out);
+		prt_newline(out);
+	}
+
+	return 0;
 }
 
-static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
+static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0,
-	    nr_compressed_extents = 0,
-	    compressed_sectors_compressed = 0,
-	    compressed_sectors_uncompressed = 0;
-
-	if (!bch2_fs_running(c))
-		return -EPERM;
+	bch2_btree_id_to_text(out, c->gc_gens_btree);
+	prt_printf(out, ": ");
+	bch2_bpos_to_text(out, c->gc_gens_pos);
+	prt_printf(out, "\n");
+}
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0, k)
-		if (k.k->type == BCH_EXTENT) {
-			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-			const struct bch_extent_ptr *ptr;
-			struct bch_extent_crc_unpacked crc;
-
-			extent_for_each_ptr_crc(e, ptr, crc) {
-				if (crc.compression_type == BCH_COMPRESSION_NONE) {
-					nr_uncompressed_extents++;
-					uncompressed_sectors += e.k->size;
-				} else {
-					nr_compressed_extents++;
-					compressed_sectors_compressed +=
-						crc.compressed_size;
-					compressed_sectors_uncompressed +=
-						crc.uncompressed_size;
-				}
-
-				/* only looking at the first ptr */
-				break;
-			}
-		}
-	bch2_btree_iter_unlock(&iter);
-
-	return scnprintf(buf, PAGE_SIZE,
-			"uncompressed data:\n"
-			"	nr extents:			%llu\n"
-			"	size (bytes):			%llu\n"
-			"compressed data:\n"
-			"	nr extents:			%llu\n"
-			"	compressed size (bytes):	%llu\n"
-			"	uncompressed size (bytes):	%llu\n",
-			nr_uncompressed_extents,
-			uncompressed_sectors << 9,
-			nr_compressed_extents,
-			compressed_sectors_compressed << 9,
-			compressed_sectors_uncompressed << 9);
+static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct bch_fs_usage_base b = {};
+
+	acc_u64s_percpu(&b.hidden, &c->usage->hidden, sizeof(b) / sizeof(u64));
+
+	prt_printf(out, "hidden:\t\t%llu\n",	b.hidden);
+	prt_printf(out, "btree:\t\t%llu\n",	b.btree);
+	prt_printf(out, "data:\t\t%llu\n",	b.data);
+	prt_printf(out, "cached:\t%llu\n",	b.cached);
+	prt_printf(out, "reserved:\t\t%llu\n",	b.reserved);
+	prt_printf(out, "nr_inodes:\t%llu\n",	b.nr_inodes);
 }
 
 SHOW(bch2_fs)
@@ -324,133 +323,142 @@ SHOW(bch2_fs)
 	sysfs_print(minor,			c->minor);
 	sysfs_printf(internal_uuid, "%pU",	c->sb.uuid.b);
 
-	sysfs_print(journal_write_delay_ms,	c->journal.write_delay_ms);
-	sysfs_print(journal_reclaim_delay_ms,	c->journal.reclaim_delay_ms);
+	if (attr == &sysfs_flags)
+		prt_bitflags(out, bch2_fs_flag_strs, c->flags);
 
-	sysfs_print(block_size,			block_bytes(c));
-	sysfs_print(btree_node_size,		btree_bytes(c));
 	sysfs_hprint(btree_cache_size,		bch2_btree_cache_size(c));
 
-	sysfs_print(read_realloc_races,
-		    atomic_long_read(&c->read_realloc_races));
-	sysfs_print(extent_migrate_done,
-		    atomic_long_read(&c->extent_migrate_done));
-	sysfs_print(extent_migrate_raced,
-		    atomic_long_read(&c->extent_migrate_raced));
+	if (attr == &sysfs_btree_write_stats)
+		bch2_btree_write_stats_to_text(out, c);
 
-	sysfs_printf(btree_gc_periodic, "%u",	(int) c->btree_gc_periodic);
+	if (attr == &sysfs_gc_gens_pos)
+		bch2_gc_gens_pos_to_text(out, c);
 
-	sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+	sysfs_pd_controller_show(rebalance,	&c->rebalance.pd); /* XXX */
 
-	sysfs_print(pd_controllers_update_seconds,
-		    c->pd_controllers_update_seconds);
+	if (attr == &sysfs_copy_gc_wait)
+		bch2_copygc_wait_to_text(out, c);
 
-	sysfs_printf(rebalance_enabled,		"%i", c->rebalance.enabled);
-	sysfs_pd_controller_show(rebalance,	&c->rebalance.pd); /* XXX */
+	if (attr == &sysfs_rebalance_status)
+		bch2_rebalance_status_to_text(out, c);
 
-	if (attr == &sysfs_rebalance_work)
-		return bch2_rebalance_work_show(c, buf);
+	/* Debugging: */
 
-	sysfs_print(promote_whole_extents,	c->promote_whole_extents);
+	if (attr == &sysfs_journal_debug)
+		bch2_journal_debug_to_text(out, &c->journal);
 
-	sysfs_printf(meta_replicas_have, "%u",	bch2_replicas_online(c, true));
-	sysfs_printf(data_replicas_have, "%u",	bch2_replicas_online(c, false));
+	if (attr == &sysfs_btree_cache)
+		bch2_btree_cache_to_text(out, &c->btree_cache);
 
-	/* Debugging: */
+	if (attr == &sysfs_btree_key_cache)
+		bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
 
-	if (attr == &sysfs_alloc_debug)
-		return show_fs_alloc_debug(c, buf);
+	if (attr == &sysfs_btree_reserve_cache)
+		bch2_btree_reserve_cache_to_text(out, c);
 
-	if (attr == &sysfs_journal_debug)
-		return bch2_journal_print_debug(&c->journal, buf);
+	if (attr == &sysfs_stripes_heap)
+		bch2_stripes_heap_to_text(out, c);
 
-	if (attr == &sysfs_journal_pins)
-		return bch2_journal_print_pins(&c->journal, buf);
+	if (attr == &sysfs_open_buckets)
+		bch2_open_buckets_to_text(out, c, NULL);
 
-	if (attr == &sysfs_btree_updates)
-		return bch2_btree_updates_print(c, buf);
+	if (attr == &sysfs_open_buckets_partial)
+		bch2_open_buckets_partial_to_text(out, c);
 
-	if (attr == &sysfs_dirty_btree_nodes)
-		return bch2_dirty_btree_nodes_print(c, buf);
+	if (attr == &sysfs_write_points)
+		bch2_write_points_to_text(out, c);
 
 	if (attr == &sysfs_compression_stats)
-		return bch2_compression_stats(c, buf);
+		bch2_compression_stats_to_text(out, c);
 
-#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
-	BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
+	if (attr == &sysfs_new_stripes)
+		bch2_new_stripes_to_text(out, c);
 
-	return 0;
-}
+	if (attr == &sysfs_io_timers_read)
+		bch2_io_timers_to_text(out, &c->io_clock[READ]);
 
-STORE(__bch2_fs)
-{
-	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+	if (attr == &sysfs_io_timers_write)
+		bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
 
-	sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
-	sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
+	if (attr == &sysfs_moving_ctxts)
+		bch2_fs_moving_ctxts_to_text(out, c);
 
-	if (attr == &sysfs_btree_gc_periodic) {
-		ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
-			?: (ssize_t) size;
+#ifdef BCH_WRITE_REF_DEBUG
+	if (attr == &sysfs_write_refs)
+		bch2_write_refs_to_text(out, c);
+#endif
 
-		wake_up_process(c->gc_thread);
-		return ret;
-	}
+	if (attr == &sysfs_nocow_lock_table)
+		bch2_nocow_locks_to_text(out, &c->nocow_locks);
 
-	if (attr == &sysfs_copy_gc_enabled) {
-		struct bch_dev *ca;
-		unsigned i;
-		ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
-			?: (ssize_t) size;
+	if (attr == &sysfs_disk_groups)
+		bch2_disk_groups_to_text(out, c);
 
-		for_each_member_device(ca, c, i)
-			if (ca->copygc_thread)
-				wake_up_process(ca->copygc_thread);
-		return ret;
-	}
+	if (attr == &sysfs_alloc_debug)
+		bch2_fs_alloc_debug_to_text(out, c);
 
-	if (attr == &sysfs_rebalance_enabled) {
-		ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
-			?: (ssize_t) size;
+	if (attr == &sysfs_accounting)
+		bch2_fs_accounting_to_text(out, c);
 
-		rebalance_wakeup(c);
-		return ret;
-	}
+	if (attr == &sysfs_usage_base)
+		bch2_fs_usage_base_to_text(out, c);
 
-	sysfs_strtoul(pd_controllers_update_seconds,
-		      c->pd_controllers_update_seconds);
-	sysfs_pd_controller_store(rebalance,	&c->rebalance.pd);
+	return 0;
+}
 
-	sysfs_strtoul(promote_whole_extents,	c->promote_whole_extents);
+STORE(bch2_fs)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
 
-	/* Debugging: */
+	sysfs_pd_controller_store(rebalance,	&c->rebalance.pd);
 
-#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name);
-	BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
+	/* Debugging: */
 
-	if (!bch2_fs_running(c))
+	if (!test_bit(BCH_FS_started, &c->flags))
 		return -EPERM;
 
 	/* Debugging: */
 
-	if (attr == &sysfs_trigger_journal_flush)
-		bch2_journal_meta_async(&c->journal, NULL);
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))
+		return -EROFS;
 
-	if (attr == &sysfs_trigger_btree_coalesce)
-		bch2_coalesce(c);
+	if (attr == &sysfs_trigger_btree_cache_shrink) {
+		struct btree_cache *bc = &c->btree_cache;
+		struct shrink_control sc;
 
-	if (attr == &sysfs_trigger_gc)
-		bch2_gc(c);
+		sc.gfp_mask = GFP_KERNEL;
+		sc.nr_to_scan = strtoul_or_return(buf);
+		bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc);
+	}
 
-	if (attr == &sysfs_prune_cache) {
+	if (attr == &sysfs_trigger_btree_key_cache_shrink) {
 		struct shrink_control sc;
 
 		sc.gfp_mask = GFP_KERNEL;
 		sc.nr_to_scan = strtoul_or_return(buf);
-		c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
+		c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc);
+	}
+
+	if (attr == &sysfs_trigger_gc)
+		bch2_gc_gens(c);
+
+	if (attr == &sysfs_trigger_discards)
+		bch2_do_discards(c);
+
+	if (attr == &sysfs_trigger_invalidates)
+		bch2_do_invalidates(c);
+
+	if (attr == &sysfs_trigger_journal_flush) {
+		bch2_journal_flush_all_pins(&c->journal);
+		bch2_journal_meta(&c->journal);
 	}
+
+	if (attr == &sysfs_trigger_journal_writes)
+		bch2_journal_do_writes(&c->journal);
+
+	if (attr == &sysfs_trigger_freelist_wakeup)
+		closure_wake_up(&c->freelist_wait);
+
 #ifdef CONFIG_BCACHEFS_TESTS
 	if (attr == &sysfs_perf_test) {
 		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -464,40 +472,24 @@ STORE(__bch2_fs)
 		if (threads_str &&
 		    !(ret = kstrtouint(threads_str, 10, &threads)) &&
 		    !(ret = bch2_strtoull_h(nr_str, &nr)))
-			bch2_btree_perf_test(c, test, nr, threads);
-		else
-			size = ret;
+			ret = bch2_btree_perf_test(c, test, nr, threads);
 		kfree(tmp);
+
+		if (ret)
+			size = ret;
 	}
 #endif
-	return size;
-}
-
-STORE(bch2_fs)
-{
-	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-
-	mutex_lock(&c->state_lock);
-	size = __bch2_fs_store(kobj, attr, buf, size);
-	mutex_unlock(&c->state_lock);
-
+	bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
 	return size;
 }
 SYSFS_OPS(bch2_fs);
 
 struct attribute *bch2_fs_files[] = {
 	&sysfs_minor,
-	&sysfs_block_size,
-	&sysfs_btree_node_size,
 	&sysfs_btree_cache_size,
+	&sysfs_btree_write_stats,
 
-	&sysfs_meta_replicas_have,
-	&sysfs_data_replicas_have,
-
-	&sysfs_journal_write_delay_ms,
-	&sysfs_journal_reclaim_delay_ms,
-
-	&sysfs_promote_whole_extents,
+	&sysfs_rebalance_status,
 
 	&sysfs_compression_stats,
 
@@ -507,49 +499,104 @@ struct attribute *bch2_fs_files[] = {
 	NULL
 };
 
+/* counters dir */
+
+SHOW(bch2_fs_counters)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj);
+	u64 counter = 0;
+	u64 counter_since_mount = 0;
+
+	printbuf_tabstop_push(out, 32);
+
+	#define x(t, ...) \
+		if (attr == &sysfs_##t) {					\
+			counter             = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
+			counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
+			prt_printf(out, "since mount:\t");			\
+			prt_human_readable_u64(out, counter_since_mount);	\
+			prt_newline(out);					\
+										\
+			prt_printf(out, "since filesystem creation:\t");	\
+			prt_human_readable_u64(out, counter);			\
+			prt_newline(out);					\
+		}
+	BCH_PERSISTENT_COUNTERS()
+	#undef x
+	return 0;
+}
+
+STORE(bch2_fs_counters) {
+	return 0;
+}
+
+SYSFS_OPS(bch2_fs_counters);
+
+struct attribute *bch2_fs_counters_files[] = {
+#define x(t, ...) \
+	&sysfs_##t,
+	BCH_PERSISTENT_COUNTERS()
+#undef x
+	NULL
+};
 /* internal dir - just a wrapper */
 
 SHOW(bch2_fs_internal)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
-	return bch2_fs_show(&c->kobj, attr, buf);
+
+	return bch2_fs_to_text(out, &c->kobj, attr);
 }
 
 STORE(bch2_fs_internal)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
+
 	return bch2_fs_store(&c->kobj, attr, buf, size);
 }
 SYSFS_OPS(bch2_fs_internal);
 
 struct attribute *bch2_fs_internal_files[] = {
-	&sysfs_alloc_debug,
+	&sysfs_flags,
 	&sysfs_journal_debug,
-	&sysfs_journal_pins,
-	&sysfs_btree_updates,
-	&sysfs_dirty_btree_nodes,
-
-	&sysfs_read_realloc_races,
-	&sysfs_extent_migrate_done,
-	&sysfs_extent_migrate_raced,
+	&sysfs_btree_cache,
+	&sysfs_btree_key_cache,
+	&sysfs_btree_reserve_cache,
+	&sysfs_new_stripes,
+	&sysfs_stripes_heap,
+	&sysfs_open_buckets,
+	&sysfs_open_buckets_partial,
+	&sysfs_write_points,
+#ifdef BCH_WRITE_REF_DEBUG
+	&sysfs_write_refs,
+#endif
+	&sysfs_nocow_lock_table,
+	&sysfs_io_timers_read,
+	&sysfs_io_timers_write,
 
-	&sysfs_trigger_journal_flush,
-	&sysfs_trigger_btree_coalesce,
 	&sysfs_trigger_gc,
-	&sysfs_prune_cache,
+	&sysfs_trigger_discards,
+	&sysfs_trigger_invalidates,
+	&sysfs_trigger_journal_flush,
+	&sysfs_trigger_journal_writes,
+	&sysfs_trigger_btree_cache_shrink,
+	&sysfs_trigger_btree_key_cache_shrink,
+	&sysfs_trigger_freelist_wakeup,
+
+	&sysfs_gc_gens_pos,
 
-	&sysfs_copy_gc_enabled,
+	&sysfs_copy_gc_wait,
 
-	&sysfs_rebalance_enabled,
-	&sysfs_rebalance_work,
 	sysfs_pd_controller_files(rebalance),
 
-	&sysfs_internal_uuid,
+	&sysfs_moving_ctxts,
 
-#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
-	BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
+	&sysfs_internal_uuid,
 
+	&sysfs_disk_groups,
+	&sysfs_alloc_debug,
+	&sysfs_accounting,
+	&sysfs_usage_base,
 	NULL
 };
 
@@ -557,16 +604,15 @@ struct attribute *bch2_fs_internal_files[] = {
 
 SHOW(bch2_fs_opts_dir)
 {
-	char *out = buf, *end = buf + PAGE_SIZE;
 	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
 	const struct bch_option *opt = container_of(attr, struct bch_option, attr);
 	int id = opt - bch2_opt_table;
 	u64 v = bch2_opt_get_by_id(&c->opts, id);
 
-	out += bch2_opt_to_text(c, out, end - out, opt, v, OPT_SHOW_FULL_LIST);
-	out += scnprintf(out, end - out, "\n");
+	bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
+	prt_char(out, '\n');
 
-	return out - buf;
+	return 0;
 }
 
 STORE(bch2_fs_opts_dir)
@@ -577,41 +623,49 @@ STORE(bch2_fs_opts_dir)
 	char *tmp;
 	u64 v;
 
+	/*
+	 * We don't need to take c->writes for correctness, but it eliminates an
+	 * unsightly error message in the dmesg log when we're RO:
+	 */
+	if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)))
+		return -EROFS;
+
 	tmp = kstrdup(buf, GFP_KERNEL);
-	if (!tmp)
-		return -ENOMEM;
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto err;
+	}
 
-	ret = bch2_opt_parse(c, opt, strim(tmp), &v);
+	ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL);
 	kfree(tmp);
 
 	if (ret < 0)
-		return ret;
+		goto err;
 
-	if (id == Opt_compression ||
-	    id == Opt_background_compression) {
-		int ret = bch2_check_set_has_compressed_data(c, v);
-		if (ret) {
-			mutex_unlock(&c->sb_lock);
-			return ret;
-		}
-	}
-
-	if (opt->set_sb != SET_NO_SB_OPT) {
-		mutex_lock(&c->sb_lock);
-		opt->set_sb(c->disk_sb.sb, v);
-		bch2_write_super(c);
-		mutex_unlock(&c->sb_lock);
-	}
+	ret = bch2_opt_check_may_set(c, id, v);
+	if (ret < 0)
+		goto err;
 
+	bch2_opt_set_sb(c, NULL, opt, v);
 	bch2_opt_set_by_id(&c->opts, id, v);
 
-	if ((id == Opt_background_target ||
-	     id == Opt_background_compression) && v) {
-		bch2_rebalance_add_work(c, S64_MAX);
+	if (v &&
+	    (id == Opt_background_target ||
+	     id == Opt_background_compression ||
+	     (id == Opt_compression && !c->opts.background_compression)))
+		bch2_set_rebalance_needs_scan(c, 0);
+
+	if (v && id == Opt_rebalance_enabled)
 		rebalance_wakeup(c);
-	}
 
-	return size;
+	if (v && id == Opt_copygc_enabled &&
+	    c->copygc_thread)
+		wake_up_process(c->copygc_thread);
+
+	ret = size;
+err:
+	bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
+	return ret;
 }
 SYSFS_OPS(bch2_fs_opts_dir);
 
@@ -625,7 +679,7 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj)
 	for (i = bch2_opt_table;
 	     i < bch2_opt_table + bch2_opts_nr;
 	     i++) {
-		if (i->mode == OPT_INTERNAL)
+		if (!(i->flags & OPT_FS))
 			continue;
 
 		ret = sysfs_create_file(kobj, &i->attr);
@@ -642,10 +696,9 @@ SHOW(bch2_fs_time_stats)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
 
-#define x(name)						\
+#define x(name)								\
 	if (attr == &sysfs_time_stat_##name)				\
-		return bch2_time_stats_print(&c->times[BCH_TIME_##name],\
-					     buf, PAGE_SIZE);
+		bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]);
 	BCH_TIME_STATS()
 #undef x
 
@@ -654,6 +707,13 @@ SHOW(bch2_fs_time_stats)
 
 STORE(bch2_fs_time_stats)
 {
+	struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
+
+#define x(name)								\
+	if (attr == &sysfs_time_stat_##name)				\
+		bch2_time_stats_reset(&c->times[BCH_TIME_##name]);
+	BCH_TIME_STATS()
+#undef x
 	return size;
 }
 SYSFS_OPS(bch2_fs_time_stats);
@@ -666,262 +726,79 @@ struct attribute *bch2_fs_time_stats_files[] = {
 	NULL
 };
 
-typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *,
-				 size_t, void *);
-
-static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
-				  size_t b, void *private)
-{
-	int rw = (private ? 1 : 0);
-
-	return bucket_last_io(c, bucket(ca, b), rw);
-}
-
-static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
-				       size_t b, void *private)
-{
-	struct bucket *g = bucket(ca, b);
-	return bucket_sectors_used(g->mark);
-}
-
-static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
-				     size_t b, void *private)
-{
-	return bucket_gc_gen(ca, b);
-}
-
-static int unsigned_cmp(const void *_l, const void *_r)
-{
-	unsigned l = *((unsigned *) _l);
-	unsigned r = *((unsigned *) _r);
-
-	return (l > r) - (l < r);
-}
-
-static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca,
-			      char *buf, bucket_map_fn *fn, void *private)
-{
-	size_t i, n;
-	/* Compute 31 quantiles */
-	unsigned q[31], *p;
-	ssize_t ret = 0;
-
-	down_read(&ca->bucket_lock);
-	n = ca->mi.nbuckets;
-
-	p = vzalloc(n * sizeof(unsigned));
-	if (!p) {
-		up_read(&ca->bucket_lock);
-		return -ENOMEM;
-	}
-
-	for (i = ca->mi.first_bucket; i < n; i++)
-		p[i] = fn(c, ca, i, private);
-
-	sort(p, n, sizeof(unsigned), unsigned_cmp, NULL);
-	up_read(&ca->bucket_lock);
-
-	while (n &&
-	       !p[n - 1])
-		--n;
-
-	for (i = 0; i < ARRAY_SIZE(q); i++)
-		q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)];
-
-	vfree(p);
-
-	for (i = 0; i < ARRAY_SIZE(q); i++)
-		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
-				 "%u ", q[i]);
-	buf[ret - 1] = '\n';
-
-	return ret;
-}
-
-static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf)
-{
-	enum alloc_reserve i;
-	ssize_t ret;
-
-	spin_lock(&ca->freelist_lock);
-
-	ret = scnprintf(buf, PAGE_SIZE,
-			"free_inc:\t%zu\t%zu\n",
-			fifo_used(&ca->free_inc),
-			ca->free_inc.size);
-
-	for (i = 0; i < RESERVE_NR; i++)
-		ret += scnprintf(buf + ret, PAGE_SIZE - ret,
-				 "free[%u]:\t%zu\t%zu\n", i,
-				 fifo_used(&ca->free[i]),
-				 ca->free[i].size);
-
-	spin_unlock(&ca->freelist_lock);
-
-	return ret;
-}
-
-static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
-{
-	struct bch_fs *c = ca->fs;
-	struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
-
-	return scnprintf(buf, PAGE_SIZE,
-		"free_inc:               %zu/%zu\n"
-		"free[RESERVE_BTREE]:    %zu/%zu\n"
-		"free[RESERVE_MOVINGGC]: %zu/%zu\n"
-		"free[RESERVE_NONE]:     %zu/%zu\n"
-		"buckets:\n"
-		"    capacity:           %llu\n"
-		"    alloc:              %llu\n"
-		"    sb:                 %llu\n"
-		"    journal:            %llu\n"
-		"    meta:               %llu\n"
-		"    user:               %llu\n"
-		"    cached:             %llu\n"
-		"    available:          %llu\n"
-		"sectors:\n"
-		"    sb:                 %llu\n"
-		"    journal:            %llu\n"
-		"    meta:               %llu\n"
-		"    user:               %llu\n"
-		"    cached:             %llu\n"
-		"freelist_wait:          %s\n"
-		"open buckets:           %u/%u (reserved %u)\n"
-		"open_buckets_wait:      %s\n",
-		fifo_used(&ca->free_inc),		ca->free_inc.size,
-		fifo_used(&ca->free[RESERVE_BTREE]),	ca->free[RESERVE_BTREE].size,
-		fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
-		fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
-		ca->mi.nbuckets - ca->mi.first_bucket,
-		stats.buckets_alloc,
-		stats.buckets[BCH_DATA_SB],
-		stats.buckets[BCH_DATA_JOURNAL],
-		stats.buckets[BCH_DATA_BTREE],
-		stats.buckets[BCH_DATA_USER],
-		stats.buckets[BCH_DATA_CACHED],
-		__dev_buckets_available(ca, stats),
-		stats.sectors[BCH_DATA_SB],
-		stats.sectors[BCH_DATA_JOURNAL],
-		stats.sectors[BCH_DATA_BTREE],
-		stats.sectors[BCH_DATA_USER],
-		stats.sectors[BCH_DATA_CACHED],
-		c->freelist_wait.list.first		? "waiting" : "empty",
-		c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
-		c->open_buckets_wait.list.first		? "waiting" : "empty");
-}
-
 static const char * const bch2_rw[] = {
 	"read",
 	"write",
 	NULL
 };
 
-static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf)
+static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca)
 {
-	char *out = buf, *end = buf + PAGE_SIZE;
-	int rw, i, cpu;
+	int rw, i;
 
 	for (rw = 0; rw < 2; rw++) {
-		out += scnprintf(out, end - out, "%s:\n", bch2_rw[rw]);
-
-		for (i = 1; i < BCH_DATA_NR; i++) {
-			u64 n = 0;
+		prt_printf(out, "%s:\n", bch2_rw[rw]);
 
-			for_each_possible_cpu(cpu)
-				n += per_cpu_ptr(ca->io_done, cpu)->sectors[rw][i];
-
-			out += scnprintf(out, end - out, "%-12s:%12llu\n",
-					 bch2_data_types[i], n << 9);
-		}
+		for (i = 1; i < BCH_DATA_NR; i++)
+			prt_printf(out, "%-12s:%12llu\n",
+			       bch2_data_type_str(i),
+			       percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
 	}
-
-	return out - buf;
 }
 
 SHOW(bch2_dev)
 {
 	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
 	struct bch_fs *c = ca->fs;
-	char *out = buf, *end = buf + PAGE_SIZE;
 
 	sysfs_printf(uuid,		"%pU\n", ca->uuid.b);
 
 	sysfs_print(bucket_size,	bucket_bytes(ca));
-	sysfs_print(block_size,		block_bytes(c));
 	sysfs_print(first_bucket,	ca->mi.first_bucket);
 	sysfs_print(nbuckets,		ca->mi.nbuckets);
 	sysfs_print(durability,		ca->mi.durability);
 	sysfs_print(discard,		ca->mi.discard);
 
 	if (attr == &sysfs_label) {
-		if (ca->mi.group) {
-			mutex_lock(&c->sb_lock);
-			out += bch2_disk_path_print(&c->disk_sb, out, end - out,
-						    ca->mi.group - 1);
-			mutex_unlock(&c->sb_lock);
-		} else {
-			out += scnprintf(out, end - out, "none");
-		}
-
-		out += scnprintf(out, end - out, "\n");
-		return out - buf;
+		if (ca->mi.group)
+			bch2_disk_path_to_text(out, c, ca->mi.group - 1);
+		prt_char(out, '\n');
 	}
 
 	if (attr == &sysfs_has_data) {
-		out += bch2_scnprint_flag_list(out, end - out,
-					       bch2_data_types,
-					       bch2_dev_has_data(c, ca));
-		out += scnprintf(out, end - out, "\n");
-		return out - buf;
+		prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca));
+		prt_char(out, '\n');
 	}
 
-	sysfs_pd_controller_show(copy_gc, &ca->copygc_pd);
-
-	if (attr == &sysfs_cache_replacement_policy) {
-		out += bch2_scnprint_string_list(out, end - out,
-						 bch2_cache_replacement_policies,
-						 ca->mi.replacement);
-		out += scnprintf(out, end - out, "\n");
-		return out - buf;
+	if (attr == &sysfs_state) {
+		prt_string_option(out, bch2_member_states, ca->mi.state);
+		prt_char(out, '\n');
 	}
 
-	if (attr == &sysfs_state_rw) {
-		out += bch2_scnprint_string_list(out, end - out,
-						 bch2_dev_state,
-						 ca->mi.state);
-		out += scnprintf(out, end - out, "\n");
-		return out - buf;
-	}
+	if (attr == &sysfs_io_done)
+		dev_io_done_to_text(out, ca);
 
-	if (attr == &sysfs_iodone)
-		return show_dev_iodone(ca, buf);
+	if (attr == &sysfs_io_errors)
+		bch2_dev_io_errors_to_text(out, ca);
 
 	sysfs_print(io_latency_read,		atomic64_read(&ca->cur_latency[READ]));
 	sysfs_print(io_latency_write,		atomic64_read(&ca->cur_latency[WRITE]));
 
 	if (attr == &sysfs_io_latency_stats_read)
-		return bch2_time_stats_print(&ca->io_latency[READ], buf, PAGE_SIZE);
+		bch2_time_stats_to_text(out, &ca->io_latency[READ].stats);
+
 	if (attr == &sysfs_io_latency_stats_write)
-		return bch2_time_stats_print(&ca->io_latency[WRITE], buf, PAGE_SIZE);
+		bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats);
 
 	sysfs_printf(congested,			"%u%%",
 		     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
 		     * 100 / CONGESTED_MAX);
 
-	if (attr == &sysfs_bucket_quantiles_last_read)
-		return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0);
-	if (attr == &sysfs_bucket_quantiles_last_write)
-		return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1);
-	if (attr == &sysfs_bucket_quantiles_fragmentation)
-		return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL);
-	if (attr == &sysfs_bucket_quantiles_oldest_gen)
-		return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL);
-
-	if (attr == &sysfs_reserve_stats)
-		return show_reserve_stats(ca, buf);
 	if (attr == &sysfs_alloc_debug)
-		return show_dev_alloc_debug(ca, buf);
+		bch2_dev_alloc_debug_to_text(out, ca);
+
+	if (attr == &sysfs_open_buckets)
+		bch2_open_buckets_to_text(out, c, ca);
 
 	return 0;
 }
@@ -930,37 +807,17 @@ STORE(bch2_dev)
 {
 	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
 	struct bch_fs *c = ca->fs;
-	struct bch_member *mi;
-
-	sysfs_pd_controller_store(copy_gc, &ca->copygc_pd);
 
 	if (attr == &sysfs_discard) {
 		bool v = strtoul_or_return(buf);
 
-		mutex_lock(&c->sb_lock);
-		mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
-
-		if (v != BCH_MEMBER_DISCARD(mi)) {
-			SET_BCH_MEMBER_DISCARD(mi, v);
-			bch2_write_super(c);
-		}
-		mutex_unlock(&c->sb_lock);
+		bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_discard, v);
 	}
 
-	if (attr == &sysfs_cache_replacement_policy) {
-		ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf);
+	if (attr == &sysfs_durability) {
+		u64 v = strtoul_or_return(buf);
 
-		if (v < 0)
-			return v;
-
-		mutex_lock(&c->sb_lock);
-		mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
-
-		if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) {
-			SET_BCH_MEMBER_REPLACEMENT(mi, v);
-			bch2_write_super(c);
-		}
-		mutex_unlock(&c->sb_lock);
+		bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_durability, v);
 	}
 
 	if (attr == &sysfs_label) {
@@ -977,8 +834,8 @@ STORE(bch2_dev)
 			return ret;
 	}
 
-	if (attr == &sysfs_wake_allocator)
-		bch2_wake_allocator(ca);
+	if (attr == &sysfs_io_errors_reset)
+		bch2_dev_errors_reset(ca);
 
 	return size;
 }
@@ -987,19 +844,19 @@ SYSFS_OPS(bch2_dev);
 struct attribute *bch2_dev_files[] = {
 	&sysfs_uuid,
 	&sysfs_bucket_size,
-	&sysfs_block_size,
 	&sysfs_first_bucket,
 	&sysfs_nbuckets,
 	&sysfs_durability,
 
 	/* settings: */
 	&sysfs_discard,
-	&sysfs_cache_replacement_policy,
-	&sysfs_state_rw,
+	&sysfs_state,
 	&sysfs_label,
 
 	&sysfs_has_data,
-	&sysfs_iodone,
+	&sysfs_io_done,
+	&sysfs_io_errors,
+	&sysfs_io_errors_reset,
 
 	&sysfs_io_latency_read,
 	&sysfs_io_latency_write,
@@ -1007,19 +864,9 @@ struct attribute *bch2_dev_files[] = {
 	&sysfs_io_latency_stats_write,
 	&sysfs_congested,
 
-	/* alloc info - other stats: */
-	&sysfs_bucket_quantiles_last_read,
-	&sysfs_bucket_quantiles_last_write,
-	&sysfs_bucket_quantiles_fragmentation,
-	&sysfs_bucket_quantiles_oldest_gen,
-
-	&sysfs_reserve_stats,
-
 	/* debug: */
 	&sysfs_alloc_debug,
-	&sysfs_wake_allocator,
-
-	sysfs_pd_controller_files(copy_gc),
+	&sysfs_open_buckets,
 	NULL
 };
 
diff --git a/libbcachefs/sysfs.h b/libbcachefs/sysfs.h
index 1ba759fd..222cd506 100644
--- a/libbcachefs/sysfs.h
+++ b/libbcachefs/sysfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_SYSFS_H_
 #define _BCACHEFS_SYSFS_H_
 
@@ -9,28 +10,32 @@ struct attribute;
 struct sysfs_ops;
 
 extern struct attribute *bch2_fs_files[];
+extern struct attribute *bch2_fs_counters_files[];
 extern struct attribute *bch2_fs_internal_files[];
 extern struct attribute *bch2_fs_opts_dir_files[];
 extern struct attribute *bch2_fs_time_stats_files[];
 extern struct attribute *bch2_dev_files[];
 
-extern struct sysfs_ops bch2_fs_sysfs_ops;
-extern struct sysfs_ops bch2_fs_internal_sysfs_ops;
-extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
-extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
-extern struct sysfs_ops bch2_dev_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_counters_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_internal_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
+extern const struct sysfs_ops bch2_dev_sysfs_ops;
 
 int bch2_opts_create_sysfs_files(struct kobject *);
 
 #else
 
 static struct attribute *bch2_fs_files[] = {};
+static struct attribute *bch2_fs_counters_files[] = {};
 static struct attribute *bch2_fs_internal_files[] = {};
 static struct attribute *bch2_fs_opts_dir_files[] = {};
 static struct attribute *bch2_fs_time_stats_files[] = {};
 static struct attribute *bch2_dev_files[] = {};
 
 static const struct sysfs_ops bch2_fs_sysfs_ops;
+static const struct sysfs_ops bch2_fs_counters_sysfs_ops;
 static const struct sysfs_ops bch2_fs_internal_sysfs_ops;
 static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
 static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c
index 31847a94..6c646981 100644
--- a/libbcachefs/tests.c
+++ b/libbcachefs/tests.c
@@ -1,8 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
 #ifdef CONFIG_BCACHEFS_TESTS
 
 #include "bcachefs.h"
 #include "btree_update.h"
 #include "journal_reclaim.h"
+#include "snapshot.h"
 #include "tests.h"
 
 #include "linux/kthread.h"
@@ -12,419 +14,747 @@ static void delete_test_keys(struct bch_fs *c)
 {
 	int ret;
 
-	ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
-				      POS(0, 0), POS(0, U64_MAX),
-				      ZERO_VERSION, NULL, NULL, NULL);
+	ret = bch2_btree_delete_range(c, BTREE_ID_extents,
+				      SPOS(0, 0, U32_MAX),
+				      POS(0, U64_MAX),
+				      0, NULL);
 	BUG_ON(ret);
 
-	ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
-				      POS(0, 0), POS(0, U64_MAX),
-				      ZERO_VERSION, NULL, NULL, NULL);
+	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
+				      SPOS(0, 0, U32_MAX),
+				      POS(0, U64_MAX),
+				      0, NULL);
 	BUG_ON(ret);
 }
 
 /* unit tests */
 
-static void test_delete(struct bch_fs *c, u64 nr)
+static int test_delete(struct bch_fs *c, u64 nr)
 {
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_i_cookie k;
 	int ret;
 
 	bkey_cookie_init(&k.k_i);
+	k.k.p.snapshot = U32_MAX;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS, k.k.p,
-			     BTREE_ITER_INTENT);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
+			     BTREE_ITER_intent);
 
-	ret = bch2_btree_iter_traverse(&iter);
-	BUG_ON(ret);
-
-	ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
-				   BTREE_INSERT_ENTRY(&iter, &k.k_i));
-	BUG_ON(ret);
+	ret = commit_do(trans, NULL, NULL, 0,
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, &k.k_i, 0));
+	bch_err_msg(c, ret, "update error");
+	if (ret)
+		goto err;
 
 	pr_info("deleting once");
-	ret = bch2_btree_delete_at(&iter, 0);
-	BUG_ON(ret);
+	ret = commit_do(trans, NULL, NULL, 0,
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_btree_delete_at(trans, &iter, 0));
+	bch_err_msg(c, ret, "delete error (first)");
+	if (ret)
+		goto err;
 
 	pr_info("deleting twice");
-	ret = bch2_btree_delete_at(&iter, 0);
-	BUG_ON(ret);
-
-	bch2_btree_iter_unlock(&iter);
+	ret = commit_do(trans, NULL, NULL, 0,
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_btree_delete_at(trans, &iter, 0));
+	bch_err_msg(c, ret, "delete error (second)");
+	if (ret)
+		goto err;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
 }
 
-static void test_delete_written(struct bch_fs *c, u64 nr)
+static int test_delete_written(struct bch_fs *c, u64 nr)
 {
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
 	struct bkey_i_cookie k;
 	int ret;
 
 	bkey_cookie_init(&k.k_i);
+	k.k.p.snapshot = U32_MAX;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS, k.k.p,
-			     BTREE_ITER_INTENT);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
+			     BTREE_ITER_intent);
 
-	ret = bch2_btree_iter_traverse(&iter);
-	BUG_ON(ret);
-
-	ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
-				   BTREE_INSERT_ENTRY(&iter, &k.k_i));
-	BUG_ON(ret);
+	ret = commit_do(trans, NULL, NULL, 0,
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, &k.k_i, 0));
+	bch_err_msg(c, ret, "update error");
+	if (ret)
+		goto err;
 
+	bch2_trans_unlock(trans);
 	bch2_journal_flush_all_pins(&c->journal);
 
-	ret = bch2_btree_delete_at(&iter, 0);
-	BUG_ON(ret);
-
-	bch2_btree_iter_unlock(&iter);
+	ret = commit_do(trans, NULL, NULL, 0,
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_btree_delete_at(trans, &iter, 0));
+	bch_err_msg(c, ret, "delete error");
+	if (ret)
+		goto err;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
 }
 
-static void test_iterate(struct bch_fs *c, u64 nr)
+static int test_iterate(struct bch_fs *c, u64 nr)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
 	u64 i;
-	int ret;
+	int ret = 0;
 
 	delete_test_keys(c);
 
 	pr_info("inserting test keys");
 
 	for (i = 0; i < nr; i++) {
-		struct bkey_i_cookie k;
+		struct bkey_i_cookie ck;
 
-		bkey_cookie_init(&k.k_i);
-		k.k.p.offset = i;
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i;
+		ck.k.p.snapshot = U32_MAX;
 
-		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
-					NULL, NULL, NULL, 0);
-		BUG_ON(ret);
+		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0);
+		bch_err_msg(c, ret, "insert error");
+		if (ret)
+			return ret;
 	}
 
 	pr_info("iterating forwards");
-
 	i = 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(0, 0), 0, k)
-		BUG_ON(k.k->p.offset != i++);
-	bch2_btree_iter_unlock(&iter);
+	ret = bch2_trans_run(c,
+		for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
+					SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+					0, k, ({
+			BUG_ON(k.k->p.offset != i++);
+			0;
+		})));
+	bch_err_msg(c, ret, "error iterating forwards");
+	if (ret)
+		return ret;
 
 	BUG_ON(i != nr);
 
 	pr_info("iterating backwards");
 
-	while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k))
-		BUG_ON(k.k->p.offset != --i);
-	bch2_btree_iter_unlock(&iter);
+	ret = bch2_trans_run(c,
+		for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
+				SPOS(0, U64_MAX, U32_MAX), 0, k, ({
+			BUG_ON(k.k->p.offset != --i);
+			0;
+		})));
+	bch_err_msg(c, ret, "error iterating backwards");
+	if (ret)
+		return ret;
 
 	BUG_ON(i);
+	return 0;
 }
 
-static void test_iterate_extents(struct bch_fs *c, u64 nr)
+static int test_iterate_extents(struct bch_fs *c, u64 nr)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
 	u64 i;
-	int ret;
+	int ret = 0;
 
 	delete_test_keys(c);
 
 	pr_info("inserting test extents");
 
 	for (i = 0; i < nr; i += 8) {
-		struct bkey_i_cookie k;
+		struct bkey_i_cookie ck;
 
-		bkey_cookie_init(&k.k_i);
-		k.k.p.offset = i + 8;
-		k.k.size = 8;
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i + 8;
+		ck.k.p.snapshot = U32_MAX;
+		ck.k.size = 8;
 
-		ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
-					NULL, NULL, NULL, 0);
-		BUG_ON(ret);
+		ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0);
+		bch_err_msg(c, ret, "insert error");
+		if (ret)
+			return ret;
 	}
 
 	pr_info("iterating forwards");
-
 	i = 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(0, 0), 0, k) {
-		BUG_ON(bkey_start_offset(k.k) != i);
-		i = k.k->p.offset;
-	}
-	bch2_btree_iter_unlock(&iter);
+	ret = bch2_trans_run(c,
+		for_each_btree_key_max(trans, iter, BTREE_ID_extents,
+					SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+					0, k, ({
+			BUG_ON(bkey_start_offset(k.k) != i);
+			i = k.k->p.offset;
+			0;
+		})));
+	bch_err_msg(c, ret, "error iterating forwards");
+	if (ret)
+		return ret;
 
 	BUG_ON(i != nr);
 
 	pr_info("iterating backwards");
 
-	while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) {
-		BUG_ON(k.k->p.offset != i);
-		i = bkey_start_offset(k.k);
-	}
-	bch2_btree_iter_unlock(&iter);
+	ret = bch2_trans_run(c,
+		for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
+				SPOS(0, U64_MAX, U32_MAX), 0, k, ({
+			BUG_ON(k.k->p.offset != i);
+			i = bkey_start_offset(k.k);
+			0;
+		})));
+	bch_err_msg(c, ret, "error iterating backwards");
+	if (ret)
+		return ret;
 
 	BUG_ON(i);
+	return 0;
 }
 
-static void test_iterate_slots(struct bch_fs *c, u64 nr)
+static int test_iterate_slots(struct bch_fs *c, u64 nr)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
 	u64 i;
-	int ret;
+	int ret = 0;
 
 	delete_test_keys(c);
 
 	pr_info("inserting test keys");
 
 	for (i = 0; i < nr; i++) {
-		struct bkey_i_cookie k;
+		struct bkey_i_cookie ck;
 
-		bkey_cookie_init(&k.k_i);
-		k.k.p.offset = i * 2;
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i * 2;
+		ck.k.p.snapshot = U32_MAX;
 
-		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
-					NULL, NULL, NULL, 0);
-		BUG_ON(ret);
+		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0);
+		bch_err_msg(c, ret, "insert error");
+		if (ret)
+			return ret;
 	}
 
 	pr_info("iterating forwards");
-
 	i = 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(0, 0), 0, k) {
-		BUG_ON(k.k->p.offset != i);
-		i += 2;
-	}
-	bch2_btree_iter_unlock(&iter);
+	ret = bch2_trans_run(c,
+		for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
+					  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+					  0, k, ({
+			BUG_ON(k.k->p.offset != i);
+			i += 2;
+			0;
+		})));
+	bch_err_msg(c, ret, "error iterating forwards");
+	if (ret)
+		return ret;
 
 	BUG_ON(i != nr * 2);
 
 	pr_info("iterating forwards by slots");
-
 	i = 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(0, 0),
-			   BTREE_ITER_SLOTS, k) {
-		BUG_ON(bkey_deleted(k.k) != (i & 1));
-		BUG_ON(k.k->p.offset != i++);
-
-		if (i == nr * 2)
-			break;
-	}
-	bch2_btree_iter_unlock(&iter);
+	ret = bch2_trans_run(c,
+		for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
+					SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+					BTREE_ITER_slots, k, ({
+			if (i >= nr * 2)
+				break;
+
+			BUG_ON(k.k->p.offset != i);
+			BUG_ON(bkey_deleted(k.k) != (i & 1));
+
+			i++;
+			0;
+		})));
+	bch_err_msg(c, ret, "error iterating forwards by slots");
+	return ret;
 }
 
-static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
+static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
 	u64 i;
-	int ret;
+	int ret = 0;
 
 	delete_test_keys(c);
 
 	pr_info("inserting test keys");
 
 	for (i = 0; i < nr; i += 16) {
-		struct bkey_i_cookie k;
+		struct bkey_i_cookie ck;
 
-		bkey_cookie_init(&k.k_i);
-		k.k.p.offset = i + 16;
-		k.k.size = 8;
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i + 16;
+		ck.k.p.snapshot = U32_MAX;
+		ck.k.size = 8;
 
-		ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
-					NULL, NULL, NULL, 0);
-		BUG_ON(ret);
+		ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0);
+		bch_err_msg(c, ret, "insert error");
+		if (ret)
+			return ret;
 	}
 
 	pr_info("iterating forwards");
-
 	i = 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(0, 0), 0, k) {
-		BUG_ON(bkey_start_offset(k.k) != i + 8);
-		BUG_ON(k.k->size != 8);
-		i += 16;
-	}
-	bch2_btree_iter_unlock(&iter);
+	ret = bch2_trans_run(c,
+		for_each_btree_key_max(trans, iter, BTREE_ID_extents,
+					SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+					0, k, ({
+			BUG_ON(bkey_start_offset(k.k) != i + 8);
+			BUG_ON(k.k->size != 8);
+			i += 16;
+			0;
+		})));
+	bch_err_msg(c, ret, "error iterating forwards");
+	if (ret)
+		return ret;
 
 	BUG_ON(i != nr);
 
 	pr_info("iterating forwards by slots");
-
 	i = 0;
 
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(0, 0),
-			   BTREE_ITER_SLOTS, k) {
-		BUG_ON(bkey_deleted(k.k) != !(i % 16));
+	ret = bch2_trans_run(c,
+		for_each_btree_key_max(trans, iter, BTREE_ID_extents,
+					SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+					BTREE_ITER_slots, k, ({
+			if (i == nr)
+				break;
+			BUG_ON(bkey_deleted(k.k) != !(i % 16));
+
+			BUG_ON(bkey_start_offset(k.k) != i);
+			BUG_ON(k.k->size != 8);
+			i = k.k->p.offset;
+			0;
+		})));
+	bch_err_msg(c, ret, "error iterating forwards by slots");
+	return ret;
+}
+
+/*
+ * XXX: we really want to make sure we've got a btree with depth > 0 for these
+ * tests
+ */
+static int test_peek_end(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
 
-		BUG_ON(bkey_start_offset(k.k) != i);
-		BUG_ON(k.k->size != 8);
-		i = k.k->p.offset;
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
+			     SPOS(0, 0, U32_MAX), 0);
 
-		if (i == nr)
-			break;
-	}
-	bch2_btree_iter_unlock(&iter);
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+	BUG_ON(k.k);
+
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+	BUG_ON(k.k);
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return 0;
 }
 
-/* perf tests */
+static int test_peek_end_extents(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
 
-static u64 test_rand(void)
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     SPOS(0, 0, U32_MAX), 0);
+
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+	BUG_ON(k.k);
+
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+	BUG_ON(k.k);
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return 0;
+}
+
+/* extent unit tests */
+
+static u64 test_version;
+
+static int insert_test_extent(struct bch_fs *c,
+			      u64 start, u64 end)
 {
-	u64 v;
-#if 0
-	v = prandom_u32();
-#else
-	prandom_bytes(&v, sizeof(v));
-#endif
-	return v;
+	struct bkey_i_cookie k;
+	int ret;
+
+	bkey_cookie_init(&k.k_i);
+	k.k_i.k.p.offset = end;
+	k.k_i.k.p.snapshot = U32_MAX;
+	k.k_i.k.size = end - start;
+	k.k_i.k.bversion.lo = test_version++;
+
+	ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int __test_extent_overwrite(struct bch_fs *c,
+				    u64 e1_start, u64 e1_end,
+				    u64 e2_start, u64 e2_end)
+{
+	int ret;
+
+	ret   = insert_test_extent(c, e1_start, e1_end) ?:
+		insert_test_extent(c, e2_start, e2_end);
+
+	delete_test_keys(c);
+	return ret;
+}
+
+static int test_extent_overwrite_front(struct bch_fs *c, u64 nr)
+{
+	return  __test_extent_overwrite(c, 0, 64, 0, 32) ?:
+		__test_extent_overwrite(c, 8, 64, 0, 32);
+}
+
+static int test_extent_overwrite_back(struct bch_fs *c, u64 nr)
+{
+	return  __test_extent_overwrite(c, 0, 64, 32, 64) ?:
+		__test_extent_overwrite(c, 0, 64, 32, 72);
+}
+
+static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
+{
+	return __test_extent_overwrite(c, 0, 64, 32, 40);
+}
+
+static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
+{
+	return  __test_extent_overwrite(c, 32, 64,  0,  64) ?:
+		__test_extent_overwrite(c, 32, 64,  0, 128) ?:
+		__test_extent_overwrite(c, 32, 64, 32,  64) ?:
+		__test_extent_overwrite(c, 32, 64, 32, 128);
 }
 
-static void rand_insert(struct bch_fs *c, u64 nr)
+static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid)
 {
 	struct bkey_i_cookie k;
 	int ret;
-	u64 i;
 
-	for (i = 0; i < nr; i++) {
-		bkey_cookie_init(&k.k_i);
-		k.k.p.offset = test_rand();
+	bkey_cookie_init(&k.k_i);
+	k.k_i.k.p.inode	= inum;
+	k.k_i.k.p.offset = start + len;
+	k.k_i.k.p.snapshot = snapid;
+	k.k_i.k.size = len;
+
+	ret = bch2_trans_commit_do(c, NULL, NULL, 0,
+		bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
+					    BTREE_UPDATE_internal_snapshot_node));
+	bch_err_fn(c, ret);
+	return ret;
+}
 
-		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
-					NULL, NULL, NULL, 0);
-		BUG_ON(ret);
-	}
+static int test_extent_create_overlapping(struct bch_fs *c, u64 inum)
+{
+	return  insert_test_overlapping_extent(c, inum,  0, 16, U32_MAX - 2) ?: /* overwrite entire */
+		insert_test_overlapping_extent(c, inum,  2,  8, U32_MAX - 2) ?:
+		insert_test_overlapping_extent(c, inum,  4,  4, U32_MAX) ?:
+		insert_test_overlapping_extent(c, inum, 32,  8, U32_MAX - 2) ?: /* overwrite front/back */
+		insert_test_overlapping_extent(c, inum, 36,  8, U32_MAX) ?:
+		insert_test_overlapping_extent(c, inum, 60,  8, U32_MAX - 2) ?:
+		insert_test_overlapping_extent(c, inum, 64,  8, U32_MAX);
 }
 
-static void rand_lookup(struct bch_fs *c, u64 nr)
+/* snapshot unit tests */
+
+/* Test skipping over keys in unrelated snapshots: */
+static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
 {
-	u64 i;
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_cookie cookie;
+	int ret;
 
-	for (i = 0; i < nr; i++) {
-		struct btree_iter iter;
-		struct bkey_s_c k;
+	bkey_cookie_init(&cookie.k_i);
+	cookie.k.p.snapshot = snapid_hi;
+	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0);
+	if (ret)
+		return ret;
 
-		bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS,
-				     POS(0, test_rand()), 0);
+	trans = bch2_trans_get(c);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
+			     SPOS(0, 0, snapid_lo), 0);
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
 
-		k = bch2_btree_iter_peek(&iter);
-		bch2_btree_iter_unlock(&iter);
-	}
+	BUG_ON(k.k->p.snapshot != U32_MAX);
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
 }
 
-static void rand_mixed(struct bch_fs *c, u64 nr)
+static int test_snapshots(struct bch_fs *c, u64 nr)
 {
+	struct bkey_i_cookie cookie;
+	u32 snapids[2];
+	u32 snapid_subvols[2] = { 1, 1 };
 	int ret;
+
+	bkey_cookie_init(&cookie.k_i);
+	cookie.k.p.snapshot = U32_MAX;
+	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0);
+	if (ret)
+		return ret;
+
+	ret = bch2_trans_commit_do(c, NULL, NULL, 0,
+		      bch2_snapshot_node_create(trans, U32_MAX,
+						snapids,
+						snapid_subvols,
+						2));
+	if (ret)
+		return ret;
+
+	if (snapids[0] > snapids[1])
+		swap(snapids[0], snapids[1]);
+
+	ret = test_snapshot_filter(c, snapids[0], snapids[1]);
+	bch_err_msg(c, ret, "from test_snapshot_filter");
+	return ret;
+}
+
+/* perf tests */
+
+static u64 test_rand(void)
+{
+	u64 v;
+
+	get_random_bytes(&v, sizeof(v));
+	return v;
+}
+
+static int rand_insert(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bkey_i_cookie k;
+	int ret = 0;
 	u64 i;
 
 	for (i = 0; i < nr; i++) {
-		struct btree_iter iter;
-		struct bkey_s_c k;
-
-		bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS,
-				     POS(0, test_rand()), 0);
+		bkey_cookie_init(&k.k_i);
+		k.k.p.offset = test_rand();
+		k.k.p.snapshot = U32_MAX;
 
-		k = bch2_btree_iter_peek(&iter);
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0));
+		if (ret)
+			break;
+	}
 
-		if (!(i & 3) && k.k) {
-			struct bkey_i_cookie k;
+	bch2_trans_put(trans);
+	return ret;
+}
 
-			bkey_cookie_init(&k.k_i);
-			k.k.p = iter.pos;
+static int rand_insert_multi(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bkey_i_cookie k[8];
+	int ret = 0;
+	unsigned j;
+	u64 i;
 
-			ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
-						   BTREE_INSERT_ENTRY(&iter, &k.k_i));
-			BUG_ON(ret);
+	for (i = 0; i < nr; i += ARRAY_SIZE(k)) {
+		for (j = 0; j < ARRAY_SIZE(k); j++) {
+			bkey_cookie_init(&k[j].k_i);
+			k[j].k.p.offset = test_rand();
+			k[j].k.p.snapshot = U32_MAX;
 		}
 
-		bch2_btree_iter_unlock(&iter);
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0));
+		if (ret)
+			break;
 	}
 
+	bch2_trans_put(trans);
+	return ret;
 }
 
-static void rand_delete(struct bch_fs *c, u64 nr)
+static int rand_lookup(struct bch_fs *c, u64 nr)
 {
-	struct bkey_i k;
-	int ret;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
 	u64 i;
 
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
+			     SPOS(0, 0, U32_MAX), 0);
+
 	for (i = 0; i < nr; i++) {
-		bkey_init(&k.k);
-		k.k.p.offset = test_rand();
+		bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
 
-		ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k,
-					NULL, NULL, NULL, 0);
-		BUG_ON(ret);
+		lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
+		ret = bkey_err(k);
+		if (ret)
+			break;
 	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
 }
 
-static void seq_insert(struct bch_fs *c, u64 nr)
+static int rand_mixed_trans(struct btree_trans *trans,
+			    struct btree_iter *iter,
+			    struct bkey_i_cookie *cookie,
+			    u64 i, u64 pos)
 {
-	struct btree_iter iter;
 	struct bkey_s_c k;
-	struct bkey_i_cookie insert;
 	int ret;
-	u64 i = 0;
 
-	bkey_cookie_init(&insert.k_i);
-
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN,
-			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
-		insert.k.p = iter.pos;
+	bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX));
 
-		ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
-				BTREE_INSERT_ENTRY(&iter, &insert.k_i));
-		BUG_ON(ret);
+	k = bch2_btree_iter_peek(iter);
+	ret = bkey_err(k);
+	bch_err_msg(trans->c, ret, "lookup error");
+	if (ret)
+		return ret;
 
-		if (++i == nr)
-			break;
+	if (!(i & 3) && k.k) {
+		bkey_cookie_init(&cookie->k_i);
+		cookie->k.p = iter->pos;
+		ret = bch2_trans_update(trans, iter, &cookie->k_i, 0);
 	}
-	bch2_btree_iter_unlock(&iter);
+
+	return ret;
 }
 
-static void seq_lookup(struct bch_fs *c, u64 nr)
+static int rand_mixed(struct bch_fs *c, u64 nr)
 {
+	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter;
-	struct bkey_s_c k;
+	struct bkey_i_cookie cookie;
+	int ret = 0;
+	u64 i, rand;
 
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k)
-		;
-	bch2_btree_iter_unlock(&iter);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
+			     SPOS(0, 0, U32_MAX), 0);
+
+	for (i = 0; i < nr; i++) {
+		rand = test_rand();
+		ret = commit_do(trans, NULL, NULL, 0,
+			rand_mixed_trans(trans, &iter, &cookie, i, rand));
+		if (ret)
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
 }
 
-static void seq_overwrite(struct bch_fs *c, u64 nr)
+static int __do_delete(struct btree_trans *trans, struct bpos pos)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	int ret;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
+			     BTREE_ITER_intent);
+	k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX));
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (!k.k)
+		goto err;
+
+	ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
 
-	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN,
-			   BTREE_ITER_INTENT, k) {
-		struct bkey_i_cookie u;
+static int rand_delete(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	int ret = 0;
+	u64 i;
 
-		bkey_reassemble(&u.k_i, k);
+	for (i = 0; i < nr; i++) {
+		struct bpos pos = SPOS(0, test_rand(), U32_MAX);
 
-		ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
-					   BTREE_INSERT_ENTRY(&iter, &u.k_i));
-		BUG_ON(ret);
+		ret = commit_do(trans, NULL, NULL, 0,
+			__do_delete(trans, pos));
+		if (ret)
+			break;
 	}
-	bch2_btree_iter_unlock(&iter);
+
+	bch2_trans_put(trans);
+	return ret;
 }
 
-static void seq_delete(struct bch_fs *c, u64 nr)
+static int seq_insert(struct bch_fs *c, u64 nr)
 {
-	int ret;
+	struct bkey_i_cookie insert;
 
-	ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
-				      POS(0, 0), POS(0, U64_MAX),
-				      ZERO_VERSION, NULL, NULL, NULL);
-	BUG_ON(ret);
+	bkey_cookie_init(&insert.k_i);
+
+	return bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
+					SPOS(0, 0, U32_MAX),
+					BTREE_ITER_slots|BTREE_ITER_intent, k,
+					NULL, NULL, 0, ({
+			if (iter.pos.offset >= nr)
+				break;
+			insert.k.p = iter.pos;
+			bch2_trans_update(trans, &iter, &insert.k_i, 0);
+		})));
+}
+
+static int seq_lookup(struct bch_fs *c, u64 nr)
+{
+	return bch2_trans_run(c,
+		for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				  0, k,
+		0));
+}
+
+static int seq_overwrite(struct bch_fs *c, u64 nr)
+{
+	return bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
+					SPOS(0, 0, U32_MAX),
+					BTREE_ITER_intent, k,
+					NULL, NULL, 0, ({
+			struct bkey_i_cookie u;
+
+			bkey_reassemble(&u.k_i, k);
+			bch2_trans_update(trans, &iter, &u.k_i, 0);
+		})));
+}
+
+static int seq_delete(struct bch_fs *c, u64 nr)
+{
+	return bch2_btree_delete_range(c, BTREE_ID_xattrs,
+				      SPOS(0, 0, U32_MAX),
+				      POS(0, U64_MAX),
+				      0, NULL);
 }
 
-typedef void (*perf_test_fn)(struct bch_fs *, u64);
+typedef int (*perf_test_fn)(struct bch_fs *, u64);
 
 struct test_job {
 	struct bch_fs			*c;
@@ -440,11 +770,13 @@ struct test_job {
 
 	u64				start;
 	u64				finish;
+	int				ret;
 };
 
 static int btree_perf_test_thread(void *data)
 {
 	struct test_job *j = data;
+	int ret;
 
 	if (atomic_dec_and_test(&j->ready)) {
 		wake_up(&j->ready_wait);
@@ -453,7 +785,11 @@ static int btree_perf_test_thread(void *data)
 		wait_event(j->ready_wait, !atomic_read(&j->ready));
 	}
 
-	j->fn(j->c, j->nr / j->nr_threads);
+	ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
+	if (ret) {
+		bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret));
+		j->ret = ret;
+	}
 
 	if (atomic_dec_and_test(&j->done)) {
 		j->finish = sched_clock();
@@ -463,14 +799,21 @@ static int btree_perf_test_thread(void *data)
 	return 0;
 }
 
-void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
-			  u64 nr, unsigned nr_threads)
+int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
+			 u64 nr, unsigned nr_threads)
 {
 	struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
-	char name_buf[20], nr_buf[20], per_sec_buf[20];
+	char name_buf[20];
+	struct printbuf nr_buf = PRINTBUF;
+	struct printbuf per_sec_buf = PRINTBUF;
 	unsigned i;
 	u64 time;
 
+	if (nr == 0 || nr_threads == 0) {
+		pr_err("nr of iterations or threads is not allowed to be 0");
+		return -EINVAL;
+	}
+
 	atomic_set(&j.ready, nr_threads);
 	init_waitqueue_head(&j.ready_wait);
 
@@ -481,6 +824,7 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 	if (!strcmp(testname, #_test)) j.fn = _test
 
 	perf_test(rand_insert);
+	perf_test(rand_insert_multi);
 	perf_test(rand_lookup);
 	perf_test(rand_mixed);
 	perf_test(rand_delete);
@@ -497,10 +841,20 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 	perf_test(test_iterate_extents);
 	perf_test(test_iterate_slots);
 	perf_test(test_iterate_slots_extents);
+	perf_test(test_peek_end);
+	perf_test(test_peek_end_extents);
+
+	perf_test(test_extent_overwrite_front);
+	perf_test(test_extent_overwrite_back);
+	perf_test(test_extent_overwrite_middle);
+	perf_test(test_extent_overwrite_all);
+	perf_test(test_extent_create_overlapping);
+
+	perf_test(test_snapshots);
 
 	if (!j.fn) {
 		pr_err("unknown test %s", testname);
-		return;
+		return -EINVAL;
 	}
 
 	//pr_info("running test %s:", testname);
@@ -518,13 +872,16 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 	time = j.finish - j.start;
 
 	scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
-	bch2_hprint(nr_buf, nr);
-	bch2_hprint(per_sec_buf, nr * NSEC_PER_SEC / time);
+	prt_human_readable_u64(&nr_buf, nr);
+	prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time));
 	printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
-		name_buf, nr_buf, nr_threads,
-		time / NSEC_PER_SEC,
-		time * nr_threads / nr,
-		per_sec_buf);
+		name_buf, nr_buf.buf, nr_threads,
+		div_u64(time, NSEC_PER_SEC),
+		div_u64(time * nr_threads, nr),
+		per_sec_buf.buf);
+	printbuf_exit(&per_sec_buf);
+	printbuf_exit(&nr_buf);
+	return j.ret;
 }
 
 #endif /* CONFIG_BCACHEFS_TESTS */
diff --git a/libbcachefs/tests.h b/libbcachefs/tests.h
index 3f1b8d1f..c73b18ae 100644
--- a/libbcachefs/tests.h
+++ b/libbcachefs/tests.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_TEST_H
 #define _BCACHEFS_TEST_H
 
@@ -5,7 +6,7 @@ struct bch_fs;
 
 #ifdef CONFIG_BCACHEFS_TESTS
 
-void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
+int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
 
 #else
 
diff --git a/libbcachefs/thread_with_file.c b/libbcachefs/thread_with_file.c
new file mode 100644
index 00000000..dea73bc1
--- /dev/null
+++ b/libbcachefs/thread_with_file.c
@@ -0,0 +1,492 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "thread_with_file.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/poll.h>
+#include <linux/sched/sysctl.h>
+
+void bch2_thread_with_file_exit(struct thread_with_file *thr)
+{
+	if (thr->task) {
+		kthread_stop(thr->task);
+		put_task_struct(thr->task);
+	}
+}
+
+int bch2_run_thread_with_file(struct thread_with_file *thr,
+			      const struct file_operations *fops,
+			      int (*fn)(void *))
+{
+	struct file *file = NULL;
+	int ret, fd = -1;
+	unsigned fd_flags = O_CLOEXEC;
+
+	if (fops->read && fops->write)
+		fd_flags |= O_RDWR;
+	else if (fops->read)
+		fd_flags |= O_RDONLY;
+	else if (fops->write)
+		fd_flags |= O_WRONLY;
+
+	char name[TASK_COMM_LEN];
+	get_task_comm(name, current);
+
+	thr->ret = 0;
+	thr->task = kthread_create(fn, thr, "%s", name);
+	ret = PTR_ERR_OR_ZERO(thr->task);
+	if (ret)
+		return ret;
+
+	ret = get_unused_fd_flags(fd_flags);
+	if (ret < 0)
+		goto err;
+	fd = ret;
+
+	file = anon_inode_getfile(name, fops, thr, fd_flags);
+	ret = PTR_ERR_OR_ZERO(file);
+	if (ret)
+		goto err;
+
+	get_task_struct(thr->task);
+	wake_up_process(thr->task);
+	fd_install(fd, file);
+	return fd;
+err:
+	if (fd >= 0)
+		put_unused_fd(fd);
+	if (thr->task)
+		kthread_stop(thr->task);
+	return ret;
+}
+
+/* stdio_redirect */
+
+static bool stdio_redirect_has_more_input(struct stdio_redirect *stdio, size_t seen)
+{
+	return stdio->input.buf.nr > seen || stdio->done;
+}
+
+static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
+{
+	return stdio_redirect_has_more_input(stdio, 0);
+}
+
+static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
+{
+	return stdio->output.buf.nr || stdio->done;
+}
+
+#define STDIO_REDIRECT_BUFSIZE		4096
+
+static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio)
+{
+	return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
+}
+
+static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio)
+{
+	return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
+}
+
+static void stdio_buf_init(struct stdio_buf *buf)
+{
+	spin_lock_init(&buf->lock);
+	init_waitqueue_head(&buf->wait);
+	darray_init(&buf->buf);
+}
+
+/* thread_with_stdio */
+
+static void thread_with_stdio_done(struct thread_with_stdio *thr)
+{
+	thr->thr.done = true;
+	thr->stdio.done = true;
+	wake_up(&thr->stdio.input.wait);
+	wake_up(&thr->stdio.output.wait);
+}
+
+static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf,
+				      size_t len, loff_t *ppos)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+	struct stdio_buf *buf = &thr->stdio.output;
+	size_t copied = 0, b;
+	int ret = 0;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio));
+		if (ret)
+			return ret;
+	} else if (!stdio_redirect_has_output(&thr->stdio))
+		return -EAGAIN;
+
+	while (len && buf->buf.nr) {
+		if (fault_in_writeable(ubuf, len) == len) {
+			ret = -EFAULT;
+			break;
+		}
+
+		spin_lock_irq(&buf->lock);
+		b = min_t(size_t, len, buf->buf.nr);
+
+		if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) {
+			ubuf	+= b;
+			len	-= b;
+			copied	+= b;
+			buf->buf.nr -= b;
+			memmove(buf->buf.data,
+				buf->buf.data + b,
+				buf->buf.nr);
+		}
+		spin_unlock_irq(&buf->lock);
+	}
+
+	return copied ?: ret;
+}
+
+static int thread_with_stdio_release(struct inode *inode, struct file *file)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+
+	thread_with_stdio_done(thr);
+	bch2_thread_with_file_exit(&thr->thr);
+	darray_exit(&thr->stdio.input.buf);
+	darray_exit(&thr->stdio.output.buf);
+	thr->ops->exit(thr);
+	return 0;
+}
+
+static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
+				       size_t len, loff_t *ppos)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+	struct stdio_buf *buf = &thr->stdio.input;
+	size_t copied = 0;
+	ssize_t ret = 0;
+
+	while (len) {
+		if (thr->thr.done) {
+			ret = -EPIPE;
+			break;
+		}
+
+		size_t b = len - fault_in_readable(ubuf, len);
+		if (!b) {
+			ret = -EFAULT;
+			break;
+		}
+
+		spin_lock(&buf->lock);
+		size_t makeroom = b;
+		if (!buf->waiting_for_line || memchr(buf->buf.data, '\n', buf->buf.nr))
+			makeroom = min_t(ssize_t, makeroom,
+				   max_t(ssize_t, STDIO_REDIRECT_BUFSIZE - buf->buf.nr,
+						  0));
+		darray_make_room_gfp(&buf->buf, makeroom, GFP_NOWAIT);
+
+		b = min(len, darray_room(buf->buf));
+
+		if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) {
+			buf->buf.nr += b;
+			ubuf	+= b;
+			len	-= b;
+			copied	+= b;
+		}
+		spin_unlock(&buf->lock);
+
+		if (b) {
+			wake_up(&buf->wait);
+		} else {
+			if ((file->f_flags & O_NONBLOCK)) {
+				ret = -EAGAIN;
+				break;
+			}
+
+			ret = wait_event_interruptible(buf->wait,
+					stdio_redirect_has_input_space(&thr->stdio));
+			if (ret)
+				break;
+		}
+	}
+
+	return copied ?: ret;
+}
+
+static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+
+	poll_wait(file, &thr->stdio.output.wait, wait);
+	poll_wait(file, &thr->stdio.input.wait, wait);
+
+	__poll_t mask = 0;
+
+	if (stdio_redirect_has_output(&thr->stdio))
+		mask |= EPOLLIN;
+	if (stdio_redirect_has_input_space(&thr->stdio))
+		mask |= EPOLLOUT;
+	if (thr->thr.done)
+		mask |= EPOLLHUP|EPOLLERR;
+	return mask;
+}
+
+static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+
+	poll_wait(file, &thr->stdio.output.wait, wait);
+
+	__poll_t mask = 0;
+
+	if (stdio_redirect_has_output(&thr->stdio))
+		mask |= EPOLLIN;
+	if (thr->thr.done)
+		mask |= EPOLLHUP|EPOLLERR;
+	return mask;
+}
+
+static int thread_with_stdio_flush(struct file *file, fl_owner_t id)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+
+	return thr->thr.ret;
+}
+
+static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+
+	if (thr->ops->unlocked_ioctl)
+		return thr->ops->unlocked_ioctl(thr, cmd, p);
+	return -ENOTTY;
+}
+
+static const struct file_operations thread_with_stdio_fops = {
+	.read		= thread_with_stdio_read,
+	.write		= thread_with_stdio_write,
+	.poll		= thread_with_stdio_poll,
+	.flush		= thread_with_stdio_flush,
+	.release	= thread_with_stdio_release,
+	.unlocked_ioctl	= thread_with_stdio_ioctl,
+};
+
+static const struct file_operations thread_with_stdout_fops = {
+	.read		= thread_with_stdio_read,
+	.poll		= thread_with_stdout_poll,
+	.flush		= thread_with_stdio_flush,
+	.release	= thread_with_stdio_release,
+	.unlocked_ioctl	= thread_with_stdio_ioctl,
+};
+
+static int thread_with_stdio_fn(void *arg)
+{
+	struct thread_with_stdio *thr = arg;
+
+	thr->thr.ret = thr->ops->fn(thr);
+
+	thread_with_stdio_done(thr);
+	return 0;
+}
+
+void bch2_thread_with_stdio_init(struct thread_with_stdio *thr,
+				 const struct thread_with_stdio_ops *ops)
+{
+	stdio_buf_init(&thr->stdio.input);
+	stdio_buf_init(&thr->stdio.output);
+	thr->ops = ops;
+}
+
+int __bch2_run_thread_with_stdio(struct thread_with_stdio *thr)
+{
+	return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
+}
+
+int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
+			       const struct thread_with_stdio_ops *ops)
+{
+	bch2_thread_with_stdio_init(thr, ops);
+
+	return __bch2_run_thread_with_stdio(thr);
+}
+
+int bch2_run_thread_with_stdout(struct thread_with_stdio *thr,
+				const struct thread_with_stdio_ops *ops)
+{
+	stdio_buf_init(&thr->stdio.input);
+	stdio_buf_init(&thr->stdio.output);
+	thr->ops = ops;
+
+	return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn);
+}
+EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout);
+
+int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len)
+{
+	struct stdio_buf *buf = &stdio->input;
+
+	/*
+	 * we're waiting on user input (or for the file descriptor to be
+	 * closed), don't want a hung task warning:
+	 */
+	do {
+		wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
+				   sysctl_hung_task_timeout_secs * HZ / 2);
+	} while (!stdio_redirect_has_input(stdio));
+
+	if (stdio->done)
+		return -1;
+
+	spin_lock(&buf->lock);
+	int ret = min(len, buf->buf.nr);
+	buf->buf.nr -= ret;
+	memcpy(ubuf, buf->buf.data, ret);
+	memmove(buf->buf.data,
+		buf->buf.data + ret,
+		buf->buf.nr);
+	spin_unlock(&buf->lock);
+
+	wake_up(&buf->wait);
+	return ret;
+}
+
+int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *stdio,
+					 darray_char *line,
+					 unsigned long timeout)
+{
+	unsigned long until = jiffies + timeout, t;
+	struct stdio_buf *buf = &stdio->input;
+	size_t seen = 0;
+again:
+	t = timeout != MAX_SCHEDULE_TIMEOUT
+		? max_t(long, until - jiffies, 0)
+		: timeout;
+
+	t = min(t, sysctl_hung_task_timeout_secs * HZ / 2);
+
+	wait_event_timeout(buf->wait, stdio_redirect_has_more_input(stdio, seen), t);
+
+	if (stdio->done)
+		return -1;
+
+	spin_lock(&buf->lock);
+	seen = buf->buf.nr;
+	char *n = memchr(buf->buf.data, '\n', seen);
+
+	if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) {
+		spin_unlock(&buf->lock);
+		return -ETIME;
+	}
+
+	if (!n) {
+		buf->waiting_for_line = true;
+		spin_unlock(&buf->lock);
+		goto again;
+	}
+
+	size_t b = n + 1 - buf->buf.data;
+	if (b > line->size) {
+		spin_unlock(&buf->lock);
+		int ret = darray_resize(line, b);
+		if (ret)
+			return ret;
+		seen = 0;
+		goto again;
+	}
+
+	buf->buf.nr -= b;
+	memcpy(line->data, buf->buf.data, b);
+	memmove(buf->buf.data,
+		buf->buf.data + b,
+		buf->buf.nr);
+	line->nr = b;
+
+	buf->waiting_for_line = false;
+	spin_unlock(&buf->lock);
+
+	wake_up(&buf->wait);
+	return 0;
+}
+
+int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, darray_char *line)
+{
+	return bch2_stdio_redirect_readline_timeout(stdio, line, MAX_SCHEDULE_TIMEOUT);
+}
+
+__printf(3, 0)
+static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
+{
+	ssize_t ret;
+
+	do {
+		va_list args2;
+		size_t len;
+
+		va_copy(args2, args);
+		len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2);
+		va_end(args2);
+
+		if (len + 1 <= darray_room(*out)) {
+			out->nr += len;
+			return len;
+		}
+
+		ret = darray_make_room_gfp(out, len + 1, gfp);
+	} while (ret == 0);
+
+	return ret;
+}
+
+ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
+				    const char *fmt, va_list args)
+{
+	struct stdio_buf *buf = &stdio->output;
+	unsigned long flags;
+	ssize_t ret;
+
+again:
+	spin_lock_irqsave(&buf->lock, flags);
+	ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
+	spin_unlock_irqrestore(&buf->lock, flags);
+
+	if (ret < 0) {
+		if (nonblocking)
+			return -EAGAIN;
+
+		ret = wait_event_interruptible(buf->wait,
+				stdio_redirect_has_output_space(stdio));
+		if (ret)
+			return ret;
+		goto again;
+	}
+
+	wake_up(&buf->wait);
+	return ret;
+}
+
+ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking,
+				const char *fmt, ...)
+{
+	va_list args;
+	ssize_t ret;
+
+	va_start(args, fmt);
+	ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args);
+	va_end(args);
+
+	return ret;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/libbcachefs/thread_with_file.h b/libbcachefs/thread_with_file.h
new file mode 100644
index 00000000..72497b92
--- /dev/null
+++ b/libbcachefs/thread_with_file.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_THREAD_WITH_FILE_H
+#define _BCACHEFS_THREAD_WITH_FILE_H
+
+#include "thread_with_file_types.h"
+
+/*
+ * Thread with file: Run a kthread and connect it to a file descriptor, so that
+ * it can be interacted with via fd read/write methods and closing the file
+ * descriptor stops the kthread.
+ *
+ * We have two different APIs:
+ *
+ * thread_with_file, the low level version.
+ * You get to define the full file_operations, including your release function,
+ * which means that you must call bch2_thread_with_file_exit() from your
+ * .release method
+ *
+ * thread_with_stdio, the higher level version
+ * This implements full piping of input and output, including .poll.
+ *
+ * Notes on behaviour:
+ *  - kthread shutdown behaves like writing or reading from a pipe that has been
+ *    closed
+ *  - Input and output buffers are 4096 bytes, although buffers may in some
+ *    situations slightly exceed that limit so as to avoid chopping off a
+ *    message in the middle in nonblocking mode.
+ *  - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations -
+ *    should be fine but might change in future revisions.
+ *  - Output buffer may grow past 4096 bytes to deal with messages that are
+ *    bigger than 4096 bytes
+ *  - Writing may be done blocking or nonblocking; in nonblocking mode, we only
+ *    drop entire messages.
+ *
+ * To write, use stdio_redirect_printf()
+ * To read, use stdio_redirect_read() or stdio_redirect_readline()
+ */
+
+struct task_struct;
+
+struct thread_with_file {
+	struct task_struct	*task;
+	int			ret;
+	bool			done;
+};
+
+void bch2_thread_with_file_exit(struct thread_with_file *);
+int bch2_run_thread_with_file(struct thread_with_file *,
+			      const struct file_operations *,
+			      int (*fn)(void *));
+
+struct thread_with_stdio;
+
+struct thread_with_stdio_ops {
+	void (*exit)(struct thread_with_stdio *);
+	int (*fn)(struct thread_with_stdio *);
+	long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long);
+};
+
+struct thread_with_stdio {
+	struct thread_with_file	thr;
+	struct stdio_redirect	stdio;
+	const struct thread_with_stdio_ops	*ops;
+};
+
+void bch2_thread_with_stdio_init(struct thread_with_stdio *,
+				 const struct thread_with_stdio_ops *);
+int __bch2_run_thread_with_stdio(struct thread_with_stdio *);
+int bch2_run_thread_with_stdio(struct thread_with_stdio *,
+			       const struct thread_with_stdio_ops *);
+int bch2_run_thread_with_stdout(struct thread_with_stdio *,
+				const struct thread_with_stdio_ops *);
+int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
+
+int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *, darray_char *, unsigned long);
+int bch2_stdio_redirect_readline(struct stdio_redirect *, darray_char *);
+
+__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
+__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
+
+#endif /* _BCACHEFS_THREAD_WITH_FILE_H */
diff --git a/libbcachefs/thread_with_file_types.h b/libbcachefs/thread_with_file_types.h
new file mode 100644
index 00000000..f4d484d4
--- /dev/null
+++ b/libbcachefs/thread_with_file_types.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
+#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
+
+#include "darray.h"
+
+struct stdio_buf {
+	spinlock_t		lock;
+	wait_queue_head_t	wait;
+	darray_char		buf;
+	bool			waiting_for_line;
+};
+
+struct stdio_redirect {
+	struct stdio_buf	input;
+	struct stdio_buf	output;
+	bool			done;
+};
+
+#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */
diff --git a/libbcachefs/time_stats.c b/libbcachefs/time_stats.c
new file mode 100644
index 00000000..3fe82757
--- /dev/null
+++ b/libbcachefs/time_stats.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+
+#include "eytzinger.h"
+#include "time_stats.h"
+
+static const struct time_unit time_units[] = {
+	{ "ns",		1		 },
+	{ "us",		NSEC_PER_USEC	 },
+	{ "ms",		NSEC_PER_MSEC	 },
+	{ "s",		NSEC_PER_SEC	 },
+	{ "m",          (u64) NSEC_PER_SEC * 60},
+	{ "h",          (u64) NSEC_PER_SEC * 3600},
+	{ "d",          (u64) NSEC_PER_SEC * 3600 * 24},
+	{ "w",          (u64) NSEC_PER_SEC * 3600 * 24 * 7},
+	{ "y",          (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
+	{ "eon",        U64_MAX          },
+};
+
+const struct time_unit *bch2_pick_time_units(u64 ns)
+{
+	const struct time_unit *u;
+
+	for (u = time_units;
+	     u + 1 < time_units + ARRAY_SIZE(time_units) &&
+	     ns >= u[1].nsecs << 1;
+	     u++)
+		;
+
+	return u;
+}
+
+static void quantiles_update(struct quantiles *q, u64 v)
+{
+	unsigned i = 0;
+
+	while (i < ARRAY_SIZE(q->entries)) {
+		struct quantile_entry *e = q->entries + i;
+
+		if (unlikely(!e->step)) {
+			e->m = v;
+			e->step = max_t(unsigned, v / 2, 1024);
+		} else if (e->m > v) {
+			e->m = e->m >= e->step
+				? e->m - e->step
+				: 0;
+		} else if (e->m < v) {
+			e->m = e->m + e->step > e->m
+				? e->m + e->step
+				: U32_MAX;
+		}
+
+		if ((e->m > v ? e->m - v : v - e->m) < e->step)
+			e->step = max_t(unsigned, e->step / 2, 1);
+
+		if (v >= e->m)
+			break;
+
+		i = eytzinger0_child(i, v > e->m);
+	}
+}
+
+static inline void time_stats_update_one(struct bch2_time_stats *stats,
+					      u64 start, u64 end)
+{
+	u64 duration, freq;
+	bool initted = stats->last_event != 0;
+
+	if (time_after64(end, start)) {
+		struct quantiles *quantiles = time_stats_to_quantiles(stats);
+
+		duration = end - start;
+		mean_and_variance_update(&stats->duration_stats, duration);
+		mean_and_variance_weighted_update(&stats->duration_stats_weighted,
+				duration, initted, TIME_STATS_MV_WEIGHT);
+		stats->max_duration = max(stats->max_duration, duration);
+		stats->min_duration = min(stats->min_duration, duration);
+		stats->total_duration += duration;
+
+		if (quantiles)
+			quantiles_update(quantiles, duration);
+	}
+
+	if (stats->last_event && time_after64(end, stats->last_event)) {
+		freq = end - stats->last_event;
+		mean_and_variance_update(&stats->freq_stats, freq);
+		mean_and_variance_weighted_update(&stats->freq_stats_weighted,
+				freq, initted, TIME_STATS_MV_WEIGHT);
+		stats->max_freq = max(stats->max_freq, freq);
+		stats->min_freq = min(stats->min_freq, freq);
+	}
+
+	stats->last_event = end;
+}
+
+void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+				    struct time_stat_buffer *b)
+{
+	for (struct time_stat_buffer_entry *i = b->entries;
+	     i < b->entries + ARRAY_SIZE(b->entries);
+	     i++)
+		time_stats_update_one(stats, i->start, i->end);
+	b->nr = 0;
+}
+
+static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats,
+					     struct time_stat_buffer *b)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&stats->lock, flags);
+	__bch2_time_stats_clear_buffer(stats, b);
+	spin_unlock_irqrestore(&stats->lock, flags);
+}
+
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
+{
+	unsigned long flags;
+
+	if (!stats->buffer) {
+		spin_lock_irqsave(&stats->lock, flags);
+		time_stats_update_one(stats, start, end);
+
+		if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
+		    stats->duration_stats.n > 1024)
+			stats->buffer =
+				alloc_percpu_gfp(struct time_stat_buffer,
+						 GFP_ATOMIC);
+		spin_unlock_irqrestore(&stats->lock, flags);
+	} else {
+		struct time_stat_buffer *b;
+
+		preempt_disable();
+		b = this_cpu_ptr(stats->buffer);
+
+		BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+		b->entries[b->nr++] = (struct time_stat_buffer_entry) {
+			.start = start,
+			.end = end
+		};
+
+		if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
+			time_stats_clear_buffer(stats, b);
+		preempt_enable();
+	}
+}
+
+void bch2_time_stats_reset(struct bch2_time_stats *stats)
+{
+	spin_lock_irq(&stats->lock);
+	unsigned offset = offsetof(struct bch2_time_stats, min_duration);
+	memset((void *) stats + offset, 0, sizeof(*stats) - offset);
+
+	if (stats->buffer) {
+		int cpu;
+		for_each_possible_cpu(cpu)
+			per_cpu_ptr(stats->buffer, cpu)->nr = 0;
+	}
+	spin_unlock_irq(&stats->lock);
+}
+
+void bch2_time_stats_exit(struct bch2_time_stats *stats)
+{
+	free_percpu(stats->buffer);
+}
+
+void bch2_time_stats_init(struct bch2_time_stats *stats)
+{
+	memset(stats, 0, sizeof(*stats));
+	stats->min_duration = U64_MAX;
+	stats->min_freq = U64_MAX;
+	spin_lock_init(&stats->lock);
+}
diff --git a/libbcachefs/time_stats.h b/libbcachefs/time_stats.h
new file mode 100644
index 00000000..dc6493f7
--- /dev/null
+++ b/libbcachefs/time_stats.h
@@ -0,0 +1,160 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * bch2_time_stats - collect statistics on events that have a duration, with nicely
+ * formatted textual output on demand
+ *
+ * - percpu buffering of event collection: cheap enough to shotgun
+ *   everywhere without worrying about overhead
+ *
+ * tracks:
+ *  - number of events
+ *  - maximum event duration ever seen
+ *  - sum of all event durations
+ *  - average event duration, standard and weighted
+ *  - standard deviation of event durations, standard and weighted
+ * and analagous statistics for the frequency of events
+ *
+ * We provide both mean and weighted mean (exponentially weighted), and standard
+ * deviation and weighted standard deviation, to give an efficient-to-compute
+ * view of current behaviour versus. average behaviour - "did this event source
+ * just become wonky, or is this typical?".
+ *
+ * Particularly useful for tracking down latency issues.
+ */
+#ifndef _BCACHEFS_TIME_STATS_H
+#define _BCACHEFS_TIME_STATS_H
+
+#include <linux/sched/clock.h>
+#include <linux/spinlock_types.h>
+#include <linux/string.h>
+
+#include "mean_and_variance.h"
+
+struct time_unit {
+	const char	*name;
+	u64		nsecs;
+};
+
+/*
+ * given a nanosecond value, pick the preferred time units for printing:
+ */
+const struct time_unit *bch2_pick_time_units(u64 ns);
+
+/*
+ * quantiles - do not use:
+ *
+ * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't
+ * use in new code.
+ */
+
+#define NR_QUANTILES	15
+#define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST	eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST	eytzinger0_last(NR_QUANTILES)
+
+struct quantiles {
+	struct quantile_entry {
+		u64	m;
+		u64	step;
+	}		entries[NR_QUANTILES];
+};
+
+struct time_stat_buffer {
+	unsigned	nr;
+	struct time_stat_buffer_entry {
+		u64	start;
+		u64	end;
+	}		entries[31];
+};
+
+struct bch2_time_stats {
+	spinlock_t	lock;
+	bool		have_quantiles;
+	struct time_stat_buffer __percpu *buffer;
+	/* all fields are in nanoseconds */
+	u64             min_duration;
+	u64		max_duration;
+	u64		total_duration;
+	u64             max_freq;
+	u64             min_freq;
+	u64		last_event;
+	u64		last_event_start;
+
+	struct mean_and_variance	  duration_stats;
+	struct mean_and_variance	  freq_stats;
+
+/* default weight for weighted mean and variance calculations */
+#define TIME_STATS_MV_WEIGHT	8
+
+	struct mean_and_variance_weighted duration_stats_weighted;
+	struct mean_and_variance_weighted freq_stats_weighted;
+};
+
+struct bch2_time_stats_quantiles {
+	struct bch2_time_stats	stats;
+	struct quantiles	quantiles;
+};
+
+static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats)
+{
+	return stats->have_quantiles
+		? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles
+		: NULL;
+}
+
+void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *);
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
+
+/**
+ * time_stats_update - collect a new event being tracked
+ *
+ * @stats	- bch2_time_stats to update
+ * @start	- start time of event, recorded with local_clock()
+ *
+ * The end duration of the event will be the current time
+ */
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
+{
+	__bch2_time_stats_update(stats, start, local_clock());
+}
+
+/**
+ * track_event_change - track state change events
+ *
+ * @stats	- bch2_time_stats to update
+ * @v		- new state, true or false
+ *
+ * Use this when tracking time stats for state changes, i.e. resource X becoming
+ * blocked/unblocked.
+ */
+static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
+{
+	if (v != !!stats->last_event_start) {
+		if (!v) {
+			bch2_time_stats_update(stats, stats->last_event_start);
+			stats->last_event_start = 0;
+		} else {
+			stats->last_event_start = local_clock() ?: 1;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+void bch2_time_stats_reset(struct bch2_time_stats *);
+void bch2_time_stats_exit(struct bch2_time_stats *);
+void bch2_time_stats_init(struct bch2_time_stats *);
+
+static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq)
+{
+	bch2_time_stats_exit(&statq->stats);
+}
+static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq)
+{
+	bch2_time_stats_init(&statq->stats);
+	statq->stats.have_quantiles = true;
+	memset(&statq->quantiles, 0, sizeof(statq->quantiles));
+}
+
+#endif /* _BCACHEFS_TIME_STATS_H */
diff --git a/libbcachefs/trace.c b/libbcachefs/trace.c
index 13f0fc24..dfad1d06 100644
--- a/libbcachefs/trace.c
+++ b/libbcachefs/trace.c
@@ -1,11 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "alloc_types.h"
 #include "buckets.h"
-#include "btree_types.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_key_cache.h"
+#include "btree_locking.h"
+#include "btree_update_interior.h"
 #include "keylist.h"
+#include "move_types.h"
+#include "opts.h"
+#include "six.h"
 
 #include <linux/blktrace_api.h>
-#include "keylist.h"
 
 #define CREATE_TRACE_POINTS
-#include <trace/events/bcachefs.h>
+#include "trace.h"
diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h
new file mode 100644
index 00000000..2d5932d2
--- /dev/null
+++ b/libbcachefs/trace.h
@@ -0,0 +1,1903 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bcachefs
+
+#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ)
+
+#include <linux/tracepoint.h>
+
+#define TRACE_BPOS_entries(name)				\
+	__field(u64,			name##_inode	)	\
+	__field(u64,			name##_offset	)	\
+	__field(u32,			name##_snapshot	)
+
+#define TRACE_BPOS_assign(dst, src)				\
+	__entry->dst##_inode		= (src).inode;		\
+	__entry->dst##_offset		= (src).offset;		\
+	__entry->dst##_snapshot		= (src).snapshot
+
+DECLARE_EVENT_CLASS(bpos,
+	TP_PROTO(const struct bpos *p),
+	TP_ARGS(p),
+
+	TP_STRUCT__entry(
+		TRACE_BPOS_entries(p)
+	),
+
+	TP_fast_assign(
+		TRACE_BPOS_assign(p, *p);
+	),
+
+	TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot)
+);
+
+DECLARE_EVENT_CLASS(fs_str,
+	TP_PROTO(struct bch_fs *c, const char *str),
+	TP_ARGS(c, str),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__string(str,		str			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__assign_str(str);
+	),
+
+	TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
+);
+
+DECLARE_EVENT_CLASS(trans_str,
+	TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
+	TP_ARGS(trans, caller_ip, str),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__array(char,		trans_fn, 32		)
+		__field(unsigned long,	caller_ip		)
+		__string(str,		str			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= trans->c->dev;
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__assign_str(str);
+	),
+
+	TP_printk("%d,%d %s %pS %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->trans_fn, (void *) __entry->caller_ip, __get_str(str))
+);
+
+DECLARE_EVENT_CLASS(trans_str_nocaller,
+	TP_PROTO(struct btree_trans *trans, const char *str),
+	TP_ARGS(trans, str),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__array(char,		trans_fn, 32		)
+		__string(str,		str			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= trans->c->dev;
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__assign_str(str);
+	),
+
+	TP_printk("%d,%d %s %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->trans_fn, __get_str(str))
+);
+
+DECLARE_EVENT_CLASS(btree_node_nofs,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u8,		level			)
+		__field(u8,		btree_id		)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->level		= b->c.level;
+		__entry->btree_id	= b->c.btree_id;
+		TRACE_BPOS_assign(pos, b->key.k.p);
+	),
+
+	TP_printk("%d,%d %u %s %llu:%llu:%u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->level,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
+);
+
+DECLARE_EVENT_CLASS(btree_node,
+	TP_PROTO(struct btree_trans *trans, struct btree *b),
+	TP_ARGS(trans, b),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__array(char,		trans_fn, 32		)
+		__field(u8,		level			)
+		__field(u8,		btree_id		)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= trans->c->dev;
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->level		= b->c.level;
+		__entry->btree_id	= b->c.btree_id;
+		TRACE_BPOS_assign(pos, b->key.k.p);
+	),
+
+	TP_printk("%d,%d %s %u %s %llu:%llu:%u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn,
+		  __entry->level,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
+);
+
+DECLARE_EVENT_CLASS(bch_fs,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+	),
+
+	TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
+);
+
+DECLARE_EVENT_CLASS(btree_trans,
+	TP_PROTO(struct btree_trans *trans),
+	TP_ARGS(trans),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__array(char,		trans_fn, 32		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= trans->c->dev;
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+	),
+
+	TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn)
+);
+
+DECLARE_EVENT_CLASS(bio,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(sector_t,	sector			)
+		__field(unsigned int,	nr_sector		)
+		__array(char,		rwbs,	6		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev ? bio_dev(bio) : 0;
+		__entry->sector		= bio->bi_iter.bi_sector;
+		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
+	),
+
+	TP_printk("%d,%d  %s %llu + %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector, __entry->nr_sector)
+);
+
+/* fs.c: */
+TRACE_EVENT(bch2_sync_fs,
+	TP_PROTO(struct super_block *sb, int wait),
+
+	TP_ARGS(sb, wait),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	int,	wait			)
+
+	),
+
+	TP_fast_assign(
+		__entry->dev	= sb->s_dev;
+		__entry->wait	= wait;
+	),
+
+	TP_printk("dev %d,%d wait %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->wait)
+);
+
+/* fs-io.c: */
+TRACE_EVENT(bch2_fsync,
+	TP_PROTO(struct file *file, int datasync),
+
+	TP_ARGS(file, datasync),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	ino_t,	parent			)
+		__field(	int,	datasync		)
+	),
+
+	TP_fast_assign(
+		struct dentry *dentry = file->f_path.dentry;
+
+		__entry->dev		= dentry->d_sb->s_dev;
+		__entry->ino		= d_inode(dentry)->i_ino;
+		__entry->parent		= d_inode(dentry->d_parent)->i_ino;
+		__entry->datasync	= datasync;
+	),
+
+	TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  (unsigned long) __entry->parent, __entry->datasync)
+);
+
+/* super-io.c: */
+TRACE_EVENT(write_super,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev	)
+		__field(unsigned long,	ip	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->ip		= ip;
+	),
+
+	TP_printk("%d,%d for %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (void *) __entry->ip)
+);
+
+/* io.c: */
+
+DEFINE_EVENT(bio, read_promote,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+TRACE_EVENT(read_nopromote,
+	TP_PROTO(struct bch_fs *c, int ret),
+	TP_ARGS(c, ret),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev		)
+		__array(char,		ret, 32		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
+	),
+
+	TP_printk("%d,%d ret %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ret)
+);
+
+DEFINE_EVENT(bio, read_bounce,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, read_split,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, read_retry,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, read_reuse_race,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+/* Journal */
+
+DEFINE_EVENT(bch_fs, journal_full,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(fs_str, journal_entry_full,
+	TP_PROTO(struct bch_fs *c, const char *str),
+	TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, journal_entry_close,
+	TP_PROTO(struct bch_fs *c, const char *str),
+	TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(bio, journal_write,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+TRACE_EVENT(journal_reclaim_start,
+	TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
+		 u64 min_nr, u64 min_key_cache,
+		 u64 btree_cache_dirty, u64 btree_cache_total,
+		 u64 btree_key_cache_dirty, u64 btree_key_cache_total),
+	TP_ARGS(c, direct, kicked, min_nr, min_key_cache,
+		btree_cache_dirty, btree_cache_total,
+		btree_key_cache_dirty, btree_key_cache_total),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(bool,		direct			)
+		__field(bool,		kicked			)
+		__field(u64,		min_nr			)
+		__field(u64,		min_key_cache		)
+		__field(u64,		btree_cache_dirty	)
+		__field(u64,		btree_cache_total	)
+		__field(u64,		btree_key_cache_dirty	)
+		__field(u64,		btree_key_cache_total	)
+	),
+
+	TP_fast_assign(
+		__entry->dev			= c->dev;
+		__entry->direct			= direct;
+		__entry->kicked			= kicked;
+		__entry->min_nr			= min_nr;
+		__entry->min_key_cache		= min_key_cache;
+		__entry->btree_cache_dirty	= btree_cache_dirty;
+		__entry->btree_cache_total	= btree_cache_total;
+		__entry->btree_key_cache_dirty	= btree_key_cache_dirty;
+		__entry->btree_key_cache_total	= btree_key_cache_total;
+	),
+
+	TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->direct,
+		  __entry->kicked,
+		  __entry->min_nr,
+		  __entry->min_key_cache,
+		  __entry->btree_cache_dirty,
+		  __entry->btree_cache_total,
+		  __entry->btree_key_cache_dirty,
+		  __entry->btree_key_cache_total)
+);
+
+TRACE_EVENT(journal_reclaim_finish,
+	TP_PROTO(struct bch_fs *c, u64 nr_flushed),
+	TP_ARGS(c, nr_flushed),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u64,		nr_flushed		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->nr_flushed	= nr_flushed;
+	),
+
+	TP_printk("%d,%d flushed %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->nr_flushed)
+);
+
+/* bset.c: */
+
+DEFINE_EVENT(bpos, bkey_pack_pos_fail,
+	TP_PROTO(const struct bpos *p),
+	TP_ARGS(p)
+);
+
+/* Btree cache: */
+
+TRACE_EVENT(btree_cache_scan,
+	TP_PROTO(long nr_to_scan, long can_free, long ret),
+	TP_ARGS(nr_to_scan, can_free, ret),
+
+	TP_STRUCT__entry(
+		__field(long,	nr_to_scan		)
+		__field(long,	can_free		)
+		__field(long,	ret			)
+	),
+
+	TP_fast_assign(
+		__entry->nr_to_scan	= nr_to_scan;
+		__entry->can_free	= can_free;
+		__entry->ret		= ret;
+	),
+
+	TP_printk("scanned for %li nodes, can free %li, ret %li",
+		  __entry->nr_to_scan, __entry->can_free, __entry->ret)
+);
+
+DEFINE_EVENT(btree_node_nofs, btree_cache_reap,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail,
+	TP_PROTO(struct btree_trans *trans),
+	TP_ARGS(trans)
+);
+
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock,
+	TP_PROTO(struct btree_trans *trans),
+	TP_ARGS(trans)
+);
+
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize,
+	TP_PROTO(struct btree_trans *trans),
+	TP_ARGS(trans)
+);
+
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock,
+	TP_PROTO(struct btree_trans *trans),
+	TP_ARGS(trans)
+);
+
+/* Btree */
+
+DEFINE_EVENT(btree_node, btree_node_read,
+	TP_PROTO(struct btree_trans *trans, struct btree *b),
+	TP_ARGS(trans, b)
+);
+
+TRACE_EVENT(btree_node_write,
+	TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
+	TP_ARGS(b, bytes, sectors),
+
+	TP_STRUCT__entry(
+		__field(enum btree_node_type,	type)
+		__field(unsigned,	bytes			)
+		__field(unsigned,	sectors			)
+	),
+
+	TP_fast_assign(
+		__entry->type	= btree_node_type(b);
+		__entry->bytes	= bytes;
+		__entry->sectors = sectors;
+	),
+
+	TP_printk("bkey type %u bytes %u sectors %u",
+		  __entry->type , __entry->bytes, __entry->sectors)
+);
+
+DEFINE_EVENT(btree_node, btree_node_alloc,
+	TP_PROTO(struct btree_trans *trans, struct btree *b),
+	TP_ARGS(trans, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_free,
+	TP_PROTO(struct btree_trans *trans, struct btree *b),
+	TP_ARGS(trans, b)
+);
+
+TRACE_EVENT(btree_reserve_get_fail,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip,
+		 size_t required,
+		 int ret),
+	TP_ARGS(trans_fn, caller_ip, required, ret),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(size_t,			required	)
+		__array(char,			ret, 32		)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip	= caller_ip;
+		__entry->required	= required;
+		strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
+	),
+
+	TP_printk("%s %pS required %zu ret %s",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  __entry->required,
+		  __entry->ret)
+);
+
+DEFINE_EVENT(btree_node, btree_node_compact,
+	TP_PROTO(struct btree_trans *trans, struct btree *b),
+	TP_ARGS(trans, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_merge,
+	TP_PROTO(struct btree_trans *trans, struct btree *b),
+	TP_ARGS(trans, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_split,
+	TP_PROTO(struct btree_trans *trans, struct btree *b),
+	TP_ARGS(trans, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_rewrite,
+	TP_PROTO(struct btree_trans *trans, struct btree *b),
+	TP_ARGS(trans, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_set_root,
+	TP_PROTO(struct btree_trans *trans, struct btree *b),
+	TP_ARGS(trans, b)
+);
+
+TRACE_EVENT(btree_path_relock_fail,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path,
+		 unsigned level),
+	TP_ARGS(trans, caller_ip, path, level),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		__field(u8,			level		)
+		__field(u8,			path_idx)
+		TRACE_BPOS_entries(pos)
+		__array(char,			node, 24	)
+		__field(u8,			self_read_count	)
+		__field(u8,			self_intent_count)
+		__field(u8,			read_count	)
+		__field(u8,			intent_count	)
+		__field(u32,			iter_lock_seq	)
+		__field(u32,			node_lock_seq	)
+	),
+
+	TP_fast_assign(
+		struct btree *b = btree_path_node(path, level);
+		struct six_lock_count c;
+
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= path->btree_id;
+		__entry->level			= level;
+		__entry->path_idx		= path - trans->paths;
+		TRACE_BPOS_assign(pos, path->pos);
+
+		c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level);
+		__entry->self_read_count	= c.n[SIX_LOCK_read];
+		__entry->self_intent_count	= c.n[SIX_LOCK_intent];
+
+		if (IS_ERR(b)) {
+			strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
+		} else {
+			c = six_lock_counts(&path->l[level].b->c.lock);
+			__entry->read_count	= c.n[SIX_LOCK_read];
+			__entry->intent_count	= c.n[SIX_LOCK_intent];
+			scnprintf(__entry->node, sizeof(__entry->node), "%px", &b->c);
+		}
+		__entry->iter_lock_seq		= path->l[level].lock_seq;
+		__entry->node_lock_seq		= is_btree_node(path, level)
+			? six_lock_seq(&path->l[level].b->c.lock)
+			: 0;
+	),
+
+	TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  __entry->path_idx,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->level,
+		  __entry->node,
+		  __entry->self_read_count,
+		  __entry->self_intent_count,
+		  __entry->read_count,
+		  __entry->intent_count,
+		  __entry->iter_lock_seq,
+		  __entry->node_lock_seq)
+);
+
+TRACE_EVENT(btree_path_upgrade_fail,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path,
+		 unsigned level),
+	TP_ARGS(trans, caller_ip, path, level),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		__field(u8,			level		)
+		__field(u8,			path_idx)
+		TRACE_BPOS_entries(pos)
+		__field(u8,			locked		)
+		__field(u8,			self_read_count	)
+		__field(u8,			self_intent_count)
+		__field(u8,			read_count	)
+		__field(u8,			intent_count	)
+		__field(u32,			iter_lock_seq	)
+		__field(u32,			node_lock_seq	)
+	),
+
+	TP_fast_assign(
+		struct six_lock_count c;
+
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= path->btree_id;
+		__entry->level			= level;
+		__entry->path_idx		= path - trans->paths;
+		TRACE_BPOS_assign(pos, path->pos);
+		__entry->locked			= btree_node_locked(path, level);
+
+		c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+		__entry->self_read_count	= c.n[SIX_LOCK_read];
+		__entry->self_intent_count	= c.n[SIX_LOCK_intent];
+		c = six_lock_counts(&path->l[level].b->c.lock);
+		__entry->read_count		= c.n[SIX_LOCK_read];
+		__entry->intent_count		= c.n[SIX_LOCK_intent];
+		__entry->iter_lock_seq		= path->l[level].lock_seq;
+		__entry->node_lock_seq		= is_btree_node(path, level)
+			? six_lock_seq(&path->l[level].b->c.lock)
+			: 0;
+	),
+
+	TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  __entry->path_idx,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->level,
+		  __entry->locked,
+		  __entry->self_read_count,
+		  __entry->self_intent_count,
+		  __entry->read_count,
+		  __entry->intent_count,
+		  __entry->iter_lock_seq,
+		  __entry->node_lock_seq)
+);
+
+/* Garbage collection */
+
+DEFINE_EVENT(bch_fs, gc_gens_start,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, gc_gens_end,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+/* Allocator */
+
+DEFINE_EVENT(fs_str, bucket_alloc,
+	TP_PROTO(struct bch_fs *c, const char *str),
+	TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, bucket_alloc_fail,
+	TP_PROTO(struct bch_fs *c, const char *str),
+	TP_ARGS(c, str)
+);
+
+TRACE_EVENT(discard_buckets,
+	TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
+		 u64 need_journal_commit, u64 discarded, const char *err),
+	TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u64,		seen			)
+		__field(u64,		open			)
+		__field(u64,		need_journal_commit	)
+		__field(u64,		discarded		)
+		__array(char,		err,	16		)
+	),
+
+	TP_fast_assign(
+		__entry->dev			= c->dev;
+		__entry->seen			= seen;
+		__entry->open			= open;
+		__entry->need_journal_commit	= need_journal_commit;
+		__entry->discarded		= discarded;
+		strscpy(__entry->err, err, sizeof(__entry->err));
+	),
+
+	TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->seen,
+		  __entry->open,
+		  __entry->need_journal_commit,
+		  __entry->discarded,
+		  __entry->err)
+);
+
+TRACE_EVENT(bucket_invalidate,
+	TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
+	TP_ARGS(c, dev, bucket, sectors),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u32,		dev_idx			)
+		__field(u32,		sectors			)
+		__field(u64,		bucket			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->dev_idx	= dev;
+		__entry->sectors	= sectors;
+		__entry->bucket		= bucket;
+	),
+
+	TP_printk("%d:%d invalidated %u:%llu cached sectors %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->dev_idx, __entry->bucket,
+		  __entry->sectors)
+);
+
+/* Moving IO */
+
+TRACE_EVENT(bucket_evacuate,
+	TP_PROTO(struct bch_fs *c, struct bpos *bucket),
+	TP_ARGS(c, bucket),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u32,		dev_idx			)
+		__field(u64,		bucket			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->dev_idx	= bucket->inode;
+		__entry->bucket		= bucket->offset;
+	),
+
+	TP_printk("%d:%d %u:%llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->dev_idx, __entry->bucket)
+);
+
+DEFINE_EVENT(fs_str, move_extent,
+	TP_PROTO(struct bch_fs *c, const char *str),
+	TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, move_extent_read,
+	TP_PROTO(struct bch_fs *c, const char *str),
+	TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, move_extent_write,
+	TP_PROTO(struct bch_fs *c, const char *str),
+	TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, move_extent_finish,
+	TP_PROTO(struct bch_fs *c, const char *str),
+	TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, move_extent_fail,
+	TP_PROTO(struct bch_fs *c, const char *str),
+	TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, move_extent_start_fail,
+	TP_PROTO(struct bch_fs *c, const char *str),
+	TP_ARGS(c, str)
+);
+
+TRACE_EVENT(move_data,
+	TP_PROTO(struct bch_fs *c,
+		 struct bch_move_stats *stats),
+	TP_ARGS(c, stats),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev		)
+		__field(u64,		keys_moved	)
+		__field(u64,		keys_raced	)
+		__field(u64,		sectors_seen	)
+		__field(u64,		sectors_moved	)
+		__field(u64,		sectors_raced	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->keys_moved	= atomic64_read(&stats->keys_moved);
+		__entry->keys_raced	= atomic64_read(&stats->keys_raced);
+		__entry->sectors_seen	= atomic64_read(&stats->sectors_seen);
+		__entry->sectors_moved	= atomic64_read(&stats->sectors_moved);
+		__entry->sectors_raced	= atomic64_read(&stats->sectors_raced);
+	),
+
+	TP_printk("%d,%d keys moved %llu raced %llu"
+		  "sectors seen %llu moved %llu raced %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->keys_moved,
+		  __entry->keys_raced,
+		  __entry->sectors_seen,
+		  __entry->sectors_moved,
+		  __entry->sectors_raced)
+);
+
+TRACE_EVENT(evacuate_bucket,
+	TP_PROTO(struct bch_fs *c, struct bpos *bucket,
+		 unsigned sectors, unsigned bucket_size,
+		 int ret),
+	TP_ARGS(c, bucket, sectors, bucket_size, ret),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev		)
+		__field(u64,		member		)
+		__field(u64,		bucket		)
+		__field(u32,		sectors		)
+		__field(u32,		bucket_size	)
+		__field(int,		ret		)
+	),
+
+	TP_fast_assign(
+		__entry->dev			= c->dev;
+		__entry->member			= bucket->inode;
+		__entry->bucket			= bucket->offset;
+		__entry->sectors		= sectors;
+		__entry->bucket_size		= bucket_size;
+		__entry->ret			= ret;
+	),
+
+	TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->member, __entry->bucket,
+		  __entry->sectors, __entry->bucket_size,
+		  __entry->ret)
+);
+
+TRACE_EVENT(copygc,
+	TP_PROTO(struct bch_fs *c,
+		 u64 sectors_moved, u64 sectors_not_moved,
+		 u64 buckets_moved, u64 buckets_not_moved),
+	TP_ARGS(c,
+		sectors_moved, sectors_not_moved,
+		buckets_moved, buckets_not_moved),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u64,		sectors_moved		)
+		__field(u64,		sectors_not_moved	)
+		__field(u64,		buckets_moved		)
+		__field(u64,		buckets_not_moved	)
+	),
+
+	TP_fast_assign(
+		__entry->dev			= c->dev;
+		__entry->sectors_moved		= sectors_moved;
+		__entry->sectors_not_moved	= sectors_not_moved;
+		__entry->buckets_moved		= buckets_moved;
+		__entry->buckets_not_moved = buckets_moved;
+	),
+
+	TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->sectors_moved, __entry->sectors_not_moved,
+		  __entry->buckets_moved, __entry->buckets_not_moved)
+);
+
+TRACE_EVENT(copygc_wait,
+	TP_PROTO(struct bch_fs *c,
+		 u64 wait_amount, u64 until),
+	TP_ARGS(c, wait_amount, until),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u64,		wait_amount		)
+		__field(u64,		until			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->wait_amount	= wait_amount;
+		__entry->until		= until;
+	),
+
+	TP_printk("%d,%u waiting for %llu sectors until %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->wait_amount, __entry->until)
+);
+
+/* btree transactions: */
+
+DECLARE_EVENT_CLASS(transaction_event,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+	),
+
+	TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,	transaction_commit,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_injected,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(trans_restart_split_race,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree *b),
+	TP_ARGS(trans, caller_ip, b),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			level		)
+		__field(u16,			written		)
+		__field(u16,			blocks		)
+		__field(u16,			u64s_remaining	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->level		= b->c.level;
+		__entry->written	= b->written;
+		__entry->blocks		= btree_blocks(trans->c);
+		__entry->u64s_remaining	= bch2_btree_keys_u64s_remaining(b);
+	),
+
+	TP_printk("%s %pS l=%u written %u/%u u64s remaining %u",
+		  __entry->trans_fn, (void *) __entry->caller_ip,
+		  __entry->level,
+		  __entry->written, __entry->blocks,
+		  __entry->u64s_remaining)
+);
+
+TRACE_EVENT(trans_blocked_journal_reclaim,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+
+		__field(unsigned long,		key_cache_nr_keys	)
+		__field(unsigned long,		key_cache_nr_dirty	)
+		__field(long,			must_wait		)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->key_cache_nr_keys	= atomic_long_read(&trans->c->btree_key_cache.nr_keys);
+		__entry->key_cache_nr_dirty	= atomic_long_read(&trans->c->btree_key_cache.nr_dirty);
+		__entry->must_wait		= __bch2_btree_key_cache_must_wait(trans->c);
+	),
+
+	TP_printk("%s %pS key cache keys %lu dirty %lu must_wait %li",
+		  __entry->trans_fn, (void *) __entry->caller_ip,
+		  __entry->key_cache_nr_keys,
+		  __entry->key_cache_nr_dirty,
+		  __entry->must_wait)
+);
+
+TRACE_EVENT(trans_restart_journal_preres_get,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 unsigned flags),
+	TP_ARGS(trans, caller_ip, flags),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(unsigned,		flags		)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->flags			= flags;
+	),
+
+	TP_printk("%s %pS %x", __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  __entry->flags)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_fault_inject,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,	trans_traverse_all,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_key_cache_raced,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(trans_str, trans_restart_too_many_iters,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 const char *paths),
+	TP_ARGS(trans, caller_ip, paths)
+);
+
+DECLARE_EVENT_CLASS(transaction_restart_iter,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= path->btree_id;
+		TRACE_BPOS_assign(pos, path->pos)
+	),
+
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_reused,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_split,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+TRACE_EVENT(trans_restart_upgrade,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path,
+		 unsigned old_locks_want,
+		 unsigned new_locks_want,
+		 struct get_locks_fail *f),
+	TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		__field(u8,			old_locks_want	)
+		__field(u8,			new_locks_want	)
+		__field(u8,			level		)
+		__field(u32,			path_seq	)
+		__field(u32,			node_seq	)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= path->btree_id;
+		__entry->old_locks_want		= old_locks_want;
+		__entry->new_locks_want		= new_locks_want;
+		__entry->level			= f->l;
+		__entry->path_seq		= path->l[f->l].lock_seq;
+		__entry->node_seq		= IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq;
+		TRACE_BPOS_assign(pos, path->pos)
+	),
+
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->old_locks_want,
+		  __entry->new_locks_want,
+		  __entry->level,
+		  __entry->path_seq,
+		  __entry->node_seq)
+);
+
+DEFINE_EVENT(trans_str,	trans_restart_relock,
+	TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
+	TP_ARGS(trans, caller_ip, str)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_next_node,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_parent_for_fill,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_after_fill,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_key_cache_upgrade,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_key_cache_fill,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_path,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_path_intent,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_traverse,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_memory_allocation_failure,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(trans_str_nocaller, trans_restart_would_deadlock,
+	TP_PROTO(struct btree_trans *trans,
+		 const char *cycle),
+	TP_ARGS(trans, cycle)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_would_deadlock_recursion_limit,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(trans_restart_would_deadlock_write,
+	TP_PROTO(struct btree_trans *trans),
+	TP_ARGS(trans),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+	),
+
+	TP_printk("%s", __entry->trans_fn)
+);
+
+TRACE_EVENT(trans_restart_mem_realloced,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 unsigned long bytes),
+	TP_ARGS(trans, caller_ip, bytes),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(unsigned long,		bytes		)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip	= caller_ip;
+		__entry->bytes		= bytes;
+	),
+
+	TP_printk("%s %pS bytes %lu",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  __entry->bytes)
+);
+
+TRACE_EVENT(trans_restart_key_cache_key_realloced,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path,
+		 unsigned old_u64s,
+		 unsigned new_u64s),
+	TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(enum btree_id,		btree_id	)
+		TRACE_BPOS_entries(pos)
+		__field(u32,			old_u64s	)
+		__field(u32,			new_u64s	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+
+		__entry->btree_id	= path->btree_id;
+		TRACE_BPOS_assign(pos, path->pos);
+		__entry->old_u64s	= old_u64s;
+		__entry->new_u64s	= new_u64s;
+	),
+
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->old_u64s,
+		  __entry->new_u64s)
+);
+
+TRACE_EVENT(path_downgrade,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path,
+		 unsigned old_locks_want),
+	TP_ARGS(trans, caller_ip, path, old_locks_want),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(unsigned,		old_locks_want	)
+		__field(unsigned,		new_locks_want	)
+		__field(unsigned,		btree		)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->old_locks_want		= old_locks_want;
+		__entry->new_locks_want		= path->locks_want;
+		__entry->btree			= path->btree_id;
+		TRACE_BPOS_assign(pos, path->pos);
+	),
+
+	TP_printk("%s %pS locks_want %u -> %u %s %llu:%llu:%u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  __entry->old_locks_want,
+		  __entry->new_locks_want,
+		  bch2_btree_id_str(__entry->btree),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_write_buffer_flush,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(write_buffer_flush,
+	TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size),
+	TP_ARGS(trans, nr, skipped, fast, size),
+
+	TP_STRUCT__entry(
+		__field(size_t,		nr		)
+		__field(size_t,		skipped		)
+		__field(size_t,		fast		)
+		__field(size_t,		size		)
+	),
+
+	TP_fast_assign(
+		__entry->nr	= nr;
+		__entry->skipped = skipped;
+		__entry->fast	= fast;
+		__entry->size	= size;
+	),
+
+	TP_printk("%zu/%zu skipped %zu fast %zu",
+		  __entry->nr, __entry->size, __entry->skipped, __entry->fast)
+);
+
+TRACE_EVENT(write_buffer_flush_sync,
+	TP_PROTO(struct btree_trans *trans, unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+	),
+
+	TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
+);
+
+TRACE_EVENT(write_buffer_flush_slowpath,
+	TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total),
+	TP_ARGS(trans, slowpath, total),
+
+	TP_STRUCT__entry(
+		__field(size_t,		slowpath	)
+		__field(size_t,		total		)
+	),
+
+	TP_fast_assign(
+		__entry->slowpath	= slowpath;
+		__entry->total		= total;
+	),
+
+	TP_printk("%zu/%zu", __entry->slowpath, __entry->total)
+);
+
+DEFINE_EVENT(fs_str, rebalance_extent,
+	TP_PROTO(struct bch_fs *c, const char *str),
+	TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, data_update,
+	TP_PROTO(struct bch_fs *c, const char *str),
+	TP_ARGS(c, str)
+);
+
+TRACE_EVENT(error_downcast,
+	TP_PROTO(int bch_err, int std_err, unsigned long ip),
+	TP_ARGS(bch_err, std_err, ip),
+
+	TP_STRUCT__entry(
+		__array(char,		bch_err, 32		)
+		__array(char,		std_err, 32		)
+		__array(char,		ip, 32			)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
+		strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
+		snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
+	),
+
+	TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip)
+);
+
+#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS
+
+TRACE_EVENT(update_by_path,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path,
+		 struct btree_insert_entry *i, bool overwrite),
+	TP_ARGS(trans, path, i, overwrite),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(btree_path_idx_t,	path_idx	)
+		__field(u8,			btree_id	)
+		TRACE_BPOS_entries(pos)
+		__field(u8,			overwrite	)
+		__field(btree_path_idx_t,	update_idx	)
+		__field(btree_path_idx_t,	nr_updates	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->path_idx		= path - trans->paths;
+		__entry->btree_id		= path->btree_id;
+		TRACE_BPOS_assign(pos, path->pos);
+		__entry->overwrite		= overwrite;
+		__entry->update_idx		= i - trans->updates;
+		__entry->nr_updates		= trans->nr_updates;
+	),
+
+	TP_printk("%s path %3u btree %s pos %llu:%llu:%u overwrite %u update %u/%u",
+		  __entry->trans_fn,
+		  __entry->path_idx,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->overwrite,
+		  __entry->update_idx,
+		  __entry->nr_updates)
+);
+
+TRACE_EVENT(btree_path_lock,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_bkey_cached_common *b),
+	TP_ARGS(trans, caller_ip, b),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		__field(u8,			level		)
+		__array(char,			node, 24	)
+		__field(u32,			lock_seq	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= b->btree_id;
+		__entry->level			= b->level;
+
+		scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
+		__entry->lock_seq		= six_lock_seq(&b->lock);
+	),
+
+	TP_printk("%s %pS\nbtree %s level %u node %s lock seq %u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->level,
+		  __entry->node,
+		  __entry->lock_seq)
+);
+
+DECLARE_EVENT_CLASS(btree_path_ev,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+	TP_ARGS(trans, path),
+
+	TP_STRUCT__entry(
+		__field(u16,			idx		)
+		__field(u8,			ref		)
+		__field(u8,			btree_id	)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		__entry->idx			= path - trans->paths;
+		__entry->ref			= path->ref;
+		__entry->btree_id		= path->btree_id;
+		TRACE_BPOS_assign(pos, path->pos);
+	),
+
+	TP_printk("path %3u ref %u btree %s pos %llu:%llu:%u",
+		  __entry->idx, __entry->ref,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot)
+);
+
+DEFINE_EVENT(btree_path_ev, btree_path_get_ll,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+	TP_ARGS(trans, path)
+);
+
+DEFINE_EVENT(btree_path_ev, btree_path_put_ll,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+	TP_ARGS(trans, path)
+);
+
+DEFINE_EVENT(btree_path_ev, btree_path_should_be_locked,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+	TP_ARGS(trans, path)
+);
+
+TRACE_EVENT(btree_path_alloc,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+	TP_ARGS(trans, path),
+
+	TP_STRUCT__entry(
+		__field(btree_path_idx_t,	idx		)
+		__field(u8,			locks_want	)
+		__field(u8,			btree_id	)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		__entry->idx			= path - trans->paths;
+		__entry->locks_want		= path->locks_want;
+		__entry->btree_id		= path->btree_id;
+		TRACE_BPOS_assign(pos, path->pos);
+	),
+
+	TP_printk("path %3u btree %s locks_want %u pos %llu:%llu:%u",
+		  __entry->idx,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->locks_want,
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot)
+);
+
+TRACE_EVENT(btree_path_get,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos),
+	TP_ARGS(trans, path, new_pos),
+
+	TP_STRUCT__entry(
+		__field(btree_path_idx_t,	idx		)
+		__field(u8,			ref		)
+		__field(u8,			preserve	)
+		__field(u8,			locks_want	)
+		__field(u8,			btree_id	)
+		TRACE_BPOS_entries(old_pos)
+		TRACE_BPOS_entries(new_pos)
+	),
+
+	TP_fast_assign(
+		__entry->idx			= path - trans->paths;
+		__entry->ref			= path->ref;
+		__entry->preserve		= path->preserve;
+		__entry->locks_want		= path->locks_want;
+		__entry->btree_id		= path->btree_id;
+		TRACE_BPOS_assign(old_pos, path->pos);
+		TRACE_BPOS_assign(new_pos, *new_pos);
+	),
+
+	TP_printk("    path %3u ref %u preserve %u btree %s locks_want %u pos %llu:%llu:%u -> %llu:%llu:%u",
+		  __entry->idx,
+		  __entry->ref,
+		  __entry->preserve,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->locks_want,
+		  __entry->old_pos_inode,
+		  __entry->old_pos_offset,
+		  __entry->old_pos_snapshot,
+		  __entry->new_pos_inode,
+		  __entry->new_pos_offset,
+		  __entry->new_pos_snapshot)
+);
+
+DECLARE_EVENT_CLASS(btree_path_clone,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
+	TP_ARGS(trans, path, new),
+
+	TP_STRUCT__entry(
+		__field(btree_path_idx_t,	idx		)
+		__field(u8,			new_idx		)
+		__field(u8,			btree_id	)
+		__field(u8,			ref		)
+		__field(u8,			preserve	)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		__entry->idx			= path - trans->paths;
+		__entry->new_idx		= new - trans->paths;
+		__entry->btree_id		= path->btree_id;
+		__entry->ref			= path->ref;
+		__entry->preserve		= path->preserve;
+		TRACE_BPOS_assign(pos, path->pos);
+	),
+
+	TP_printk("  path %3u ref %u preserve %u btree %s %llu:%llu:%u -> %u",
+		  __entry->idx,
+		  __entry->ref,
+		  __entry->preserve,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->new_idx)
+);
+
+DEFINE_EVENT(btree_path_clone, btree_path_clone,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
+	TP_ARGS(trans, path, new)
+);
+
+DEFINE_EVENT(btree_path_clone, btree_path_save_pos,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
+	TP_ARGS(trans, path, new)
+);
+
+DECLARE_EVENT_CLASS(btree_path_traverse,
+	TP_PROTO(struct btree_trans *trans,
+		 struct btree_path *path),
+	TP_ARGS(trans, path),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(btree_path_idx_t,	idx		)
+		__field(u8,			ref		)
+		__field(u8,			preserve	)
+		__field(u8,			should_be_locked )
+		__field(u8,			btree_id	)
+		__field(u8,			level		)
+		TRACE_BPOS_entries(pos)
+		__field(u8,			locks_want	)
+		__field(u8,			nodes_locked	)
+		__array(char,			node0, 24	)
+		__array(char,			node1, 24	)
+		__array(char,			node2, 24	)
+		__array(char,			node3, 24	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+
+		__entry->idx			= path - trans->paths;
+		__entry->ref			= path->ref;
+		__entry->preserve		= path->preserve;
+		__entry->btree_id		= path->btree_id;
+		__entry->level			= path->level;
+		TRACE_BPOS_assign(pos, path->pos);
+
+		__entry->locks_want		= path->locks_want;
+		__entry->nodes_locked		= path->nodes_locked;
+		struct btree *b = path->l[0].b;
+		if (IS_ERR(b))
+			strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+		else
+			scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
+		b = path->l[1].b;
+		if (IS_ERR(b))
+			strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+		else
+			scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
+		b = path->l[2].b;
+		if (IS_ERR(b))
+			strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+		else
+			scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
+		b = path->l[3].b;
+		if (IS_ERR(b))
+			strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+		else
+			scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
+	),
+
+	TP_printk("%s\npath %3u ref %u preserve %u btree %s %llu:%llu:%u level %u locks_want %u\n"
+		  "locks %u %u %u %u node %s %s %s %s",
+		  __entry->trans_fn,
+		  __entry->idx,
+		  __entry->ref,
+		  __entry->preserve,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->level,
+		  __entry->locks_want,
+		  (__entry->nodes_locked >> 6) & 3,
+		  (__entry->nodes_locked >> 4) & 3,
+		  (__entry->nodes_locked >> 2) & 3,
+		  (__entry->nodes_locked >> 0) & 3,
+		  __entry->node3,
+		  __entry->node2,
+		  __entry->node1,
+		  __entry->node0)
+);
+
+DEFINE_EVENT(btree_path_traverse, btree_path_traverse_start,
+	TP_PROTO(struct btree_trans *trans,
+		 struct btree_path *path),
+	TP_ARGS(trans, path)
+);
+
+DEFINE_EVENT(btree_path_traverse, btree_path_traverse_end,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+	TP_ARGS(trans, path)
+);
+
+TRACE_EVENT(btree_path_set_pos,
+	TP_PROTO(struct btree_trans *trans,
+		 struct btree_path *path,
+		 struct bpos *new_pos),
+	TP_ARGS(trans, path, new_pos),
+
+	TP_STRUCT__entry(
+		__field(btree_path_idx_t,	idx		)
+		__field(u8,			ref		)
+		__field(u8,			preserve	)
+		__field(u8,			btree_id	)
+		TRACE_BPOS_entries(old_pos)
+		TRACE_BPOS_entries(new_pos)
+		__field(u8,			locks_want	)
+		__field(u8,			nodes_locked	)
+		__array(char,			node0, 24	)
+		__array(char,			node1, 24	)
+		__array(char,			node2, 24	)
+		__array(char,			node3, 24	)
+	),
+
+	TP_fast_assign(
+		__entry->idx			= path - trans->paths;
+		__entry->ref			= path->ref;
+		__entry->preserve		= path->preserve;
+		__entry->btree_id		= path->btree_id;
+		TRACE_BPOS_assign(old_pos, path->pos);
+		TRACE_BPOS_assign(new_pos, *new_pos);
+
+		__entry->nodes_locked		= path->nodes_locked;
+		struct btree *b = path->l[0].b;
+		if (IS_ERR(b))
+			strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+		else
+			scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
+		b = path->l[1].b;
+		if (IS_ERR(b))
+			strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+		else
+			scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
+		b = path->l[2].b;
+		if (IS_ERR(b))
+			strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+		else
+			scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
+		b = path->l[3].b;
+		if (IS_ERR(b))
+			strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+		else
+			scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
+	),
+
+	TP_printk("\npath %3u ref %u preserve %u btree %s %llu:%llu:%u -> %llu:%llu:%u\n"
+		  "locks %u %u %u %u node %s %s %s %s",
+		  __entry->idx,
+		  __entry->ref,
+		  __entry->preserve,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->old_pos_inode,
+		  __entry->old_pos_offset,
+		  __entry->old_pos_snapshot,
+		  __entry->new_pos_inode,
+		  __entry->new_pos_offset,
+		  __entry->new_pos_snapshot,
+		  (__entry->nodes_locked >> 6) & 3,
+		  (__entry->nodes_locked >> 4) & 3,
+		  (__entry->nodes_locked >> 2) & 3,
+		  (__entry->nodes_locked >> 0) & 3,
+		  __entry->node3,
+		  __entry->node2,
+		  __entry->node1,
+		  __entry->node0)
+);
+
+TRACE_EVENT(btree_path_free,
+	TP_PROTO(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup),
+	TP_ARGS(trans, path, dup),
+
+	TP_STRUCT__entry(
+		__field(btree_path_idx_t,	idx		)
+		__field(u8,			preserve	)
+		__field(u8,			should_be_locked)
+		__field(s8,			dup		)
+		__field(u8,			dup_locked	)
+	),
+
+	TP_fast_assign(
+		__entry->idx			= path;
+		__entry->preserve		= trans->paths[path].preserve;
+		__entry->should_be_locked	= trans->paths[path].should_be_locked;
+		__entry->dup			= dup ? dup - trans->paths  : -1;
+		__entry->dup_locked		= dup ? btree_node_locked(dup, dup->level) : 0;
+	),
+
+	TP_printk("   path %3u %c %c dup %2i locked %u", __entry->idx,
+		  __entry->preserve ? 'P' : ' ',
+		  __entry->should_be_locked ? 'S' : ' ',
+		  __entry->dup,
+		  __entry->dup_locked)
+);
+
+TRACE_EVENT(btree_path_free_trans_begin,
+	TP_PROTO(btree_path_idx_t path),
+	TP_ARGS(path),
+
+	TP_STRUCT__entry(
+		__field(btree_path_idx_t,	idx		)
+	),
+
+	TP_fast_assign(
+		__entry->idx			= path;
+	),
+
+	TP_printk("   path %3u", __entry->idx)
+);
+
+#else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
+#ifndef _TRACE_BCACHEFS_H
+
+static inline void trace_update_by_path(struct btree_trans *trans, struct btree_path *path,
+					struct btree_insert_entry *i, bool overwrite) {}
+static inline void trace_btree_path_lock(struct btree_trans *trans, unsigned long caller_ip, struct btree_bkey_cached_common *b) {}
+static inline void trace_btree_path_get_ll(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_put_ll(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_should_be_locked(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_alloc(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_get(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
+static inline void trace_btree_path_clone(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
+static inline void trace_btree_path_save_pos(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
+static inline void trace_btree_path_traverse_start(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
+static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {}
+static inline void trace_btree_path_free_trans_begin(btree_path_idx_t path) {}
+
+#endif
+#endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
+
+#define _TRACE_BCACHEFS_H
+#endif /* _TRACE_BCACHEFS_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../fs/bcachefs
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+#include <trace/define_trace.h>
diff --git a/libbcachefs/two_state_shared_lock.c b/libbcachefs/two_state_shared_lock.c
new file mode 100644
index 00000000..9764c2e6
--- /dev/null
+++ b/libbcachefs/two_state_shared_lock.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "two_state_shared_lock.h"
+
+void __bch2_two_state_lock(two_state_lock_t *lock, int s)
+{
+	__wait_event(lock->wait, bch2_two_state_trylock(lock, s));
+}
diff --git a/libbcachefs/two_state_shared_lock.h b/libbcachefs/two_state_shared_lock.h
new file mode 100644
index 00000000..7f647846
--- /dev/null
+++ b/libbcachefs/two_state_shared_lock.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_TWO_STATE_LOCK_H
+#define _BCACHEFS_TWO_STATE_LOCK_H
+
+#include <linux/atomic.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+
+#include "util.h"
+
+/*
+ * Two-state lock - can be taken for add or block - both states are shared,
+ * like read side of rwsem, but conflict with other state:
+ */
+typedef struct {
+	atomic_long_t		v;
+	wait_queue_head_t	wait;
+} two_state_lock_t;
+
+static inline void two_state_lock_init(two_state_lock_t *lock)
+{
+	atomic_long_set(&lock->v, 0);
+	init_waitqueue_head(&lock->wait);
+}
+
+static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s)
+{
+	long i = s ? 1 : -1;
+
+	EBUG_ON(atomic_long_read(&lock->v) == 0);
+
+	if (atomic_long_sub_return_release(i, &lock->v) == 0)
+		wake_up_all(&lock->wait);
+}
+
+static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s)
+{
+	long i = s ? 1 : -1;
+	long old;
+
+	old = atomic_long_read(&lock->v);
+	do {
+		if (i > 0 ? old < 0 : old > 0)
+			return false;
+	} while (!atomic_long_try_cmpxchg_acquire(&lock->v, &old, old + i));
+
+	return true;
+}
+
+void __bch2_two_state_lock(two_state_lock_t *, int);
+
+static inline void bch2_two_state_lock(two_state_lock_t *lock, int s)
+{
+	if (!bch2_two_state_trylock(lock, s))
+		__bch2_two_state_lock(lock, s);
+}
+
+#endif /* _BCACHEFS_TWO_STATE_LOCK_H */
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 1272ea7a..e0a876cb 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
- * random utiility code, for bcache but in theory not specific to bcache
+ * random utility code, for bcache but in theory not specific to bcache
  *
  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
  * Copyright 2012 Google, Inc.
@@ -7,6 +8,7 @@
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/console.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
 #include <linux/freezer.h>
@@ -22,23 +24,24 @@
 #include <linux/sched/clock.h>
 
 #include "eytzinger.h"
+#include "mean_and_variance.h"
 #include "util.h"
 
-#define simple_strtoint(c, end, base)	simple_strtol(c, end, base)
-#define simple_strtouint(c, end, base)	simple_strtoul(c, end, base)
-
 static const char si_units[] = "?kMGTPEZY";
 
-static int __bch2_strtoh(const char *cp, u64 *res,
-			 u64 t_max, bool t_signed)
+/* string_get_size units: */
+static const char *const units_2[] = {
+	"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"
+};
+static const char *const units_10[] = {
+	"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
+};
+
+static int parse_u64(const char *cp, u64 *res)
 {
-	bool positive = *cp != '-';
-	unsigned u;
+	const char *start = cp;
 	u64 v = 0;
 
-	if (*cp == '+' || *cp == '-')
-		cp++;
-
 	if (!isdigit(*cp))
 		return -EINVAL;
 
@@ -52,22 +55,122 @@ static int __bch2_strtoh(const char *cp, u64 *res,
 		cp++;
 	} while (isdigit(*cp));
 
+	*res = v;
+	return cp - start;
+}
+
+static int bch2_pow(u64 n, u64 p, u64 *res)
+{
+	*res = 1;
+
+	while (p--) {
+		if (*res > div64_u64(U64_MAX, n))
+			return -ERANGE;
+		*res *= n;
+	}
+	return 0;
+}
+
+static int parse_unit_suffix(const char *cp, u64 *res)
+{
+	const char *start = cp;
+	u64 base = 1024;
+	unsigned u;
+	int ret;
+
+	if (*cp == ' ')
+		cp++;
+
 	for (u = 1; u < strlen(si_units); u++)
 		if (*cp == si_units[u]) {
 			cp++;
 			goto got_unit;
 		}
-	u = 0;
+
+	for (u = 0; u < ARRAY_SIZE(units_2); u++)
+		if (!strncmp(cp, units_2[u], strlen(units_2[u]))) {
+			cp += strlen(units_2[u]);
+			goto got_unit;
+		}
+
+	for (u = 0; u < ARRAY_SIZE(units_10); u++)
+		if (!strncmp(cp, units_10[u], strlen(units_10[u]))) {
+			cp += strlen(units_10[u]);
+			base = 1000;
+			goto got_unit;
+		}
+
+	*res = 1;
+	return 0;
 got_unit:
-	if (*cp == '\n')
+	ret = bch2_pow(base, u, res);
+	if (ret)
+		return ret;
+
+	return cp - start;
+}
+
+#define parse_or_ret(cp, _f)			\
+do {						\
+	int _ret = _f;				\
+	if (_ret < 0)				\
+		return _ret;			\
+	cp += _ret;				\
+} while (0)
+
+static int __bch2_strtou64_h(const char *cp, u64 *res)
+{
+	const char *start = cp;
+	u64 v = 0, b, f_n = 0, f_d = 1;
+	int ret;
+
+	parse_or_ret(cp, parse_u64(cp, &v));
+
+	if (*cp == '.') {
 		cp++;
-	if (*cp)
-		return -EINVAL;
+		ret = parse_u64(cp, &f_n);
+		if (ret < 0)
+			return ret;
+		cp += ret;
+
+		ret = bch2_pow(10, ret, &f_d);
+		if (ret)
+			return ret;
+	}
 
-	if (fls64(v) + u * 10 > 64)
+	parse_or_ret(cp, parse_unit_suffix(cp, &b));
+
+	if (v > div64_u64(U64_MAX, b))
 		return -ERANGE;
+	v *= b;
 
-	v <<= u * 10;
+	if (f_n > div64_u64(U64_MAX, b))
+		return -ERANGE;
+
+	f_n = div64_u64(f_n * b, f_d);
+	if (v + f_n < v)
+		return -ERANGE;
+	v += f_n;
+
+	*res = v;
+	return cp - start;
+}
+
+static int __bch2_strtoh(const char *cp, u64 *res,
+			 u64 t_max, bool t_signed)
+{
+	bool positive = *cp != '-';
+	u64 v = 0;
+
+	if (*cp == '+' || *cp == '-')
+		cp++;
+
+	parse_or_ret(cp, __bch2_strtou64_h(cp, &v));
+
+	if (*cp == '\n')
+		cp++;
+	if (*cp)
+		return -EINVAL;
 
 	if (positive) {
 		if (v > t_max)
@@ -88,7 +191,7 @@ got_unit:
 #define STRTO_H(name, type)					\
 int bch2_ ## name ## _h(const char *cp, type *res)		\
 {								\
-	u64 v;							\
+	u64 v = 0;						\
 	int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type),	\
 			ANYSINT_MAX(type) != ((type) ~0ULL));	\
 	*res = v;						\
@@ -99,91 +202,27 @@ STRTO_H(strtoint, int)
 STRTO_H(strtouint, unsigned int)
 STRTO_H(strtoll, long long)
 STRTO_H(strtoull, unsigned long long)
+STRTO_H(strtou64, u64)
 
-ssize_t bch2_hprint(char *buf, s64 v)
-{
-	char dec[4] = "";
-	int u, t = 0;
-
-	for (u = 0; v >= 1024 || v <= -1024; u++) {
-		t = v & ~(~0U << 10);
-		v >>= 10;
-	}
-
-	if (!u)
-		return sprintf(buf, "%lli", v);
-
-	/*
-	 * 103 is magic: t is in the range [-1023, 1023] and we want
-	 * to turn it into [-9, 9]
-	 */
-	if (v < 100 && v > -100)
-		scnprintf(dec, sizeof(dec), ".%i", t / 103);
-
-	return sprintf(buf, "%lli%s%c", v, dec, si_units[u]);
-}
-
-ssize_t bch2_scnprint_string_list(char *buf, size_t size,
-				  const char * const list[],
-				  size_t selected)
-{
-	char *out = buf;
-	size_t i;
-
-	if (size)
-		*out = '\0';
-
-	for (i = 0; list[i]; i++)
-		out += scnprintf(out, buf + size - out,
-				 i == selected ? "[%s] " : "%s ", list[i]);
-
-	if (out != buf)
-		*--out = '\0';
-
-	return out - buf;
-}
-
-ssize_t bch2_scnprint_flag_list(char *buf, size_t size,
-				const char * const list[], u64 flags)
-{
-	char *out = buf, *end = buf + size;
-	unsigned bit, nr = 0;
-
-	while (list[nr])
-		nr++;
-
-	if (size)
-		*out = '\0';
-
-	while (flags && (bit = __ffs(flags)) < nr) {
-		out += scnprintf(out, end - out, "%s,", list[bit]);
-		flags ^= 1 << bit;
-	}
-
-	if (out != buf)
-		*--out = '\0';
-
-	return out - buf;
-}
-
-u64 bch2_read_flag_list(char *opt, const char * const list[])
+u64 bch2_read_flag_list(const char *opt, const char * const list[])
 {
 	u64 ret = 0;
-	char *p, *s, *d = kstrndup(opt, PAGE_SIZE - 1, GFP_KERNEL);
+	char *p, *s, *d = kstrdup(opt, GFP_KERNEL);
 
 	if (!d)
 		return -ENOMEM;
 
 	s = strim(d);
 
-	while ((p = strsep(&s, ","))) {
+	while ((p = strsep(&s, ",;"))) {
 		int flag = match_string(list, -1, p);
+
 		if (flag < 0) {
 			ret = -1;
 			break;
 		}
 
-		ret |= 1 << flag;
+		ret |= BIT_ULL(flag);
 	}
 
 	kfree(d);
@@ -202,196 +241,257 @@ bool bch2_is_zero(const void *_p, size_t n)
 	return true;
 }
 
-static void bch2_quantiles_update(struct quantiles *q, u64 v)
+void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits)
 {
-	unsigned i = 0;
+	while (nr_bits)
+		prt_char(out, '0' + ((v >> --nr_bits) & 1));
+}
 
-	while (i < ARRAY_SIZE(q->entries)) {
-		struct quantile_entry *e = q->entries + i;
+void bch2_prt_u64_base2(struct printbuf *out, u64 v)
+{
+	bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
+}
 
-		if (unlikely(!e->step)) {
-			e->m = v;
-			e->step = max_t(unsigned, v / 2, 1024);
-		} else if (e->m > v) {
-			e->m = e->m >= e->step
-				? e->m - e->step
-				: 0;
-		} else if (e->m < v) {
-			e->m = e->m + e->step > e->m
-				? e->m + e->step
-				: U32_MAX;
-		}
+static void __bch2_print_string_as_lines(const char *prefix, const char *lines,
+					 bool nonblocking)
+{
+	bool locked = false;
+	const char *p;
 
-		if ((e->m > v ? e->m - v : v - e->m) < e->step)
-			e->step = max_t(unsigned, e->step / 2, 1);
+	if (!lines) {
+		printk("%s (null)\n", prefix);
+		return;
+	}
 
-		if (v >= e->m)
-			break;
+	if (!nonblocking) {
+		console_lock();
+		locked = true;
+	} else {
+		locked = console_trylock();
+	}
 
-		i = eytzinger0_child(i, v > e->m);
+	while (1) {
+		p = strchrnul(lines, '\n');
+		printk("%s%.*s\n", prefix, (int) (p - lines), lines);
+		if (!*p)
+			break;
+		lines = p + 1;
 	}
+	if (locked)
+		console_unlock();
 }
 
-/* time stats: */
-
-static void bch2_time_stats_update_one(struct time_stats *stats,
-				       u64 start, u64 end)
+void bch2_print_string_as_lines(const char *prefix, const char *lines)
 {
-	u64 duration, freq;
+	return __bch2_print_string_as_lines(prefix, lines, false);
+}
 
-	duration	= time_after64(end, start)
-		? end - start : 0;
-	freq		= time_after64(end, stats->last_event)
-		? end - stats->last_event : 0;
+void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines)
+{
+	return __bch2_print_string_as_lines(prefix, lines, true);
+}
 
-	stats->count++;
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr,
+			gfp_t gfp)
+{
+#ifdef CONFIG_STACKTRACE
+	unsigned nr_entries = 0;
 
-	stats->average_duration = stats->average_duration
-		? ewma_add(stats->average_duration, duration, 6)
-		: duration;
+	stack->nr = 0;
+	int ret = darray_make_room_gfp(stack, 32, gfp);
+	if (ret)
+		return ret;
 
-	stats->average_frequency = stats->average_frequency
-		? ewma_add(stats->average_frequency, freq, 6)
-		: freq;
+	if (!down_read_trylock(&task->signal->exec_update_lock))
+		return -1;
 
-	stats->max_duration = max(stats->max_duration, duration);
+	do {
+		nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1);
+	} while (nr_entries == stack->size &&
+		 !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp)));
 
-	stats->last_event = end;
+	stack->nr = nr_entries;
+	up_read(&task->signal->exec_update_lock);
 
-	bch2_quantiles_update(&stats->quantiles, duration);
+	return ret;
+#else
+	return 0;
+#endif
 }
 
-void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
+void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
 {
-	unsigned long flags;
-
-	if (!stats->buffer) {
-		spin_lock_irqsave(&stats->lock, flags);
-		bch2_time_stats_update_one(stats, start, end);
-
-		if (stats->average_frequency < 32 &&
-		    stats->count > 1024)
-			stats->buffer =
-				alloc_percpu_gfp(struct time_stat_buffer,
-						 GFP_ATOMIC);
-		spin_unlock_irqrestore(&stats->lock, flags);
-	} else {
-		struct time_stat_buffer_entry *i;
-		struct time_stat_buffer *b;
-
-		preempt_disable();
-		b = this_cpu_ptr(stats->buffer);
-
-		BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
-		b->entries[b->nr++] = (struct time_stat_buffer_entry) {
-			.start = start,
-			.end = end
-		};
-
-		if (b->nr == ARRAY_SIZE(b->entries)) {
-			spin_lock_irqsave(&stats->lock, flags);
-			for (i = b->entries;
-			     i < b->entries + ARRAY_SIZE(b->entries);
-			     i++)
-				bch2_time_stats_update_one(stats, i->start, i->end);
-			spin_unlock_irqrestore(&stats->lock, flags);
-
-			b->nr = 0;
-		}
-
-		preempt_enable();
+	darray_for_each(*stack, i) {
+		prt_printf(out, "[<0>] %pB", (void *) *i);
+		prt_newline(out);
 	}
 }
 
-static const struct time_unit {
-	const char	*name;
-	u32		nsecs;
-} time_units[] = {
-	{ "ns",		1		},
-	{ "us",		NSEC_PER_USEC	},
-	{ "ms",		NSEC_PER_MSEC	},
-	{ "sec",	NSEC_PER_SEC	},
-};
-
-static const struct time_unit *pick_time_units(u64 ns)
+int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp)
 {
-	const struct time_unit *u;
+	bch_stacktrace stack = { 0 };
+	int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp);
 
-	for (u = time_units;
-	     u + 1 < time_units + ARRAY_SIZE(time_units) &&
-	     ns >= u[1].nsecs << 1;
-	     u++)
-		;
+	bch2_prt_backtrace(out, &stack);
+	darray_exit(&stack);
+	return ret;
+}
 
-	return u;
+#ifndef __KERNEL__
+#include <time.h>
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+	time_t t = sec;
+	char buf[64];
+	ctime_r(&t, buf);
+	strim(buf);
+	prt_str(out, buf);
 }
+#else
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+	char buf[64];
+	snprintf(buf, sizeof(buf), "%ptT", &sec);
+	prt_u64(out, sec);
+}
+#endif
 
-static size_t pr_time_units(char *buf, size_t len, u64 ns)
+void bch2_pr_time_units(struct printbuf *out, u64 ns)
 {
-	const struct time_unit *u = pick_time_units(ns);
+	const struct time_unit *u = bch2_pick_time_units(ns);
 
-	return scnprintf(buf, len, "%llu %s", div_u64(ns, u->nsecs), u->name);
+	prt_printf(out, "%llu %s", div64_u64(ns, u->nsecs), u->name);
 }
 
-size_t bch2_time_stats_print(struct time_stats *stats, char *buf, size_t len)
+static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
 {
-	char *out = buf, *end = buf + len;
-	const struct time_unit *u;
-	u64 freq = READ_ONCE(stats->average_frequency);
-	u64 q, last_q = 0;
-	int i;
+	const struct time_unit *u = bch2_pick_time_units(ns);
 
-	out += scnprintf(out, end - out, "count:\t\t%llu\n",
-			 stats->count);
-	out += scnprintf(out, end - out, "rate:\t\t%llu/sec\n",
-			 freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
+	prt_printf(out, "%llu \r%s", div64_u64(ns, u->nsecs), u->name);
+}
 
-	out += scnprintf(out, end - out, "frequency:\t");
-	out += pr_time_units(out, end - out, freq);
+static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
+{
+	prt_printf(out, "%s\t", name);
+	bch2_pr_time_units_aligned(out, ns);
+	prt_newline(out);
+}
 
-	out += scnprintf(out, end - out, "\navg duration:\t");
-	out += pr_time_units(out, end - out, stats->average_duration);
+#define TABSTOP_SIZE 12
 
-	out += scnprintf(out, end - out, "\nmax duration:\t");
-	out += pr_time_units(out, end - out, stats->max_duration);
+void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
+{
+	struct quantiles *quantiles = time_stats_to_quantiles(stats);
+	s64 f_mean = 0, d_mean = 0;
+	u64 f_stddev = 0, d_stddev = 0;
 
-	i = eytzinger0_first(NR_QUANTILES);
-	u = pick_time_units(stats->quantiles.entries[i].m);
+	if (stats->buffer) {
+		int cpu;
 
-	out += scnprintf(out, end - out, "\nquantiles (%s):\t", u->name);
-	eytzinger0_for_each(i, NR_QUANTILES) {
-		bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+		spin_lock_irq(&stats->lock);
+		for_each_possible_cpu(cpu)
+			__bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+		spin_unlock_irq(&stats->lock);
+	}
 
-		q = max(stats->quantiles.entries[i].m, last_q);
-		out += scnprintf(out, end - out, "%llu%s",
-				 div_u64(q, u->nsecs),
-				 is_last ? "\n" : " ");
-		last_q = q;
+	/*
+	 * avoid divide by zero
+	 */
+	if (stats->freq_stats.n) {
+		f_mean = mean_and_variance_get_mean(stats->freq_stats);
+		f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
+		d_mean = mean_and_variance_get_mean(stats->duration_stats);
+		d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
 	}
 
-	return out - buf;
-}
+	printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE);
+	prt_printf(out, "count:\t%llu\n", stats->duration_stats.n);
+	printbuf_tabstop_pop(out);
 
-void bch2_time_stats_exit(struct time_stats *stats)
-{
-	free_percpu(stats->buffer);
-}
+	printbuf_tabstops_reset(out);
 
-void bch2_time_stats_init(struct time_stats *stats)
-{
-	memset(stats, 0, sizeof(*stats));
-	spin_lock_init(&stats->lock);
+	printbuf_tabstop_push(out, out->indent + 20);
+	printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
+	printbuf_tabstop_push(out, 0);
+	printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
+
+	prt_printf(out, "\tsince mount\r\trecent\r\n");
+
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, out->indent + 20);
+	printbuf_tabstop_push(out, TABSTOP_SIZE);
+	printbuf_tabstop_push(out, 2);
+	printbuf_tabstop_push(out, TABSTOP_SIZE);
+
+	prt_printf(out, "duration of events\n");
+	printbuf_indent_add(out, 2);
+
+	pr_name_and_units(out, "min:", stats->min_duration);
+	pr_name_and_units(out, "max:", stats->max_duration);
+	pr_name_and_units(out, "total:", stats->total_duration);
+
+	prt_printf(out, "mean:\t");
+	bch2_pr_time_units_aligned(out, d_mean);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
+	prt_newline(out);
+
+	prt_printf(out, "stddev:\t");
+	bch2_pr_time_units_aligned(out, d_stddev);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
+
+	printbuf_indent_sub(out, 2);
+	prt_newline(out);
+
+	prt_printf(out, "time between events\n");
+	printbuf_indent_add(out, 2);
+
+	pr_name_and_units(out, "min:", stats->min_freq);
+	pr_name_and_units(out, "max:", stats->max_freq);
+
+	prt_printf(out, "mean:\t");
+	bch2_pr_time_units_aligned(out, f_mean);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
+	prt_newline(out);
+
+	prt_printf(out, "stddev:\t");
+	bch2_pr_time_units_aligned(out, f_stddev);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
+
+	printbuf_indent_sub(out, 2);
+	prt_newline(out);
+
+	printbuf_tabstops_reset(out);
+
+	if (quantiles) {
+		int i = eytzinger0_first(NR_QUANTILES);
+		const struct time_unit *u =
+			bch2_pick_time_units(quantiles->entries[i].m);
+		u64 last_q = 0;
+
+		prt_printf(out, "quantiles (%s):\t", u->name);
+		eytzinger0_for_each(i, NR_QUANTILES) {
+			bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+			u64 q = max(quantiles->entries[i].m, last_q);
+			prt_printf(out, "%llu ", div64_u64(q, u->nsecs));
+			if (is_last)
+				prt_newline(out);
+			last_q = q;
+		}
+	}
 }
 
 /* ratelimit: */
 
 /**
  * bch2_ratelimit_delay() - return how long to delay until the next time to do
- * some work
- *
- * @d - the struct bch_ratelimit to update
- *
- * Returns the amount of time to delay by, in jiffies
+ *		some work
+ * @d:		the struct bch_ratelimit to update
+ * Returns:	the amount of time to delay by, in jiffies
  */
 u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
 {
@@ -404,9 +504,8 @@ u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
 
 /**
  * bch2_ratelimit_increment() - increment @d by the amount of work done
- *
- * @d - the struct bch_ratelimit to update
- * @done - the amount of work done, in arbitrary units
+ * @d:		the struct bch_ratelimit to update
+ * @done:	the amount of work done, in arbitrary units
  */
 void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
 {
@@ -421,27 +520,6 @@ void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
 		d->next = now - NSEC_PER_SEC * 2;
 }
 
-int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d)
-{
-	bool kthread = (current->flags & PF_KTHREAD) != 0;
-
-	while (1) {
-		u64 delay = bch2_ratelimit_delay(d);
-
-		if (delay)
-			set_current_state(TASK_INTERRUPTIBLE);
-
-		if (kthread && kthread_should_stop())
-			return 1;
-
-		if (!delay)
-			return 0;
-
-		schedule_timeout(delay);
-		try_to_freeze();
-	}
-}
-
 /* pd controller: */
 
 /*
@@ -506,80 +584,70 @@ void bch2_pd_controller_init(struct bch_pd_controller *pd)
 	pd->backpressure	= 1;
 }
 
-size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
+void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd)
 {
-	/* 2^64 - 1 is 20 digits, plus null byte */
-	char rate[21];
-	char actual[21];
-	char target[21];
-	char proportional[21];
-	char derivative[21];
-	char change[21];
-	s64 next_io;
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 20);
 
-	bch2_hprint(rate,	pd->rate.rate);
-	bch2_hprint(actual,	pd->last_actual);
-	bch2_hprint(target,	pd->last_target);
-	bch2_hprint(proportional, pd->last_proportional);
-	bch2_hprint(derivative,	pd->last_derivative);
-	bch2_hprint(change,	pd->last_change);
+	prt_printf(out, "rate:\t");
+	prt_human_readable_s64(out, pd->rate.rate);
+	prt_newline(out);
 
-	next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC);
+	prt_printf(out, "target:\t");
+	prt_human_readable_u64(out, pd->last_target);
+	prt_newline(out);
 
-	return sprintf(buf,
-		       "rate:\t\t%s/sec\n"
-		       "target:\t\t%s\n"
-		       "actual:\t\t%s\n"
-		       "proportional:\t%s\n"
-		       "derivative:\t%s\n"
-		       "change:\t\t%s/sec\n"
-		       "next io:\t%llims\n",
-		       rate, target, actual, proportional,
-		       derivative, change, next_io);
-}
+	prt_printf(out, "actual:\t");
+	prt_human_readable_u64(out, pd->last_actual);
+	prt_newline(out);
 
-/* misc: */
+	prt_printf(out, "proportional:\t");
+	prt_human_readable_s64(out, pd->last_proportional);
+	prt_newline(out);
 
-void bch2_bio_map(struct bio *bio, void *base)
-{
-	size_t size = bio->bi_iter.bi_size;
-	struct bio_vec *bv = bio->bi_io_vec;
+	prt_printf(out, "derivative:\t");
+	prt_human_readable_s64(out, pd->last_derivative);
+	prt_newline(out);
 
-	BUG_ON(!bio->bi_iter.bi_size);
-	BUG_ON(bio->bi_vcnt);
+	prt_printf(out, "change:\t");
+	prt_human_readable_s64(out, pd->last_change);
+	prt_newline(out);
+
+	prt_printf(out, "next io:\t%llims\n", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
+}
 
-	bv->bv_offset = base ? offset_in_page(base) : 0;
-	goto start;
+/* misc: */
 
-	for (; size; bio->bi_vcnt++, bv++) {
-		bv->bv_offset	= 0;
-start:		bv->bv_len	= min_t(size_t, PAGE_SIZE - bv->bv_offset,
-					size);
-		BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
-		if (base) {
-			bv->bv_page = is_vmalloc_addr(base)
+void bch2_bio_map(struct bio *bio, void *base, size_t size)
+{
+	while (size) {
+		struct page *page = is_vmalloc_addr(base)
 				? vmalloc_to_page(base)
 				: virt_to_page(base);
+		unsigned offset = offset_in_page(base);
+		unsigned len = min_t(size_t, PAGE_SIZE - offset, size);
 
-			base += bv->bv_len;
-		}
-
-		size -= bv->bv_len;
+		BUG_ON(!bio_add_page(bio, page, len, offset));
+		size -= len;
+		base += len;
 	}
 }
 
-int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
+int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
 {
-	int i;
-	struct bio_vec *bv;
+	while (size) {
+		struct page *page = alloc_pages(gfp_mask, 0);
+		unsigned len = min_t(size_t, PAGE_SIZE, size);
 
-	bio_for_each_segment_all(bv, bio, i) {
-		bv->bv_page = alloc_page(gfp_mask);
-		if (!bv->bv_page) {
-			while (--bv >= bio->bi_io_vec)
-				__free_page(bv->bv_page);
+		if (!page)
 			return -ENOMEM;
+
+		if (unlikely(!bio_add_page(bio, page, len, 0))) {
+			__free_page(page);
+			break;
 		}
+
+		size -= len;
 	}
 
 	return 0;
@@ -600,15 +668,16 @@ size_t bch2_rand_range(size_t max)
 	return rand;
 }
 
-void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, void *src)
+void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
 {
 	struct bio_vec bv;
 	struct bvec_iter iter;
 
 	__bio_for_each_segment(bv, dst, iter, dst_iter) {
-		void *dstp = kmap_atomic(bv.bv_page);
+		void *dstp = kmap_local_page(bv.bv_page);
+
 		memcpy(dstp + bv.bv_offset, src, bv.bv_len);
-		kunmap_atomic(dstp);
+		kunmap_local(dstp);
 
 		src += bv.bv_len;
 	}
@@ -620,195 +689,15 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
 	struct bvec_iter iter;
 
 	__bio_for_each_segment(bv, src, iter, src_iter) {
-		void *srcp = kmap_atomic(bv.bv_page);
+		void *srcp = kmap_local_page(bv.bv_page);
+
 		memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
-		kunmap_atomic(srcp);
+		kunmap_local(srcp);
 
 		dst += bv.bv_len;
 	}
 }
 
-size_t bch_scnmemcpy(char *buf, size_t size, const char *src, size_t len)
-{
-	size_t n;
-
-	if (!size)
-		return 0;
-
-	n = min(size - 1, len);
-	memcpy(buf, src, n);
-	buf[n] = '\0';
-
-	return n;
-}
-
-#include "eytzinger.h"
-
-static int alignment_ok(const void *base, size_t align)
-{
-	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
-		((unsigned long)base & (align - 1)) == 0;
-}
-
-static void u32_swap(void *a, void *b, size_t size)
-{
-	u32 t = *(u32 *)a;
-	*(u32 *)a = *(u32 *)b;
-	*(u32 *)b = t;
-}
-
-static void u64_swap(void *a, void *b, size_t size)
-{
-	u64 t = *(u64 *)a;
-	*(u64 *)a = *(u64 *)b;
-	*(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, size_t size)
-{
-	char t;
-
-	do {
-		t = *(char *)a;
-		*(char *)a++ = *(char *)b;
-		*(char *)b++ = t;
-	} while (--size > 0);
-}
-
-static inline int do_cmp(void *base, size_t n, size_t size,
-			 int (*cmp_func)(const void *, const void *, size_t),
-			 size_t l, size_t r)
-{
-	return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
-			base + inorder_to_eytzinger0(r, n) * size,
-			size);
-}
-
-static inline void do_swap(void *base, size_t n, size_t size,
-			   void (*swap_func)(void *, void *, size_t),
-			   size_t l, size_t r)
-{
-	swap_func(base + inorder_to_eytzinger0(l, n) * size,
-		  base + inorder_to_eytzinger0(r, n) * size,
-		  size);
-}
-
-void eytzinger0_sort(void *base, size_t n, size_t size,
-		     int (*cmp_func)(const void *, const void *, size_t),
-		     void (*swap_func)(void *, void *, size_t))
-{
-	int i, c, r;
-
-	if (!swap_func) {
-		if (size == 4 && alignment_ok(base, 4))
-			swap_func = u32_swap;
-		else if (size == 8 && alignment_ok(base, 8))
-			swap_func = u64_swap;
-		else
-			swap_func = generic_swap;
-	}
-
-	/* heapify */
-	for (i = n / 2 - 1; i >= 0; --i) {
-		for (r = i; r * 2 + 1 < n; r = c) {
-			c = r * 2 + 1;
-
-			if (c + 1 < n &&
-			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
-				c++;
-
-			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
-				break;
-
-			do_swap(base, n, size, swap_func, r, c);
-		}
-	}
-
-	/* sort */
-	for (i = n - 1; i > 0; --i) {
-		do_swap(base, n, size, swap_func, 0, i);
-
-		for (r = 0; r * 2 + 1 < i; r = c) {
-			c = r * 2 + 1;
-
-			if (c + 1 < i &&
-			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
-				c++;
-
-			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
-				break;
-
-			do_swap(base, n, size, swap_func, r, c);
-		}
-	}
-}
-
-void sort_cmp_size(void *base, size_t num, size_t size,
-	  int (*cmp_func)(const void *, const void *, size_t),
-	  void (*swap_func)(void *, void *, size_t size))
-{
-	/* pre-scale counters for performance */
-	int i = (num/2 - 1) * size, n = num * size, c, r;
-
-	if (!swap_func) {
-		if (size == 4 && alignment_ok(base, 4))
-			swap_func = u32_swap;
-		else if (size == 8 && alignment_ok(base, 8))
-			swap_func = u64_swap;
-		else
-			swap_func = generic_swap;
-	}
-
-	/* heapify */
-	for ( ; i >= 0; i -= size) {
-		for (r = i; r * 2 + size < n; r  = c) {
-			c = r * 2 + size;
-			if (c < n - size &&
-			    cmp_func(base + c, base + c + size, size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c, size) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
-		}
-	}
-
-	/* sort */
-	for (i = n - size; i > 0; i -= size) {
-		swap_func(base, base + i, size);
-		for (r = 0; r * 2 + size < i; r = c) {
-			c = r * 2 + size;
-			if (c < i - size &&
-			    cmp_func(base + c, base + c + size, size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c, size) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
-		}
-	}
-}
-
-static void mempool_free_vp(void *element, void *pool_data)
-{
-	size_t size = (size_t) pool_data;
-
-	vpfree(element, size);
-}
-
-static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
-{
-	size_t size = (size_t) pool_data;
-
-	return vpmalloc(size, gfp_mask);
-}
-
-int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
-{
-	return size < PAGE_SIZE
-		? mempool_init_kmalloc_pool(pool, min_nr, size)
-		: mempool_init(pool, min_nr, mempool_alloc_vp,
-			       mempool_free_vp, (void *) size);
-}
-
 #if 0
 void eytzinger1_test(void)
 {
@@ -934,3 +823,65 @@ void eytzinger0_find_test(void)
 	kfree(test_array);
 }
 #endif
+
+/*
+ * Accumulate percpu counters onto one cpu's copy - only valid when access
+ * against any percpu counter is guarded against
+ */
+u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
+{
+	u64 *ret;
+	int cpu;
+
+	/* access to pcpu vars has to be blocked by other locking */
+	preempt_disable();
+	ret = this_cpu_ptr(p);
+	preempt_enable();
+
+	for_each_possible_cpu(cpu) {
+		u64 *i = per_cpu_ptr(p, cpu);
+
+		if (i != ret) {
+			acc_u64s(ret, i, nr);
+			memset(i, 0, nr * sizeof(u64));
+		}
+	}
+
+	return ret;
+}
+
+void bch2_darray_str_exit(darray_str *d)
+{
+	darray_for_each(*d, i)
+		kfree(*i);
+	darray_exit(d);
+}
+
+int bch2_split_devs(const char *_dev_name, darray_str *ret)
+{
+	darray_init(ret);
+
+	char *dev_name, *s, *orig;
+
+	dev_name = orig = kstrdup(_dev_name, GFP_KERNEL);
+	if (!dev_name)
+		return -ENOMEM;
+
+	while ((s = strsep(&dev_name, ":"))) {
+		char *p = kstrdup(s, GFP_KERNEL);
+		if (!p)
+			goto err;
+
+		if (darray_push(ret, p)) {
+			kfree(p);
+			goto err;
+		}
+	}
+
+	kfree(orig);
+	return 0;
+err:
+	bch2_darray_str_exit(ret);
+	kfree(orig);
+	return -ENOMEM;
+}
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index 487591c4..c292b9ce 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_UTIL_H
 #define _BCACHEFS_UTIL_H
 
@@ -7,65 +8,28 @@
 #include <linux/errno.h>
 #include <linux/freezer.h>
 #include <linux/kernel.h>
+#include <linux/min_heap.h>
 #include <linux/sched/clock.h>
 #include <linux/llist.h>
 #include <linux/log2.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
 #include <linux/ratelimit.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 
-#define PAGE_SECTOR_SHIFT	(PAGE_SHIFT - 9)
-#define PAGE_SECTORS		(1UL << PAGE_SECTOR_SHIFT)
+#include "mean_and_variance.h"
+
+#include "darray.h"
+#include "time_stats.h"
 
 struct closure;
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-
 #define EBUG_ON(cond)		BUG_ON(cond)
-#define atomic_dec_bug(v)	BUG_ON(atomic_dec_return(v) < 0)
-#define atomic_inc_bug(v, i)	BUG_ON(atomic_inc_return(v) <= i)
-#define atomic_sub_bug(i, v)	BUG_ON(atomic_sub_return(i, v) < 0)
-#define atomic_add_bug(i, v)	BUG_ON(atomic_add_return(i, v) < 0)
-#define atomic_long_dec_bug(v)		BUG_ON(atomic_long_dec_return(v) < 0)
-#define atomic_long_sub_bug(i, v)	BUG_ON(atomic_long_sub_return(i, v) < 0)
-#define atomic64_dec_bug(v)	BUG_ON(atomic64_dec_return(v) < 0)
-#define atomic64_inc_bug(v, i)	BUG_ON(atomic64_inc_return(v) <= i)
-#define atomic64_sub_bug(i, v)	BUG_ON(atomic64_sub_return(i, v) < 0)
-#define atomic64_add_bug(i, v)	BUG_ON(atomic64_add_return(i, v) < 0)
-
-#define memcpy(dst, src, len)						\
-({									\
-	void *_dst = (dst);						\
-	const void *_src = (src);					\
-	size_t _len = (len);						\
-									\
-	BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) ||		\
-		 (void *) (_dst) + (_len) <= (void *) (_src)));		\
-	memcpy(_dst, _src, _len);					\
-})
-
-#else /* DEBUG */
-
-#define EBUG_ON(cond)
-#define atomic_dec_bug(v)	atomic_dec(v)
-#define atomic_inc_bug(v, i)	atomic_inc(v)
-#define atomic_sub_bug(i, v)	atomic_sub(i, v)
-#define atomic_add_bug(i, v)	atomic_add(i, v)
-#define atomic_long_dec_bug(v)		atomic_long_dec(v)
-#define atomic_long_sub_bug(i, v)	atomic_long_sub(i, v)
-#define atomic64_dec_bug(v)	atomic64_dec(v)
-#define atomic64_inc_bug(v, i)	atomic64_inc(v)
-#define atomic64_sub_bug(i, v)	atomic64_sub(i, v)
-#define atomic64_add_bug(i, v)	atomic64_add(i, v)
-
-#endif
-
-#ifndef __CHECKER__
-#define __flatten __attribute__((flatten))
 #else
-/* sparse doesn't know about attribute((flatten)) */
-#define __flatten
+#define EBUG_ON(cond)
 #endif
 
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
@@ -83,157 +47,84 @@ struct closure;
 	(__builtin_types_compatible_p(typeof(_val), _type) ||		\
 	 __builtin_types_compatible_p(typeof(_val), const _type))
 
-static inline void vpfree(void *p, size_t size)
-{
-	if (is_vmalloc_addr(p))
-		vfree(p);
-	else
-		free_pages((unsigned long) p, get_order(size));
-}
-
-static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
-{
-	return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
-					 get_order(size)) ?:
-		__vmalloc(size, gfp_mask, PAGE_KERNEL);
-}
-
-static inline void kvpfree(void *p, size_t size)
-{
-	if (size < PAGE_SIZE)
-		kfree(p);
-	else
-		vpfree(p, size);
-}
-
-static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
+/* Userspace doesn't align allocations as nicely as the kernel allocators: */
+static inline size_t buf_pages(void *p, size_t len)
 {
-	return size < PAGE_SIZE
-		? kmalloc(size, gfp_mask)
-		: vpmalloc(size, gfp_mask);
+	return DIV_ROUND_UP(len +
+			    ((unsigned long) p & (PAGE_SIZE - 1)),
+			    PAGE_SIZE);
 }
 
-int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
-
-#define HEAP(type)							\
-struct {								\
-	size_t size, used;						\
-	type *data;							\
-}
-
-#define DECLARE_HEAP(type, name) HEAP(type) name
-
 #define init_heap(heap, _size, gfp)					\
 ({									\
-	(heap)->used = 0;						\
+	(heap)->nr = 0;						\
 	(heap)->size = (_size);						\
-	(heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
+	(heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\
 				 (gfp));				\
 })
 
 #define free_heap(heap)							\
 do {									\
-	kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0]));	\
+	kvfree((heap)->data);						\
 	(heap)->data = NULL;						\
 } while (0)
 
-#define heap_swap(h, i, j)	swap((h)->data[i], (h)->data[j])
-
-#define heap_peek(h)							\
-({									\
-	EBUG_ON(!(h)->used);						\
-	(h)->data[0];							\
-})
+#define ANYSINT_MAX(t)							\
+	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
 
-#define heap_full(h)	((h)->used == (h)->size)
+#include "printbuf.h"
 
-#define heap_sift_down(h, i, cmp)					\
-do {									\
-	size_t _c, _j = i;						\
-									\
-	for (; _j * 2 + 1 < (h)->used; _j = _c) {			\
-		_c = _j * 2 + 1;					\
-		if (_c + 1 < (h)->used &&				\
-		    cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0)	\
-			_c++;						\
-									\
-		if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0)		\
-			break;						\
-		heap_swap(h, _c, _j);					\
-	}								\
-} while (0)
+#define prt_vprintf(_out, ...)		bch2_prt_vprintf(_out, __VA_ARGS__)
+#define prt_printf(_out, ...)		bch2_prt_printf(_out, __VA_ARGS__)
+#define printbuf_str(_buf)		bch2_printbuf_str(_buf)
+#define printbuf_exit(_buf)		bch2_printbuf_exit(_buf)
 
-#define heap_sift_up(h, i, cmp)						\
-do {									\
-	while (i) {							\
-		size_t p = (i - 1) / 2;					\
-		if (cmp(h, (h)->data[i], (h)->data[p]) >= 0)		\
-			break;						\
-		heap_swap(h, i, p);					\
-		i = p;							\
-	}								\
-} while (0)
+#define printbuf_tabstops_reset(_buf)	bch2_printbuf_tabstops_reset(_buf)
+#define printbuf_tabstop_pop(_buf)	bch2_printbuf_tabstop_pop(_buf)
+#define printbuf_tabstop_push(_buf, _n)	bch2_printbuf_tabstop_push(_buf, _n)
 
-#define __heap_add(h, d, cmp)						\
-do {									\
-	size_t _i = (h)->used++;					\
-	(h)->data[_i] = d;						\
-									\
-	heap_sift_up(h, _i, cmp);					\
-} while (0)
+#define printbuf_indent_add(_out, _n)	bch2_printbuf_indent_add(_out, _n)
+#define printbuf_indent_sub(_out, _n)	bch2_printbuf_indent_sub(_out, _n)
 
-#define heap_add(h, d, cmp)						\
-({									\
-	bool _r = !heap_full(h);					\
-	if (_r)								\
-		__heap_add(h, d, cmp);					\
-	_r;								\
-})
+#define prt_newline(_out)		bch2_prt_newline(_out)
+#define prt_tab(_out)			bch2_prt_tab(_out)
+#define prt_tab_rjust(_out)		bch2_prt_tab_rjust(_out)
 
-#define heap_add_or_replace(h, new, cmp)				\
-do {									\
-	if (!heap_add(h, new, cmp) &&					\
-	    cmp(h, new, heap_peek(h)) >= 0) {				\
-		(h)->data[0] = new;					\
-		heap_sift_down(h, 0, cmp);				\
-	}								\
-} while (0)
+#define prt_bytes_indented(...)		bch2_prt_bytes_indented(__VA_ARGS__)
+#define prt_u64(_out, _v)		prt_printf(_out, "%llu", (u64) (_v))
+#define prt_human_readable_u64(...)	bch2_prt_human_readable_u64(__VA_ARGS__)
+#define prt_human_readable_s64(...)	bch2_prt_human_readable_s64(__VA_ARGS__)
+#define prt_units_u64(...)		bch2_prt_units_u64(__VA_ARGS__)
+#define prt_units_s64(...)		bch2_prt_units_s64(__VA_ARGS__)
+#define prt_string_option(...)		bch2_prt_string_option(__VA_ARGS__)
+#define prt_bitflags(...)		bch2_prt_bitflags(__VA_ARGS__)
+#define prt_bitflags_vector(...)	bch2_prt_bitflags_vector(__VA_ARGS__)
 
-#define heap_del(h, i, cmp)						\
-do {									\
-	size_t _i = (i);						\
-									\
-	BUG_ON(_i >= (h)->used);					\
-	(h)->used--;							\
-	heap_swap(h, _i, (h)->used);					\
-	heap_sift_up(h, _i, cmp);					\
-	heap_sift_down(h, _i, cmp);					\
-} while (0)
+void bch2_pr_time_units(struct printbuf *, u64);
+void bch2_prt_datetime(struct printbuf *, time64_t);
 
-#define heap_pop(h, d, cmp)						\
-({									\
-	bool _r = (h)->used;						\
-	if (_r) {							\
-		(d) = (h)->data[0];					\
-		heap_del(h, 0, cmp);					\
-	}								\
-	_r;								\
-})
+#ifdef __KERNEL__
+static inline void uuid_unparse_lower(u8 *uuid, char *out)
+{
+	sprintf(out, "%pUb", uuid);
+}
+#else
+#include <uuid/uuid.h>
+#endif
 
-#define heap_resort(heap, cmp)						\
-do {									\
-	ssize_t _i;							\
-	for (_i = (ssize_t) (heap)->used / 2 -  1; _i >= 0; --_i)	\
-		heap_sift_down(heap, _i, cmp);				\
-} while (0)
+static inline void pr_uuid(struct printbuf *out, u8 *uuid)
+{
+	char uuid_str[40];
 
-#define ANYSINT_MAX(t)							\
-	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
+	uuid_unparse_lower(uuid, uuid_str);
+	prt_printf(out, "%s", uuid_str);
+}
 
 int bch2_strtoint_h(const char *, int *);
 int bch2_strtouint_h(const char *, unsigned int *);
 int bch2_strtoll_h(const char *, long long *);
 int bch2_strtoull_h(const char *, unsigned long long *);
+int bch2_strtou64_h(const char *, u64 *);
 
 static inline int bch2_strtol_h(const char *cp, long *res)
 {
@@ -291,8 +182,8 @@ static inline int bch2_strtoul_h(const char *cp, long *res)
 	_r;								\
 })
 
-#define snprint(buf, size, var)						\
-	snprintf(buf, size,						\
+#define snprint(out, var)						\
+	prt_printf(out,							\
 		   type_is(var, int)		? "%i\n"		\
 		 : type_is(var, unsigned)	? "%u\n"		\
 		 : type_is(var, long)		? "%li\n"		\
@@ -302,59 +193,31 @@ static inline int bch2_strtoul_h(const char *cp, long *res)
 		 : type_is(var, char *)		? "%s\n"		\
 		 : "%i\n", var)
 
-ssize_t bch2_hprint(char *buf, s64 v);
-
 bool bch2_is_zero(const void *, size_t);
 
-ssize_t bch2_scnprint_string_list(char *, size_t, const char * const[], size_t);
-
-ssize_t bch2_scnprint_flag_list(char *, size_t, const char * const[], u64);
-u64 bch2_read_flag_list(char *, const char * const[]);
+u64 bch2_read_flag_list(const char *, const char * const[]);
 
-#define NR_QUANTILES	15
-#define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
-#define QUANTILE_FIRST	eytzinger0_first(NR_QUANTILES)
-#define QUANTILE_LAST	eytzinger0_last(NR_QUANTILES)
-
-struct quantiles {
-	struct quantile_entry {
-		u64	m;
-		u64	step;
-	}		entries[NR_QUANTILES];
-};
+void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
+void bch2_prt_u64_base2(struct printbuf *, u64);
 
-struct time_stat_buffer {
-	unsigned	nr;
-	struct time_stat_buffer_entry {
-		u64	start;
-		u64	end;
-	}		entries[32];
-};
-
-struct time_stats {
-	spinlock_t	lock;
-	u64		count;
-	/* all fields are in nanoseconds */
-	u64		average_duration;
-	u64		average_frequency;
-	u64		max_duration;
-	u64		last_event;
-	struct quantiles quantiles;
-
-	struct time_stat_buffer __percpu *buffer;
-};
+void bch2_print_string_as_lines(const char *prefix, const char *lines);
+void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines);
 
-void __bch2_time_stats_update(struct time_stats *stats, u64, u64);
+typedef DARRAY(unsigned long) bch_stacktrace;
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
+void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *);
+int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t);
 
-static inline void bch2_time_stats_update(struct time_stats *stats, u64 start)
+static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
 {
-	__bch2_time_stats_update(stats, start, local_clock());
+#ifdef __KERNEL__
+	prt_printf(out, "%pg", bdev);
+#else
+	prt_str(out, bdev->name);
+#endif
 }
 
-size_t bch2_time_stats_print(struct time_stats *, char *, size_t);
-
-void bch2_time_stats_exit(struct time_stats *);
-void bch2_time_stats_init(struct time_stats *);
+void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
 
 #define ewma_add(ewma, val, weight)					\
 ({									\
@@ -383,7 +246,6 @@ static inline void bch2_ratelimit_reset(struct bch_ratelimit *d)
 
 u64 bch2_ratelimit_delay(struct bch_ratelimit *);
 void bch2_ratelimit_increment(struct bch_ratelimit *, u64);
-int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *);
 
 struct bch_pd_controller {
 	struct bch_ratelimit	rate;
@@ -402,14 +264,16 @@ struct bch_pd_controller {
 	s64			last_change;
 	s64			last_target;
 
-	/* If true, the rate will not increase if bch2_ratelimit_delay()
-	 * is not being called often enough. */
+	/*
+	 * If true, the rate will not increase if bch2_ratelimit_delay()
+	 * is not being called often enough.
+	 */
 	bool			backpressure;
 };
 
 void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
 void bch2_pd_controller_init(struct bch_pd_controller *);
-size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *);
+void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *);
 
 #define sysfs_pd_controller_attribute(name)				\
 	rw_attribute(name##_rate);					\
@@ -433,7 +297,7 @@ do {									\
 	sysfs_print(name##_rate_p_term_inverse,	(var)->p_term_inverse);	\
 									\
 	if (attr == &sysfs_##name##_rate_debug)				\
-		return bch2_pd_controller_print_debug(var, buf);		\
+		bch2_pd_controller_debug_to_text(out, var);		\
 } while (0)
 
 #define sysfs_pd_controller_store(name, var)				\
@@ -447,95 +311,24 @@ do {									\
 			    (var)->p_term_inverse, 1, INT_MAX);		\
 } while (0)
 
-#define __DIV_SAFE(n, d, zero)						\
-({									\
-	typeof(n) _n = (n);						\
-	typeof(d) _d = (d);						\
-	_d ? _n / _d : zero;						\
-})
-
-#define DIV_SAFE(n, d)	__DIV_SAFE(n, d, 0)
-
 #define container_of_or_null(ptr, type, member)				\
 ({									\
 	typeof(ptr) _ptr = ptr;						\
 	_ptr ? container_of(_ptr, type, member) : NULL;			\
 })
 
-#define RB_INSERT(root, new, member, cmp)				\
-({									\
-	__label__ dup;							\
-	struct rb_node **n = &(root)->rb_node, *parent = NULL;		\
-	typeof(new) this;						\
-	int res, ret = -1;						\
-									\
-	while (*n) {							\
-		parent = *n;						\
-		this = container_of(*n, typeof(*(new)), member);	\
-		res = cmp(new, this);					\
-		if (!res)						\
-			goto dup;					\
-		n = res < 0						\
-			? &(*n)->rb_left				\
-			: &(*n)->rb_right;				\
-	}								\
-									\
-	rb_link_node(&(new)->member, parent, n);			\
-	rb_insert_color(&(new)->member, root);				\
-	ret = 0;							\
-dup:									\
-	ret;								\
-})
-
-#define RB_SEARCH(root, search, member, cmp)				\
-({									\
-	struct rb_node *n = (root)->rb_node;				\
-	typeof(&(search)) this, ret = NULL;				\
-	int res;							\
-									\
-	while (n) {							\
-		this = container_of(n, typeof(search), member);		\
-		res = cmp(&(search), this);				\
-		if (!res) {						\
-			ret = this;					\
-			break;						\
-		}							\
-		n = res < 0						\
-			? n->rb_left					\
-			: n->rb_right;					\
-	}								\
-	ret;								\
-})
-
-#define RB_GREATER(root, search, member, cmp)				\
-({									\
-	struct rb_node *n = (root)->rb_node;				\
-	typeof(&(search)) this, ret = NULL;				\
-	int res;							\
-									\
-	while (n) {							\
-		this = container_of(n, typeof(search), member);		\
-		res = cmp(&(search), this);				\
-		if (res < 0) {						\
-			ret = this;					\
-			n = n->rb_left;					\
-		} else							\
-			n = n->rb_right;				\
-	}								\
-	ret;								\
-})
-
-#define RB_FIRST(root, type, member)					\
-	container_of_or_null(rb_first(root), type, member)
-
-#define RB_LAST(root, type, member)					\
-	container_of_or_null(rb_last(root), type, member)
+static inline struct list_head *list_pop(struct list_head *head)
+{
+	if (list_empty(head))
+		return NULL;
 
-#define RB_NEXT(ptr, member)						\
-	container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member)
+	struct list_head *ret = head->next;
+	list_del_init(ret);
+	return ret;
+}
 
-#define RB_PREV(ptr, member)						\
-	container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
+#define list_pop_entry(head, type, member)		\
+	container_of_or_null(list_pop(head), type, member)
 
 /* Does linear interpolation between powers of two */
 static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
@@ -549,13 +342,8 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
 	return x;
 }
 
-void bch2_bio_map(struct bio *bio, void *base);
-int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask);
-
-static inline sector_t bdev_sectors(struct block_device *bdev)
-{
-	return bdev->bd_inode->i_size >> 9;
-}
+void bch2_bio_map(struct bio *bio, void *base, size_t);
+int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
 
 #define closure_bio_submit(bio, cl)					\
 do {									\
@@ -563,6 +351,26 @@ do {									\
 	submit_bio(bio);						\
 } while (0)
 
+#define kthread_wait(cond)						\
+({									\
+	int _ret = 0;							\
+									\
+	while (1) {							\
+		set_current_state(TASK_INTERRUPTIBLE);			\
+		if (kthread_should_stop()) {				\
+			_ret = -1;					\
+			break;						\
+		}							\
+									\
+		if (cond)						\
+			break;						\
+									\
+		schedule();						\
+	}								\
+	set_current_state(TASK_RUNNING);				\
+	_ret;								\
+})
+
 #define kthread_wait_freezable(cond)					\
 ({									\
 	int _ret = 0;							\
@@ -585,14 +393,25 @@ do {									\
 
 size_t bch2_rand_range(size_t);
 
-void memcpy_to_bio(struct bio *, struct bvec_iter, void *);
+void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
 void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
 
+static inline void memcpy_u64s_small(void *dst, const void *src,
+				     unsigned u64s)
+{
+	u64 *d = dst;
+	const u64 *s = src;
+
+	while (u64s--)
+		*d++ = *s++;
+}
+
 static inline void __memcpy_u64s(void *dst, const void *src,
 				 unsigned u64s)
 {
 #ifdef CONFIG_X86_64
 	long d0, d1, d2;
+
 	asm volatile("rep ; movsq"
 		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
 		     : "0" (u64s), "1" (dst), "2" (src)
@@ -629,6 +448,38 @@ static inline void memmove_u64s_down(void *dst, const void *src,
 	__memmove_u64s_down(dst, src, u64s);
 }
 
+static inline void __memmove_u64s_down_small(void *dst, const void *src,
+				       unsigned u64s)
+{
+	memcpy_u64s_small(dst, src, u64s);
+}
+
+static inline void memmove_u64s_down_small(void *dst, const void *src,
+				     unsigned u64s)
+{
+	EBUG_ON(dst > src);
+
+	__memmove_u64s_down_small(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_up_small(void *_dst, const void *_src,
+					   unsigned u64s)
+{
+	u64 *dst = (u64 *) _dst + u64s;
+	u64 *src = (u64 *) _src + u64s;
+
+	while (u64s--)
+		*--dst = *--src;
+}
+
+static inline void memmove_u64s_up_small(void *dst, const void *src,
+					 unsigned u64s)
+{
+	EBUG_ON(dst < src);
+
+	__memmove_u64s_up_small(dst, src, u64s);
+}
+
 static inline void __memmove_u64s_up(void *_dst, const void *_src,
 				     unsigned u64s)
 {
@@ -637,6 +488,7 @@ static inline void __memmove_u64s_up(void *_dst, const void *_src,
 
 #ifdef CONFIG_X86_64
 	long d0, d1, d2;
+
 	asm volatile("std ;\n"
 		     "rep ; movsq\n"
 		     "cld ;\n"
@@ -666,41 +518,14 @@ static inline void memmove_u64s(void *dst, const void *src,
 		__memmove_u64s_up(dst, src, u64s);
 }
 
-static inline struct bio_vec next_contig_bvec(struct bio *bio,
-					      struct bvec_iter *iter)
+/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */
+static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
 {
-	struct bio_vec bv = bio_iter_iovec(bio, *iter);
+	unsigned rem = round_up(bytes, sizeof(u64)) - bytes;
 
-	bio_advance_iter(bio, iter, bv.bv_len);
-#ifndef CONFIG_HIGHMEM
-	while (iter->bi_size) {
-		struct bio_vec next = bio_iter_iovec(bio, *iter);
-
-		if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len !=
-		    page_address(next.bv_page) + next.bv_offset)
-			break;
-
-		bv.bv_len += next.bv_len;
-		bio_advance_iter(bio, iter, next.bv_len);
-	}
-#endif
-	return bv;
+	memset(s + bytes, c, rem);
 }
 
-#define __bio_for_each_contig_segment(bv, bio, iter, start)		\
-	for (iter = (start);						\
-	     (iter).bi_size &&						\
-		((bv = next_contig_bvec((bio), &(iter))), 1);)
-
-#define bio_for_each_contig_segment(bv, bio, iter)			\
-	__bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
-
-size_t bch_scnmemcpy(char *, size_t, const char *, size_t);
-
-void sort_cmp_size(void *base, size_t num, size_t size,
-	  int (*cmp_func)(const void *, const void *, size_t),
-	  void (*swap_func)(void *, void *, size_t));
-
 /* just the memmove, doesn't update @_nr */
 #define __array_insert_item(_array, _nr, _pos)				\
 	memmove(&(_array)[(_pos) + 1],					\
@@ -725,14 +550,46 @@ do {									\
 #define array_remove_item(_array, _nr, _pos)				\
 	array_remove_items(_array, _nr, _pos, 1)
 
+static inline void __move_gap(void *array, size_t element_size,
+			      size_t nr, size_t size,
+			      size_t old_gap, size_t new_gap)
+{
+	size_t gap_end = old_gap + size - nr;
+
+	if (new_gap < old_gap) {
+		size_t move = old_gap - new_gap;
+
+		memmove(array + element_size * (gap_end - move),
+			array + element_size * (old_gap - move),
+				element_size * move);
+	} else if (new_gap > old_gap) {
+		size_t move = new_gap - old_gap;
+
+		memmove(array + element_size * old_gap,
+			array + element_size * gap_end,
+				element_size * move);
+	}
+}
+
+/* Move the gap in a gap buffer: */
+#define move_gap(_d, _new_gap)						\
+do {									\
+	BUG_ON(_new_gap > (_d)->nr);					\
+	BUG_ON((_d)->gap > (_d)->nr);					\
+									\
+	__move_gap((_d)->data, sizeof((_d)->data[0]),			\
+		   (_d)->nr, (_d)->size, (_d)->gap, _new_gap);		\
+	(_d)->gap = _new_gap;						\
+} while (0)
+
 #define bubble_sort(_base, _nr, _cmp)					\
 do {									\
-	ssize_t _i, _end;						\
+	ssize_t _i, _last;						\
 	bool _swapped = true;						\
 									\
-	for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\
+	for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\
 		_swapped = false;					\
-		for (_i = 0; _i < _end; _i++)				\
+		for (_i = 0; _i < _last; _i++)				\
 			if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) {	\
 				swap((_base)[_i], (_base)[_i + 1]);	\
 				_swapped = true;			\
@@ -740,4 +597,125 @@ do {									\
 	}								\
 } while (0)
 
+#define per_cpu_sum(_p)							\
+({									\
+	typeof(*_p) _ret = 0;						\
+									\
+	int cpu;							\
+	for_each_possible_cpu(cpu)					\
+		_ret += *per_cpu_ptr(_p, cpu);				\
+	_ret;								\
+})
+
+static inline u64 percpu_u64_get(u64 __percpu *src)
+{
+	return per_cpu_sum(src);
+}
+
+static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		*per_cpu_ptr(dst, cpu) = 0;
+	this_cpu_write(*dst, src);
+}
+
+static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
+{
+	for (unsigned i = 0; i < nr; i++)
+		acc[i] += src[i];
+}
+
+static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
+				   unsigned nr)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
+}
+
+static inline void percpu_memset(void __percpu *p, int c, size_t bytes)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		memset(per_cpu_ptr(p, cpu), c, bytes);
+}
+
+u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
+
+#define cmp_int(l, r)		((l > r) - (l < r))
+
+static inline int u8_cmp(u8 l, u8 r)
+{
+	return cmp_int(l, r);
+}
+
+static inline int cmp_le32(__le32 l, __le32 r)
+{
+	return cmp_int(le32_to_cpu(l), le32_to_cpu(r));
+}
+
+#include <linux/uuid.h>
+
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+static inline bool qstr_eq(const struct qstr l, const struct qstr r)
+{
+	return l.len == r.len && !memcmp(l.name, r.name, l.len);
+}
+
+void bch2_darray_str_exit(darray_str *);
+int bch2_split_devs(const char *, darray_str *);
+
+#ifdef __KERNEL__
+
+__must_check
+static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
+{
+	return copy_to_user(to, from, n) ? -EFAULT : 0;
+}
+
+__must_check
+static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n)
+{
+	return copy_from_user(to, from, n) ? -EFAULT : 0;
+}
+
+#endif
+
+static inline void mod_bit(long nr, volatile unsigned long *addr, bool v)
+{
+	if (v)
+		set_bit(nr, addr);
+	else
+		clear_bit(nr, addr);
+}
+
+static inline void __set_bit_le64(size_t bit, __le64 *addr)
+{
+	addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
+}
+
+static inline void __clear_bit_le64(size_t bit, __le64 *addr)
+{
+	addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64));
+}
+
+static inline bool test_bit_le64(size_t bit, __le64 *addr)
+{
+	return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0;
+}
+
+static inline void memcpy_swab(void *_dst, void *_src, size_t len)
+{
+	u8 *dst = _dst + len;
+	u8 *src = _src;
+
+	while (len--)
+		*--dst = *src++;
+}
+
 #endif /* _BCACHEFS_UTIL_H */
diff --git a/libbcachefs/varint.c b/libbcachefs/varint.c
new file mode 100644
index 00000000..6a78553d
--- /dev/null
+++ b/libbcachefs/varint.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitops.h>
+#include <linux/math.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+#ifdef CONFIG_VALGRIND
+#include <valgrind/memcheck.h>
+#endif
+
+#include "varint.h"
+
+/**
+ * bch2_varint_encode - encode a variable length integer
+ * @out:	destination to encode to
+ * @v:		unsigned integer to encode
+ * Returns:	size in bytes of the encoded integer - at most 9 bytes
+ */
+int bch2_varint_encode(u8 *out, u64 v)
+{
+	unsigned bits = fls64(v|1);
+	unsigned bytes = DIV_ROUND_UP(bits, 7);
+	__le64 v_le;
+
+	if (likely(bytes < 9)) {
+		v <<= bytes;
+		v |= ~(~0 << (bytes - 1));
+		v_le = cpu_to_le64(v);
+		memcpy(out, &v_le, bytes);
+	} else {
+		*out++ = 255;
+		bytes = 9;
+		put_unaligned_le64(v, out);
+	}
+
+	return bytes;
+}
+
+/**
+ * bch2_varint_decode - encode a variable length integer
+ * @in:		varint to decode
+ * @end:	end of buffer to decode from
+ * @out:	on success, decoded integer
+ * Returns:	size in bytes of the decoded integer - or -1 on failure (would
+ * have read past the end of the buffer)
+ */
+int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
+{
+	unsigned bytes = likely(in < end)
+		? ffz(*in & 255) + 1
+		: 1;
+	u64 v;
+
+	if (unlikely(in + bytes > end))
+		return -1;
+
+	if (likely(bytes < 9)) {
+		__le64 v_le = 0;
+
+		memcpy(&v_le, in, bytes);
+		v = le64_to_cpu(v_le);
+		v >>= bytes;
+	} else {
+		v = get_unaligned_le64(++in);
+	}
+
+	*out = v;
+	return bytes;
+}
+
+/**
+ * bch2_varint_encode_fast - fast version of bch2_varint_encode
+ * @out:	destination to encode to
+ * @v:		unsigned integer to encode
+ * Returns:	size in bytes of the encoded integer - at most 9 bytes
+ *
+ * This version assumes it's always safe to write 8 bytes to @out, even if the
+ * encoded integer would be smaller.
+ */
+int bch2_varint_encode_fast(u8 *out, u64 v)
+{
+	unsigned bits = fls64(v|1);
+	unsigned bytes = DIV_ROUND_UP(bits, 7);
+
+	if (likely(bytes < 9)) {
+		v <<= bytes;
+		v |= ~(~0U << (bytes - 1));
+	} else {
+		*out++ = 255;
+		bytes = 9;
+	}
+
+	put_unaligned_le64(v, out);
+	return bytes;
+}
+
+/**
+ * bch2_varint_decode_fast - fast version of bch2_varint_decode
+ * @in:		varint to decode
+ * @end:	end of buffer to decode from
+ * @out:	on success, decoded integer
+ * Returns:	size in bytes of the decoded integer - or -1 on failure (would
+ * have read past the end of the buffer)
+ *
+ * This version assumes that it is safe to read at most 8 bytes past the end of
+ * @end (we still return an error if the varint extends past @end).
+ */
+int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
+{
+#ifdef CONFIG_VALGRIND
+	VALGRIND_MAKE_MEM_DEFINED(in, 8);
+#endif
+	u64 v = get_unaligned_le64(in);
+	unsigned bytes = ffz(*in) + 1;
+
+	if (unlikely(in + bytes > end))
+		return -1;
+
+	if (likely(bytes < 9)) {
+		v >>= bytes;
+		v &= ~(~0ULL << (7 * bytes));
+	} else {
+		v = get_unaligned_le64(++in);
+	}
+
+	*out = v;
+	return bytes;
+}
diff --git a/libbcachefs/varint.h b/libbcachefs/varint.h
new file mode 100644
index 00000000..92a182fb
--- /dev/null
+++ b/libbcachefs/varint.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_VARINT_H
+#define _BCACHEFS_VARINT_H
+
+int bch2_varint_encode(u8 *, u64);
+int bch2_varint_decode(const u8 *, const u8 *, u64 *);
+
+int bch2_varint_encode_fast(u8 *, u64);
+int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *);
+
+#endif /* _BCACHEFS_VARINT_H */
diff --git a/libbcachefs/vstructs.h b/libbcachefs/vstructs.h
index 79566442..2ad338e2 100644
--- a/libbcachefs/vstructs.h
+++ b/libbcachefs/vstructs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _VSTRUCTS_H
 #define _VSTRUCTS_H
 
@@ -19,7 +20,7 @@
 ({									\
 	BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64));		\
 									\
-	(offsetof(_type, _data) + (_u64s) * sizeof(u64));		\
+	(size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64));	\
 })
 
 #define vstruct_bytes(_s)						\
@@ -40,21 +41,21 @@
 	(round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
 
 #define vstruct_next(_s)						\
-	((typeof(_s))			((_s)->_data + __vstruct_u64s(_s)))
+	((typeof(_s))			((u64 *) (_s)->_data + __vstruct_u64s(_s)))
 #define vstruct_last(_s)						\
-	((typeof(&(_s)->start[0]))	((_s)->_data + __vstruct_u64s(_s)))
+	((typeof(&(_s)->start[0]))	((u64 *) (_s)->_data + __vstruct_u64s(_s)))
 #define vstruct_end(_s)							\
-	((void *)			((_s)->_data + __vstruct_u64s(_s)))
+	((void *)			((u64 *) (_s)->_data + __vstruct_u64s(_s)))
 
 #define vstruct_for_each(_s, _i)					\
-	for (_i = (_s)->start;						\
+	for (typeof(&(_s)->start[0]) _i = (_s)->start;			\
 	     _i < vstruct_last(_s);					\
 	     _i = vstruct_next(_i))
 
-#define vstruct_for_each_safe(_s, _i, _t)				\
-	for (_i = (_s)->start;						\
-	     _i < vstruct_last(_s) && (_t = vstruct_next(_i), true);	\
-	     _i = _t)
+#define vstruct_for_each_safe(_s, _i)					\
+	for (typeof(&(_s)->start[0]) _next, _i = (_s)->start;		\
+	     _i < vstruct_last(_s) && (_next = vstruct_next(_i), true);	\
+	     _i = _next)
 
 #define vstruct_idx(_s, _idx)						\
 	((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c
index 7d0fee3a..aed7c698 100644
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@@ -1,8 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "acl.h"
 #include "bkey_methods.h"
 #include "btree_update.h"
-#include "compress.h"
 #include "extents.h"
 #include "fs.h"
 #include "rebalance.h"
@@ -61,110 +62,94 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
 }
 
 const struct bch_hash_desc bch2_xattr_hash_desc = {
-	.btree_id	= BTREE_ID_XATTRS,
-	.key_type	= BCH_XATTR,
-	.whiteout_type	= BCH_XATTR_WHITEOUT,
+	.btree_id	= BTREE_ID_xattrs,
+	.key_type	= KEY_TYPE_xattr,
 	.hash_key	= xattr_hash_key,
 	.hash_bkey	= xattr_hash_bkey,
 	.cmp_key	= xattr_cmp_key,
 	.cmp_bkey	= xattr_cmp_bkey,
 };
 
-const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k,
+			struct bkey_validate_context from)
 {
-	const struct xattr_handler *handler;
-	struct bkey_s_c_xattr xattr;
-
-	switch (k.k->type) {
-	case BCH_XATTR:
-		if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr))
-			return "value too small";
+	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
+	unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len,
+					   le16_to_cpu(xattr.v->x_val_len));
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s,
+			 c, xattr_val_size_too_small,
+			 "value too small (%zu < %u)",
+			 bkey_val_u64s(k.k), val_u64s);
+
+	/* XXX why +4 ? */
+	val_u64s = xattr_val_u64s(xattr.v->x_name_len,
+				  le16_to_cpu(xattr.v->x_val_len) + 4);
+
+	bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s,
+			 c, xattr_val_size_too_big,
+			 "value too big (%zu > %u)",
+			 bkey_val_u64s(k.k), val_u64s);
+
+	bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type),
+			 c, xattr_invalid_type,
+			 "invalid type (%u)", xattr.v->x_type);
+
+	bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len),
+			 c, xattr_name_invalid_chars,
+			 "xattr name has invalid characters");
+fsck_err:
+	return ret;
+}
 
-		xattr = bkey_s_c_to_xattr(k);
+void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
+			struct bkey_s_c k)
+{
+	const struct xattr_handler *handler;
+	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
 
-		if (bkey_val_u64s(k.k) <
-			xattr_val_u64s(xattr.v->x_name_len,
-				       le16_to_cpu(xattr.v->x_val_len)))
-			return "value too small";
+	handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+	if (handler && handler->prefix)
+		prt_printf(out, "%s", handler->prefix);
+	else if (handler)
+		prt_printf(out, "(type %u)", xattr.v->x_type);
+	else
+		prt_printf(out, "(unknown type %u)", xattr.v->x_type);
 
-		if (bkey_val_u64s(k.k) >
-			xattr_val_u64s(xattr.v->x_name_len,
-				       le16_to_cpu(xattr.v->x_val_len) + 4))
-			return "value too big";
+	unsigned name_len = xattr.v->x_name_len;
+	unsigned val_len  = le16_to_cpu(xattr.v->x_val_len);
+	unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) -
+		offsetof(struct bch_xattr, x_name);
 
-		handler = bch2_xattr_type_to_handler(xattr.v->x_type);
-		if (!handler)
-			return "invalid type";
+	val_len  = min_t(int, val_len, max_name_val_bytes - name_len);
+	name_len = min(name_len, max_name_val_bytes);
 
-		if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len))
-			return "xattr name has invalid characters";
+	prt_printf(out, "%.*s:%.*s",
+		   name_len, xattr.v->x_name,
+		   val_len,  (char *) xattr_val(xattr.v));
 
-		return NULL;
-	case BCH_XATTR_WHITEOUT:
-		return bkey_val_bytes(k.k) != 0
-			? "value size should be zero"
-			: NULL;
-
-	default:
-		return "invalid type";
+	if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
+	    xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) {
+		prt_char(out, ' ');
+		bch2_acl_to_text(out, xattr_val(xattr.v),
+				 le16_to_cpu(xattr.v->x_val_len));
 	}
 }
 
-void bch2_xattr_to_text(struct bch_fs *c, char *buf,
-			size_t size, struct bkey_s_c k)
+static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
+				const char *name, void *buffer, size_t size, int type)
 {
-	const struct xattr_handler *handler;
-	struct bkey_s_c_xattr xattr;
-	size_t n = 0;
-
-	switch (k.k->type) {
-	case BCH_XATTR:
-		xattr = bkey_s_c_to_xattr(k);
-
-		handler = bch2_xattr_type_to_handler(xattr.v->x_type);
-		if (handler && handler->prefix)
-			n += scnprintf(buf + n, size - n, "%s", handler->prefix);
-		else if (handler)
-			n += scnprintf(buf + n, size - n, "(type %u)",
-				       xattr.v->x_type);
-		else
-			n += scnprintf(buf + n, size - n, "(unknown type %u)",
-				       xattr.v->x_type);
-
-		n += bch_scnmemcpy(buf + n, size - n, xattr.v->x_name,
-				   xattr.v->x_name_len);
-		n += scnprintf(buf + n, size - n, ":");
-		n += bch_scnmemcpy(buf + n, size - n, xattr_val(xattr.v),
-				   le16_to_cpu(xattr.v->x_val_len));
-		break;
-	case BCH_XATTR_WHITEOUT:
-		scnprintf(buf, size, "whiteout");
-		break;
-	}
-}
-
-int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
-		   const char *name, void *buffer, size_t size, int type)
-{
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct bkey_s_c_xattr xattr;
-	int ret;
-
-	bch2_trans_init(&trans, c);
-
-	iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
-				&inode->ei_str_hash, inode->v.i_ino,
-				&X_SEARCH(type, name, strlen(name)),
-				0);
-	if (IS_ERR(iter)) {
-		bch2_trans_exit(&trans);
-		BUG_ON(PTR_ERR(iter) == -EINTR);
-
-		return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter);
-	}
+	struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
+	struct xattr_search_key search = X_SEARCH(type, name, strlen(name));
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
+					     inode_inum(inode), &search, 0);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
 
-	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
 	ret = le16_to_cpu(xattr.v->x_val_len);
 	if (buffer) {
 		if (ret > size)
@@ -172,18 +157,33 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
 		else
 			memcpy(buffer, xattr_val(xattr.v), ret);
 	}
-
-	bch2_trans_exit(&trans);
+	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
 
-int bch2_xattr_set(struct btree_trans *trans, u64 inum,
+int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
+		   struct bch_inode_unpacked *inode_u,
 		   const struct bch_hash_info *hash_info,
 		   const char *name, const void *value, size_t size,
 		   int type, int flags)
 {
+	struct bch_fs *c = trans->c;
+	struct btree_iter inode_iter = { NULL };
 	int ret;
 
+	ret   = bch2_subvol_is_ro_trans(trans, inum.subvol) ?:
+		bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent);
+	if (ret)
+		return ret;
+
+	inode_u->bi_ctime = bch2_current_time(c);
+
+	ret = bch2_inode_write(trans, &inode_iter, inode_u);
+	bch2_trans_iter_exit(trans, &inode_iter);
+
+	if (ret)
+		return ret;
+
 	if (value) {
 		struct bkey_i_xattr *xattr;
 		unsigned namelen = strlen(name);
@@ -205,10 +205,10 @@ int bch2_xattr_set(struct btree_trans *trans, u64 inum,
 		memcpy(xattr->v.x_name, name, namelen);
 		memcpy(xattr_val(&xattr->v), value, size);
 
-		ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
+		ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
 			      inum, &xattr->k_i,
-			      (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
-			      (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
+			      (flags & XATTR_CREATE ? STR_HASH_must_create : 0)|
+			      (flags & XATTR_REPLACE ? STR_HASH_must_replace : 0));
 	} else {
 		struct xattr_search_key search =
 			X_SEARCH(type, name, strlen(name));
@@ -217,107 +217,155 @@ int bch2_xattr_set(struct btree_trans *trans, u64 inum,
 				       hash_info, inum, &search);
 	}
 
-	if (ret == -ENOENT)
+	if (bch2_err_matches(ret, ENOENT))
 		ret = flags & XATTR_REPLACE ? -ENODATA : 0;
 
 	return ret;
 }
 
-static size_t bch2_xattr_emit(struct dentry *dentry,
-			     const struct bch_xattr *xattr,
-			     char *buffer, size_t buffer_size)
+struct xattr_buf {
+	char		*buf;
+	size_t		len;
+	size_t		used;
+};
+
+static int __bch2_xattr_emit(const char *prefix,
+			     const char *name, size_t name_len,
+			     struct xattr_buf *buf)
 {
-	const struct xattr_handler *handler =
-		bch2_xattr_type_to_handler(xattr->x_type);
-
-	if (handler && (!handler->list || handler->list(dentry))) {
-		const char *prefix = handler->prefix ?: handler->name;
-		const size_t prefix_len = strlen(prefix);
-		const size_t total_len = prefix_len + xattr->x_name_len + 1;
-
-		if (buffer && total_len <= buffer_size) {
-			memcpy(buffer, prefix, prefix_len);
-			memcpy(buffer + prefix_len,
-			       xattr->x_name, xattr->x_name_len);
-			buffer[prefix_len + xattr->x_name_len] = '\0';
-		}
+	const size_t prefix_len = strlen(prefix);
+	const size_t total_len = prefix_len + name_len + 1;
 
-		return total_len;
-	} else {
-		return 0;
+	if (buf->buf) {
+		if (buf->used + total_len > buf->len)
+			return -ERANGE;
+
+		memcpy(buf->buf + buf->used, prefix, prefix_len);
+		memcpy(buf->buf + buf->used + prefix_len,
+		       name, name_len);
+		buf->buf[buf->used + prefix_len + name_len] = '\0';
 	}
+
+	buf->used += total_len;
+	return 0;
 }
 
-ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+static inline const char *bch2_xattr_prefix(unsigned type, struct dentry *dentry)
 {
-	struct bch_fs *c = dentry->d_sb->s_fs_info;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	const struct bch_xattr *xattr;
-	u64 inum = dentry->d_inode->i_ino;
-	ssize_t ret = 0;
-	size_t len;
+	const struct xattr_handler *handler = bch2_xattr_type_to_handler(type);
 
-	for_each_btree_key(&iter, c, BTREE_ID_XATTRS, POS(inum, 0), 0, k) {
-		BUG_ON(k.k->p.inode < inum);
+	if (!xattr_handler_can_list(handler, dentry))
+		return NULL;
 
-		if (k.k->p.inode > inum)
-			break;
+	return xattr_prefix(handler);
+}
 
-		if (k.k->type != BCH_XATTR)
-			continue;
+static int bch2_xattr_emit(struct dentry *dentry,
+			    const struct bch_xattr *xattr,
+			    struct xattr_buf *buf)
+{
+	const char *prefix;
 
-		xattr = bkey_s_c_to_xattr(k).v;
+	prefix = bch2_xattr_prefix(xattr->x_type, dentry);
+	if (!prefix)
+		return 0;
 
-		len = bch2_xattr_emit(dentry, xattr, buffer, buffer_size);
-		if (buffer) {
-			if (len > buffer_size) {
-				bch2_btree_iter_unlock(&iter);
-				return -ERANGE;
-			}
+	return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf);
+}
 
-			buffer += len;
-			buffer_size -= len;
-		}
+static int bch2_xattr_list_bcachefs(struct bch_fs *c,
+				    struct bch_inode_unpacked *inode,
+				    struct xattr_buf *buf,
+				    bool all)
+{
+	const char *prefix = all ? "bcachefs_effective." : "bcachefs.";
+	unsigned id;
+	int ret = 0;
+	u64 v;
+
+	for (id = 0; id < Inode_opt_nr; id++) {
+		v = bch2_inode_opt_get(inode, id);
+		if (!v)
+			continue;
 
-		ret += len;
+		if (!all &&
+		    !(inode->bi_fields_set & (1 << id)))
+			continue;
 
+		ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id],
+					strlen(bch2_inode_opts[id]), buf);
+		if (ret)
+			break;
 	}
-	bch2_btree_iter_unlock(&iter);
 
 	return ret;
 }
 
+ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct bch_fs *c = dentry->d_sb->s_fs_info;
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
+	u64 offset = 0, inum = inode->ei_inode.bi_inum;
+
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs,
+				   POS(inum, offset),
+				   POS(inum, U64_MAX),
+				   inode->ei_inum.subvol, 0, k, ({
+			if (k.k->type != KEY_TYPE_xattr)
+				continue;
+
+			bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
+		}))) ?:
+		bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?:
+		bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
+
+	return ret ? bch2_err_class(ret) : buf.used;
+}
+
 static int bch2_xattr_get_handler(const struct xattr_handler *handler,
 				  struct dentry *dentry, struct inode *vinode,
 				  const char *name, void *buffer, size_t size)
 {
 	struct bch_inode_info *inode = to_bch_ei(vinode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	int ret = bch2_trans_do(c,
+		bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
+
+	if (ret < 0 && bch2_err_matches(ret, ENOENT))
+		ret = -ENODATA;
 
-	return bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
+	return bch2_err_class(ret);
 }
 
 static int bch2_xattr_set_handler(const struct xattr_handler *handler,
+				  struct mnt_idmap *idmap,
 				  struct dentry *dentry, struct inode *vinode,
 				  const char *name, const void *value,
 				  size_t size, int flags)
 {
 	struct bch_inode_info *inode = to_bch_ei(vinode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
+	struct bch_inode_unpacked inode_u;
+	int ret;
+
+	ret = bch2_trans_run(c,
+		commit_do(trans, NULL, NULL, 0,
+			bch2_xattr_set(trans, inode_inum(inode), &inode_u,
+				       &hash, name, value, size,
+				       handler->flags, flags)) ?:
+		(bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0));
 
-	return bch2_trans_do(c, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC,
-			bch2_xattr_set(&trans, inode->v.i_ino,
-				       &inode->ei_str_hash,
-				       name, value, size,
-				       handler->flags, flags));
+	return bch2_err_class(ret);
 }
 
 static const struct xattr_handler bch_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.get	= bch2_xattr_get_handler,
 	.set	= bch2_xattr_set_handler,
-	.flags	= BCH_XATTR_INDEX_USER,
+	.flags	= KEY_TYPE_XATTR_INDEX_USER,
 };
 
 static bool bch2_xattr_trusted_list(struct dentry *dentry)
@@ -330,44 +378,86 @@ static const struct xattr_handler bch_xattr_trusted_handler = {
 	.list	= bch2_xattr_trusted_list,
 	.get	= bch2_xattr_get_handler,
 	.set	= bch2_xattr_set_handler,
-	.flags	= BCH_XATTR_INDEX_TRUSTED,
+	.flags	= KEY_TYPE_XATTR_INDEX_TRUSTED,
 };
 
 static const struct xattr_handler bch_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.get	= bch2_xattr_get_handler,
 	.set	= bch2_xattr_set_handler,
-	.flags	= BCH_XATTR_INDEX_SECURITY,
+	.flags	= KEY_TYPE_XATTR_INDEX_SECURITY,
 };
 
 #ifndef NO_BCACHEFS_FS
 
-static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
-				   struct dentry *dentry, struct inode *vinode,
-				   const char *name, void *buffer, size_t size)
+static int opt_to_inode_opt(int id)
+{
+	switch (id) {
+#define x(name, ...)				\
+	case Opt_##name: return Inode_opt_##name;
+	BCH_INODE_OPTS()
+#undef  x
+	default:
+		return -1;
+	}
+}
+
+static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+				struct dentry *dentry, struct inode *vinode,
+				const char *name, void *buffer, size_t size,
+				bool all)
 {
 	struct bch_inode_info *inode = to_bch_ei(vinode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_opts opts =
-		bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
+		bch2_inode_opts_to_opts(&inode->ei_inode);
 	const struct bch_option *opt;
-	int ret, id;
+	int id, inode_opt_id;
+	struct printbuf out = PRINTBUF;
+	int ret;
 	u64 v;
 
 	id = bch2_opt_lookup(name);
 	if (id < 0 || !bch2_opt_is_inode_opt(id))
 		return -EINVAL;
 
+	inode_opt_id = opt_to_inode_opt(id);
+	if (inode_opt_id < 0)
+		return -EINVAL;
+
 	opt = bch2_opt_table + id;
 
 	if (!bch2_opt_defined_by_id(&opts, id))
 		return -ENODATA;
 
+	if (!all &&
+	    !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id)))
+		return -ENODATA;
+
 	v = bch2_opt_get_by_id(&opts, id);
+	bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0);
+
+	ret = out.pos;
+
+	if (out.allocation_failure) {
+		ret = -ENOMEM;
+	} else if (buffer) {
+		if (out.pos > size)
+			ret = -ERANGE;
+		else
+			memcpy(buffer, out.buf, out.pos);
+	}
 
-	ret = bch2_opt_to_text(c, buffer, size, opt, v, 0);
+	printbuf_exit(&out);
+	return ret;
+}
 
-	return ret < size || !buffer ? ret : -ERANGE;
+static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+				   struct dentry *dentry, struct inode *vinode,
+				   const char *name, void *buffer, size_t size)
+{
+	return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
+					 name, buffer, size, false);
 }
 
 struct inode_opt_set {
@@ -376,20 +466,25 @@ struct inode_opt_set {
 	bool			defined;
 };
 
-static int inode_opt_set_fn(struct bch_inode_info *inode,
+static int inode_opt_set_fn(struct btree_trans *trans,
+			    struct bch_inode_info *inode,
 			    struct bch_inode_unpacked *bi,
 			    void *p)
 {
 	struct inode_opt_set *s = p;
 
 	if (s->defined)
-		bch2_inode_opt_set(bi, s->id, s->v);
+		bi->bi_fields_set |= 1U << s->id;
 	else
-		bch2_inode_opt_clear(bi, s->id);
+		bi->bi_fields_set &= ~(1U << s->id);
+
+	bch2_inode_opt_set(bi, s->id, s->v);
+
 	return 0;
 }
 
 static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
+				   struct mnt_idmap *idmap,
 				   struct dentry *dentry, struct inode *vinode,
 				   const char *name, const void *value,
 				   size_t size, int flags)
@@ -399,49 +494,79 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
 	const struct bch_option *opt;
 	char *buf;
 	struct inode_opt_set s;
-	int ret;
+	int opt_id, inode_opt_id, ret;
 
-	s.id = bch2_opt_lookup(name);
-	if (s.id < 0 || !bch2_opt_is_inode_opt(s.id))
+	opt_id = bch2_opt_lookup(name);
+	if (opt_id < 0)
 		return -EINVAL;
 
-	opt = bch2_opt_table + s.id;
+	opt = bch2_opt_table + opt_id;
+
+	inode_opt_id = opt_to_inode_opt(opt_id);
+	if (inode_opt_id < 0)
+		return -EINVAL;
+
+	s.id = inode_opt_id;
 
 	if (value) {
+		u64 v = 0;
+
 		buf = kmalloc(size + 1, GFP_KERNEL);
 		if (!buf)
 			return -ENOMEM;
 		memcpy(buf, value, size);
 		buf[size] = '\0';
 
-		ret = bch2_opt_parse(c, opt, buf, &s.v);
+		ret = bch2_opt_parse(c, opt, buf, &v, NULL);
 		kfree(buf);
 
 		if (ret < 0)
-			return ret;
+			goto err_class_exit;
 
-		if (s.id == Opt_compression ||
-		    s.id == Opt_background_compression) {
-			ret = bch2_check_set_has_compressed_data(c, s.v);
-			if (ret)
-				return ret;
-		}
+		ret = bch2_opt_check_may_set(c, opt_id, v);
+		if (ret < 0)
+			goto err_class_exit;
 
+		s.v = v + 1;
 		s.defined = true;
 	} else {
+		/*
+		 * Check if this option was set on the parent - if so, switched
+		 * back to inheriting from the parent:
+		 *
+		 * rename() also has to deal with keeping inherited options up
+		 * to date - see bch2_reinherit_attrs()
+		 */
+		spin_lock(&dentry->d_lock);
+		if (!IS_ROOT(dentry)) {
+			struct bch_inode_info *dir =
+				to_bch_ei(d_inode(dentry->d_parent));
+
+			s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id);
+		} else {
+			s.v = 0;
+		}
+		spin_unlock(&dentry->d_lock);
+
 		s.defined = false;
 	}
 
 	mutex_lock(&inode->ei_update_lock);
-	ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
-	mutex_unlock(&inode->ei_update_lock);
-
-	if (value &&
-	    (s.id == Opt_background_compression ||
-	     s.id == Opt_background_target))
-		bch2_rebalance_add_work(c, inode->v.i_blocks);
+	if (inode_opt_id == Inode_opt_project) {
+		/*
+		 * inode fields accessible via the xattr interface are stored
+		 * with a +1 bias, so that 0 means unset:
+		 */
+		ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0);
+		if (ret)
+			goto err;
+	}
 
-	return ret;
+	ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
+err:
+	mutex_unlock(&inode->ei_update_lock);
+err_class_exit:
+	return bch2_err_class(ret);
 }
 
 static const struct xattr_handler bch_xattr_bcachefs_handler = {
@@ -450,28 +575,52 @@ static const struct xattr_handler bch_xattr_bcachefs_handler = {
 	.set	= bch2_xattr_bcachefs_set,
 };
 
+static int bch2_xattr_bcachefs_get_effective(
+				const struct xattr_handler *handler,
+				struct dentry *dentry, struct inode *vinode,
+				const char *name, void *buffer, size_t size)
+{
+	return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
+					 name, buffer, size, true);
+}
+
+/* Noop - xattrs in the bcachefs_effective namespace are inherited */
+static int bch2_xattr_bcachefs_set_effective(const struct xattr_handler *handler,
+				   struct mnt_idmap *idmap,
+				   struct dentry *dentry, struct inode *vinode,
+				   const char *name, const void *value,
+				   size_t size, int flags)
+{
+	return 0;
+}
+
+static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
+	.prefix	= "bcachefs_effective.",
+	.get	= bch2_xattr_bcachefs_get_effective,
+	.set	= bch2_xattr_bcachefs_set_effective,
+};
+
 #endif /* NO_BCACHEFS_FS */
 
-const struct xattr_handler *bch2_xattr_handlers[] = {
+const struct xattr_handler * const bch2_xattr_handlers[] = {
 	&bch_xattr_user_handler,
-	&posix_acl_access_xattr_handler,
-	&posix_acl_default_xattr_handler,
 	&bch_xattr_trusted_handler,
 	&bch_xattr_security_handler,
 #ifndef NO_BCACHEFS_FS
 	&bch_xattr_bcachefs_handler,
+	&bch_xattr_bcachefs_effective_handler,
 #endif
 	NULL
 };
 
 static const struct xattr_handler *bch_xattr_handler_map[] = {
-	[BCH_XATTR_INDEX_USER]			= &bch_xattr_user_handler,
-	[BCH_XATTR_INDEX_POSIX_ACL_ACCESS]	=
-		&posix_acl_access_xattr_handler,
-	[BCH_XATTR_INDEX_POSIX_ACL_DEFAULT]	=
-		&posix_acl_default_xattr_handler,
-	[BCH_XATTR_INDEX_TRUSTED]		= &bch_xattr_trusted_handler,
-	[BCH_XATTR_INDEX_SECURITY]		= &bch_xattr_security_handler,
+	[KEY_TYPE_XATTR_INDEX_USER]			= &bch_xattr_user_handler,
+	[KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS]	=
+		&nop_posix_acl_access,
+	[KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT]	=
+		&nop_posix_acl_default,
+	[KEY_TYPE_XATTR_INDEX_TRUSTED]		= &bch_xattr_trusted_handler,
+	[KEY_TYPE_XATTR_INDEX_SECURITY]		= &bch_xattr_security_handler,
 };
 
 static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h
index 0689d327..132fbbd1 100644
--- a/libbcachefs/xattr.h
+++ b/libbcachefs/xattr.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_XATTR_H
 #define _BCACHEFS_XATTR_H
 
@@ -5,13 +6,15 @@
 
 extern const struct bch_hash_desc bch2_xattr_hash_desc;
 
-const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_xattr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c,
+			struct bkey_validate_context);
+void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
-#define bch2_bkey_xattr_ops (struct bkey_ops) {		\
-	.key_invalid	= bch2_xattr_invalid,		\
+#define bch2_bkey_ops_xattr ((struct bkey_ops) {	\
+	.key_validate	= bch2_xattr_validate,		\
 	.val_to_text	= bch2_xattr_to_text,		\
-}
+	.min_val_size	= 8,				\
+})
 
 static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
 {
@@ -35,14 +38,13 @@ struct xattr_handler;
 struct bch_hash_info;
 struct bch_inode_info;
 
-int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *,
-		  const char *, void *, size_t, int);
-
-int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *,
+/* Exported for cmd_migrate.c in tools: */
+int bch2_xattr_set(struct btree_trans *, subvol_inum,
+		   struct bch_inode_unpacked *, const struct bch_hash_info *,
 		   const char *, const void *, size_t, int, int);
 
 ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
 
-extern const struct xattr_handler *bch2_xattr_handlers[];
+extern const struct xattr_handler * const bch2_xattr_handlers[];
 
 #endif /* _BCACHEFS_XATTR_H */
diff --git a/libbcachefs/xattr_format.h b/libbcachefs/xattr_format.h
new file mode 100644
index 00000000..c7916011
--- /dev/null
+++ b/libbcachefs/xattr_format.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_XATTR_FORMAT_H
+#define _BCACHEFS_XATTR_FORMAT_H
+
+#define KEY_TYPE_XATTR_INDEX_USER		0
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS	1
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT	2
+#define KEY_TYPE_XATTR_INDEX_TRUSTED		3
+#define KEY_TYPE_XATTR_INDEX_SECURITY	        4
+
+struct bch_xattr {
+	struct bch_val		v;
+	__u8			x_type;
+	__u8			x_name_len;
+	__le16			x_val_len;
+	__u8			x_name[] __counted_by(x_name_len);
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_XATTR_FORMAT_H */
diff --git a/linux/atomic64.c b/linux/atomic64.c
index 4654d092..2dbbc995 100644
--- a/linux/atomic64.c
+++ b/linux/atomic64.c
@@ -157,6 +157,21 @@ long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n)
 	return val;
 }
 
+bool atomic64_try_cmpxchg(atomic64_t *v, s64 *o, s64 n)
+{
+	unsigned long flags;
+	raw_spinlock_t *lock = lock_addr(v);
+
+	raw_spin_lock_irqsave(lock, flags);
+	bool ret = v->counter == *o;
+	if (ret)
+		v->counter = n;
+	else
+		*o = v->counter;
+	raw_spin_unlock_irqrestore(lock, flags);
+	return ret;
+}
+
 long long atomic64_xchg(atomic64_t *v, long long new)
 {
 	unsigned long flags;
diff --git a/linux/bio.c b/linux/bio.c
index c4cdceaa..08dcd584 100644
--- a/linux/bio.c
+++ b/linux/bio.c
@@ -52,6 +52,15 @@ int blk_status_to_errno(blk_status_t status)
 	return blk_errors[idx].err;
 }
 
+const char *blk_status_to_str(blk_status_t status)
+{
+	int idx = (__force int)status;
+
+	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
+		return "(invalid error)";
+	return blk_errors[idx].name;
+}
+
 void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
 			struct bio *src, struct bvec_iter *src_iter)
 {
@@ -111,29 +120,30 @@ void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
 	}
 }
 
-void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
+static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
 {
-	/*
-	 * most users will be overriding ->bi_bdev with a new target,
-	 * so we don't set nor calculate new physical/hw segment counts here
-	 */
-	bio->bi_bdev = bio_src->bi_bdev;
 	bio_set_flag(bio, BIO_CLONED);
-	bio->bi_opf = bio_src->bi_opf;
+	bio->bi_ioprio = bio_src->bi_ioprio;
 	bio->bi_iter = bio_src->bi_iter;
-	bio->bi_io_vec = bio_src->bi_io_vec;
+	return 0;
 }
 
-struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
+struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src,
+		gfp_t gfp, struct bio_set *bs)
 {
-	struct bio *b;
+	struct bio *bio;
+
+	bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs);
+	if (!bio)
+		return NULL;
 
-	b = bio_alloc_bioset(gfp_mask, 0, bs);
-	if (!b)
+	if (__bio_clone(bio, bio_src, gfp) < 0) {
+		bio_put(bio);
 		return NULL;
+	}
+	bio->bi_io_vec = bio_src->bi_io_vec;
 
-	__bio_clone_fast(b, bio);
-	return b;
+	return bio;
 }
 
 struct bio *bio_split(struct bio *bio, int sectors,
@@ -144,15 +154,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
 	BUG_ON(sectors <= 0);
 	BUG_ON(sectors >= bio_sectors(bio));
 
-	/*
-	 * Discards need a mutable bio_vec to accommodate the payload
-	 * required by the DSM TRIM and UNMAP commands.
-	 */
-	if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
-		split = bio_clone_bioset(bio, gfp, bs);
-	else
-		split = bio_clone_fast(bio, gfp, bs);
-
+	split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs);
 	if (!split)
 		return NULL;
 
@@ -165,10 +167,10 @@ struct bio *bio_split(struct bio *bio, int sectors,
 
 void bio_free_pages(struct bio *bio)
 {
+	struct bvec_iter_all iter;
 	struct bio_vec *bvec;
-	int i;
 
-	bio_for_each_segment_all(bvec, bio, i)
+	bio_for_each_segment_all(bvec, bio, iter)
 		__free_page(bvec->bv_page);
 }
 
@@ -179,9 +181,16 @@ void bio_advance(struct bio *bio, unsigned bytes)
 
 static void bio_free(struct bio *bio)
 {
-	unsigned front_pad = bio->bi_pool ? bio->bi_pool->front_pad : 0;
+	struct bio_set *bs = bio->bi_pool;
 
-	kfree((void *) bio - front_pad);
+	if (bs) {
+		if (bio->bi_max_vecs > BIO_INLINE_VECS)
+			mempool_free(bio->bi_io_vec, &bs->bvec_pool);
+
+		mempool_free((void *) bio - bs->front_pad, &bs->bio_pool);
+	} else {
+		kfree(bio);
+	}
 }
 
 void bio_put(struct bio *bio)
@@ -199,6 +208,23 @@ void bio_put(struct bio *bio)
 	}
 }
 
+int bio_add_page(struct bio *bio, struct page *page,
+		 unsigned int len, unsigned int off)
+{
+	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
+
+	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+	WARN_ON_ONCE(bio->bi_vcnt >= bio->bi_max_vecs);
+
+	bv->bv_page = page;
+	bv->bv_offset = off;
+	bv->bv_len = len;
+
+	bio->bi_iter.bi_size += len;
+	bio->bi_vcnt++;
+	return len;
+}
+
 static inline bool bio_remaining_done(struct bio *bio)
 {
 	/*
@@ -256,64 +282,128 @@ again:
 		bio->bi_end_io(bio);
 }
 
-void bio_reset(struct bio *bio)
+void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf)
 {
 	unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
 
 	memset(bio, 0, BIO_RESET_BYTES);
-	bio->bi_flags = flags;
+	bio->bi_bdev	= bdev;
+	bio->bi_opf	= opf;
+	bio->bi_flags	= flags;
 	atomic_set(&bio->__bi_remaining, 1);
 }
 
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
+struct bio *bio_kmalloc(unsigned int nr_iovecs, gfp_t gfp_mask)
 {
-	unsigned front_pad = bs ? bs->front_pad : 0;
 	struct bio *bio;
-	void *p;
-
-	p = kmalloc(front_pad +
-		    sizeof(struct bio) +
-		    nr_iovecs * sizeof(struct bio_vec),
-		    gfp_mask);
 
-	if (unlikely(!p))
+	bio = kmalloc(sizeof(struct bio) +
+		      sizeof(struct bio_vec) * nr_iovecs, gfp_mask);
+	if (unlikely(!bio))
 		return NULL;
+	bio_init(bio, NULL, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs, 0);
+	bio->bi_pool = NULL;
+	return bio;
+}
 
-	bio = p + front_pad;
-	bio_init(bio, bio->bi_inline_vecs, nr_iovecs);
-	bio->bi_pool = bs;
+struct bio *bio_alloc(struct block_device *bdev, unsigned nr_iovecs,
+		      blk_opf_t opf, gfp_t gfp_mask)
+{
+	struct bio *bio;
 
+	bio = kmalloc(sizeof(struct bio) +
+		      sizeof(struct bio_vec) * nr_iovecs, gfp_mask);
+	if (unlikely(!bio))
+		return NULL;
+	bio_init(bio, bdev, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs, opf);
+	bio->bi_pool = NULL;
 	return bio;
 }
 
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+static struct bio_vec *bvec_alloc(mempool_t *pool, int *nr_vecs,
+		gfp_t gfp_mask)
+{
+	*nr_vecs = roundup_pow_of_two(*nr_vecs);
+	/*
+	 * Try a slab allocation first for all smaller allocations.  If that
+	 * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool.
+	 * The mempool is sized to handle up to BIO_MAX_VECS entries.
+	 */
+	if (*nr_vecs < BIO_MAX_VECS) {
+		struct bio_vec *bvl;
+
+		bvl = kmalloc(sizeof(*bvl) * *nr_vecs, gfp_mask);
+		if (likely(bvl))
+			return bvl;
+		*nr_vecs = BIO_MAX_VECS;
+	}
+
+	return mempool_alloc(pool, gfp_mask);
+}
+
+struct bio *bio_alloc_bioset(struct block_device *bdev,
+			     unsigned nr_iovecs,
+			     unsigned opf,
+			     gfp_t gfp_mask,
 			     struct bio_set *bs)
 {
-	struct bvec_iter iter;
-	struct bio_vec bv;
 	struct bio *bio;
+	void *p;
 
-	bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
-	if (!bio)
+	if (nr_iovecs > BIO_MAX_VECS)
+		return NULL;
+
+	p = mempool_alloc(&bs->bio_pool, gfp_mask);
+	if (unlikely(!p))
 		return NULL;
 
-	bio->bi_bdev		= bio_src->bi_bdev;
-	bio->bi_opf		= bio_src->bi_opf;
-	bio->bi_iter.bi_sector	= bio_src->bi_iter.bi_sector;
-	bio->bi_iter.bi_size	= bio_src->bi_iter.bi_size;
-
-	switch (bio_op(bio)) {
-	case REQ_OP_DISCARD:
-	case REQ_OP_SECURE_ERASE:
-		break;
-	case REQ_OP_WRITE_SAME:
-		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
-		break;
-	default:
-		bio_for_each_segment(bv, bio_src, iter)
-			bio->bi_io_vec[bio->bi_vcnt++] = bv;
-		break;
+	bio = p + bs->front_pad;
+	if (nr_iovecs > BIO_INLINE_VECS) {
+		struct bio_vec *bvl = NULL;
+
+		bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
+		if (unlikely(!bvl))
+			goto err_free;
+
+		bio_init(bio, bdev, bvl, nr_iovecs, opf);
+	} else if (nr_iovecs) {
+		bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf);
+	} else {
+		bio_init(bio, bdev, NULL, 0, opf);
 	}
 
+	bio->bi_pool = bs;
 	return bio;
+
+err_free:
+	mempool_free(p, &bs->bio_pool);
+	return NULL;
+}
+
+void bioset_exit(struct bio_set *bs)
+{
+	mempool_exit(&bs->bio_pool);
+	mempool_exit(&bs->bvec_pool);
+}
+
+int bioset_init(struct bio_set *bs,
+		unsigned int pool_size,
+		unsigned int front_pad,
+		int flags)
+{
+	int ret;
+
+	bs->front_pad = front_pad;
+	if (flags & BIOSET_NEED_BVECS)
+		bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
+	else
+		bs->back_pad = 0;
+
+	ret   = mempool_init_kmalloc_pool(&bs->bio_pool, pool_size, bs->front_pad +
+					  sizeof(struct bio) + bs->back_pad) ?:
+		mempool_init_kmalloc_pool(&bs->bvec_pool, pool_size,
+					  sizeof(struct bio_vec) * BIO_MAX_VECS);
+	if (ret)
+		bioset_exit(bs);
+	return ret;
 }
diff --git a/linux/bitrev.c b/linux/bitrev.c
deleted file mode 100644
index 61207bb7..00000000
--- a/linux/bitrev.c
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <linux/types.h>
-#include <linux/bitrev.h>
-
-const u8 byte_rev_table[256] = {
-	0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
-	0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
-	0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
-	0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
-	0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
-	0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
-	0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
-	0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
-	0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
-	0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
-	0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
-	0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
-	0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
-	0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
-	0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
-	0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
-	0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
-	0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
-	0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
-	0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
-	0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
-	0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
-	0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
-	0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
-	0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
-	0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
-	0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
-	0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
-	0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
-	0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
-	0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
-	0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
-};
diff --git a/linux/blkdev.c b/linux/blkdev.c
index 156d5353..e496fc11 100644
--- a/linux/blkdev.c
+++ b/linux/blkdev.c
@@ -10,6 +10,10 @@
 
 #include <libaio.h>
 
+#ifdef CONFIG_VALGRIND
+#include <valgrind/memcheck.h>
+#endif
+
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/completion.h>
@@ -18,7 +22,16 @@
 
 #include "tools-util.h"
 
+struct fops {
+	void (*init)(void);
+	void (*cleanup)(void);
+	void (*read)(struct bio *bio, struct iovec * iov, unsigned i);
+	void (*write)(struct bio *bio, struct iovec * iov, unsigned i);
+};
+
+static struct fops *fops;
 static io_context_t aio_ctx;
+static atomic_t running_requests;
 
 void generic_make_request(struct bio *bio)
 {
@@ -45,39 +58,28 @@ void generic_make_request(struct bio *bio)
 	iov = alloca(sizeof(*iov) * i);
 
 	i = 0;
-	bio_for_each_segment(bv, bio, iter)
+	bio_for_each_segment(bv, bio, iter) {
+		void *start = page_address(bv.bv_page) + bv.bv_offset;
+		size_t len = bv.bv_len;
+
 		iov[i++] = (struct iovec) {
-			.iov_base = page_address(bv.bv_page) + bv.bv_offset,
-			.iov_len = bv.bv_len,
+			.iov_base = start,
+			.iov_len = len,
 		};
 
-	struct iocb iocb = {
-		.data		= bio,
-		.aio_fildes	= bio->bi_opf & REQ_FUA
-			? bio->bi_bdev->bd_sync_fd
-			: bio->bi_bdev->bd_fd,
-	}, *iocbp = &iocb;
+#ifdef CONFIG_VALGRIND
+		/* To be pedantic it should only be on IO completion. */
+		if (bio_op(bio) == REQ_OP_READ)
+			VALGRIND_MAKE_MEM_DEFINED(start, len);
+#endif
+	}
 
 	switch (bio_op(bio)) {
 	case REQ_OP_READ:
-		iocb.aio_lio_opcode	= IO_CMD_PREADV;
-		iocb.u.v.vec		= iov;
-		iocb.u.v.nr		= i;
-		iocb.u.v.offset		= bio->bi_iter.bi_sector << 9;
-
-		ret = io_submit(aio_ctx, 1, &iocbp);
-		if (ret != 1)
-			die("io_submit err: %s", strerror(-ret));
+		fops->read(bio, iov, i);
 		break;
 	case REQ_OP_WRITE:
-		iocb.aio_lio_opcode	= IO_CMD_PWRITEV;
-		iocb.u.v.vec		= iov;
-		iocb.u.v.nr		= i;
-		iocb.u.v.offset		= bio->bi_iter.bi_sector << 9;
-
-		ret = io_submit(aio_ctx, 1, &iocbp);
-		if (ret != 1)
-			die("io_submit err: %s", strerror(-ret));
+		fops->write(bio, iov, i);
 		break;
 	case REQ_OP_FLUSH:
 		ret = fsync(bio->bi_bdev->bd_fd);
@@ -111,11 +113,19 @@ int submit_bio_wait(struct bio *bio)
 
 int blkdev_issue_discard(struct block_device *bdev,
 			 sector_t sector, sector_t nr_sects,
-			 gfp_t gfp_mask, unsigned long flags)
+			 gfp_t gfp_mask)
 {
 	return 0;
 }
 
+int blkdev_issue_zeroout(struct block_device *bdev,
+			 sector_t sector, sector_t nr_sects,
+			 gfp_t gfp_mask, unsigned flags)
+{
+	/* Not yet implemented: */
+	BUG();
+}
+
 unsigned bdev_logical_block_size(struct block_device *bdev)
 {
 	struct stat statbuf;
@@ -126,12 +136,10 @@ unsigned bdev_logical_block_size(struct block_device *bdev)
 	BUG_ON(ret);
 
 	if (!S_ISBLK(statbuf.st_mode))
-		return statbuf.st_blksize >> 9;
+		return statbuf.st_blksize;
 
-	ret = ioctl(bdev->bd_fd, BLKPBSZGET, &blksize);
-	BUG_ON(ret);
-
-	return blksize >> 9;
+	xioctl(bdev->bd_fd, BLKPBSZGET, &blksize);
+	return blksize;
 }
 
 sector_t get_capacity(struct gendisk *disk)
@@ -154,76 +162,118 @@ sector_t get_capacity(struct gendisk *disk)
 	return bytes >> 9;
 }
 
-void blkdev_put(struct block_device *bdev, fmode_t mode)
+void bdev_fput(struct file *file)
 {
+	struct block_device *bdev = file_bdev(file);
+
 	fdatasync(bdev->bd_fd);
-	close(bdev->bd_sync_fd);
 	close(bdev->bd_fd);
 	free(bdev);
+	free(file);
 }
 
-struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
-					void *holder)
+struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
+				    void *holder, const struct blk_holder_ops *hop)
 {
-	struct block_device *bdev;
-	int fd, sync_fd, flags = O_DIRECT;
+	int fd, flags = 0;
 
-	if ((mode & (FMODE_READ|FMODE_WRITE)) == (FMODE_READ|FMODE_WRITE))
+	if ((mode & (BLK_OPEN_READ|BLK_OPEN_WRITE)) == (BLK_OPEN_READ|BLK_OPEN_WRITE))
 		flags = O_RDWR;
-	else if (mode & FMODE_READ)
+	else if (mode & BLK_OPEN_READ)
 		flags = O_RDONLY;
-	else if (mode & FMODE_WRITE)
+	else if (mode & BLK_OPEN_WRITE)
 		flags = O_WRONLY;
 
-#if 0
-	/* using O_EXCL doesn't work with opening twice for an O_SYNC fd: */
-	if (mode & FMODE_EXCL)
+	if (!(mode & BLK_OPEN_BUFFERED))
+		flags |= O_DIRECT;
+
+	if (mode & BLK_OPEN_EXCL)
 		flags |= O_EXCL;
-#endif
 
 	fd = open(path, flags);
 	if (fd < 0)
 		return ERR_PTR(-errno);
 
-	sync_fd = open(path, flags|O_SYNC);
-	if (sync_fd < 0) {
-		assert(0);
-		close(fd);
-		return ERR_PTR(-errno);
-	}
-
-	bdev = malloc(sizeof(*bdev));
+	struct block_device *bdev = malloc(sizeof(*bdev));
 	memset(bdev, 0, sizeof(*bdev));
 
 	strncpy(bdev->name, path, sizeof(bdev->name));
 	bdev->name[sizeof(bdev->name) - 1] = '\0';
 
+	bdev->bd_dev		= xfstat(fd).st_rdev;
 	bdev->bd_fd		= fd;
-	bdev->bd_sync_fd	= sync_fd;
 	bdev->bd_holder		= holder;
 	bdev->bd_disk		= &bdev->__bd_disk;
-	bdev->bd_bdi		= &bdev->__bd_bdi;
-	bdev->queue.backing_dev_info = bdev->bd_bdi;
+	bdev->bd_disk->bdi	= &bdev->bd_disk->__bdi;
+	bdev->queue.backing_dev_info = bdev->bd_disk->bdi;
+	bdev->bd_inode		= &bdev->__bd_inode;
 
-	return bdev;
+	struct file *file = calloc(sizeof(*file), 1);
+	file->f_inode = bdev->bd_inode;
+
+	return file;
 }
 
-void bdput(struct block_device *bdev)
+int lookup_bdev(const char *path, dev_t *dev)
 {
-	BUG();
+	return -EINVAL;
+}
+
+static void io_fallback(void)
+{
+	fops++;
+	if (fops->init == NULL)
+		die("no fallback possible, something is very wrong");
+	fops->init();
+}
+
+static void sync_check(struct bio *bio, int ret)
+{
+	if (ret != bio->bi_iter.bi_size) {
+		die("IO error: %s\n", strerror(-ret));
+	}
+
+	if (bio->bi_opf & REQ_FUA) {
+		ret = fdatasync(bio->bi_bdev->bd_fd);
+		if (ret)
+			die("fsync error: %s\n", strerror(-ret));
+	}
+	bio_endio(bio);
 }
 
-struct block_device *lookup_bdev(const char *path)
+static void sync_init(void) {}
+
+static void sync_cleanup(void)
 {
-	return ERR_PTR(-EINVAL);
+	/* not necessary? */
+	sync();
 }
 
+static void sync_read(struct bio *bio, struct iovec * iov, unsigned i)
+{
+
+	ssize_t ret = preadv(bio->bi_bdev->bd_fd, iov, i,
+			     bio->bi_iter.bi_sector << 9);
+	sync_check(bio, ret);
+}
+
+static void sync_write(struct bio *bio, struct iovec * iov, unsigned i)
+{
+	ssize_t ret = pwritev2(bio->bi_bdev->bd_fd, iov, i,
+			       bio->bi_iter.bi_sector << 9,
+			       bio->bi_opf & REQ_FUA ? RWF_SYNC : 0);
+	sync_check(bio, ret);
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(aio_events_completed);
+
 static int aio_completion_thread(void *arg)
 {
 	struct io_event events[8], *ev;
 	int ret;
+	bool stop = false;
 
-	while (1) {
+	while (!stop) {
 		ret = io_getevents(aio_ctx, 1, ARRAY_SIZE(events),
 				   events, NULL);
 
@@ -231,28 +281,149 @@ static int aio_completion_thread(void *arg)
 			continue;
 		if (ret < 0)
 			die("io_getevents() error: %s", strerror(-ret));
+		if (ret)
+			wake_up(&aio_events_completed);
 
 		for (ev = events; ev < events + ret; ev++) {
 			struct bio *bio = (struct bio *) ev->data;
 
+			/* This should only happen during blkdev_cleanup() */
+			if (!bio) {
+				BUG_ON(atomic_read(&running_requests) != 0);
+				stop = true;
+				continue;
+			}
+
 			if (ev->res != bio->bi_iter.bi_size)
 				bio->bi_status = BLK_STS_IOERR;
 
 			bio_endio(bio);
+			atomic_dec(&running_requests);
 		}
 	}
 
 	return 0;
 }
 
-__attribute__((constructor(102)))
-static void blkdev_init(void)
+static struct task_struct *aio_task = NULL;
+
+static void aio_init(void)
 {
 	struct task_struct *p;
+	long err = io_setup(256, &aio_ctx);
+	if (!err) {
+		p = kthread_run(aio_completion_thread, NULL, "aio_completion");
+		BUG_ON(IS_ERR(p));
+
+		aio_task = p;
 
-	if (io_setup(256, &aio_ctx))
-		die("io_setup() error: %m");
+	} else if (err == -ENOSYS) {
+		io_fallback();
+	} else {
+		die("io_setup() error: %s", strerror(err));
+	}
+}
+
+static void aio_cleanup(void)
+{
+	struct task_struct *p = NULL;
+	swap(aio_task, p);
+	get_task_struct(p);
+
+	/* I mean, really?! IO_CMD_NOOP is even defined, but not implemented. */
+	int fds[2];
+	int ret = pipe(fds);
+	if (ret != 0)
+		die("pipe err: %s", strerror(ret));
+
+	/* Wake up the completion thread with spurious work. */
+	int junk = 0;
+	struct iocb iocb = {
+		.aio_lio_opcode = IO_CMD_PWRITE,
+		.data = NULL, /* Signal to stop */
+		.aio_fildes = fds[1],
+		.u.c.buf = &junk,
+		.u.c.nbytes = 1,
+	}, *iocbp = &iocb;
+	ret = io_submit(aio_ctx, 1, &iocbp);
+	if (ret != 1)
+		die("io_submit cleanup err: %s", strerror(-ret));
 
-	p = kthread_run(aio_completion_thread, NULL, "aio_completion");
-	BUG_ON(IS_ERR(p));
+	ret = kthread_stop(p);
+	BUG_ON(ret);
+
+	put_task_struct(p);
+
+	close(fds[0]);
+	close(fds[1]);
+}
+
+static void aio_op(struct bio *bio, struct iovec *iov, unsigned i, int opcode)
+{
+	ssize_t ret;
+	struct iocb iocb = {
+		.data		= bio,
+		.aio_fildes	= bio->bi_bdev->bd_fd,
+		.aio_rw_flags	= bio->bi_opf & REQ_FUA ? RWF_SYNC : 0,
+		.aio_lio_opcode	= opcode,
+		.u.c.buf        = iov,
+		.u.c.nbytes     = i,
+		.u.c.offset     = bio->bi_iter.bi_sector << 9,
+
+	}, *iocbp = &iocb;
+
+	atomic_inc(&running_requests);
+
+	wait_event(aio_events_completed,
+		   (ret = io_submit(aio_ctx, 1, &iocbp)) != -EAGAIN);;
+
+	if (ret != 1)
+		die("io_submit err: %s", strerror(-ret));
+}
+
+static void aio_read(struct bio *bio, struct iovec *iov, unsigned i)
+{
+	aio_op(bio, iov, i, IO_CMD_PREADV);
+}
+
+static void aio_write(struct bio *bio, struct iovec * iov, unsigned i)
+{
+	aio_op(bio, iov, i, IO_CMD_PWRITEV);
+}
+
+
+/* not implemented */
+static void uring_init(void) {
+	io_fallback();
+}
+
+struct fops fops_list[] = {
+	{
+		.init		= uring_init,
+	}, {
+		.init		= aio_init,
+		.cleanup	= aio_cleanup,
+		.read		= aio_read,
+		.write		= aio_write,
+	}, {
+		.init		= sync_init,
+		.cleanup	= sync_cleanup,
+		.read		= sync_read,
+		.write		= sync_write,
+	}, {
+		/* NULL */
+	}
+};
+
+__attribute__((constructor(102)))
+static void blkdev_init(void)
+{
+	fops = fops_list;
+	fops->init();
+}
+
+__attribute__((destructor(102)))
+static void blkdev_cleanup(void)
+{
+	fops->cleanup();
 }
diff --git a/linux/closure.c b/linux/closure.c
index 26a29356..2bfe7d2a 100644
--- a/linux/closure.c
+++ b/linux/closure.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Asynchronous refcounty things
  *
@@ -8,16 +9,33 @@
 #include <linux/closure.h>
 #include <linux/debugfs.h>
 #include <linux/export.h>
+#include <linux/rcupdate.h>
 #include <linux/seq_file.h>
+#include <linux/sched/debug.h>
 
-static inline void closure_put_after_sub(struct closure *cl, int flags)
+static inline void closure_put_after_sub_checks(int flags)
 {
 	int r = flags & CLOSURE_REMAINING_MASK;
 
-	BUG_ON(flags & CLOSURE_GUARD_MASK);
-	BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
+	if (WARN(flags & CLOSURE_GUARD_MASK,
+		 "closure has guard bits set: %x (%u)",
+		 flags & CLOSURE_GUARD_MASK, (unsigned) __fls(r)))
+		r &= ~CLOSURE_GUARD_MASK;
+
+	WARN(!r && (flags & ~CLOSURE_DESTRUCTOR),
+	     "closure ref hit 0 with incorrect flags set: %x (%u)",
+	     flags & ~CLOSURE_DESTRUCTOR, (unsigned) __fls(flags));
+}
+
+static inline void closure_put_after_sub(struct closure *cl, int flags)
+{
+	closure_put_after_sub_checks(flags);
+
+	if (!(flags & CLOSURE_REMAINING_MASK)) {
+		smp_acquire__after_ctrl_dep();
+
+		cl->closure_get_happened = false;
 
-	if (!r) {
 		if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
 			atomic_set(&cl->remaining,
 				   CLOSURE_REMAINING_INITIALIZER);
@@ -29,7 +47,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
 			closure_debug_destroy(cl);
 
 			if (destructor)
-				destructor(cl);
+				destructor(&cl->work);
 
 			if (parent)
 				closure_put(parent);
@@ -40,37 +58,35 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
 /* For clearing flags with the same atomic op as a put */
 void closure_sub(struct closure *cl, int v)
 {
-	closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
+	closure_put_after_sub(cl, atomic_sub_return_release(v, &cl->remaining));
 }
 EXPORT_SYMBOL(closure_sub);
 
-/**
+/*
  * closure_put - decrement a closure's refcount
  */
 void closure_put(struct closure *cl)
 {
-	closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
+	closure_put_after_sub(cl, atomic_dec_return_release(&cl->remaining));
 }
 EXPORT_SYMBOL(closure_put);
 
-/**
+/*
  * closure_wake_up - wake up all closures on a wait list, without memory barrier
  */
 void __closure_wake_up(struct closure_waitlist *wait_list)
 {
-	struct llist_node *list, *next;
-	struct closure *cl;
+	struct llist_node *list;
+	struct closure *cl, *t;
+	struct llist_node *reverse = NULL;
 
-	/*
-	 * Grab entire list, reverse order to preserve FIFO ordering, and wake
-	 * everything up
-	 */
-	for (list = llist_reverse_order(llist_del_all(&wait_list->list));
-	     list;
-	     list = next) {
-		next = llist_next(list);
-		cl = container_of(list, struct closure, list);
+	list = llist_del_all(&wait_list->list);
 
+	/* We first reverse the list to preserve FIFO ordering and fairness */
+	reverse = llist_reverse_order(list);
+
+	/* Then do the wakeups */
+	llist_for_each_entry_safe(cl, t, reverse, list) {
 		closure_set_waiting(cl, 0);
 		closure_sub(cl, CLOSURE_WAITING + 1);
 	}
@@ -79,9 +95,9 @@ EXPORT_SYMBOL(__closure_wake_up);
 
 /**
  * closure_wait - add a closure to a waitlist
- *
- * @waitlist will own a ref on @cl, which will be released when
+ * @waitlist: will own a ref on @cl, which will be released when
  * closure_wake_up() is called on @waitlist.
+ * @cl: closure pointer.
  *
  */
 bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
@@ -89,6 +105,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
 	if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
 		return false;
 
+	cl->closure_get_happened = true;
 	closure_set_waiting(cl, _RET_IP_);
 	atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
 	llist_add(&cl->list, &waitlist->list);
@@ -102,10 +119,17 @@ struct closure_syncer {
 	int			done;
 };
 
-static void closure_sync_fn(struct closure *cl)
+static CLOSURE_CALLBACK(closure_sync_fn)
 {
-	cl->s->done = 1;
-	wake_up_process(cl->s->task);
+	struct closure *cl = container_of(ws, struct closure, work);
+	struct closure_syncer *s = cl->s;
+	struct task_struct *p;
+
+	rcu_read_lock();
+	p = READ_ONCE(s->task);
+	s->done = 1;
+	wake_up_process(p);
+	rcu_read_unlock();
 }
 
 void __sched __closure_sync(struct closure *cl)
@@ -113,11 +137,10 @@ void __sched __closure_sync(struct closure *cl)
 	struct closure_syncer s = { .task = current };
 
 	cl->s = &s;
-	continue_at_noreturn(cl, closure_sync_fn, NULL);
+	continue_at(cl, closure_sync_fn, NULL);
 
 	while (1) {
-		__set_current_state(TASK_UNINTERRUPTIBLE);
-		smp_mb();
+		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (s.done)
 			break;
 		schedule();
@@ -127,6 +150,78 @@ void __sched __closure_sync(struct closure *cl)
 }
 EXPORT_SYMBOL(__closure_sync);
 
+/*
+ * closure_return_sync - finish running a closure, synchronously (i.e. waiting
+ * for outstanding get()s to finish) and returning once closure refcount is 0.
+ *
+ * Unlike closure_sync() this doesn't reinit the ref to 1; subsequent
+ * closure_get_not_zero() calls waill fail.
+ */
+void __sched closure_return_sync(struct closure *cl)
+{
+	struct closure_syncer s = { .task = current };
+
+	cl->s = &s;
+	set_closure_fn(cl, closure_sync_fn, NULL);
+
+	unsigned flags = atomic_sub_return_release(1 + CLOSURE_RUNNING - CLOSURE_DESTRUCTOR,
+						   &cl->remaining);
+
+	closure_put_after_sub_checks(flags);
+
+	if (unlikely(flags & CLOSURE_REMAINING_MASK)) {
+		while (1) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (s.done)
+				break;
+			schedule();
+		}
+
+		__set_current_state(TASK_RUNNING);
+	}
+
+	if (cl->parent)
+		closure_put(cl->parent);
+}
+EXPORT_SYMBOL(closure_return_sync);
+
+int __sched __closure_sync_timeout(struct closure *cl, unsigned long timeout)
+{
+	struct closure_syncer s = { .task = current };
+	int ret = 0;
+
+	cl->s = &s;
+	continue_at(cl, closure_sync_fn, NULL);
+
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (s.done)
+			break;
+		if (!timeout) {
+			/*
+			 * Carefully undo the continue_at() - but only if it
+			 * hasn't completed, i.e. the final closure_put() hasn't
+			 * happened yet:
+			 */
+			unsigned old, new, v = atomic_read(&cl->remaining);
+			do {
+				old = v;
+				if (!old || (old & CLOSURE_RUNNING))
+					goto success;
+
+				new = old + CLOSURE_REMAINING_INITIALIZER;
+			} while ((v = atomic_cmpxchg(&cl->remaining, old, new)) != old);
+			ret = -ETIME;
+		}
+
+		timeout = schedule_timeout(timeout);
+	}
+success:
+	__set_current_state(TASK_RUNNING);
+	return ret;
+}
+EXPORT_SYMBOL(__closure_sync_timeout);
+
 #ifdef CONFIG_DEBUG_CLOSURES
 
 static LIST_HEAD(closure_list);
@@ -149,6 +244,9 @@ void closure_debug_destroy(struct closure *cl)
 {
 	unsigned long flags;
 
+	if (cl->magic == CLOSURE_MAGIC_STACK)
+		return;
+
 	BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
 	cl->magic = CLOSURE_MAGIC_DEAD;
 
@@ -158,9 +256,7 @@ void closure_debug_destroy(struct closure *cl)
 }
 EXPORT_SYMBOL(closure_debug_destroy);
 
-static struct dentry *debug;
-
-static int debug_seq_show(struct seq_file *f, void *data)
+static int debug_show(struct seq_file *f, void *data)
 {
 	struct closure *cl;
 
@@ -169,7 +265,7 @@ static int debug_seq_show(struct seq_file *f, void *data)
 	list_for_each_entry(cl, &closure_list, all) {
 		int r = atomic_read(&cl->remaining);
 
-		seq_printf(f, "%p: %pF -> %pf p %p r %i ",
+		seq_printf(f, "%p: %pS -> %pS p %p r %i ",
 			   cl, (void *) cl->ip, cl->fn, cl->parent,
 			   r & CLOSURE_REMAINING_MASK);
 
@@ -179,31 +275,21 @@ static int debug_seq_show(struct seq_file *f, void *data)
 			   r & CLOSURE_RUNNING	? "R" : "");
 
 		if (r & CLOSURE_WAITING)
-			seq_printf(f, " W %pF\n",
+			seq_printf(f, " W %pS\n",
 				   (void *) cl->waiting_on);
 
-		seq_puts(f, "\n");
+		seq_putc(f, '\n');
 	}
 
 	spin_unlock_irq(&closure_list_lock);
 	return 0;
 }
 
-static int debug_seq_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, debug_seq_show, NULL);
-}
-
-static const struct file_operations debug_ops = {
-	.owner		= THIS_MODULE,
-	.open		= debug_seq_open,
-	.read		= seq_read,
-	.release	= single_release
-};
+DEFINE_SHOW_ATTRIBUTE(debug);
 
 static int __init closure_debug_init(void)
 {
-	debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops);
+	debugfs_create_file("closures", 0400, NULL, NULL, &debug_fops);
 	return 0;
 }
 late_initcall(closure_debug_init)
diff --git a/linux/crc64.c b/linux/crc64.c
new file mode 100644
index 00000000..0ef8ae6a
--- /dev/null
+++ b/linux/crc64.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Normal 64-bit CRC calculation.
+ *
+ * This is a basic crc64 implementation following ECMA-182 specification,
+ * which can be found from,
+ * http://www.ecma-international.org/publications/standards/Ecma-182.htm
+ *
+ * Dr. Ross N. Williams has a great document to introduce the idea of CRC
+ * algorithm, here the CRC64 code is also inspired by the table-driven
+ * algorithm and detail example from this paper. This paper can be found
+ * from,
+ * http://www.ross.net/crc/download/crc_v3.txt
+ *
+ * crc64table[256] is the lookup table of a table-driven 64-bit CRC
+ * calculation, which is generated by gen_crc64table.c in kernel build
+ * time. The polynomial of crc64 arithmetic is from ECMA-182 specification
+ * as well, which is defined as,
+ *
+ * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x + 1
+ *
+ * Copyright 2018 SUSE Linux.
+ *   Author: Coly Li <colyli@suse.de>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include "crc64table.h"
+
+MODULE_DESCRIPTION("CRC64 calculations");
+MODULE_LICENSE("GPL v2");
+
+/**
+ * crc64_be - Calculate bitwise big-endian ECMA-182 CRC64
+ * @crc: seed value for computation. 0 or (u64)~0 for a new CRC calculation,
+	or the previous crc64 value if computing incrementally.
+ * @p: pointer to buffer over which CRC64 is run
+ * @len: length of buffer @p
+ */
+u64 __pure crc64_be(u64 crc, const void *p, size_t len)
+{
+	size_t i, t;
+
+	const unsigned char *_p = p;
+
+	for (i = 0; i < len; i++) {
+		t = ((crc >> 56) ^ (*_p++)) & 0xFF;
+		crc = crc64table[t] ^ (crc << 8);
+	}
+
+	return crc;
+}
+EXPORT_SYMBOL_GPL(crc64_be);
diff --git a/linux/crc64table.h b/linux/crc64table.h
new file mode 100644
index 00000000..9964164d
--- /dev/null
+++ b/linux/crc64table.h
@@ -0,0 +1,135 @@
+/* this file is generated - do not edit */
+
+#include <linux/types.h>
+#include <linux/cache.h>
+
+static const u64 ____cacheline_aligned crc64table[256] = {
+	0x0000000000000000ULL, 	0x42f0e1eba9ea3693ULL,
+	0x85e1c3d753d46d26ULL, 	0xc711223cfa3e5bb5ULL,
+	0x493366450e42ecdfULL, 	0x0bc387aea7a8da4cULL,
+	0xccd2a5925d9681f9ULL, 	0x8e224479f47cb76aULL,
+	0x9266cc8a1c85d9beULL, 	0xd0962d61b56fef2dULL,
+	0x17870f5d4f51b498ULL, 	0x5577eeb6e6bb820bULL,
+	0xdb55aacf12c73561ULL, 	0x99a54b24bb2d03f2ULL,
+	0x5eb4691841135847ULL, 	0x1c4488f3e8f96ed4ULL,
+	0x663d78ff90e185efULL, 	0x24cd9914390bb37cULL,
+	0xe3dcbb28c335e8c9ULL, 	0xa12c5ac36adfde5aULL,
+	0x2f0e1eba9ea36930ULL, 	0x6dfeff5137495fa3ULL,
+	0xaaefdd6dcd770416ULL, 	0xe81f3c86649d3285ULL,
+	0xf45bb4758c645c51ULL, 	0xb6ab559e258e6ac2ULL,
+	0x71ba77a2dfb03177ULL, 	0x334a9649765a07e4ULL,
+	0xbd68d2308226b08eULL, 	0xff9833db2bcc861dULL,
+	0x388911e7d1f2dda8ULL, 	0x7a79f00c7818eb3bULL,
+	0xcc7af1ff21c30bdeULL, 	0x8e8a101488293d4dULL,
+	0x499b3228721766f8ULL, 	0x0b6bd3c3dbfd506bULL,
+	0x854997ba2f81e701ULL, 	0xc7b97651866bd192ULL,
+	0x00a8546d7c558a27ULL, 	0x4258b586d5bfbcb4ULL,
+	0x5e1c3d753d46d260ULL, 	0x1cecdc9e94ace4f3ULL,
+	0xdbfdfea26e92bf46ULL, 	0x990d1f49c77889d5ULL,
+	0x172f5b3033043ebfULL, 	0x55dfbadb9aee082cULL,
+	0x92ce98e760d05399ULL, 	0xd03e790cc93a650aULL,
+	0xaa478900b1228e31ULL, 	0xe8b768eb18c8b8a2ULL,
+	0x2fa64ad7e2f6e317ULL, 	0x6d56ab3c4b1cd584ULL,
+	0xe374ef45bf6062eeULL, 	0xa1840eae168a547dULL,
+	0x66952c92ecb40fc8ULL, 	0x2465cd79455e395bULL,
+	0x3821458aada7578fULL, 	0x7ad1a461044d611cULL,
+	0xbdc0865dfe733aa9ULL, 	0xff3067b657990c3aULL,
+	0x711223cfa3e5bb50ULL, 	0x33e2c2240a0f8dc3ULL,
+	0xf4f3e018f031d676ULL, 	0xb60301f359dbe0e5ULL,
+	0xda050215ea6c212fULL, 	0x98f5e3fe438617bcULL,
+	0x5fe4c1c2b9b84c09ULL, 	0x1d14202910527a9aULL,
+	0x93366450e42ecdf0ULL, 	0xd1c685bb4dc4fb63ULL,
+	0x16d7a787b7faa0d6ULL, 	0x5427466c1e109645ULL,
+	0x4863ce9ff6e9f891ULL, 	0x0a932f745f03ce02ULL,
+	0xcd820d48a53d95b7ULL, 	0x8f72eca30cd7a324ULL,
+	0x0150a8daf8ab144eULL, 	0x43a04931514122ddULL,
+	0x84b16b0dab7f7968ULL, 	0xc6418ae602954ffbULL,
+	0xbc387aea7a8da4c0ULL, 	0xfec89b01d3679253ULL,
+	0x39d9b93d2959c9e6ULL, 	0x7b2958d680b3ff75ULL,
+	0xf50b1caf74cf481fULL, 	0xb7fbfd44dd257e8cULL,
+	0x70eadf78271b2539ULL, 	0x321a3e938ef113aaULL,
+	0x2e5eb66066087d7eULL, 	0x6cae578bcfe24bedULL,
+	0xabbf75b735dc1058ULL, 	0xe94f945c9c3626cbULL,
+	0x676dd025684a91a1ULL, 	0x259d31cec1a0a732ULL,
+	0xe28c13f23b9efc87ULL, 	0xa07cf2199274ca14ULL,
+	0x167ff3eacbaf2af1ULL, 	0x548f120162451c62ULL,
+	0x939e303d987b47d7ULL, 	0xd16ed1d631917144ULL,
+	0x5f4c95afc5edc62eULL, 	0x1dbc74446c07f0bdULL,
+	0xdaad56789639ab08ULL, 	0x985db7933fd39d9bULL,
+	0x84193f60d72af34fULL, 	0xc6e9de8b7ec0c5dcULL,
+	0x01f8fcb784fe9e69ULL, 	0x43081d5c2d14a8faULL,
+	0xcd2a5925d9681f90ULL, 	0x8fdab8ce70822903ULL,
+	0x48cb9af28abc72b6ULL, 	0x0a3b7b1923564425ULL,
+	0x70428b155b4eaf1eULL, 	0x32b26afef2a4998dULL,
+	0xf5a348c2089ac238ULL, 	0xb753a929a170f4abULL,
+	0x3971ed50550c43c1ULL, 	0x7b810cbbfce67552ULL,
+	0xbc902e8706d82ee7ULL, 	0xfe60cf6caf321874ULL,
+	0xe224479f47cb76a0ULL, 	0xa0d4a674ee214033ULL,
+	0x67c58448141f1b86ULL, 	0x253565a3bdf52d15ULL,
+	0xab1721da49899a7fULL, 	0xe9e7c031e063acecULL,
+	0x2ef6e20d1a5df759ULL, 	0x6c0603e6b3b7c1caULL,
+	0xf6fae5c07d3274cdULL, 	0xb40a042bd4d8425eULL,
+	0x731b26172ee619ebULL, 	0x31ebc7fc870c2f78ULL,
+	0xbfc9838573709812ULL, 	0xfd39626eda9aae81ULL,
+	0x3a28405220a4f534ULL, 	0x78d8a1b9894ec3a7ULL,
+	0x649c294a61b7ad73ULL, 	0x266cc8a1c85d9be0ULL,
+	0xe17dea9d3263c055ULL, 	0xa38d0b769b89f6c6ULL,
+	0x2daf4f0f6ff541acULL, 	0x6f5faee4c61f773fULL,
+	0xa84e8cd83c212c8aULL, 	0xeabe6d3395cb1a19ULL,
+	0x90c79d3fedd3f122ULL, 	0xd2377cd44439c7b1ULL,
+	0x15265ee8be079c04ULL, 	0x57d6bf0317edaa97ULL,
+	0xd9f4fb7ae3911dfdULL, 	0x9b041a914a7b2b6eULL,
+	0x5c1538adb04570dbULL, 	0x1ee5d94619af4648ULL,
+	0x02a151b5f156289cULL, 	0x4051b05e58bc1e0fULL,
+	0x87409262a28245baULL, 	0xc5b073890b687329ULL,
+	0x4b9237f0ff14c443ULL, 	0x0962d61b56fef2d0ULL,
+	0xce73f427acc0a965ULL, 	0x8c8315cc052a9ff6ULL,
+	0x3a80143f5cf17f13ULL, 	0x7870f5d4f51b4980ULL,
+	0xbf61d7e80f251235ULL, 	0xfd913603a6cf24a6ULL,
+	0x73b3727a52b393ccULL, 	0x31439391fb59a55fULL,
+	0xf652b1ad0167feeaULL, 	0xb4a25046a88dc879ULL,
+	0xa8e6d8b54074a6adULL, 	0xea16395ee99e903eULL,
+	0x2d071b6213a0cb8bULL, 	0x6ff7fa89ba4afd18ULL,
+	0xe1d5bef04e364a72ULL, 	0xa3255f1be7dc7ce1ULL,
+	0x64347d271de22754ULL, 	0x26c49cccb40811c7ULL,
+	0x5cbd6cc0cc10fafcULL, 	0x1e4d8d2b65facc6fULL,
+	0xd95caf179fc497daULL, 	0x9bac4efc362ea149ULL,
+	0x158e0a85c2521623ULL, 	0x577eeb6e6bb820b0ULL,
+	0x906fc95291867b05ULL, 	0xd29f28b9386c4d96ULL,
+	0xcedba04ad0952342ULL, 	0x8c2b41a1797f15d1ULL,
+	0x4b3a639d83414e64ULL, 	0x09ca82762aab78f7ULL,
+	0x87e8c60fded7cf9dULL, 	0xc51827e4773df90eULL,
+	0x020905d88d03a2bbULL, 	0x40f9e43324e99428ULL,
+	0x2cffe7d5975e55e2ULL, 	0x6e0f063e3eb46371ULL,
+	0xa91e2402c48a38c4ULL, 	0xebeec5e96d600e57ULL,
+	0x65cc8190991cb93dULL, 	0x273c607b30f68faeULL,
+	0xe02d4247cac8d41bULL, 	0xa2dda3ac6322e288ULL,
+	0xbe992b5f8bdb8c5cULL, 	0xfc69cab42231bacfULL,
+	0x3b78e888d80fe17aULL, 	0x7988096371e5d7e9ULL,
+	0xf7aa4d1a85996083ULL, 	0xb55aacf12c735610ULL,
+	0x724b8ecdd64d0da5ULL, 	0x30bb6f267fa73b36ULL,
+	0x4ac29f2a07bfd00dULL, 	0x08327ec1ae55e69eULL,
+	0xcf235cfd546bbd2bULL, 	0x8dd3bd16fd818bb8ULL,
+	0x03f1f96f09fd3cd2ULL, 	0x41011884a0170a41ULL,
+	0x86103ab85a2951f4ULL, 	0xc4e0db53f3c36767ULL,
+	0xd8a453a01b3a09b3ULL, 	0x9a54b24bb2d03f20ULL,
+	0x5d45907748ee6495ULL, 	0x1fb5719ce1045206ULL,
+	0x919735e51578e56cULL, 	0xd367d40ebc92d3ffULL,
+	0x1476f63246ac884aULL, 	0x568617d9ef46bed9ULL,
+	0xe085162ab69d5e3cULL, 	0xa275f7c11f7768afULL,
+	0x6564d5fde549331aULL, 	0x279434164ca30589ULL,
+	0xa9b6706fb8dfb2e3ULL, 	0xeb46918411358470ULL,
+	0x2c57b3b8eb0bdfc5ULL, 	0x6ea7525342e1e956ULL,
+	0x72e3daa0aa188782ULL, 	0x30133b4b03f2b111ULL,
+	0xf7021977f9cceaa4ULL, 	0xb5f2f89c5026dc37ULL,
+	0x3bd0bce5a45a6b5dULL, 	0x79205d0e0db05dceULL,
+	0xbe317f32f78e067bULL, 	0xfcc19ed95e6430e8ULL,
+	0x86b86ed5267cdbd3ULL, 	0xc4488f3e8f96ed40ULL,
+	0x0359ad0275a8b6f5ULL, 	0x41a94ce9dc428066ULL,
+	0xcf8b0890283e370cULL, 	0x8d7be97b81d4019fULL,
+	0x4a6acb477bea5a2aULL, 	0x089a2aacd2006cb9ULL,
+	0x14dea25f3af9026dULL, 	0x562e43b4931334feULL,
+	0x913f6188692d6f4bULL, 	0xd3cf8063c0c759d8ULL,
+	0x5dedc41a34bbeeb2ULL, 	0x1f1d25f19d51d821ULL,
+	0xd80c07cd676f8394ULL, 	0x9afce626ce85b507ULL,
+};
diff --git a/linux/crypto/chacha20_generic.c b/linux/crypto/chacha20_generic.c
index c6f14945..914189e7 100644
--- a/linux/crypto/chacha20_generic.c
+++ b/linux/crypto/chacha20_generic.c
@@ -17,7 +17,7 @@
 
 #include <linux/crypto.h>
 #include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
 #include <crypto/skcipher.h>
 
 #include <sodium/crypto_stream_chacha20.h>
@@ -36,7 +36,7 @@ static int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
 		container_of(tfm, struct chacha20_tfm, tfm);
 	int i;
 
-	if (keysize != CHACHA20_KEY_SIZE)
+	if (keysize != CHACHA_KEY_SIZE)
 		return -EINVAL;
 
 	for (i = 0; i < ARRAY_SIZE(ctx->key); i++)
@@ -72,8 +72,8 @@ static int crypto_chacha20_crypt(struct skcipher_request *req)
 		if (sg_is_last(sg))
 			break;
 
-		BUG_ON(sg->length % CHACHA20_BLOCK_SIZE);
-		iv[0] += sg->length / CHACHA20_BLOCK_SIZE;
+		BUG_ON(sg->length % CHACHA_BLOCK_SIZE);
+		iv[0] += sg->length / CHACHA_BLOCK_SIZE;
 		sg = sg_next(sg);
 	};
 
@@ -93,8 +93,8 @@ static void *crypto_chacha20_alloc_tfm(void)
 	tfm->tfm.setkey		= crypto_chacha20_setkey;
 	tfm->tfm.encrypt	= crypto_chacha20_crypt;
 	tfm->tfm.decrypt	= crypto_chacha20_crypt;
-	tfm->tfm.ivsize		= CHACHA20_IV_SIZE;
-	tfm->tfm.keysize	= CHACHA20_KEY_SIZE;
+	tfm->tfm.ivsize		= CHACHA_IV_SIZE;
+	tfm->tfm.keysize	= CHACHA_KEY_SIZE;
 
 	return tfm;
 }
diff --git a/linux/fs.c b/linux/fs.c
index 00028469..623ca266 100644
--- a/linux/fs.c
+++ b/linux/fs.c
@@ -3,12 +3,12 @@
 #include <linux/posix_acl_xattr.h>
 #include <linux/xattr.h>
 
-const struct xattr_handler posix_acl_access_xattr_handler = {
+const struct xattr_handler nop_posix_acl_access = {
 	.name = XATTR_NAME_POSIX_ACL_ACCESS,
 	.flags = ACL_TYPE_ACCESS,
 };
 
-const struct xattr_handler posix_acl_default_xattr_handler = {
+const struct xattr_handler nop_posix_acl_default = {
 	.name = XATTR_NAME_POSIX_ACL_DEFAULT,
 	.flags = ACL_TYPE_DEFAULT,
 };
diff --git a/linux/fs_parser.c b/linux/fs_parser.c
new file mode 100644
index 00000000..b1cd0c72
--- /dev/null
+++ b/linux/fs_parser.c
@@ -0,0 +1,36 @@
+
+#include <linux/kernel.h>
+#include <linux/fs_parser.h>
+#include <string.h>
+
+const struct constant_table bool_names[] = {
+	{ "0",		false },
+	{ "1",		true },
+	{ "false",	false },
+	{ "no",		false },
+	{ "true",	true },
+	{ "yes",	true },
+	{ },
+};
+
+static const struct constant_table *
+__lookup_constant(const struct constant_table *tbl, const char *name)
+{
+	for ( ; tbl->name; tbl++)
+		if (strcmp(name, tbl->name) == 0)
+			return tbl;
+	return NULL;
+}
+
+/**
+ * lookup_constant - Look up a constant by name in an ordered table
+ * @tbl: The table of constants to search.
+ * @name: The name to look up.
+ * @not_found: The value to return if the name is not found.
+ */
+int lookup_constant(const struct constant_table *tbl, const char *name, int not_found)
+{
+	const struct constant_table *p = __lookup_constant(tbl, name);
+
+	return p ? p->value : not_found;
+}
diff --git a/linux/generic-radix-tree.c b/linux/generic-radix-tree.c
index 5c4a275e..79e067b5 100644
--- a/linux/generic-radix-tree.c
+++ b/linux/generic-radix-tree.c
@@ -1,33 +1,9 @@
 
+#include <linux/atomic.h>
 #include <linux/export.h>
 #include <linux/generic-radix-tree.h>
 #include <linux/gfp.h>
-
-#define GENRADIX_ARY		(PAGE_SIZE / sizeof(struct genradix_node *))
-#define GENRADIX_ARY_SHIFT	ilog2(GENRADIX_ARY)
-
-struct genradix_node {
-	union {
-		/* Interior node: */
-		struct genradix_node	*children[GENRADIX_ARY];
-
-		/* Leaf: */
-		u8			data[PAGE_SIZE];
-	};
-};
-
-static inline unsigned genradix_depth_shift(unsigned depth)
-{
-	return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth;
-}
-
-/*
- * Returns size (of data, in bytes) that a tree of a given depth holds:
- */
-static inline size_t genradix_depth_size(unsigned depth)
-{
-	return 1UL << genradix_depth_shift(depth);
-}
+#include <linux/kmemleak.h>
 
 /*
  * Returns pointer to the specified byte @offset within @radix, or NULL if not
@@ -35,25 +11,7 @@ static inline size_t genradix_depth_size(unsigned depth)
  */
 void *__genradix_ptr(struct __genradix *radix, size_t offset)
 {
-	size_t level = radix->depth;
-	struct genradix_node *n = radix->root;
-
-	if (offset >= genradix_depth_size(radix->depth))
-		return NULL;
-
-	while (1) {
-		if (!n)
-			return NULL;
-		if (!level)
-			break;
-
-		level--;
-
-		n = n->children[offset >> genradix_depth_shift(level)];
-		offset &= genradix_depth_size(level) - 1;
-	}
-
-	return &n->data[offset];
+	return __genradix_ptr_inlined(radix, offset);
 }
 EXPORT_SYMBOL(__genradix_ptr);
 
@@ -62,45 +20,66 @@ EXPORT_SYMBOL(__genradix_ptr);
  * necessary - newly allocated slots are always zeroed out:
  */
 void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
+			   struct genradix_node **preallocated,
 			   gfp_t gfp_mask)
 {
-	struct genradix_node **n;
-	size_t level;
+	struct genradix_root *v = READ_ONCE(radix->root);
+	struct genradix_node *n, *new_node = NULL;
+	unsigned level;
 
-	/* Increase tree depth if necessary: */
+	if (preallocated)
+		swap(new_node, *preallocated);
 
-	while (offset >= genradix_depth_size(radix->depth)) {
-		struct genradix_node *new_root =
-			(void *) __get_free_page(gfp_mask|__GFP_ZERO);
-
-		if (!new_root)
-			return NULL;
+	/* Increase tree depth if necessary: */
+	while (1) {
+		struct genradix_root *r = v, *new_root;
 
-		new_root->children[0] = radix->root;
-		radix->root = new_root;
-		radix->depth++;
-	}
+		n	= genradix_root_to_node(r);
+		level	= genradix_root_to_depth(r);
 
-	n = &radix->root;
-	level = radix->depth;
+		if (n && ilog2(offset) < genradix_depth_shift(level))
+			break;
 
-	while (1) {
-		if (!*n) {
-			*n = (void *) __get_free_page(gfp_mask|__GFP_ZERO);
-			if (!*n)
+		if (!new_node) {
+			new_node = genradix_alloc_node(gfp_mask);
+			if (!new_node)
 				return NULL;
 		}
 
-		if (!level)
-			break;
+		new_node->children[0] = n;
+		new_root = ((struct genradix_root *)
+			    ((unsigned long) new_node | (n ? level + 1 : 0)));
 
-		level--;
+		if ((v = cmpxchg_release(&radix->root, r, new_root)) == r) {
+			v = new_root;
+			new_node = NULL;
+		} else {
+			new_node->children[0] = NULL;
+		}
+	}
 
-		n = &(*n)->children[offset >> genradix_depth_shift(level)];
+	while (level--) {
+		struct genradix_node **p =
+			&n->children[offset >> genradix_depth_shift(level)];
 		offset &= genradix_depth_size(level) - 1;
+
+		n = READ_ONCE(*p);
+		if (!n) {
+			if (!new_node) {
+				new_node = genradix_alloc_node(gfp_mask);
+				if (!new_node)
+					return NULL;
+			}
+
+			if (!(n = cmpxchg_release(p, NULL, new_node)))
+				swap(n, new_node);
+		}
 	}
 
-	return &(*n)->data[offset];
+	if (new_node)
+		genradix_free_node(new_node);
+
+	return &n->data[offset];
 }
 EXPORT_SYMBOL(__genradix_ptr_alloc);
 
@@ -108,17 +87,23 @@ void *__genradix_iter_peek(struct genradix_iter *iter,
 			   struct __genradix *radix,
 			   size_t objs_per_page)
 {
+	struct genradix_root *r;
 	struct genradix_node *n;
-	size_t level, i;
+	unsigned level, i;
 
-	if (!radix->root)
+	if (iter->offset == SIZE_MAX)
 		return NULL;
+
 restart:
-	if (iter->offset >= genradix_depth_size(radix->depth))
+	r = READ_ONCE(radix->root);
+	if (!r)
 		return NULL;
 
-	n	= radix->root;
-	level	= radix->depth;
+	n	= genradix_root_to_node(r);
+	level	= genradix_root_to_depth(r);
+
+	if (ilog2(iter->offset) >= genradix_depth_shift(level))
+		return NULL;
 
 	while (level) {
 		level--;
@@ -127,11 +112,18 @@ restart:
 			(GENRADIX_ARY - 1);
 
 		while (!n->children[i]) {
+			size_t objs_per_ptr = genradix_depth_size(level);
+
+			if (iter->offset + objs_per_ptr < iter->offset) {
+				iter->offset	= SIZE_MAX;
+				iter->pos	= SIZE_MAX;
+				return NULL;
+			}
+
 			i++;
-			iter->offset = round_down(iter->offset +
-					   genradix_depth_size(level),
-					   genradix_depth_size(level));
-			iter->pos = (iter->offset >> PAGE_SHIFT) *
+			iter->offset = round_down(iter->offset + objs_per_ptr,
+						  objs_per_ptr);
+			iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) *
 				objs_per_page;
 			if (i == GENRADIX_ARY)
 				goto restart;
@@ -140,10 +132,68 @@ restart:
 		n = n->children[i];
 	}
 
-	return &n->data[iter->offset & (PAGE_SIZE - 1)];
+	return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)];
 }
 EXPORT_SYMBOL(__genradix_iter_peek);
 
+void *__genradix_iter_peek_prev(struct genradix_iter *iter,
+				struct __genradix *radix,
+				size_t objs_per_page,
+				size_t obj_size_plus_page_remainder)
+{
+	struct genradix_root *r;
+	struct genradix_node *n;
+	unsigned level, i;
+
+	if (iter->offset == SIZE_MAX)
+		return NULL;
+
+restart:
+	r = READ_ONCE(radix->root);
+	if (!r)
+		return NULL;
+
+	n	= genradix_root_to_node(r);
+	level	= genradix_root_to_depth(r);
+
+	if (ilog2(iter->offset) >= genradix_depth_shift(level)) {
+		iter->offset = genradix_depth_size(level);
+		iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page;
+
+		iter->offset -= obj_size_plus_page_remainder;
+		iter->pos--;
+	}
+
+	while (level) {
+		level--;
+
+		i = (iter->offset >> genradix_depth_shift(level)) &
+			(GENRADIX_ARY - 1);
+
+		while (!n->children[i]) {
+			size_t objs_per_ptr = genradix_depth_size(level);
+
+			iter->offset = round_down(iter->offset, objs_per_ptr);
+			iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page;
+
+			if (!iter->offset)
+				return NULL;
+
+			iter->offset -= obj_size_plus_page_remainder;
+			iter->pos--;
+
+			if (!i)
+				goto restart;
+			--i;
+		}
+
+		n = n->children[i];
+	}
+
+	return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)];
+}
+EXPORT_SYMBOL(__genradix_iter_peek_prev);
+
 static void genradix_free_recurse(struct genradix_node *n, unsigned level)
 {
 	if (level) {
@@ -154,14 +204,27 @@ static void genradix_free_recurse(struct genradix_node *n, unsigned level)
 				genradix_free_recurse(n->children[i], level - 1);
 	}
 
-	free_page((unsigned long) n);
+	genradix_free_node(n);
+}
+
+int __genradix_prealloc(struct __genradix *radix, size_t size,
+			gfp_t gfp_mask)
+{
+	size_t offset;
+
+	for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE)
+		if (!__genradix_ptr_alloc(radix, offset, NULL, gfp_mask))
+			return -ENOMEM;
+
+	return 0;
 }
+EXPORT_SYMBOL(__genradix_prealloc);
 
 void __genradix_free(struct __genradix *radix)
 {
-	genradix_free_recurse(radix->root, radix->depth);
+	struct genradix_root *r = xchg(&radix->root, NULL);
 
-	radix->root = NULL;
-	radix->depth = 0;
+	genradix_free_recurse(genradix_root_to_node(r),
+			      genradix_root_to_depth(r));
 }
 EXPORT_SYMBOL(__genradix_free);
diff --git a/linux/int_sqrt.c b/linux/int_sqrt.c
new file mode 100644
index 00000000..a8170bb9
--- /dev/null
+++ b/linux/int_sqrt.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2013 Davidlohr Bueso <davidlohr.bueso@hp.com>
+ *
+ *  Based on the shift-and-subtract algorithm for computing integer
+ *  square root from Guy L. Steele.
+ */
+
+#include <linux/export.h>
+#include <linux/bitops.h>
+#include <linux/limits.h>
+#include <linux/math.h>
+
+/**
+ * int_sqrt - computes the integer square root
+ * @x: integer of which to calculate the sqrt
+ *
+ * Computes: floor(sqrt(x))
+ */
+unsigned long int_sqrt(unsigned long x)
+{
+	unsigned long b, m, y = 0;
+
+	if (x <= 1)
+		return x;
+
+	m = 1UL << (__fls(x) & ~1UL);
+	while (m != 0) {
+		b = y + m;
+		y >>= 1;
+
+		if (x >= b) {
+			x -= b;
+			y += m;
+		}
+		m >>= 2;
+	}
+
+	return y;
+}
+EXPORT_SYMBOL(int_sqrt);
+
+#if BITS_PER_LONG < 64
+/**
+ * int_sqrt64 - strongly typed int_sqrt function when minimum 64 bit input
+ * is expected.
+ * @x: 64bit integer of which to calculate the sqrt
+ */
+u32 int_sqrt64(u64 x)
+{
+	u64 b, m, y = 0;
+
+	if (x <= ULONG_MAX)
+		return int_sqrt((unsigned long) x);
+
+	m = 1ULL << ((fls64(x) - 1) & ~1ULL);
+	while (m != 0) {
+		b = y + m;
+		y >>= 1;
+
+		if (x >= b) {
+			x -= b;
+			y += m;
+		}
+		m >>= 2;
+	}
+
+	return y;
+}
+EXPORT_SYMBOL(int_sqrt64);
+#endif
diff --git a/linux/kstrtox.c b/linux/kstrtox.c
index 9164fb96..bde55808 100644
--- a/linux/kstrtox.c
+++ b/linux/kstrtox.c
@@ -12,10 +12,8 @@
  * If -E is returned, result is not touched.
  */
 #include <errno.h>
-#include <linux/ctype.h>
+#include <ctype.h>
 #include <linux/kernel.h>
-#include <linux/math64.h>
-#include <linux/export.h>
 #include <linux/types.h>
 #include "kstrtox.h"
 
@@ -71,7 +69,7 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long
 		 * it in the max base we support (16)
 		 */
 		if (unlikely(res & (~0ull << 60))) {
-			if (res > div_u64(ULLONG_MAX - val, base))
+			if (res > ULLONG_MAX - val / base)
 				overflow = 1;
 		}
 		res = res * base + val;
@@ -126,7 +124,6 @@ int kstrtoull(const char *s, unsigned int base, unsigned long long *res)
 		s++;
 	return _kstrtoull(s, base, res);
 }
-EXPORT_SYMBOL(kstrtoull);
 
 /**
  * kstrtoll - convert a string to a long long
@@ -166,7 +163,6 @@ int kstrtoll(const char *s, unsigned int base, long long *res)
 	}
 	return 0;
 }
-EXPORT_SYMBOL(kstrtoll);
 
 /* Internal, do not use. */
 int _kstrtoul(const char *s, unsigned int base, unsigned long *res)
@@ -182,7 +178,6 @@ int _kstrtoul(const char *s, unsigned int base, unsigned long *res)
 	*res = tmp;
 	return 0;
 }
-EXPORT_SYMBOL(_kstrtoul);
 
 /* Internal, do not use. */
 int _kstrtol(const char *s, unsigned int base, long *res)
@@ -198,7 +193,6 @@ int _kstrtol(const char *s, unsigned int base, long *res)
 	*res = tmp;
 	return 0;
 }
-EXPORT_SYMBOL(_kstrtol);
 
 /**
  * kstrtouint - convert a string to an unsigned int
@@ -229,7 +223,6 @@ int kstrtouint(const char *s, unsigned int base, unsigned int *res)
 	*res = tmp;
 	return 0;
 }
-EXPORT_SYMBOL(kstrtouint);
 
 /**
  * kstrtoint - convert a string to an int
@@ -260,7 +253,6 @@ int kstrtoint(const char *s, unsigned int base, int *res)
 	*res = tmp;
 	return 0;
 }
-EXPORT_SYMBOL(kstrtoint);
 
 int kstrtou16(const char *s, unsigned int base, u16 *res)
 {
@@ -275,7 +267,6 @@ int kstrtou16(const char *s, unsigned int base, u16 *res)
 	*res = tmp;
 	return 0;
 }
-EXPORT_SYMBOL(kstrtou16);
 
 int kstrtos16(const char *s, unsigned int base, s16 *res)
 {
@@ -290,7 +281,6 @@ int kstrtos16(const char *s, unsigned int base, s16 *res)
 	*res = tmp;
 	return 0;
 }
-EXPORT_SYMBOL(kstrtos16);
 
 int kstrtou8(const char *s, unsigned int base, u8 *res)
 {
@@ -305,7 +295,6 @@ int kstrtou8(const char *s, unsigned int base, u8 *res)
 	*res = tmp;
 	return 0;
 }
-EXPORT_SYMBOL(kstrtou8);
 
 int kstrtos8(const char *s, unsigned int base, s8 *res)
 {
@@ -320,7 +309,6 @@ int kstrtos8(const char *s, unsigned int base, s8 *res)
 	*res = tmp;
 	return 0;
 }
-EXPORT_SYMBOL(kstrtos8);
 
 /**
  * kstrtobool - convert common user inputs into boolean values
@@ -367,4 +355,3 @@ int kstrtobool(const char *s, bool *res)
 
 	return -EINVAL;
 }
-EXPORT_SYMBOL(kstrtobool);
diff --git a/linux/kthread.c b/linux/kthread.c
index eef73fe8..17830e5f 100644
--- a/linux/kthread.c
+++ b/linux/kthread.c
@@ -7,6 +7,8 @@
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 
+#include "tools-util.h"
+
 enum KTHREAD_BITS {
 	KTHREAD_IS_PER_CPU = 0,
 	KTHREAD_SHOULD_STOP,
@@ -57,6 +59,7 @@ struct task_struct *kthread_create(int (*thread_fn)(void *data),
 {
 	va_list args;
 	struct task_struct *p = malloc(sizeof(*p));
+	int ret;
 
 	memset(p, 0, sizeof(*p));
 
@@ -68,10 +71,24 @@ struct task_struct *kthread_create(int (*thread_fn)(void *data),
 	p->thread_fn	= thread_fn;
 	p->thread_data	= thread_data;
 	p->state	= TASK_UNINTERRUPTIBLE;
+	p->signal	= &p->_signal;
 	atomic_set(&p->usage, 1);
 	init_completion(&p->exited);
+	init_rwsem(&p->_signal.exec_update_lock);
+
+	pthread_attr_t attr;
+	pthread_attr_init(&attr);
+	pthread_attr_setstacksize(&attr, 32 << 10);
 
-	pthread_create(&p->thread, NULL, kthread_start_fn, p);
+	for (unsigned i = 0; i < 10; i++) {
+		ret = pthread_create(&p->thread, &attr, kthread_start_fn, p);
+		if (!ret)
+			break;
+
+		run_shrinkers(GFP_KERNEL, true);
+	}
+	if (ret)
+		return ERR_PTR(-ret);
 	pthread_setname_np(p->thread, p->comm);
 	return p;
 }
@@ -88,6 +105,11 @@ bool kthread_should_stop(void)
 	return test_bit(KTHREAD_SHOULD_STOP, &current->kthread_flags);
 }
 
+bool kthread_freezable_should_stop(bool *was_frozen)
+{
+	return test_bit(KTHREAD_SHOULD_STOP, &current->kthread_flags);
+}
+
 /**
  * kthread_stop - stop a thread created by kthread_create().
  * @k: thread created by kthread_create().
diff --git a/linux/llist.c b/linux/llist.c
index ae5872b1..611ce488 100644
--- a/linux/llist.c
+++ b/linux/llist.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Lock-less NULL terminated single linked list
  *
@@ -8,19 +9,6 @@
  *
  * Copyright 2010,2011 Intel Corp.
  *   Author: Huang Ying <ying.huang@intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation;
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 #include <linux/kernel.h>
 #include <linux/export.h>
@@ -41,7 +29,7 @@ bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
 	struct llist_node *first;
 
 	do {
-		new_last->next = first = ACCESS_ONCE(head->first);
+		new_last->next = first = READ_ONCE(head->first);
 	} while (cmpxchg(&head->first, first, new_first) != first);
 
 	return !first;
diff --git a/linux/mempool.c b/linux/mempool.c
new file mode 100644
index 00000000..74ed17bf
--- /dev/null
+++ b/linux/mempool.c
@@ -0,0 +1,554 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  linux/mm/mempool.c
+ *
+ *  memory buffer pool support. Such pools are mostly used
+ *  for guaranteed, deadlock-free memory allocations during
+ *  extreme VM load.
+ *
+ *  started by Ingo Molnar, Copyright (C) 2001
+ *  debugging by David Rientjes, Copyright (C) 2015
+ */
+
+#include <linux/slab.h>
+//#include <linux/kasan.h>
+//#include <linux/kmemleak.h>
+#include <linux/export.h>
+#include <linux/jiffies.h>
+#include <linux/mempool.h>
+#include <linux/mempool.h>
+#include <linux/sched.h>
+
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
+static void poison_error(mempool_t *pool, void *element, size_t size,
+			 size_t byte)
+{
+	const int nr = pool->curr_nr;
+	const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0);
+	const int end = min_t(int, byte + (BITS_PER_LONG / 8), size);
+	int i;
+
+	pr_err("BUG: mempool element poison mismatch\n");
+	pr_err("Mempool %p size %zu\n", pool, size);
+	pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : "");
+	for (i = start; i < end; i++)
+		pr_cont("%x ", *(u8 *)(element + i));
+	pr_cont("%s\n", end < size ? "..." : "");
+	dump_stack();
+}
+
+static void __check_element(mempool_t *pool, void *element, size_t size)
+{
+	u8 *obj = element;
+	size_t i;
+
+	for (i = 0; i < size; i++) {
+		u8 exp = (i < size - 1) ? POISON_FREE : POISON_END;
+
+		if (obj[i] != exp) {
+			poison_error(pool, element, size, i);
+			return;
+		}
+	}
+	memset(obj, POISON_INUSE, size);
+}
+
+static void check_element(mempool_t *pool, void *element)
+{
+	/* Mempools backed by slab allocator */
+	if (pool->free == mempool_free_slab || pool->free == mempool_kfree) {
+		__check_element(pool, element, ksize(element));
+	} else if (pool->free == mempool_free_pages) {
+		/* Mempools backed by page allocator */
+		int order = (int)(long)pool->pool_data;
+		void *addr = kmap_atomic((struct page *)element);
+
+		__check_element(pool, addr, 1UL << (PAGE_SHIFT + order));
+		kunmap_atomic(addr);
+	}
+}
+
+static void __poison_element(void *element, size_t size)
+{
+	u8 *obj = element;
+
+	memset(obj, POISON_FREE, size - 1);
+	obj[size - 1] = POISON_END;
+}
+
+static void poison_element(mempool_t *pool, void *element)
+{
+	/* Mempools backed by slab allocator */
+	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) {
+		__poison_element(element, ksize(element));
+	} else if (pool->alloc == mempool_alloc_pages) {
+		/* Mempools backed by page allocator */
+		int order = (int)(long)pool->pool_data;
+		void *addr = kmap_atomic((struct page *)element);
+
+		__poison_element(addr, 1UL << (PAGE_SHIFT + order));
+		kunmap_atomic(addr);
+	}
+}
+#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
+static inline void check_element(mempool_t *pool, void *element)
+{
+}
+static inline void poison_element(mempool_t *pool, void *element)
+{
+}
+#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
+
+static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
+{
+#if 0
+	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+		kasan_poison_kfree(element, _RET_IP_);
+	else if (pool->alloc == mempool_alloc_pages)
+		kasan_free_pages(element, (unsigned long)pool->pool_data);
+#endif
+}
+
+static void kasan_unpoison_element(mempool_t *pool, void *element)
+{
+#if 0
+	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+		kasan_unpoison_slab(element);
+	else if (pool->alloc == mempool_alloc_pages)
+		kasan_alloc_pages(element, (unsigned long)pool->pool_data);
+#endif
+}
+
+static __always_inline void add_element(mempool_t *pool, void *element)
+{
+	BUG_ON(pool->curr_nr >= pool->min_nr);
+	poison_element(pool, element);
+	kasan_poison_element(pool, element);
+	pool->elements[pool->curr_nr++] = element;
+}
+
+static void *remove_element(mempool_t *pool)
+{
+	void *element = pool->elements[--pool->curr_nr];
+
+	BUG_ON(pool->curr_nr < 0);
+	kasan_unpoison_element(pool, element);
+	check_element(pool, element);
+	return element;
+}
+
+/**
+ * mempool_exit - exit a mempool initialized with mempool_init()
+ * @pool:      pointer to the memory pool which was initialized with
+ *             mempool_init().
+ *
+ * Free all reserved elements in @pool and @pool itself.  This function
+ * only sleeps if the free_fn() function sleeps.
+ *
+ * May be called on a zeroed but uninitialized mempool (i.e. allocated with
+ * kzalloc()).
+ */
+void mempool_exit(mempool_t *pool)
+{
+	while (pool->curr_nr) {
+		void *element = remove_element(pool);
+		pool->free(element, pool->pool_data);
+	}
+	kfree(pool->elements);
+	pool->elements = NULL;
+}
+EXPORT_SYMBOL(mempool_exit);
+
+/**
+ * mempool_destroy - deallocate a memory pool
+ * @pool:      pointer to the memory pool which was allocated via
+ *             mempool_create().
+ *
+ * Free all reserved elements in @pool and @pool itself.  This function
+ * only sleeps if the free_fn() function sleeps.
+ */
+void mempool_destroy(mempool_t *pool)
+{
+	if (unlikely(!pool))
+		return;
+
+	mempool_exit(pool);
+	kfree(pool);
+}
+EXPORT_SYMBOL(mempool_destroy);
+
+int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+		      mempool_free_t *free_fn, void *pool_data,
+		      gfp_t gfp_mask, int node_id)
+{
+	spin_lock_init(&pool->lock);
+	pool->min_nr	= min_nr;
+	pool->pool_data = pool_data;
+	pool->alloc	= alloc_fn;
+	pool->free	= free_fn;
+	init_waitqueue_head(&pool->wait);
+
+	pool->elements = kmalloc_array(min_nr, sizeof(void *), gfp_mask);
+	if (!pool->elements)
+		return -ENOMEM;
+
+	/*
+	 * First pre-allocate the guaranteed number of buffers.
+	 */
+	while (pool->curr_nr < pool->min_nr) {
+		void *element;
+
+		element = pool->alloc(gfp_mask, pool->pool_data);
+		if (unlikely(!element)) {
+			mempool_exit(pool);
+			return -ENOMEM;
+		}
+		add_element(pool, element);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(mempool_init_node);
+
+/**
+ * mempool_init - initialize a memory pool
+ * @pool:      pointer to the memory pool that should be initialized
+ * @min_nr:    the minimum number of elements guaranteed to be
+ *             allocated for this pool.
+ * @alloc_fn:  user-defined element-allocation function.
+ * @free_fn:   user-defined element-freeing function.
+ * @pool_data: optional private data available to the user-defined functions.
+ *
+ * Like mempool_create(), but initializes the pool in (i.e. embedded in another
+ * structure).
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+		 mempool_free_t *free_fn, void *pool_data)
+{
+	return mempool_init_node(pool, min_nr, alloc_fn, free_fn,
+				 pool_data, GFP_KERNEL, 0);
+
+}
+EXPORT_SYMBOL(mempool_init);
+
+/**
+ * mempool_create - create a memory pool
+ * @min_nr:    the minimum number of elements guaranteed to be
+ *             allocated for this pool.
+ * @alloc_fn:  user-defined element-allocation function.
+ * @free_fn:   user-defined element-freeing function.
+ * @pool_data: optional private data available to the user-defined functions.
+ *
+ * this function creates and allocates a guaranteed size, preallocated
+ * memory pool. The pool can be used from the mempool_alloc() and mempool_free()
+ * functions. This function might sleep. Both the alloc_fn() and the free_fn()
+ * functions might sleep - as long as the mempool_alloc() function is not called
+ * from IRQ contexts.
+ *
+ * Return: pointer to the created memory pool object or %NULL on error.
+ */
+mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+				mempool_free_t *free_fn, void *pool_data)
+{
+	return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,
+				   GFP_KERNEL, 0);
+}
+EXPORT_SYMBOL(mempool_create);
+
+mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
+			       mempool_free_t *free_fn, void *pool_data,
+			       gfp_t gfp_mask, int node_id)
+{
+	mempool_t *pool;
+
+	pool = kzalloc(sizeof(*pool), gfp_mask);
+	if (!pool)
+		return NULL;
+
+	if (mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data,
+			      gfp_mask, node_id)) {
+		kfree(pool);
+		return NULL;
+	}
+
+	return pool;
+}
+EXPORT_SYMBOL(mempool_create_node);
+
+/**
+ * mempool_resize - resize an existing memory pool
+ * @pool:       pointer to the memory pool which was allocated via
+ *              mempool_create().
+ * @new_min_nr: the new minimum number of elements guaranteed to be
+ *              allocated for this pool.
+ *
+ * This function shrinks/grows the pool. In the case of growing,
+ * it cannot be guaranteed that the pool will be grown to the new
+ * size immediately, but new mempool_free() calls will refill it.
+ * This function may sleep.
+ *
+ * Note, the caller must guarantee that no mempool_destroy is called
+ * while this function is running. mempool_alloc() & mempool_free()
+ * might be called (eg. from IRQ contexts) while this function executes.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int mempool_resize(mempool_t *pool, int new_min_nr)
+{
+	void *element;
+	void **new_elements;
+	unsigned long flags;
+
+	BUG_ON(new_min_nr <= 0);
+	might_sleep();
+
+	spin_lock_irqsave(&pool->lock, flags);
+	if (new_min_nr <= pool->min_nr) {
+		while (new_min_nr < pool->curr_nr) {
+			element = remove_element(pool);
+			spin_unlock_irqrestore(&pool->lock, flags);
+			pool->free(element, pool->pool_data);
+			spin_lock_irqsave(&pool->lock, flags);
+		}
+		pool->min_nr = new_min_nr;
+		goto out_unlock;
+	}
+	spin_unlock_irqrestore(&pool->lock, flags);
+
+	/* Grow the pool */
+	new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements),
+				     GFP_KERNEL);
+	if (!new_elements)
+		return -ENOMEM;
+
+	spin_lock_irqsave(&pool->lock, flags);
+	if (unlikely(new_min_nr <= pool->min_nr)) {
+		/* Raced, other resize will do our work */
+		spin_unlock_irqrestore(&pool->lock, flags);
+		kfree(new_elements);
+		goto out;
+	}
+	memcpy(new_elements, pool->elements,
+			pool->curr_nr * sizeof(*new_elements));
+	kfree(pool->elements);
+	pool->elements = new_elements;
+	pool->min_nr = new_min_nr;
+
+	while (pool->curr_nr < pool->min_nr) {
+		spin_unlock_irqrestore(&pool->lock, flags);
+		element = pool->alloc(GFP_KERNEL, pool->pool_data);
+		if (!element)
+			goto out;
+		spin_lock_irqsave(&pool->lock, flags);
+		if (pool->curr_nr < pool->min_nr) {
+			add_element(pool, element);
+		} else {
+			spin_unlock_irqrestore(&pool->lock, flags);
+			pool->free(element, pool->pool_data);	/* Raced */
+			goto out;
+		}
+	}
+out_unlock:
+	spin_unlock_irqrestore(&pool->lock, flags);
+out:
+	return 0;
+}
+EXPORT_SYMBOL(mempool_resize);
+
+/**
+ * mempool_alloc - allocate an element from a specific memory pool
+ * @pool:      pointer to the memory pool which was allocated via
+ *             mempool_create().
+ * @gfp_mask:  the usual allocation bitmask.
+ *
+ * this function only sleeps if the alloc_fn() function sleeps or
+ * returns NULL. Note that due to preallocation, this function
+ * *never* fails when called from process contexts. (it might
+ * fail if called from an IRQ context.)
+ * Note: using __GFP_ZERO is not supported.
+ *
+ * Return: pointer to the allocated element or %NULL on error.
+ */
+void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
+{
+	void *element;
+	unsigned long flags;
+	DEFINE_WAIT(wait);
+	gfp_t gfp_temp;
+
+	WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
+
+	gfp_mask |= __GFP_NORETRY;	/* don't loop in __alloc_pages */
+	gfp_mask |= __GFP_NOWARN;	/* failures are OK */
+
+	gfp_temp = gfp_mask & ~(__GFP_IO);
+
+repeat_alloc:
+
+	element = pool->alloc(gfp_temp, pool->pool_data);
+	if (likely(element != NULL))
+		return element;
+
+	spin_lock_irqsave(&pool->lock, flags);
+	if (likely(pool->curr_nr)) {
+		element = remove_element(pool);
+		spin_unlock_irqrestore(&pool->lock, flags);
+		/* paired with rmb in mempool_free(), read comment there */
+		smp_wmb();
+		return element;
+	}
+
+	/*
+	 * We use gfp mask w/o direct reclaim or IO for the first round.  If
+	 * alloc failed with that and @pool was empty, retry immediately.
+	 */
+	if (gfp_temp != gfp_mask) {
+		spin_unlock_irqrestore(&pool->lock, flags);
+		gfp_temp = gfp_mask;
+		goto repeat_alloc;
+	}
+
+	/* Let's wait for someone else to return an element to @pool */
+	prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
+
+	spin_unlock_irqrestore(&pool->lock, flags);
+
+	/*
+	 * FIXME: this should be io_schedule().  The timeout is there as a
+	 * workaround for some DM problems in 2.6.18.
+	 */
+	io_schedule_timeout(5*HZ);
+
+	finish_wait(&pool->wait, &wait);
+	goto repeat_alloc;
+}
+EXPORT_SYMBOL(mempool_alloc);
+
+/**
+ * mempool_free - return an element to the pool.
+ * @element:   pool element pointer.
+ * @pool:      pointer to the memory pool which was allocated via
+ *             mempool_create().
+ *
+ * this function only sleeps if the free_fn() function sleeps.
+ */
+void mempool_free(void *element, mempool_t *pool)
+{
+	unsigned long flags;
+
+	if (unlikely(element == NULL))
+		return;
+
+	/*
+	 * Paired with the wmb in mempool_alloc().  The preceding read is
+	 * for @element and the following @pool->curr_nr.  This ensures
+	 * that the visible value of @pool->curr_nr is from after the
+	 * allocation of @element.  This is necessary for fringe cases
+	 * where @element was passed to this task without going through
+	 * barriers.
+	 *
+	 * For example, assume @p is %NULL at the beginning and one task
+	 * performs "p = mempool_alloc(...);" while another task is doing
+	 * "while (!p) cpu_relax(); mempool_free(p, ...);".  This function
+	 * may end up using curr_nr value which is from before allocation
+	 * of @p without the following rmb.
+	 */
+	smp_rmb();
+
+	/*
+	 * For correctness, we need a test which is guaranteed to trigger
+	 * if curr_nr + #allocated == min_nr.  Testing curr_nr < min_nr
+	 * without locking achieves that and refilling as soon as possible
+	 * is desirable.
+	 *
+	 * Because curr_nr visible here is always a value after the
+	 * allocation of @element, any task which decremented curr_nr below
+	 * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets
+	 * incremented to min_nr afterwards.  If curr_nr gets incremented
+	 * to min_nr after the allocation of @element, the elements
+	 * allocated after that are subject to the same guarantee.
+	 *
+	 * Waiters happen iff curr_nr is 0 and the above guarantee also
+	 * ensures that there will be frees which return elements to the
+	 * pool waking up the waiters.
+	 */
+	if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) {
+		spin_lock_irqsave(&pool->lock, flags);
+		if (likely(pool->curr_nr < pool->min_nr)) {
+			add_element(pool, element);
+			spin_unlock_irqrestore(&pool->lock, flags);
+			wake_up(&pool->wait);
+			return;
+		}
+		spin_unlock_irqrestore(&pool->lock, flags);
+	}
+	pool->free(element, pool->pool_data);
+}
+EXPORT_SYMBOL(mempool_free);
+
+/*
+ * A commonly used alloc and free fn.
+ */
+void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
+{
+	struct kmem_cache *mem = pool_data;
+	return kmem_cache_alloc(mem, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_alloc_slab);
+
+void mempool_free_slab(void *element, void *pool_data)
+{
+	struct kmem_cache *mem = pool_data;
+	kmem_cache_free(mem, element);
+}
+EXPORT_SYMBOL(mempool_free_slab);
+
+/*
+ * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory
+ * specified by pool_data
+ */
+void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
+{
+	size_t size = (size_t)pool_data;
+	return kmalloc(size, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_kmalloc);
+
+void mempool_kfree(void *element, void *pool_data)
+{
+	kfree(element);
+}
+EXPORT_SYMBOL(mempool_kfree);
+
+void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data)
+{
+	size_t size = (size_t)pool_data;
+	return kvmalloc(size, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_kvmalloc);
+
+void mempool_kvfree(void *element, void *pool_data)
+{
+	kvfree(element);
+}
+EXPORT_SYMBOL(mempool_kvfree);
+
+/*
+ * A simple mempool-backed page allocator that allocates pages
+ * of the order specified by pool_data.
+ */
+void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data)
+{
+	int order = (int)(long)pool_data;
+	return alloc_pages(gfp_mask, order);
+}
+EXPORT_SYMBOL(mempool_alloc_pages);
+
+void mempool_free_pages(void *element, void *pool_data)
+{
+	int order = (int)(long)pool_data;
+	__free_pages(element, order);
+}
+EXPORT_SYMBOL(mempool_free_pages);
diff --git a/linux/preempt.c b/linux/preempt.c
new file mode 100644
index 00000000..72eceed3
--- /dev/null
+++ b/linux/preempt.c
@@ -0,0 +1,37 @@
+#include <pthread.h>
+
+#include "linux/preempt.h"
+
+/*
+ * In userspace, pthreads are preemptible and can migrate CPUs at any time.
+ *
+ * In the kernel, preempt_disable() logic essentially guarantees that a marked
+ * critical section owns its CPU for the relevant block. This is necessary for
+ * various code paths, critically including the percpu system as it allows for
+ * non-atomic reads and writes to CPU-local data structures.
+ *
+ * The high performance userspace equivalent would be to use thread local
+ * storage to replace percpu data, but that would be complicated. It should be
+ * correct to instead guarantee mutual exclusion for the critical sections.
+ */
+
+static pthread_mutex_t preempt_lock;
+
+__attribute__((constructor))
+static void preempt_init(void) {
+	pthread_mutexattr_t attr;
+	pthread_mutexattr_init(&attr);
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+	pthread_mutex_init(&preempt_lock, &attr);
+	pthread_mutexattr_destroy(&attr);
+}
+
+void preempt_disable(void)
+{
+	pthread_mutex_lock(&preempt_lock);
+}
+
+void preempt_enable(void)
+{
+	pthread_mutex_unlock(&preempt_lock);
+}
diff --git a/linux/ratelimit.c b/linux/ratelimit.c
new file mode 100644
index 00000000..21a6d6c8
--- /dev/null
+++ b/linux/ratelimit.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ratelimit.c - Do something with rate limit.
+ *
+ * Isolated from kernel/printk.c by Dave Young <hidave.darkstar@gmail.com>
+ *
+ * 2008-05-01 rewrite the function and use a ratelimit_state data struct as
+ * parameter. Now every user can use their own standalone ratelimit_state.
+ */
+
+#include <linux/ratelimit.h>
+#include <linux/jiffies.h>
+#include <linux/export.h>
+
+/*
+ * __ratelimit - rate limiting
+ * @rs: ratelimit_state data
+ * @func: name of calling function
+ *
+ * This enforces a rate limit: not more than @rs->burst callbacks
+ * in every @rs->interval
+ *
+ * RETURNS:
+ * 0 means callbacks will be suppressed.
+ * 1 means go ahead and do it.
+ */
+int ___ratelimit(struct ratelimit_state *rs, const char *func)
+{
+	int ret;
+
+	if (!rs->interval)
+		return 1;
+
+	/*
+	 * If we contend on this state's lock then almost
+	 * by definition we are too busy to print a message,
+	 * in addition to the one that will be printed by
+	 * the entity that is holding the lock already:
+	 */
+	if (!raw_spin_trylock(&rs->lock))
+		return 0;
+
+	if (!rs->begin)
+		rs->begin = jiffies;
+
+	if (time_is_before_jiffies(rs->begin + rs->interval)) {
+		if (rs->missed) {
+			if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) {
+				printk(KERN_WARNING
+				       "%s: %d callbacks suppressed\n",
+				       func, rs->missed);
+				rs->missed = 0;
+			}
+		}
+		rs->begin   = jiffies;
+		rs->printed = 0;
+	}
+	if (rs->burst && rs->burst > rs->printed) {
+		rs->printed++;
+		ret = 1;
+	} else {
+		rs->missed++;
+		ret = 0;
+	}
+	raw_spin_unlock(&rs->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(___ratelimit);
diff --git a/linux/rhashtable.c b/linux/rhashtable.c
index 03369ead..ba2196fc 100644
--- a/linux/rhashtable.c
+++ b/linux/rhashtable.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Resizable, Scalable, Concurrent Hash Table
  *
@@ -8,27 +9,29 @@
  * Code partially derived from nft_hash
  * Rewritten with rehash code from br_multicast plus single list
  * pointer as suggested by Josh Triplett
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/atomic.h>
 #include <linux/kernel.h>
 #include <linux/log2.h>
 #include <linux/sched.h>
+#include <linux/rculist.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/mm.h>
 #include <linux/jhash.h>
+#include <linux/overflow.h>
 #include <linux/random.h>
 #include <linux/rhashtable.h>
 #include <linux/err.h>
+#include <linux/export.h>
 
 #define HASH_DEFAULT_SIZE	64UL
 #define HASH_MIN_SIZE		4U
-#define BUCKET_LOCKS_PER_CPU	32UL
+
+union nested_table {
+	union nested_table __rcu *table;
+	struct rhash_lock_head __rcu *bucket;
+};
 
 static u32 head_hashfn(struct rhashtable *ht,
 		       const struct bucket_table *tbl,
@@ -37,40 +40,75 @@ static u32 head_hashfn(struct rhashtable *ht,
 	return rht_head_hashfn(ht, tbl, he, ht->p);
 }
 
-static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
-			      gfp_t gfp)
-{
-	unsigned int i, size;
-	unsigned int nr_pcpus = num_possible_cpus();
+#ifdef CONFIG_PROVE_LOCKING
+#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
 
-	nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL);
-	size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
+int lockdep_rht_mutex_is_held(struct rhashtable *ht)
+{
+	return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
 
-	/* Never allocate more than 0.5 locks per bucket */
-	size = min_t(unsigned int, size, tbl->size >> 1);
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
+{
+	if (!debug_locks)
+		return 1;
+	if (unlikely(tbl->nest))
+		return 1;
+	return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]);
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
+#else
+#define ASSERT_RHT_MUTEX(HT)
+#endif
 
-	if (sizeof(spinlock_t) != 0) {
-		tbl->locks = NULL;
-		if (gfp != GFP_KERNEL)
-			gfp |= __GFP_NOWARN | __GFP_NORETRY;
+static inline union nested_table *nested_table_top(
+	const struct bucket_table *tbl)
+{
+	/* The top-level bucket entry does not need RCU protection
+	 * because it's set at the same time as tbl->nest.
+	 */
+	return (void *)rcu_dereference_protected(tbl->buckets[0], 1);
+}
 
-		if (!tbl->locks)
-			tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
-						   gfp);
-		if (!tbl->locks)
-			return -ENOMEM;
-		for (i = 0; i < size; i++)
-			spin_lock_init(&tbl->locks[i]);
+static void nested_table_free(union nested_table *ntbl, unsigned int size)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	const unsigned int len = 1 << shift;
+	unsigned int i;
+
+	ntbl = rcu_dereference_protected(ntbl->table, 1);
+	if (!ntbl)
+		return;
+
+	if (size > len) {
+		size >>= shift;
+		for (i = 0; i < len; i++)
+			nested_table_free(ntbl + i, size);
 	}
-	tbl->locks_mask = size - 1;
 
-	return 0;
+	kfree(ntbl);
+}
+
+static void nested_bucket_table_free(const struct bucket_table *tbl)
+{
+	unsigned int size = tbl->size >> tbl->nest;
+	unsigned int len = 1 << tbl->nest;
+	union nested_table *ntbl;
+	unsigned int i;
+
+	ntbl = nested_table_top(tbl);
+
+	for (i = 0; i < len; i++)
+		nested_table_free(ntbl + i, size);
+
+	kfree(ntbl);
 }
 
 static void bucket_table_free(struct bucket_table *tbl)
 {
-	if (tbl)
-		kvfree(tbl->locks);
+	if (tbl->nest)
+		nested_bucket_table_free(tbl);
 
 	kvfree(tbl);
 }
@@ -80,6 +118,59 @@ static void bucket_table_free_rcu(struct rcu_head *head)
 	bucket_table_free(container_of(head, struct bucket_table, rcu));
 }
 
+static union nested_table *nested_table_alloc(struct rhashtable *ht,
+					      union nested_table __rcu **prev,
+					      bool leaf)
+{
+	union nested_table *ntbl;
+	int i;
+
+	ntbl = rcu_dereference(*prev);
+	if (ntbl)
+		return ntbl;
+
+	ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);
+
+	if (ntbl && leaf) {
+		for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
+			INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
+	}
+
+	if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL)
+		return ntbl;
+	/* Raced with another thread. */
+	kfree(ntbl);
+	return rcu_dereference(*prev);
+}
+
+static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
+						      size_t nbuckets,
+						      gfp_t gfp)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	struct bucket_table *tbl;
+	size_t size;
+
+	if (nbuckets < (1 << (shift + 1)))
+		return NULL;
+
+	size = sizeof(*tbl) + sizeof(tbl->buckets[0]);
+
+	tbl = kzalloc(size, gfp);
+	if (!tbl)
+		return NULL;
+
+	if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
+				false)) {
+		kfree(tbl);
+		return NULL;
+	}
+
+	tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;
+
+	return tbl;
+}
+
 static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 					       size_t nbuckets,
 					       gfp_t gfp)
@@ -88,28 +179,27 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 	size_t size;
 	int i;
 
-	size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
-	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) ||
-	    gfp != GFP_KERNEL)
-		tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
-	if (tbl == NULL && gfp == GFP_KERNEL)
-		tbl = vzalloc(size);
-	if (tbl == NULL)
-		return NULL;
+	tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp);
 
-	tbl->size = nbuckets;
+	size = nbuckets;
 
-	if (alloc_bucket_locks(ht, tbl, gfp) < 0) {
-		bucket_table_free(tbl);
-		return NULL;
+	if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) {
+		tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
+		nbuckets = 0;
 	}
 
+	if (tbl == NULL)
+		return NULL;
+
+	tbl->size = size;
+
+	rcu_head_init(&tbl->rcu);
 	INIT_LIST_HEAD(&tbl->walkers);
 
-	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
+	tbl->hash_rnd = get_random_u32();
 
 	for (i = 0; i < nbuckets; i++)
-		INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i);
+		INIT_RHT_NULLS_HEAD(tbl->buckets[i]);
 
 	return tbl;
 }
@@ -127,18 +217,24 @@ static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
 	return new_tbl;
 }
 
-static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
+static int rhashtable_rehash_one(struct rhashtable *ht,
+				 struct rhash_lock_head __rcu **bkt,
+				 unsigned int old_hash)
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
-	struct bucket_table *new_tbl = rhashtable_last_table(ht,
-		rht_dereference_rcu(old_tbl->future_tbl, ht));
-	struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash];
-	int err = -ENOENT;
+	struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
+	int err = -EAGAIN;
 	struct rhash_head *head, *next, *entry;
-	spinlock_t *new_bucket_lock;
+	struct rhash_head __rcu **pprev = NULL;
 	unsigned int new_hash;
 
-	rht_for_each(entry, old_tbl, old_hash) {
+	if (new_tbl->nest)
+		goto out;
+
+	err = -ENOENT;
+
+	rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash),
+			  old_tbl, old_hash) {
 		err = 0;
 		next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
 
@@ -153,57 +249,58 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 
 	new_hash = head_hashfn(ht, new_tbl, entry);
 
-	new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
+	rht_lock(new_tbl, &new_tbl->buckets[new_hash]);
 
-	spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
-	head = rht_dereference_bucket(new_tbl->buckets[new_hash],
-				      new_tbl, new_hash);
+	head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash);
 
 	RCU_INIT_POINTER(entry->next, head);
 
-	rcu_assign_pointer(new_tbl->buckets[new_hash], entry);
-	spin_unlock(new_bucket_lock);
+	rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry);
 
-	rcu_assign_pointer(*pprev, next);
+	if (pprev)
+		rcu_assign_pointer(*pprev, next);
+	else
+		/* Need to preserved the bit lock. */
+		rht_assign_locked(bkt, next);
 
 out:
 	return err;
 }
 
-static void rhashtable_rehash_chain(struct rhashtable *ht,
+static int rhashtable_rehash_chain(struct rhashtable *ht,
 				    unsigned int old_hash)
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
-	spinlock_t *old_bucket_lock;
+	struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash);
+	int err;
 
-	old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
+	if (!bkt)
+		return 0;
+	rht_lock(old_tbl, bkt);
 
-	spin_lock_bh(old_bucket_lock);
-	while (!rhashtable_rehash_one(ht, old_hash))
+	while (!(err = rhashtable_rehash_one(ht, bkt, old_hash)))
 		;
-	old_tbl->rehash++;
-	spin_unlock_bh(old_bucket_lock);
+
+	if (err == -ENOENT)
+		err = 0;
+	rht_unlock(old_tbl, bkt);
+
+	return err;
 }
 
 static int rhashtable_rehash_attach(struct rhashtable *ht,
 				    struct bucket_table *old_tbl,
 				    struct bucket_table *new_tbl)
 {
-	/* Protect future_tbl using the first bucket lock. */
-	spin_lock_bh(old_tbl->locks);
-
-	/* Did somebody beat us to it? */
-	if (rcu_access_pointer(old_tbl->future_tbl)) {
-		spin_unlock_bh(old_tbl->locks);
-		return -EEXIST;
-	}
-
 	/* Make insertions go into the new, empty table right away. Deletions
 	 * and lookups will be attempted in both tables until we synchronize.
+	 * As cmpxchg() provides strong barriers, we do not need
+	 * rcu_assign_pointer().
 	 */
-	rcu_assign_pointer(old_tbl->future_tbl, new_tbl);
 
-	spin_unlock_bh(old_tbl->locks);
+	if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL,
+		    new_tbl) != NULL)
+		return -EEXIST;
 
 	return 0;
 }
@@ -214,13 +311,18 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	struct bucket_table *new_tbl;
 	struct rhashtable_walker *walker;
 	unsigned int old_hash;
+	int err;
 
 	new_tbl = rht_dereference(old_tbl->future_tbl, ht);
 	if (!new_tbl)
 		return 0;
 
-	for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
-		rhashtable_rehash_chain(ht, old_hash);
+	for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
+		err = rhashtable_rehash_chain(ht, old_hash);
+		if (err)
+			return err;
+		cond_resched();
+	}
 
 	/* Publish the new table pointer. */
 	rcu_assign_pointer(ht->tbl, new_tbl);
@@ -228,25 +330,30 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	spin_lock(&ht->lock);
 	list_for_each_entry(walker, &old_tbl->walkers, list)
 		walker->tbl = NULL;
-	spin_unlock(&ht->lock);
 
 	/* Wait for readers. All new readers will see the new
 	 * table, and thus no references to the old table will
 	 * remain.
+	 * We do this inside the locked region so that
+	 * rhashtable_walk_stop() can use rcu_head_after_call_rcu()
+	 * to check if it should not re-link the table.
 	 */
 	call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
+	spin_unlock(&ht->lock);
 
 	return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
 }
 
-static int rhashtable_expand(struct rhashtable *ht)
+static int rhashtable_rehash_alloc(struct rhashtable *ht,
+				   struct bucket_table *old_tbl,
+				   unsigned int size)
 {
-	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *new_tbl;
 	int err;
 
-	old_tbl = rhashtable_last_table(ht, old_tbl);
+	ASSERT_RHT_MUTEX(ht);
 
-	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL);
+	new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
@@ -257,12 +364,27 @@ static int rhashtable_expand(struct rhashtable *ht)
 	return err;
 }
 
+/**
+ * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
+ * @ht:		the hash table to shrink
+ *
+ * This function shrinks the hash table to fit, i.e., the smallest
+ * size would not cause it to expand right away automatically.
+ *
+ * The caller must ensure that no concurrent resizing occurs by holding
+ * ht->mutex.
+ *
+ * The caller must ensure that no concurrent table mutations take place.
+ * It is however valid to have concurrent lookups if they are RCU protected.
+ *
+ * It is valid to have concurrent insertions and deletions protected by per
+ * bucket locks or concurrent RCU protected lookups and traversals.
+ */
 static int rhashtable_shrink(struct rhashtable *ht)
 {
-	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	unsigned int nelems = atomic_read(&ht->nelems);
 	unsigned int size = 0;
-	int err;
 
 	if (nelems)
 		size = roundup_pow_of_two(nelems * 3 / 2);
@@ -275,15 +397,7 @@ static int rhashtable_shrink(struct rhashtable *ht)
 	if (rht_dereference(old_tbl->future_tbl, ht))
 		return -EEXIST;
 
-	new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
-	if (new_tbl == NULL)
-		return -ENOMEM;
-
-	err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
-	if (err)
-		bucket_table_free(new_tbl);
-
-	return err;
+	return rhashtable_rehash_alloc(ht, old_tbl, size);
 }
 
 static void rht_deferred_worker(struct work_struct *work)
@@ -299,11 +413,18 @@ static void rht_deferred_worker(struct work_struct *work)
 	tbl = rhashtable_last_table(ht, tbl);
 
 	if (rht_grow_above_75(ht, tbl))
-		rhashtable_expand(ht);
+		err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
 	else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
-		rhashtable_shrink(ht);
+		err = rhashtable_shrink(ht);
+	else if (tbl->nest)
+		err = rhashtable_rehash_alloc(ht, tbl, tbl->size);
+
+	if (!err || err == -EEXIST) {
+		int nerr;
 
-	err = rhashtable_rehash_table(ht);
+		nerr = rhashtable_rehash_table(ht);
+		err = err ?: nerr;
+	}
 
 	mutex_unlock(&ht->mutex);
 
@@ -311,22 +432,8 @@ static void rht_deferred_worker(struct work_struct *work)
 		schedule_work(&ht->run_work);
 }
 
-static bool rhashtable_check_elasticity(struct rhashtable *ht,
-					struct bucket_table *tbl,
-					unsigned int hash)
-{
-	unsigned int elasticity = ht->elasticity;
-	struct rhash_head *head;
-
-	rht_for_each(head, tbl, hash)
-		if (!--elasticity)
-			return true;
-
-	return false;
-}
-
-int rhashtable_insert_rehash(struct rhashtable *ht,
-			     struct bucket_table *tbl)
+static int rhashtable_insert_rehash(struct rhashtable *ht,
+				    struct bucket_table *tbl)
 {
 	struct bucket_table *old_tbl;
 	struct bucket_table *new_tbl;
@@ -347,7 +454,7 @@ int rhashtable_insert_rehash(struct rhashtable *ht,
 
 	err = -ENOMEM;
 
-	new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC);
+	new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN);
 	if (new_tbl == NULL)
 		goto fail;
 
@@ -363,7 +470,7 @@ int rhashtable_insert_rehash(struct rhashtable *ht,
 
 fail:
 	/* Do not fail the insert if someone else did a rehash. */
-	if (likely(rcu_dereference_raw(tbl->future_tbl)))
+	if (likely(rcu_access_pointer(tbl->future_tbl)))
 		return 0;
 
 	/* Schedule async rehash to retry allocation in process context. */
@@ -373,57 +480,485 @@ fail:
 	return err;
 }
 
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
-					    const void *key,
-					    struct rhash_head *obj,
-					    struct bucket_table *tbl)
+static void *rhashtable_lookup_one(struct rhashtable *ht,
+				   struct rhash_lock_head __rcu **bkt,
+				   struct bucket_table *tbl, unsigned int hash,
+				   const void *key, struct rhash_head *obj)
 {
+	struct rhashtable_compare_arg arg = {
+		.ht = ht,
+		.key = key,
+	};
+	struct rhash_head __rcu **pprev = NULL;
 	struct rhash_head *head;
-	unsigned int hash;
-	int err;
+	int elasticity;
+
+	elasticity = RHT_ELASTICITY;
+	rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
+		struct rhlist_head *list;
+		struct rhlist_head *plist;
+
+		elasticity--;
+		if (!key ||
+		    (ht->p.obj_cmpfn ?
+		     ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
+		     rhashtable_compare(&arg, rht_obj(ht, head)))) {
+			pprev = &head->next;
+			continue;
+		}
 
-	tbl = rhashtable_last_table(ht, tbl);
-	hash = head_hashfn(ht, tbl, obj);
-	spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
+		if (!ht->rhlist)
+			return rht_obj(ht, head);
 
-	err = -EEXIST;
-	if (key && rhashtable_lookup_fast(ht, key, ht->p))
-		goto exit;
+		list = container_of(obj, struct rhlist_head, rhead);
+		plist = container_of(head, struct rhlist_head, rhead);
 
-	err = -E2BIG;
-	if (unlikely(rht_grow_above_max(ht, tbl)))
-		goto exit;
+		RCU_INIT_POINTER(list->next, plist);
+		head = rht_dereference_bucket(head->next, tbl, hash);
+		RCU_INIT_POINTER(list->rhead.next, head);
+		if (pprev)
+			rcu_assign_pointer(*pprev, obj);
+		else
+			/* Need to preserve the bit lock */
+			rht_assign_locked(bkt, obj);
+
+		return NULL;
+	}
+
+	if (elasticity <= 0)
+		return ERR_PTR(-EAGAIN);
+
+	return ERR_PTR(-ENOENT);
+}
+
+static struct bucket_table *rhashtable_insert_one(
+	struct rhashtable *ht, struct rhash_lock_head __rcu **bkt,
+	struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj,
+	void *data)
+{
+	struct bucket_table *new_tbl;
+	struct rhash_head *head;
+
+	if (!IS_ERR_OR_NULL(data))
+		return ERR_PTR(-EEXIST);
+
+	if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
+		return ERR_CAST(data);
+
+	new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	if (new_tbl)
+		return new_tbl;
+
+	if (PTR_ERR(data) != -ENOENT)
+		return ERR_CAST(data);
 
-	err = -EAGAIN;
-	if (rhashtable_check_elasticity(ht, tbl, hash) ||
-	    rht_grow_above_100(ht, tbl))
-		goto exit;
+	if (unlikely(rht_grow_above_max(ht, tbl)))
+		return ERR_PTR(-E2BIG);
 
-	err = 0;
+	if (unlikely(rht_grow_above_100(ht, tbl)))
+		return ERR_PTR(-EAGAIN);
 
-	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+	head = rht_ptr(bkt, tbl, hash);
 
 	RCU_INIT_POINTER(obj->next, head);
+	if (ht->rhlist) {
+		struct rhlist_head *list;
 
-	rcu_assign_pointer(tbl->buckets[hash], obj);
+		list = container_of(obj, struct rhlist_head, rhead);
+		RCU_INIT_POINTER(list->next, NULL);
+	}
+
+	/* bkt is always the head of the list, so it holds
+	 * the lock, which we need to preserve
+	 */
+	rht_assign_locked(bkt, obj);
 
 	atomic_inc(&ht->nelems);
+	if (rht_grow_above_75(ht, tbl))
+		schedule_work(&ht->run_work);
+
+	return NULL;
+}
+
+static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
+				   struct rhash_head *obj)
+{
+	struct bucket_table *new_tbl;
+	struct bucket_table *tbl;
+	struct rhash_lock_head __rcu **bkt;
+	unsigned int hash;
+	void *data;
+
+	new_tbl = rcu_dereference(ht->tbl);
+
+	do {
+		tbl = new_tbl;
+		hash = rht_head_hashfn(ht, tbl, obj, ht->p);
+		if (rcu_access_pointer(tbl->future_tbl))
+			/* Failure is OK */
+			bkt = rht_bucket_var(tbl, hash);
+		else
+			bkt = rht_bucket_insert(ht, tbl, hash);
+		if (bkt == NULL) {
+			new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+			data = ERR_PTR(-EAGAIN);
+		} else {
+			rht_lock(tbl, bkt);
+			data = rhashtable_lookup_one(ht, bkt, tbl,
+						     hash, key, obj);
+			new_tbl = rhashtable_insert_one(ht, bkt, tbl,
+							hash, obj, data);
+			if (PTR_ERR(new_tbl) != -EEXIST)
+				data = ERR_CAST(new_tbl);
+
+			rht_unlock(tbl, bkt);
+		}
+	} while (!IS_ERR_OR_NULL(new_tbl));
+
+	if (PTR_ERR(data) == -EAGAIN)
+		data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
+			       -EAGAIN);
+
+	return data;
+}
+
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+			     struct rhash_head *obj)
+{
+	void *data;
+
+	do {
+		rcu_read_lock();
+		data = rhashtable_try_insert(ht, key, obj);
+		rcu_read_unlock();
+	} while (PTR_ERR(data) == -EAGAIN);
 
-exit:
-	spin_unlock(rht_bucket_lock(tbl, hash));
+	return data;
+}
+EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
 
-	if (err == 0)
+/**
+ * rhashtable_walk_enter - Initialise an iterator
+ * @ht:		Table to walk over
+ * @iter:	Hash table Iterator
+ *
+ * This function prepares a hash table walk.
+ *
+ * Note that if you restart a walk after rhashtable_walk_stop you
+ * may see the same object twice.  Also, you may miss objects if
+ * there are removals in between rhashtable_walk_stop and the next
+ * call to rhashtable_walk_start.
+ *
+ * For a completely stable walk you should construct your own data
+ * structure outside the hash table.
+ *
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
+ *
+ * You must call rhashtable_walk_exit after this function returns.
+ */
+void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
+{
+	iter->ht = ht;
+	iter->p = NULL;
+	iter->slot = 0;
+	iter->skip = 0;
+	iter->end_of_table = 0;
+
+	spin_lock(&ht->lock);
+	iter->walker.tbl =
+		rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
+	list_add(&iter->walker.list, &iter->walker.tbl->walkers);
+	spin_unlock(&ht->lock);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_enter);
+
+/**
+ * rhashtable_walk_exit - Free an iterator
+ * @iter:	Hash table Iterator
+ *
+ * This function frees resources allocated by rhashtable_walk_enter.
+ */
+void rhashtable_walk_exit(struct rhashtable_iter *iter)
+{
+	spin_lock(&iter->ht->lock);
+	if (iter->walker.tbl)
+		list_del(&iter->walker.list);
+	spin_unlock(&iter->ht->lock);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
+
+/**
+ * rhashtable_walk_start_check - Start a hash table walk
+ * @iter:	Hash table iterator
+ *
+ * Start a hash table walk at the current iterator position.  Note that we take
+ * the RCU lock in all cases including when we return an error.  So you must
+ * always call rhashtable_walk_stop to clean up.
+ *
+ * Returns zero if successful.
+ *
+ * Returns -EAGAIN if resize event occured.  Note that the iterator
+ * will rewind back to the beginning and you may use it immediately
+ * by calling rhashtable_walk_next.
+ *
+ * rhashtable_walk_start is defined as an inline variant that returns
+ * void. This is preferred in cases where the caller would ignore
+ * resize events and always continue.
+ */
+int rhashtable_walk_start_check(struct rhashtable_iter *iter)
+	__acquires(RCU)
+{
+	struct rhashtable *ht = iter->ht;
+	bool rhlist = ht->rhlist;
+
+	rcu_read_lock();
+
+	spin_lock(&ht->lock);
+	if (iter->walker.tbl)
+		list_del(&iter->walker.list);
+	spin_unlock(&ht->lock);
+
+	if (iter->end_of_table)
+		return 0;
+	if (!iter->walker.tbl) {
+		iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
+		iter->slot = 0;
+		iter->skip = 0;
+		return -EAGAIN;
+	}
+
+	if (iter->p && !rhlist) {
+		/*
+		 * We need to validate that 'p' is still in the table, and
+		 * if so, update 'skip'
+		 */
+		struct rhash_head *p;
+		int skip = 0;
+		rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+			skip++;
+			if (p == iter->p) {
+				iter->skip = skip;
+				goto found;
+			}
+		}
+		iter->p = NULL;
+	} else if (iter->p && rhlist) {
+		/* Need to validate that 'list' is still in the table, and
+		 * if so, update 'skip' and 'p'.
+		 */
+		struct rhash_head *p;
+		struct rhlist_head *list;
+		int skip = 0;
+		rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+			for (list = container_of(p, struct rhlist_head, rhead);
+			     list;
+			     list = rcu_dereference(list->next)) {
+				skip++;
+				if (list == iter->list) {
+					iter->p = p;
+					iter->skip = skip;
+					goto found;
+				}
+			}
+		}
+		iter->p = NULL;
+	}
+found:
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);
+
+/**
+ * __rhashtable_walk_find_next - Find the next element in a table (or the first
+ * one in case of a new walk).
+ *
+ * @iter:	Hash table iterator
+ *
+ * Returns the found object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred.
+ */
+static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter)
+{
+	struct bucket_table *tbl = iter->walker.tbl;
+	struct rhlist_head *list = iter->list;
+	struct rhashtable *ht = iter->ht;
+	struct rhash_head *p = iter->p;
+	bool rhlist = ht->rhlist;
+
+	if (!tbl)
 		return NULL;
-	else if (err == -EAGAIN)
-		return tbl;
+
+	for (; iter->slot < tbl->size; iter->slot++) {
+		int skip = iter->skip;
+
+		rht_for_each_rcu(p, tbl, iter->slot) {
+			if (rhlist) {
+				list = container_of(p, struct rhlist_head,
+						    rhead);
+				do {
+					if (!skip)
+						goto next;
+					skip--;
+					list = rcu_dereference(list->next);
+				} while (list);
+
+				continue;
+			}
+			if (!skip)
+				break;
+			skip--;
+		}
+
+next:
+		if (!rht_is_a_nulls(p)) {
+			iter->skip++;
+			iter->p = p;
+			iter->list = list;
+			return rht_obj(ht, rhlist ? &list->rhead : p);
+		}
+
+		iter->skip = 0;
+	}
+
+	iter->p = NULL;
+
+	/* Ensure we see any new tables. */
+	smp_rmb();
+
+	iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	if (iter->walker.tbl) {
+		iter->slot = 0;
+		iter->skip = 0;
+		return ERR_PTR(-EAGAIN);
+	} else {
+		iter->end_of_table = true;
+	}
+
+	return NULL;
+}
+
+/**
+ * rhashtable_walk_next - Return the next object and advance the iterator
+ * @iter:	Hash table iterator
+ *
+ * Note that you must call rhashtable_walk_stop when you are finished
+ * with the walk.
+ *
+ * Returns the next object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred.  Note that the iterator
+ * will rewind back to the beginning and you may continue to use it.
+ */
+void *rhashtable_walk_next(struct rhashtable_iter *iter)
+{
+	struct rhlist_head *list = iter->list;
+	struct rhashtable *ht = iter->ht;
+	struct rhash_head *p = iter->p;
+	bool rhlist = ht->rhlist;
+
+	if (p) {
+		if (!rhlist || !(list = rcu_dereference(list->next))) {
+			p = rcu_dereference(p->next);
+			list = container_of(p, struct rhlist_head, rhead);
+		}
+		if (!rht_is_a_nulls(p)) {
+			iter->skip++;
+			iter->p = p;
+			iter->list = list;
+			return rht_obj(ht, rhlist ? &list->rhead : p);
+		}
+
+		/* At the end of this slot, switch to next one and then find
+		 * next entry from that point.
+		 */
+		iter->skip = 0;
+		iter->slot++;
+	}
+
+	return __rhashtable_walk_find_next(iter);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_next);
+
+/**
+ * rhashtable_walk_peek - Return the next object but don't advance the iterator
+ * @iter:	Hash table iterator
+ *
+ * Returns the next object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred.  Note that the iterator
+ * will rewind back to the beginning and you may continue to use it.
+ */
+void *rhashtable_walk_peek(struct rhashtable_iter *iter)
+{
+	struct rhlist_head *list = iter->list;
+	struct rhashtable *ht = iter->ht;
+	struct rhash_head *p = iter->p;
+
+	if (p)
+		return rht_obj(ht, ht->rhlist ? &list->rhead : p);
+
+	/* No object found in current iter, find next one in the table. */
+
+	if (iter->skip) {
+		/* A nonzero skip value points to the next entry in the table
+		 * beyond that last one that was found. Decrement skip so
+		 * we find the current value. __rhashtable_walk_find_next
+		 * will restore the original value of skip assuming that
+		 * the table hasn't changed.
+		 */
+		iter->skip--;
+	}
+
+	return __rhashtable_walk_find_next(iter);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_peek);
+
+/**
+ * rhashtable_walk_stop - Finish a hash table walk
+ * @iter:	Hash table iterator
+ *
+ * Finish a hash table walk.  Does not reset the iterator to the start of the
+ * hash table.
+ */
+void rhashtable_walk_stop(struct rhashtable_iter *iter)
+	__releases(RCU)
+{
+	struct rhashtable *ht;
+	struct bucket_table *tbl = iter->walker.tbl;
+
+	if (!tbl)
+		goto out;
+
+	ht = iter->ht;
+
+	spin_lock(&ht->lock);
+	if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu))
+		/* This bucket table is being freed, don't re-link it. */
+		iter->walker.tbl = NULL;
 	else
-		return ERR_PTR(err);
+		list_add(&iter->walker.list, &tbl->walkers);
+	spin_unlock(&ht->lock);
+
+out:
+	rcu_read_unlock();
 }
+EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
 
 static size_t rounded_hashtable_size(const struct rhashtable_params *params)
 {
-	return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
-		   (unsigned long)params->min_size);
+	size_t retsize;
+
+	if (params->nelem_hint)
+		retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
+			      (unsigned long)params->min_size);
+	else
+		retsize = max(HASH_DEFAULT_SIZE,
+			      (unsigned long)params->min_size);
+
+	return retsize;
 }
 
 static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
@@ -431,21 +966,58 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
 	return jhash2(key, length, seed);
 }
 
+/**
+ * rhashtable_init - initialize a new hash table
+ * @ht:		hash table to be initialized
+ * @params:	configuration parameters
+ *
+ * Initializes a new hash table based on the provided configuration
+ * parameters. A table can be configured either with a variable or
+ * fixed length key:
+ *
+ * Configuration Example 1: Fixed length keys
+ * struct test_obj {
+ *	int			key;
+ *	void *			my_member;
+ *	struct rhash_head	node;
+ * };
+ *
+ * struct rhashtable_params params = {
+ *	.head_offset = offsetof(struct test_obj, node),
+ *	.key_offset = offsetof(struct test_obj, key),
+ *	.key_len = sizeof(int),
+ *	.hashfn = jhash,
+ * };
+ *
+ * Configuration Example 2: Variable length keys
+ * struct test_obj {
+ *	[...]
+ *	struct rhash_head	node;
+ * };
+ *
+ * u32 my_hash_fn(const void *data, u32 len, u32 seed)
+ * {
+ *	struct test_obj *obj = data;
+ *
+ *	return [... hash ...];
+ * }
+ *
+ * struct rhashtable_params params = {
+ *	.head_offset = offsetof(struct test_obj, node),
+ *	.hashfn = jhash,
+ *	.obj_hashfn = my_hash_fn,
+ * };
+ */
 int rhashtable_init(struct rhashtable *ht,
 		    const struct rhashtable_params *params)
 {
 	struct bucket_table *tbl;
 	size_t size;
 
-	size = HASH_DEFAULT_SIZE;
-
 	if ((!params->key_len && !params->obj_hashfn) ||
 	    (params->obj_hashfn && !params->obj_cmpfn))
 		return -EINVAL;
 
-	if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
-		return -EINVAL;
-
 	memset(ht, 0, sizeof(*ht));
 	mutex_init(&ht->mutex);
 	spin_lock_init(&ht->lock);
@@ -454,39 +1026,18 @@ int rhashtable_init(struct rhashtable *ht,
 	if (params->min_size)
 		ht->p.min_size = roundup_pow_of_two(params->min_size);
 
-	if (params->max_size)
-		ht->p.max_size = rounddown_pow_of_two(params->max_size);
+	/* Cap total entries at 2^31 to avoid nelems overflow. */
+	ht->max_elems = 1u << 31;
 
-	if (params->insecure_max_entries)
-		ht->p.insecure_max_entries =
-			rounddown_pow_of_two(params->insecure_max_entries);
-	else
-		ht->p.insecure_max_entries = ht->p.max_size * 2;
-
-	ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
+	if (params->max_size) {
+		ht->p.max_size = rounddown_pow_of_two(params->max_size);
+		if (ht->p.max_size < ht->max_elems / 2)
+			ht->max_elems = ht->p.max_size * 2;
+	}
 
-	if (params->nelem_hint)
-		size = rounded_hashtable_size(&ht->p);
-
-	/* The maximum (not average) chain length grows with the
-	 * size of the hash table, at a rate of (log N)/(log log N).
-	 * The value of 16 is selected so that even if the hash
-	 * table grew to 2^32 you would not expect the maximum
-	 * chain length to exceed it unless we are under attack
-	 * (or extremely unlucky).
-	 *
-	 * As this limit is only to detect attacks, we don't need
-	 * to set it to a lower value as you'd need the chain
-	 * length to vastly exceed 16 to have any real effect
-	 * on the system.
-	 */
-	if (!params->insecure_elasticity)
-		ht->elasticity = 16;
+	ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
 
-	if (params->locks_mul)
-		ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
-	else
-		ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
+	size = rounded_hashtable_size(&ht->p);
 
 	ht->key_len = ht->p.key_len;
 	if (!params->hashfn) {
@@ -498,9 +1049,16 @@ int rhashtable_init(struct rhashtable *ht,
 		}
 	}
 
+	/*
+	 * This is api initialization and thus we need to guarantee the
+	 * initial rhashtable allocation. Upon failure, retry with the
+	 * smallest possible size with __GFP_NOFAIL semantics.
+	 */
 	tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
-	if (tbl == NULL)
-		return -ENOMEM;
+	if (unlikely(tbl == NULL)) {
+		size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
+		tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL);
+	}
 
 	atomic_set(&ht->nelems, 0);
 
@@ -510,15 +1068,170 @@ int rhashtable_init(struct rhashtable *ht,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(rhashtable_init);
 
-void rhashtable_destroy(struct rhashtable *ht)
+/**
+ * rhltable_init - initialize a new hash list table
+ * @hlt:	hash list table to be initialized
+ * @params:	configuration parameters
+ *
+ * Initializes a new hash list table.
+ *
+ * See documentation for rhashtable_init.
+ */
+int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
 {
-	struct bucket_table *tbl;
+	int err;
+
+	err = rhashtable_init(&hlt->ht, params);
+	hlt->ht.rhlist = true;
+	return err;
+}
+EXPORT_SYMBOL_GPL(rhltable_init);
+
+static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
+				void (*free_fn)(void *ptr, void *arg),
+				void *arg)
+{
+	struct rhlist_head *list;
+
+	if (!ht->rhlist) {
+		free_fn(rht_obj(ht, obj), arg);
+		return;
+	}
+
+	list = container_of(obj, struct rhlist_head, rhead);
+	do {
+		obj = &list->rhead;
+		list = rht_dereference(list->next, ht);
+		free_fn(rht_obj(ht, obj), arg);
+	} while (list);
+}
+
+/**
+ * rhashtable_free_and_destroy - free elements and destroy hash table
+ * @ht:		the hash table to destroy
+ * @free_fn:	callback to release resources of element
+ * @arg:	pointer passed to free_fn
+ *
+ * Stops an eventual async resize. If defined, invokes free_fn for each
+ * element to releasal resources. Please note that RCU protected
+ * readers may still be accessing the elements. Releasing of resources
+ * must occur in a compatible manner. Then frees the bucket array.
+ *
+ * This function will eventually sleep to wait for an async resize
+ * to complete. The caller is responsible that no further write operations
+ * occurs in parallel.
+ */
+void rhashtable_free_and_destroy(struct rhashtable *ht,
+				 void (*free_fn)(void *ptr, void *arg),
+				 void *arg)
+{
+	struct bucket_table *tbl, *next_tbl;
+	unsigned int i;
 
 	cancel_work_sync(&ht->run_work);
 
 	mutex_lock(&ht->mutex);
 	tbl = rht_dereference(ht->tbl, ht);
+restart:
+	if (free_fn) {
+		for (i = 0; i < tbl->size; i++) {
+			struct rhash_head *pos, *next;
+
+			cond_resched();
+			for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)),
+			     next = !rht_is_a_nulls(pos) ?
+					rht_dereference(pos->next, ht) : NULL;
+			     !rht_is_a_nulls(pos);
+			     pos = next,
+			     next = !rht_is_a_nulls(pos) ?
+					rht_dereference(pos->next, ht) : NULL)
+				rhashtable_free_one(ht, pos, free_fn, arg);
+		}
+	}
+
+	next_tbl = rht_dereference(tbl->future_tbl, ht);
 	bucket_table_free(tbl);
+	if (next_tbl) {
+		tbl = next_tbl;
+		goto restart;
+	}
 	mutex_unlock(&ht->mutex);
 }
+EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);
+
+void rhashtable_destroy(struct rhashtable *ht)
+{
+	return rhashtable_free_and_destroy(ht, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(rhashtable_destroy);
+
+struct rhash_lock_head __rcu **__rht_bucket_nested(
+	const struct bucket_table *tbl, unsigned int hash)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	unsigned int index = hash & ((1 << tbl->nest) - 1);
+	unsigned int size = tbl->size >> tbl->nest;
+	unsigned int subhash = hash;
+	union nested_table *ntbl;
+
+	ntbl = nested_table_top(tbl);
+	ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash);
+	subhash >>= tbl->nest;
+
+	while (ntbl && size > (1 << shift)) {
+		index = subhash & ((1 << shift) - 1);
+		ntbl = rht_dereference_bucket_rcu(ntbl[index].table,
+						  tbl, hash);
+		size >>= shift;
+		subhash >>= shift;
+	}
+
+	if (!ntbl)
+		return NULL;
+
+	return &ntbl[subhash].bucket;
+
+}
+EXPORT_SYMBOL_GPL(__rht_bucket_nested);
+
+struct rhash_lock_head __rcu **rht_bucket_nested(
+	const struct bucket_table *tbl, unsigned int hash)
+{
+	static struct rhash_lock_head __rcu *rhnull;
+
+	if (!rhnull)
+		INIT_RHT_NULLS_HEAD(rhnull);
+	return __rht_bucket_nested(tbl, hash) ?: &rhnull;
+}
+EXPORT_SYMBOL_GPL(rht_bucket_nested);
+
+struct rhash_lock_head __rcu **rht_bucket_nested_insert(
+	struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	unsigned int index = hash & ((1 << tbl->nest) - 1);
+	unsigned int size = tbl->size >> tbl->nest;
+	union nested_table *ntbl;
+
+	ntbl = nested_table_top(tbl);
+	hash >>= tbl->nest;
+	ntbl = nested_table_alloc(ht, &ntbl[index].table,
+				  size <= (1 << shift));
+
+	while (ntbl && size > (1 << shift)) {
+		index = hash & ((1 << shift) - 1);
+		size >>= shift;
+		hash >>= shift;
+		ntbl = nested_table_alloc(ht, &ntbl[index].table,
+					  size <= (1 << shift));
+	}
+
+	if (!ntbl)
+		return NULL;
+
+	return &ntbl[hash].bucket;
+
+}
+EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);
diff --git a/linux/sched.c b/linux/sched.c
index c996945e..1c7198d2 100644
--- a/linux/sched.c
+++ b/linux/sched.c
@@ -1,17 +1,15 @@
 
-#include <linux/futex.h>
+#include <stdio.h>
 #include <string.h>
 #include <sys/mman.h>
+#include <linux/futex.h>
 
 /* hack for mips: */
 #define CONFIG_RCU_HAVE_FUTEX 1
 #include <urcu/futex.h>
 
-#include <linux/math64.h>
-#include <linux/printk.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/timer.h>
 
 __thread struct task_struct *current;
@@ -39,7 +37,7 @@ void schedule(void)
 
 	rcu_quiescent_state();
 
-	while ((v = current->state) != TASK_RUNNING)
+	while ((v = READ_ONCE(current->state)) != TASK_RUNNING)
 		futex(&current->state, FUTEX_WAIT|FUTEX_PRIVATE_FLAG,
 		      v, NULL, NULL, 0);
 }
@@ -83,7 +81,7 @@ long schedule_timeout(long timeout)
 		 * that will tell you if something is gone wrong and where.
 		 */
 		if (timeout < 0) {
-			printk(KERN_ERR "schedule_timeout: wrong timeout "
+			fprintf(stderr, "schedule_timeout: wrong timeout "
 				"value %lx\n", timeout);
 			current->state = TASK_RUNNING;
 			goto out;
@@ -108,8 +106,6 @@ static void sched_init(void)
 {
 	struct task_struct *p = malloc(sizeof(*p));
 
-	mlockall(MCL_CURRENT|MCL_FUTURE);
-
 	memset(p, 0, sizeof(*p));
 
 	p->state	= TASK_RUNNING;
@@ -122,7 +118,7 @@ static void sched_init(void)
 	rcu_register_thread();
 }
 
-#ifndef __NR_getrandom
+#ifndef SYS_getrandom
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
diff --git a/linux/semaphore.c b/linux/semaphore.c
new file mode 100644
index 00000000..b7d4b517
--- /dev/null
+++ b/linux/semaphore.c
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2008 Intel Corporation
+ * Author: Matthew Wilcox <willy@linux.intel.com>
+ *
+ * This file implements counting semaphores.
+ * A counting semaphore may be acquired 'n' times before sleeping.
+ * See mutex.c for single-acquisition sleeping locks which enforce
+ * rules which allow code to be debugged more easily.
+ */
+
+/*
+ * Some notes on the implementation:
+ *
+ * The spinlock controls access to the other members of the semaphore.
+ * down_trylock() and up() can be called from interrupt context, so we
+ * have to disable interrupts when taking the lock.  It turns out various
+ * parts of the kernel expect to be able to use down() on a semaphore in
+ * interrupt context when they know it will succeed, so we have to use
+ * irqsave variants for down(), down_interruptible() and down_killable()
+ * too.
+ *
+ * The ->count variable represents how many more tasks can acquire this
+ * semaphore.  If it's zero, there may be tasks waiting on the wait_list.
+ */
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/spinlock.h>
+
+static noinline void __down(struct semaphore *sem);
+static noinline int __down_timeout(struct semaphore *sem, long timeout);
+static noinline void __up(struct semaphore *sem);
+
+/**
+ * down - acquire the semaphore
+ * @sem: the semaphore to be acquired
+ *
+ * Acquires the semaphore.  If no more tasks are allowed to acquire the
+ * semaphore, calling this function will put the task to sleep until the
+ * semaphore is released.
+ *
+ * Use of this function is deprecated, please use down_interruptible() or
+ * down_killable() instead.
+ */
+void down(struct semaphore *sem)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&sem->lock, flags);
+	if (likely(sem->count > 0))
+		sem->count--;
+	else
+		__down(sem);
+	raw_spin_unlock_irqrestore(&sem->lock, flags);
+}
+EXPORT_SYMBOL(down);
+
+/**
+ * down_trylock - try to acquire the semaphore, without waiting
+ * @sem: the semaphore to be acquired
+ *
+ * Try to acquire the semaphore atomically.  Returns 0 if the semaphore has
+ * been acquired successfully or 1 if it it cannot be acquired.
+ *
+ * NOTE: This return value is inverted from both spin_trylock and
+ * mutex_trylock!  Be careful about this when converting code.
+ *
+ * Unlike mutex_trylock, this function can be used from interrupt context,
+ * and the semaphore can be released by any task or interrupt.
+ */
+int down_trylock(struct semaphore *sem)
+{
+	unsigned long flags;
+	int count;
+
+	raw_spin_lock_irqsave(&sem->lock, flags);
+	count = sem->count - 1;
+	if (likely(count >= 0))
+		sem->count = count;
+	raw_spin_unlock_irqrestore(&sem->lock, flags);
+
+	return (count < 0);
+}
+EXPORT_SYMBOL(down_trylock);
+
+/**
+ * down_timeout - acquire the semaphore within a specified time
+ * @sem: the semaphore to be acquired
+ * @timeout: how long to wait before failing
+ *
+ * Attempts to acquire the semaphore.  If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the semaphore is not released within the specified number of jiffies,
+ * this function returns -ETIME.  It returns 0 if the semaphore was acquired.
+ */
+int down_timeout(struct semaphore *sem, long timeout)
+{
+	unsigned long flags;
+	int result = 0;
+
+	raw_spin_lock_irqsave(&sem->lock, flags);
+	if (likely(sem->count > 0))
+		sem->count--;
+	else
+		result = __down_timeout(sem, timeout);
+	raw_spin_unlock_irqrestore(&sem->lock, flags);
+
+	return result;
+}
+EXPORT_SYMBOL(down_timeout);
+
+/**
+ * up - release the semaphore
+ * @sem: the semaphore to release
+ *
+ * Release the semaphore.  Unlike mutexes, up() may be called from any
+ * context and even by tasks which have never called down().
+ */
+void up(struct semaphore *sem)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&sem->lock, flags);
+	if (likely(list_empty(&sem->wait_list)))
+		sem->count++;
+	else
+		__up(sem);
+	raw_spin_unlock_irqrestore(&sem->lock, flags);
+}
+EXPORT_SYMBOL(up);
+
+/* Functions for the contended case */
+
+struct semaphore_waiter {
+	struct list_head list;
+	struct task_struct *task;
+	bool up;
+};
+
+/*
+ * Because this function is inlined, the 'state' parameter will be
+ * constant, and thus optimised away by the compiler.  Likewise the
+ * 'timeout' parameter for the cases without timeouts.
+ */
+static inline int __sched __down_common(struct semaphore *sem, long state,
+								long timeout)
+{
+	struct semaphore_waiter waiter;
+
+	list_add_tail(&waiter.list, &sem->wait_list);
+	waiter.task = current;
+	waiter.up = false;
+
+	for (;;) {
+		if (unlikely(timeout <= 0))
+			goto timed_out;
+		__set_current_state(state);
+		raw_spin_unlock_irq(&sem->lock);
+		timeout = schedule_timeout(timeout);
+		raw_spin_lock_irq(&sem->lock);
+		if (waiter.up)
+			return 0;
+	}
+
+ timed_out:
+	list_del(&waiter.list);
+	return -ETIME;
+}
+
+static noinline void __sched __down(struct semaphore *sem)
+{
+	__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+
+static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
+{
+	return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
+}
+
+static noinline void __sched __up(struct semaphore *sem)
+{
+	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
+						struct semaphore_waiter, list);
+	list_del(&waiter->list);
+	waiter->up = true;
+	wake_up_process(waiter->task);
+}
diff --git a/linux/seq_buf.c b/linux/seq_buf.c
new file mode 100644
index 00000000..cf8709ad
--- /dev/null
+++ b/linux/seq_buf.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * seq_buf.c
+ *
+ * Copyright (C) 2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * The seq_buf is a handy tool that allows you to pass a descriptor around
+ * to a buffer that other functions can write to. It is similar to the
+ * seq_file functionality but has some differences.
+ *
+ * To use it, the seq_buf must be initialized with seq_buf_init().
+ * This will set up the counters within the descriptor. You can call
+ * seq_buf_init() more than once to reset the seq_buf to start
+ * from scratch.
+ */
+#include <linux/seq_buf.h>
+#include <stdio.h>
+
+/**
+ * seq_buf_can_fit - can the new data fit in the current buffer?
+ * @s: the seq_buf descriptor
+ * @len: The length to see if it can fit in the current buffer
+ *
+ * Returns true if there's enough unused space in the seq_buf buffer
+ * to fit the amount of new data according to @len.
+ */
+static bool seq_buf_can_fit(struct seq_buf *s, size_t len)
+{
+	return s->len + len <= s->size;
+}
+
+/**
+ * seq_buf_vprintf - sequence printing of information.
+ * @s: seq_buf descriptor
+ * @fmt: printf format string
+ * @args: va_list of arguments from a printf() type function
+ *
+ * Writes a vnprintf() format into the sequencce buffer.
+ *
+ * Returns zero on success, -1 on overflow.
+ */
+int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args)
+{
+	int len;
+
+	WARN_ON(s->size == 0);
+
+	if (s->len < s->size) {
+		len = vsnprintf(s->buffer + s->len, s->size - s->len, fmt, args);
+		if (s->len + len < s->size) {
+			s->len += len;
+			return 0;
+		}
+	}
+	seq_buf_set_overflow(s);
+	return -1;
+}
+
+/**
+ * seq_buf_printf - sequence printing of information
+ * @s: seq_buf descriptor
+ * @fmt: printf format string
+ *
+ * Writes a printf() format into the sequence buffer.
+ *
+ * Returns zero on success, -1 on overflow.
+ */
+int seq_buf_printf(struct seq_buf *s, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+
+	va_start(ap, fmt);
+	ret = seq_buf_vprintf(s, fmt, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+/**
+ * seq_buf_puts - sequence printing of simple string
+ * @s: seq_buf descriptor
+ * @str: simple string to record
+ *
+ * Copy a simple string into the sequence buffer.
+ *
+ * Returns zero on success, -1 on overflow
+ */
+int seq_buf_puts(struct seq_buf *s, const char *str)
+{
+	size_t len = strlen(str);
+
+	WARN_ON(s->size == 0);
+
+	/* Add 1 to len for the trailing null byte which must be there */
+	len += 1;
+
+	if (seq_buf_can_fit(s, len)) {
+		memcpy(s->buffer + s->len, str, len);
+		/* Don't count the trailing null byte against the capacity */
+		s->len += len - 1;
+		return 0;
+	}
+	seq_buf_set_overflow(s);
+	return -1;
+}
+
+/**
+ * seq_buf_putc - sequence printing of simple character
+ * @s: seq_buf descriptor
+ * @c: simple character to record
+ *
+ * Copy a single character into the sequence buffer.
+ *
+ * Returns zero on success, -1 on overflow
+ */
+int seq_buf_putc(struct seq_buf *s, unsigned char c)
+{
+	WARN_ON(s->size == 0);
+
+	if (seq_buf_can_fit(s, 1)) {
+		s->buffer[s->len++] = c;
+		return 0;
+	}
+	seq_buf_set_overflow(s);
+	return -1;
+}
+
+/**
+ * seq_buf_putmem - write raw data into the sequenc buffer
+ * @s: seq_buf descriptor
+ * @mem: The raw memory to copy into the buffer
+ * @len: The length of the raw memory to copy (in bytes)
+ *
+ * There may be cases where raw memory needs to be written into the
+ * buffer and a strcpy() would not work. Using this function allows
+ * for such cases.
+ *
+ * Returns zero on success, -1 on overflow
+ */
+int seq_buf_putmem(struct seq_buf *s, const void *mem, unsigned int len)
+{
+	WARN_ON(s->size == 0);
+
+	if (seq_buf_can_fit(s, len)) {
+		memcpy(s->buffer + s->len, mem, len);
+		s->len += len;
+		return 0;
+	}
+	seq_buf_set_overflow(s);
+	return -1;
+}
diff --git a/linux/shrinker.c b/linux/shrinker.c
index b8fc2464..ca34ebc7 100644
--- a/linux/shrinker.c
+++ b/linux/shrinker.c
@@ -1,7 +1,10 @@
 
 #include <stdio.h>
+#include <unistd.h>
 
+#include <linux/kthread.h>
 #include <linux/list.h>
+#include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/shrinker.h>
 
@@ -10,7 +13,22 @@
 static LIST_HEAD(shrinker_list);
 static DEFINE_MUTEX(shrinker_lock);
 
-int register_shrinker(struct shrinker *shrinker)
+void shrinker_free(struct shrinker *s)
+{
+	if (s->list.next) {
+		mutex_lock(&shrinker_lock);
+		list_del(&s->list);
+		mutex_unlock(&shrinker_lock);
+	}
+	free(s);
+}
+
+struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
+{
+	return calloc(sizeof(struct shrinker), 1);
+}
+
+int shrinker_register(struct shrinker *shrinker)
 {
 	mutex_lock(&shrinker_lock);
 	list_add_tail(&shrinker->list, &shrinker_list);
@@ -18,70 +36,105 @@ int register_shrinker(struct shrinker *shrinker)
 	return 0;
 }
 
-void unregister_shrinker(struct shrinker *shrinker)
+static void run_shrinkers_allocation_failed(gfp_t gfp_mask)
 {
-	mutex_lock(&shrinker_lock);
-	list_del(&shrinker->list);
-	mutex_unlock(&shrinker_lock);
-}
+	struct shrinker *shrinker;
 
-struct meminfo {
-	u64		total;
-	u64		available;
+	mutex_lock(&shrinker_lock);
+	list_for_each_entry(shrinker, &shrinker_list, list) {
+		struct shrink_control sc = { .gfp_mask	= gfp_mask, };
 
-};
+		unsigned long have = shrinker->count_objects(shrinker, &sc);
 
-static u64 parse_meminfo_line(const char *line)
-{
-	u64 v;
+		sc.nr_to_scan = have / 8;
 
-	if (sscanf(line, " %llu kB", &v) < 1)
-		die("sscanf error");
-	return v << 10;
+		shrinker->scan_objects(shrinker, &sc);
+	}
+	mutex_unlock(&shrinker_lock);
 }
 
-static struct meminfo read_meminfo(void)
+void run_shrinkers(gfp_t gfp_mask, bool allocation_failed)
 {
-	struct meminfo ret = { 0 };
-	size_t len, n = 0;
-	char *line = NULL;
-	const char *v;
-	FILE *f;
-
-	f = fopen("/proc/meminfo", "r");
-	if (!f)
-		die("error opening /proc/meminfo: %m");
-
-	while ((len = getline(&line, &n, f)) != -1) {
-		if ((v = strcmp_prefix(line, "MemTotal:")))
-			ret.total = parse_meminfo_line(v);
-
-		if ((v = strcmp_prefix(line, "MemAvailable:")))
-			ret.available = parse_meminfo_line(v);
-	}
+	struct shrinker *shrinker;
+	struct sysinfo info;
+	s64 want_shrink;
 
-	fclose(f);
-	free(line);
+	if (!(gfp_mask & GFP_KERNEL))
+		return;
 
-	return ret;
-}
+	/* Fast out if there are no shrinkers to run. */
+	if (list_empty(&shrinker_list))
+		return;
 
-void run_shrinkers(void)
-{
-	struct shrinker *shrinker;
-	struct meminfo info = read_meminfo();
-	s64 want_shrink = (info.total >> 2) - info.available;
+	if (allocation_failed) {
+		run_shrinkers_allocation_failed(gfp_mask);
+		return;
+	}
 
+	si_meminfo(&info);
+
+	/* Aim for 6% of physical RAM free without anything in swap */
+	want_shrink = (info.totalram >> 4) - info.freeram
+			+ info.totalswap - info.freeswap;
 	if (want_shrink <= 0)
 		return;
 
 	mutex_lock(&shrinker_lock);
 	list_for_each_entry(shrinker, &shrinker_list, list) {
 		struct shrink_control sc = {
-			.nr_to_scan = want_shrink >> PAGE_SHIFT
+			.gfp_mask	= gfp_mask,
+			.nr_to_scan	= want_shrink >> PAGE_SHIFT
 		};
 
 		shrinker->scan_objects(shrinker, &sc);
 	}
 	mutex_unlock(&shrinker_lock);
 }
+
+static int shrinker_thread(void *arg)
+{
+	while (!kthread_should_stop()) {
+		struct timespec to;
+		int v;
+
+		clock_gettime(CLOCK_MONOTONIC, &to);
+		to.tv_sec += 1;
+		__set_current_state(TASK_INTERRUPTIBLE);
+		errno = 0;
+		while ((v = READ_ONCE(current->state)) != TASK_RUNNING &&
+		       errno != ETIMEDOUT)
+			futex(&current->state, FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG,
+			      v, &to, NULL, (uint32_t)~0);
+		if (kthread_should_stop())
+			break;
+		if (v != TASK_RUNNING)
+			__set_current_state(TASK_RUNNING);
+		run_shrinkers(GFP_KERNEL, false);
+	}
+
+	return 0;
+}
+
+struct task_struct *shrinker_task;
+
+__attribute__((constructor(103)))
+static void shrinker_thread_init(void)
+{
+	shrinker_task = kthread_run(shrinker_thread, NULL, "shrinkers");
+	BUG_ON(IS_ERR(shrinker_task));
+}
+
+#if 0
+/*
+ * We seem to be hitting a rare segfault when shutting down the shrinker thread.
+ * Disabling this is going to cause some harmless warnings about memory leaks:
+ */
+__attribute__((destructor(103)))
+static void shrinker_thread_exit(void)
+{
+	int ret = kthread_stop(shrinker_task);
+	BUG_ON(ret);
+
+	shrinker_task = NULL;
+}
+#endif
diff --git a/linux/siphash.c b/linux/siphash.c
new file mode 100644
index 00000000..f8dbecea
--- /dev/null
+++ b/linux/siphash.c
@@ -0,0 +1,552 @@
+/* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.
+ *
+ * SipHash: a fast short-input PRF
+ * https://131002.net/siphash/
+ *
+ * This implementation is specifically for SipHash2-4 for a secure PRF
+ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
+ * hashtables.
+ */
+
+#include <linux/siphash.h>
+#include <linux/bitops.h>
+#include <asm/unaligned.h>
+
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+#include <linux/dcache.h>
+#include <asm/word-at-a-time.h>
+#endif
+
+#define SIPROUND \
+	do { \
+	v0 += v1; v1 = rol64(v1, 13); v1 ^= v0; v0 = rol64(v0, 32); \
+	v2 += v3; v3 = rol64(v3, 16); v3 ^= v2; \
+	v0 += v3; v3 = rol64(v3, 21); v3 ^= v0; \
+	v2 += v1; v1 = rol64(v1, 17); v1 ^= v2; v2 = rol64(v2, 32); \
+	} while (0)
+
+#define PREAMBLE(len) \
+	u64 v0 = 0x736f6d6570736575ULL; \
+	u64 v1 = 0x646f72616e646f6dULL; \
+	u64 v2 = 0x6c7967656e657261ULL; \
+	u64 v3 = 0x7465646279746573ULL; \
+	u64 b = ((u64)(len)) << 56; \
+	v3 ^= key->key[1]; \
+	v2 ^= key->key[0]; \
+	v1 ^= key->key[1]; \
+	v0 ^= key->key[0];
+
+#define POSTAMBLE \
+	v3 ^= b; \
+	SIPROUND; \
+	SIPROUND; \
+	v0 ^= b; \
+	v2 ^= 0xff; \
+	SIPROUND; \
+	SIPROUND; \
+	SIPROUND; \
+	SIPROUND; \
+	return (v0 ^ v1) ^ (v2 ^ v3);
+
+u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key)
+{
+	const u8 *end = data + len - (len % sizeof(u64));
+	const u8 left = len & (sizeof(u64) - 1);
+	u64 m;
+	PREAMBLE(len)
+	for (; data != end; data += sizeof(u64)) {
+		m = le64_to_cpup(data);
+		v3 ^= m;
+		SIPROUND;
+		SIPROUND;
+		v0 ^= m;
+	}
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+	if (left)
+		b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+						  bytemask_from_count(left)));
+#else
+	switch (left) {
+	case 7: b |= ((u64)end[6]) << 48; fallthrough;
+	case 6: b |= ((u64)end[5]) << 40; fallthrough;
+	case 5: b |= ((u64)end[4]) << 32; fallthrough;
+	case 4: b |= le32_to_cpup(data); break;
+	case 3: b |= ((u64)end[2]) << 16; fallthrough;
+	case 2: b |= le16_to_cpup(data); break;
+	case 1: b |= end[0];
+	}
+#endif
+	POSTAMBLE
+}
+EXPORT_SYMBOL(__siphash_aligned);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key)
+{
+	const u8 *end = data + len - (len % sizeof(u64));
+	const u8 left = len & (sizeof(u64) - 1);
+	u64 m;
+	PREAMBLE(len)
+	for (; data != end; data += sizeof(u64)) {
+		m = get_unaligned_le64(data);
+		v3 ^= m;
+		SIPROUND;
+		SIPROUND;
+		v0 ^= m;
+	}
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+	if (left)
+		b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+						  bytemask_from_count(left)));
+#else
+	switch (left) {
+	case 7: b |= ((u64)end[6]) << 48; fallthrough;
+	case 6: b |= ((u64)end[5]) << 40; fallthrough;
+	case 5: b |= ((u64)end[4]) << 32; fallthrough;
+	case 4: b |= get_unaligned_le32(end); break;
+	case 3: b |= ((u64)end[2]) << 16; fallthrough;
+	case 2: b |= get_unaligned_le16(end); break;
+	case 1: b |= end[0];
+	}
+#endif
+	POSTAMBLE
+}
+EXPORT_SYMBOL(__siphash_unaligned);
+#endif
+
+/**
+ * siphash_1u64 - compute 64-bit siphash PRF value of a u64
+ * @first: first u64
+ * @key: the siphash key
+ */
+u64 siphash_1u64(const u64 first, const siphash_key_t *key)
+{
+	PREAMBLE(8)
+	v3 ^= first;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= first;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_1u64);
+
+/**
+ * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64
+ * @first: first u64
+ * @second: second u64
+ * @key: the siphash key
+ */
+u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key)
+{
+	PREAMBLE(16)
+	v3 ^= first;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= second;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_2u64);
+
+/**
+ * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64
+ * @first: first u64
+ * @second: second u64
+ * @third: third u64
+ * @key: the siphash key
+ */
+u64 siphash_3u64(const u64 first, const u64 second, const u64 third,
+		 const siphash_key_t *key)
+{
+	PREAMBLE(24)
+	v3 ^= first;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= second;
+	v3 ^= third;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= third;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_3u64);
+
+/**
+ * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64
+ * @first: first u64
+ * @second: second u64
+ * @third: third u64
+ * @forth: forth u64
+ * @key: the siphash key
+ */
+u64 siphash_4u64(const u64 first, const u64 second, const u64 third,
+		 const u64 forth, const siphash_key_t *key)
+{
+	PREAMBLE(32)
+	v3 ^= first;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= second;
+	v3 ^= third;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= third;
+	v3 ^= forth;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= forth;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_4u64);
+
+u64 siphash_1u32(const u32 first, const siphash_key_t *key)
+{
+	PREAMBLE(4)
+	b |= first;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_1u32);
+
+u64 siphash_3u32(const u32 first, const u32 second, const u32 third,
+		 const siphash_key_t *key)
+{
+	u64 combined = (u64)second << 32 | first;
+	PREAMBLE(12)
+	v3 ^= combined;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= combined;
+	b |= third;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_3u32);
+
+#if BITS_PER_LONG == 64
+/* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for
+ * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3.
+ */
+
+#define HSIPROUND SIPROUND
+#define HPREAMBLE(len) PREAMBLE(len)
+#define HPOSTAMBLE \
+	v3 ^= b; \
+	HSIPROUND; \
+	v0 ^= b; \
+	v2 ^= 0xff; \
+	HSIPROUND; \
+	HSIPROUND; \
+	HSIPROUND; \
+	return (v0 ^ v1) ^ (v2 ^ v3);
+
+u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
+{
+	const u8 *end = data + len - (len % sizeof(u64));
+	const u8 left = len & (sizeof(u64) - 1);
+	u64 m;
+	HPREAMBLE(len)
+	for (; data != end; data += sizeof(u64)) {
+		m = le64_to_cpup(data);
+		v3 ^= m;
+		HSIPROUND;
+		v0 ^= m;
+	}
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+	if (left)
+		b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+						  bytemask_from_count(left)));
+#else
+	switch (left) {
+	case 7: b |= ((u64)end[6]) << 48; fallthrough;
+	case 6: b |= ((u64)end[5]) << 40; fallthrough;
+	case 5: b |= ((u64)end[4]) << 32; fallthrough;
+	case 4: b |= le32_to_cpup(data); break;
+	case 3: b |= ((u64)end[2]) << 16; fallthrough;
+	case 2: b |= le16_to_cpup(data); break;
+	case 1: b |= end[0];
+	}
+#endif
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_aligned);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u32 __hsiphash_unaligned(const void *data, size_t len,
+			 const hsiphash_key_t *key)
+{
+	const u8 *end = data + len - (len % sizeof(u64));
+	const u8 left = len & (sizeof(u64) - 1);
+	u64 m;
+	HPREAMBLE(len)
+	for (; data != end; data += sizeof(u64)) {
+		m = get_unaligned_le64(data);
+		v3 ^= m;
+		HSIPROUND;
+		v0 ^= m;
+	}
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+	if (left)
+		b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+						  bytemask_from_count(left)));
+#else
+	switch (left) {
+	case 7: b |= ((u64)end[6]) << 48; fallthrough;
+	case 6: b |= ((u64)end[5]) << 40; fallthrough;
+	case 5: b |= ((u64)end[4]) << 32; fallthrough;
+	case 4: b |= get_unaligned_le32(end); break;
+	case 3: b |= ((u64)end[2]) << 16; fallthrough;
+	case 2: b |= get_unaligned_le16(end); break;
+	case 1: b |= end[0];
+	}
+#endif
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_unaligned);
+#endif
+
+/**
+ * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32
+ * @first: first u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
+{
+	HPREAMBLE(4)
+	b |= first;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_1u32);
+
+/**
+ * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
+ * @first: first u32
+ * @second: second u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
+{
+	u64 combined = (u64)second << 32 | first;
+	HPREAMBLE(8)
+	v3 ^= combined;
+	HSIPROUND;
+	v0 ^= combined;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_2u32);
+
+/**
+ * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
+		  const hsiphash_key_t *key)
+{
+	u64 combined = (u64)second << 32 | first;
+	HPREAMBLE(12)
+	v3 ^= combined;
+	HSIPROUND;
+	v0 ^= combined;
+	b |= third;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_3u32);
+
+/**
+ * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @forth: forth u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
+		  const u32 forth, const hsiphash_key_t *key)
+{
+	u64 combined = (u64)second << 32 | first;
+	HPREAMBLE(16)
+	v3 ^= combined;
+	HSIPROUND;
+	v0 ^= combined;
+	combined = (u64)forth << 32 | third;
+	v3 ^= combined;
+	HSIPROUND;
+	v0 ^= combined;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_4u32);
+#else
+#define HSIPROUND \
+	do { \
+	v0 += v1; v1 = rol32(v1, 5); v1 ^= v0; v0 = rol32(v0, 16); \
+	v2 += v3; v3 = rol32(v3, 8); v3 ^= v2; \
+	v0 += v3; v3 = rol32(v3, 7); v3 ^= v0; \
+	v2 += v1; v1 = rol32(v1, 13); v1 ^= v2; v2 = rol32(v2, 16); \
+	} while (0)
+
+#define HPREAMBLE(len) \
+	u32 v0 = 0; \
+	u32 v1 = 0; \
+	u32 v2 = 0x6c796765U; \
+	u32 v3 = 0x74656462U; \
+	u32 b = ((u32)(len)) << 24; \
+	v3 ^= key->key[1]; \
+	v2 ^= key->key[0]; \
+	v1 ^= key->key[1]; \
+	v0 ^= key->key[0];
+
+#define HPOSTAMBLE \
+	v3 ^= b; \
+	HSIPROUND; \
+	v0 ^= b; \
+	v2 ^= 0xff; \
+	HSIPROUND; \
+	HSIPROUND; \
+	HSIPROUND; \
+	return v1 ^ v3;
+
+u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
+{
+	const u8 *end = data + len - (len % sizeof(u32));
+	const u8 left = len & (sizeof(u32) - 1);
+	u32 m;
+	HPREAMBLE(len)
+	for (; data != end; data += sizeof(u32)) {
+		m = le32_to_cpup(data);
+		v3 ^= m;
+		HSIPROUND;
+		v0 ^= m;
+	}
+	switch (left) {
+	case 3: b |= ((u32)end[2]) << 16; fallthrough;
+	case 2: b |= le16_to_cpup(data); break;
+	case 1: b |= end[0];
+	}
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_aligned);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u32 __hsiphash_unaligned(const void *data, size_t len,
+			 const hsiphash_key_t *key)
+{
+	const u8 *end = data + len - (len % sizeof(u32));
+	const u8 left = len & (sizeof(u32) - 1);
+	u32 m;
+	HPREAMBLE(len)
+	for (; data != end; data += sizeof(u32)) {
+		m = get_unaligned_le32(data);
+		v3 ^= m;
+		HSIPROUND;
+		v0 ^= m;
+	}
+	switch (left) {
+	case 3: b |= ((u32)end[2]) << 16; fallthrough;
+	case 2: b |= get_unaligned_le16(end); break;
+	case 1: b |= end[0];
+	}
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_unaligned);
+#endif
+
+/**
+ * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32
+ * @first: first u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
+{
+	HPREAMBLE(4)
+	v3 ^= first;
+	HSIPROUND;
+	v0 ^= first;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_1u32);
+
+/**
+ * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
+ * @first: first u32
+ * @second: second u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
+{
+	HPREAMBLE(8)
+	v3 ^= first;
+	HSIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	HSIPROUND;
+	v0 ^= second;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_2u32);
+
+/**
+ * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
+		  const hsiphash_key_t *key)
+{
+	HPREAMBLE(12)
+	v3 ^= first;
+	HSIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	HSIPROUND;
+	v0 ^= second;
+	v3 ^= third;
+	HSIPROUND;
+	v0 ^= third;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_3u32);
+
+/**
+ * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @forth: forth u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
+		  const u32 forth, const hsiphash_key_t *key)
+{
+	HPREAMBLE(16)
+	v3 ^= first;
+	HSIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	HSIPROUND;
+	v0 ^= second;
+	v3 ^= third;
+	HSIPROUND;
+	v0 ^= third;
+	v3 ^= forth;
+	HSIPROUND;
+	v0 ^= forth;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_4u32);
+#endif
diff --git a/linux/sort.c b/linux/sort.c
index 15e8d117..b01a9d91 100644
--- a/linux/sort.c
+++ b/linux/sort.c
@@ -1,143 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
- * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
+ * A fast, small, non-recursive O(n log n) sort for the Linux kernel
  *
- * Jan 23 2005  Matt Mackall <mpm@selenic.com>
+ * This performs n*log2(n) + 0.37*n + o(n) comparisons on average,
+ * and 1.5*n*log2(n) + O(n) in the (very contrived) worst case.
+ *
+ * Glibc qsort() manages n*log2(n) - 1.26*n for random inputs (1.63*n
+ * better) at the expense of stack usage and much larger code to avoid
+ * quicksort's O(n^2) worst case.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/export.h>
-#include <linux/kernel.h>
 #include <linux/sort.h>
 
-static int alignment_ok(const void *base, int align)
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required alignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
 {
-	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
-		((unsigned long)base & (align - 1)) == 0;
-}
+	unsigned char lsbits = (unsigned char)size;
 
-static void u32_swap(void *a, void *b, int size)
-{
-	u32 t = *(u32 *)a;
-	*(u32 *)a = *(u32 *)b;
-	*(u32 *)b = t;
+	(void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+	return (lsbits & (align - 1)) == 0;
 }
 
-static void u64_swap(void *a, void *b, int size)
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory.  This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is still valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static void swap_words_32(void *a, void *b, size_t n)
 {
-	u64 t = *(u64 *)a;
-	*(u64 *)a = *(u64 *)b;
-	*(u64 *)b = t;
+	do {
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+	} while (n);
 }
 
-static void generic_swap(void *a, void *b, int size)
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory.  This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible.  If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI).  Are there any cases the kernel needs to worry about?
+ */
+static void swap_words_64(void *a, void *b, size_t n)
 {
-	char t;
-
 	do {
-		t = *(char *)a;
-		*(char *)a++ = *(char *)b;
-		*(char *)b++ = t;
-	} while (--size > 0);
+#ifdef CONFIG_64BIT
+		u64 t = *(u64 *)(a + (n -= 8));
+		*(u64 *)(a + n) = *(u64 *)(b + n);
+		*(u64 *)(b + n) = t;
+#else
+		/* Use two 32-bit transfers to avoid base+index+4 addressing */
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+
+		t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+#endif
+	} while (n);
 }
 
 /**
- * sort - sort an array of elements
- * @base: pointer to data to sort
- * @num: number of elements
- * @size: size of each element
- * @cmp_func: pointer to comparison function
- * @swap_func: pointer to swap function or NULL
+ * swap_bytes - swap two elements a byte at a time
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size
  *
- * This function does a heapsort on the given array. You may provide a
- * swap_func function optimized to your element type.
- *
- * Sorting time is O(n log n) both on average and worst-case. While
- * qsort is about 20% faster on average, it suffers from exploitable
- * O(n*n) worst-case behavior and extra memory requirements that make
- * it less suitable for kernel use.
+ * This is the fallback if alignment doesn't allow using larger chunks.
  */
-
-void sort(void *base, size_t num, size_t size,
-	  int (*cmp_func)(const void *, const void *),
-	  void (*swap_func)(void *, void *, int size))
+static void swap_bytes(void *a, void *b, size_t n)
 {
-	/* pre-scale counters for performance */
-	int i = (num/2 - 1) * size, n = num * size, c, r;
+	do {
+		char t = ((char *)a)[--n];
+		((char *)a)[n] = ((char *)b)[n];
+		((char *)b)[n] = t;
+	} while (n);
+}
 
-	if (!swap_func) {
-		if (size == 4 && alignment_ok(base, 4))
-			swap_func = u32_swap;
-		else if (size == 8 && alignment_ok(base, 8))
-			swap_func = u64_swap;
-		else
-			swap_func = generic_swap;
-	}
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 (swap_r_func_t)0
+#define SWAP_WORDS_32 (swap_r_func_t)1
+#define SWAP_BYTES    (swap_r_func_t)2
+#define SWAP_WRAPPER  (swap_r_func_t)3
 
-	/* heapify */
-	for ( ; i >= 0; i -= size) {
-		for (r = i; r * 2 + size < n; r  = c) {
-			c = r * 2 + size;
-			if (c < n - size &&
-					cmp_func(base + c, base + c + size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
-		}
-	}
+struct wrapper {
+	cmp_func_t cmp;
+	swap_func_t swap;
+};
 
-	/* sort */
-	for (i = n - size; i > 0; i -= size) {
-		swap_func(base, base + i, size);
-		for (r = 0; r * 2 + size < i; r = c) {
-			c = r * 2 + size;
-			if (c < i - size &&
-					cmp_func(base + c, base + c + size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
-		}
+/*
+ * The function pointer is last to make tail calls most efficient if the
+ * compiler decides not to inline this function.
+ */
+static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
+{
+	if (swap_func == SWAP_WRAPPER) {
+		((const struct wrapper *)priv)->swap(a, b, (int)size);
+		return;
 	}
+
+	if (swap_func == SWAP_WORDS_64)
+		swap_words_64(a, b, size);
+	else if (swap_func == SWAP_WORDS_32)
+		swap_words_32(a, b, size);
+	else if (swap_func == SWAP_BYTES)
+		swap_bytes(a, b, size);
+	else
+		swap_func(a, b, (int)size, priv);
 }
 
-EXPORT_SYMBOL(sort);
+#define _CMP_WRAPPER ((cmp_r_func_t)0L)
 
-#if 0
-#include <linux/slab.h>
-/* a simple boot-time regression test */
+static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
+{
+	if (cmp == _CMP_WRAPPER)
+		return ((const struct wrapper *)priv)->cmp(a, b);
+	return cmp(a, b, priv);
+}
 
-int cmpint(const void *a, const void *b)
+/**
+ * parent - given the offset of the child, find the offset of the parent.
+ * @i: the offset of the heap element whose parent is sought.  Non-zero.
+ * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
+ * @size: size of each element
+ *
+ * In terms of array indexes, the parent of element j = @i/@size is simply
+ * (j-1)/2.  But when working in byte offsets, we can't use implicit
+ * truncation of integer divides.
+ *
+ * Fortunately, we only need one bit of the quotient, not the full divide.
+ * @size has a least significant bit.  That bit will be clear if @i is
+ * an even multiple of @size, and set if it's an odd multiple.
+ *
+ * Logically, we're doing "if (i & lsbit) i -= size;", but since the
+ * branch is unpredictable, it's done with a bit of clever branch-free
+ * code instead.
+ */
+__attribute_const__ __always_inline
+static size_t parent(size_t i, unsigned int lsbit, size_t size)
 {
-	return *(int *)a - *(int *)b;
+	i -= size;
+	i -= size & -(i & lsbit);
+	return i / 2;
 }
 
-static int sort_test(void)
+/**
+ * sort_r - sort an array of elements
+ * @base: pointer to data to sort
+ * @num: number of elements
+ * @size: size of each element
+ * @cmp_func: pointer to comparison function
+ * @swap_func: pointer to swap function or NULL
+ * @priv: third argument passed to comparison function
+ *
+ * This function does a heapsort on the given array.  You may provide
+ * a swap_func function if you need to do something more than a memory
+ * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
+ * avoids a slow retpoline and so is significantly faster.
+ *
+ * Sorting time is O(n log n) both on average and worst-case. While
+ * quicksort is slightly faster on average, it suffers from exploitable
+ * O(n*n) worst-case behavior and extra memory requirements that make
+ * it less suitable for kernel use.
+ */
+void sort_r(void *base, size_t num, size_t size,
+	    cmp_r_func_t cmp_func,
+	    swap_r_func_t swap_func,
+	    const void *priv)
 {
-	int *a, i, r = 1;
+	/* pre-scale counters for performance */
+	size_t n = num * size, a = (num/2) * size;
+	const unsigned int lsbit = size & -size;  /* Used to find parent */
 
-	a = kmalloc(1000 * sizeof(int), GFP_KERNEL);
-	BUG_ON(!a);
+	if (!a)		/* num < 2 || size == 0 */
+		return;
 
-	printk("testing sort()\n");
+	/* called from 'sort' without swap function, let's pick the default */
+	if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
+		swap_func = NULL;
 
-	for (i = 0; i < 1000; i++) {
-		r = (r * 725861) % 6599;
-		a[i] = r;
+	if (!swap_func) {
+		if (is_aligned(base, size, 8))
+			swap_func = SWAP_WORDS_64;
+		else if (is_aligned(base, size, 4))
+			swap_func = SWAP_WORDS_32;
+		else
+			swap_func = SWAP_BYTES;
 	}
 
-	sort(a, 1000, sizeof(int), cmpint, NULL);
+	/*
+	 * Loop invariants:
+	 * 1. elements [a,n) satisfy the heap property (compare greater than
+	 *    all of their children),
+	 * 2. elements [n,num*size) are sorted, and
+	 * 3. a <= b <= c <= d <= n (whenever they are valid).
+	 */
+	for (;;) {
+		size_t b, c, d;
 
-	for (i = 0; i < 999; i++)
-		if (a[i] > a[i+1]) {
-			printk("sort() failed!\n");
+		if (a)			/* Building heap: sift down --a */
+			a -= size;
+		else if (n -= size)	/* Sorting: Extract root to --n */
+			do_swap(base, base + n, size, swap_func, priv);
+		else			/* Sort complete */
 			break;
-		}
 
-	kfree(a);
+		/*
+		 * Sift element at "a" down into heap.  This is the
+		 * "bottom-up" variant, which significantly reduces
+		 * calls to cmp_func(): we find the sift-down path all
+		 * the way to the leaves (one compare per level), then
+		 * backtrack to find where to insert the target element.
+		 *
+		 * Because elements tend to sift down close to the leaves,
+		 * this uses fewer compares than doing two per level
+		 * on the way down.  (A bit more than half as many on
+		 * average, 3/4 worst-case.)
+		 */
+		for (b = a; c = 2*b + size, (d = c + size) < n;)
+			b = do_cmp(base + c, base + d, cmp_func, priv) >= 0 ? c : d;
+		if (d == n)	/* Special case last leaf with no sibling */
+			b = c;
 
-	return 0;
+		/* Now backtrack from "b" to the correct location for "a" */
+		while (b != a && do_cmp(base + a, base + b, cmp_func, priv) >= 0)
+			b = parent(b, lsbit, size);
+		c = b;			/* Where "a" belongs */
+		while (b != a) {	/* Shift it into place */
+			b = parent(b, lsbit, size);
+			do_swap(base + b, base + c, size, swap_func, priv);
+		}
+	}
 }
-
-module_init(sort_test);
-#endif
+EXPORT_SYMBOL(sort_r);
diff --git a/linux/string.c b/linux/string.c
index 4fa3f64b..f91b5380 100644
--- a/linux/string.c
+++ b/linux/string.c
@@ -19,37 +19,22 @@
  * -  Kissed strtok() goodbye
  */
 
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/ctype.h>
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/bug.h>
-#include <linux/errno.h>
-
+#include <ctype.h>
+#include <errno.h>
+#include <limits.h>
 #include <string.h>
 
-/**
- * skip_spaces - Removes leading whitespace from @str.
- * @str: The string to be stripped.
- *
- * Returns a pointer to the first non-whitespace character in @str.
- */
-char *skip_spaces(const char *str)
+#include <linux/bug.h>
+#include <linux/compiler.h>
+#include <linux/string.h>
+
+static char *skip_spaces(const char *str)
 {
 	while (isspace(*str))
 		++str;
 	return (char *)str;
 }
 
-/**
- * strim - Removes leading and trailing whitespace from @s.
- * @s: The string to be stripped.
- *
- * Note that the first trailing whitespace is replaced with a %NUL-terminator
- * in the given string @s. Returns a pointer to the first non-whitespace
- * character in @s.
- */
 char *strim(char *s)
 {
 	size_t size;
@@ -67,17 +52,6 @@ char *strim(char *s)
 	return skip_spaces(s);
 }
 
-/**
- * strlcpy - Copy a C-string into a sized buffer
- * @dest: Where to copy the string to
- * @src: Where to copy the string from
- * @size: size of destination buffer
- *
- * Compatible with *BSD: the result is always a valid
- * NUL-terminated string that fits in the buffer (unless,
- * of course, the buffer size is zero). It does not pad
- * out the result like strncpy() does.
- */
 size_t strlcpy(char *dest, const char *src, size_t size)
 {
 	size_t ret = strlen(src);
@@ -90,6 +64,31 @@ size_t strlcpy(char *dest, const char *src, size_t size)
 	return ret;
 }
 
+ssize_t strscpy(char *dest, const char *src, size_t count)
+{
+	long res = 0;
+
+	if (count == 0 || WARN_ON_ONCE(count > INT_MAX))
+		return -E2BIG;
+
+	while (count) {
+		char c;
+
+		c = src[res];
+		dest[res] = c;
+		if (!c)
+			return res;
+		res++;
+		count--;
+	}
+
+	/* Hit buffer length without finding a NUL; force NUL-termination. */
+	if (res)
+		dest[res-1] = '\0';
+
+	return -E2BIG;
+}
+
 void memzero_explicit(void *s, size_t count)
 {
 	memset(s, 0, count);
@@ -111,3 +110,16 @@ int match_string(const char * const *array, size_t n, const char *string)
 
 	return -EINVAL;
 }
+
+void *memscan(void *addr, int c, size_t size)
+{
+	unsigned char *p = addr;
+
+	while (size) {
+		if (*p == (unsigned char)c)
+			return (void *)p;
+		p++;
+		size--;
+	}
+  	return (void *)p;
+}
diff --git a/linux/string_helpers.c b/linux/string_helpers.c
new file mode 100644
index 00000000..0810ca13
--- /dev/null
+++ b/linux/string_helpers.c
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Helpers for formatting and printing strings
+ *
+ * Copyright 31 August 2008 James Bottomley
+ * Copyright (C) 2013, Intel Corporation
+ */
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/math64.h>
+#include <linux/export.h>
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/limits.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/string_helpers.h>
+
+/**
+ * string_get_size - get the size in the specified units
+ * @size:	The size to be converted in blocks
+ * @blk_size:	Size of the block (use 1 for size in bytes)
+ * @units:	units to use (powers of 1000 or 1024)
+ * @buf:	buffer to format to
+ * @len:	length of buffer
+ *
+ * This function returns a string formatted to 3 significant figures
+ * giving the size in the required units.  @buf should have room for
+ * at least 9 bytes and will always be zero terminated.
+ *
+ */
+int string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
+		    char *buf, int len)
+{
+	static const char *const units_10[] = {
+		"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
+	};
+	static const char *const units_2[] = {
+		"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"
+	};
+	static const char *const *const units_str[] = {
+		[STRING_UNITS_10] = units_10,
+		[STRING_UNITS_2] = units_2,
+	};
+	static const unsigned int divisor[] = {
+		[STRING_UNITS_10] = 1000,
+		[STRING_UNITS_2] = 1024,
+	};
+	static const unsigned int rounding[] = { 500, 50, 5 };
+	int i = 0, j;
+	u32 remainder = 0, sf_cap;
+	char tmp[12];
+	const char *unit;
+
+	tmp[0] = '\0';
+
+	if (blk_size == 0)
+		size = 0;
+	if (size == 0)
+		goto out;
+
+	/* This is Napier's algorithm.  Reduce the original block size to
+	 *
+	 * coefficient * divisor[units]^i
+	 *
+	 * we do the reduction so both coefficients are just under 32 bits so
+	 * that multiplying them together won't overflow 64 bits and we keep
+	 * as much precision as possible in the numbers.
+	 *
+	 * Note: it's safe to throw away the remainders here because all the
+	 * precision is in the coefficients.
+	 */
+	while (blk_size >> 32) {
+		do_div(blk_size, divisor[units]);
+		i++;
+	}
+
+	while (size >> 32) {
+		do_div(size, divisor[units]);
+		i++;
+	}
+
+	/* now perform the actual multiplication keeping i as the sum of the
+	 * two logarithms */
+	size *= blk_size;
+
+	/* and logarithmically reduce it until it's just under the divisor */
+	while (size >= divisor[units]) {
+		remainder = do_div(size, divisor[units]);
+		i++;
+	}
+
+	/* work out in j how many digits of precision we need from the
+	 * remainder */
+	sf_cap = size;
+	for (j = 0; sf_cap*10 < 1000; j++)
+		sf_cap *= 10;
+
+	if (units == STRING_UNITS_2) {
+		/* express the remainder as a decimal.  It's currently the
+		 * numerator of a fraction whose denominator is
+		 * divisor[units], which is 1 << 10 for STRING_UNITS_2 */
+		remainder *= 1000;
+		remainder >>= 10;
+	}
+
+	/* add a 5 to the digit below what will be printed to ensure
+	 * an arithmetical round up and carry it through to size */
+	remainder += rounding[j];
+	if (remainder >= 1000) {
+		remainder -= 1000;
+		size += 1;
+	}
+
+	if (j) {
+		snprintf(tmp, sizeof(tmp), ".%03u", remainder);
+		tmp[j+1] = '\0';
+	}
+
+ out:
+	if (i >= ARRAY_SIZE(units_2))
+		unit = "UNK";
+	else
+		unit = units_str[units][i];
+
+	return snprintf(buf, len, "%u%s %s", (u32)size, tmp, unit);
+}
+EXPORT_SYMBOL(string_get_size);
diff --git a/linux/timer.c b/linux/timer.c
index dd5aba18..7d519a4d 100644
--- a/linux/timer.c
+++ b/linux/timer.c
@@ -93,9 +93,11 @@ do {									\
 									\
 	BUG_ON(_i >= (h)->used);					\
 	(h)->used--;							\
-	heap_swap(h, _i, (h)->used);					\
-	heap_sift_down(h, _i, cmp);					\
-	heap_sift(h, _i, cmp);						\
+	if ((_i) < (h)->used) {						\
+		heap_swap(h, _i, (h)->used);				\
+		heap_sift_down(h, _i, cmp);				\
+		heap_sift(h, _i, cmp);					\
+	}								\
 } while (0)
 
 #define heap_pop(h, d, cmp)						\
@@ -244,6 +246,8 @@ out:
 	return idx >= 0;
 }
 
+static bool timer_thread_stop = false;
+
 static int timer_thread(void *arg)
 {
 	struct pending_timer *p;
@@ -253,7 +257,7 @@ static int timer_thread(void *arg)
 
 	pthread_mutex_lock(&timer_lock);
 
-	while (1) {
+	while (!timer_thread_stop) {
 		now = jiffies;
 		p = heap_peek(&pending_timers);
 
@@ -295,14 +299,31 @@ static int timer_thread(void *arg)
 	return 0;
 }
 
+struct task_struct *timer_task;
+
 __attribute__((constructor(103)))
 static void timers_init(void)
 {
-	struct task_struct *p;
-
 	heap_init(&pending_timers, 64);
 	BUG_ON(!pending_timers.data);
 
-	p = kthread_run(timer_thread, NULL, "timers");
-	BUG_ON(IS_ERR(p));
+	timer_task = kthread_run(timer_thread, NULL, "timers");
+	BUG_ON(IS_ERR(timer_task));
+}
+
+__attribute__((destructor(103)))
+static void timers_cleanup(void)
+{
+	get_task_struct(timer_task);
+
+	pthread_mutex_lock(&timer_lock);
+	timer_thread_stop = true;
+	pthread_cond_signal(&timer_cond);
+	pthread_mutex_unlock(&timer_lock);
+
+	int ret = kthread_stop(timer_task);
+	BUG_ON(ret);
+
+	put_task_struct(timer_task);
+	timer_task = NULL;
 }
diff --git a/linux/vsprintf.c b/linux/vsprintf.c
deleted file mode 100644
index 1dca320c..00000000
--- a/linux/vsprintf.c
+++ /dev/null
@@ -1,21 +0,0 @@
-#include <stdlib.h>
-
-unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
-{
-	return strtoull(cp, endp, base);
-}
-
-unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base)
-{
-	return strtoul(cp, endp, base);
-}
-
-long long simple_strtoll(const char *cp, char **endp, unsigned int base)
-{
-	return strtoll(cp, endp, base);
-}
-
-long simple_strtol(const char *cp, char **endp, unsigned int base)
-{
-	return strtol(cp, endp, base);
-}
diff --git a/linux/wait.c b/linux/wait.c
index 991875c5..b1f002b9 100644
--- a/linux/wait.c
+++ b/linux/wait.c
@@ -66,6 +66,11 @@ void wake_up(wait_queue_head_t *q)
 	__wake_up(q, TASK_NORMAL, 1, NULL);
 }
 
+void wake_up_all(wait_queue_head_t *q)
+{
+	__wake_up(q, TASK_NORMAL, 0, NULL);
+}
+
 static void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
 {
 	__wake_up_common(q, mode, nr, 0, NULL);
diff --git a/linux/workqueue.c b/linux/workqueue.c
index 4dfd6cd9..0d5af3fb 100644
--- a/linux/workqueue.c
+++ b/linux/workqueue.c
@@ -5,6 +5,7 @@
 #include <linux/workqueue.h>
 
 static pthread_mutex_t	wq_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t	work_finished = PTHREAD_COND_INITIALIZER;
 static LIST_HEAD(wq_list);
 
 struct workqueue_struct {
@@ -13,8 +14,6 @@ struct workqueue_struct {
 	struct work_struct	*current_work;
 	struct list_head	pending_work;
 
-	pthread_cond_t		work_finished;
-
 	struct task_struct	*worker;
 	char			name[24];
 };
@@ -23,6 +22,11 @@ enum {
 	WORK_PENDING_BIT,
 };
 
+static bool work_pending(struct work_struct *work)
+{
+	return test_bit(WORK_PENDING_BIT, work_data_bits(work));
+}
+
 static void clear_work_pending(struct work_struct *work)
 {
 	clear_bit(WORK_PENDING_BIT, work_data_bits(work));
@@ -36,7 +40,7 @@ static bool set_work_pending(struct work_struct *work)
 static void __queue_work(struct workqueue_struct *wq,
 			 struct work_struct *work)
 {
-	BUG_ON(!test_bit(WORK_PENDING_BIT, work_data_bits(work)));
+	BUG_ON(!work_pending(work));
 	BUG_ON(!list_empty(&work->entry));
 
 	list_add_tail(&work->entry, &wq->pending_work);
@@ -130,17 +134,39 @@ retry:
 	goto retry;
 }
 
-static bool __flush_work(struct work_struct *work)
+static bool work_running(struct work_struct *work)
 {
 	struct workqueue_struct *wq;
-	bool ret = false;
-retry:
+
 	list_for_each_entry(wq, &wq_list, list)
-		if (wq->current_work == work) {
-			pthread_cond_wait(&wq->work_finished, &wq_lock);
-			ret = true;
-			goto retry;
-		}
+		if (wq->current_work == work)
+			return true;
+
+	return false;
+}
+
+bool flush_work(struct work_struct *work)
+{
+	bool ret = false;
+
+	pthread_mutex_lock(&wq_lock);
+	while (work_pending(work) || work_running(work)) {
+		pthread_cond_wait(&work_finished, &wq_lock);
+		ret = true;
+	}
+	pthread_mutex_unlock(&wq_lock);
+
+	return ret;
+}
+
+static bool __flush_work(struct work_struct *work)
+{
+	bool ret = false;
+
+	while (work_running(work)) {
+		pthread_cond_wait(&work_finished, &wq_lock);
+		ret = true;
+	}
 
 	return ret;
 }
@@ -228,7 +254,7 @@ static int worker_thread(void *arg)
 			continue;
 		}
 
-		BUG_ON(!test_bit(WORK_PENDING_BIT, work_data_bits(work)));
+		BUG_ON(!work_pending(work));
 		list_del_init(&work->entry);
 		clear_work_pending(work);
 
@@ -236,7 +262,7 @@ static int worker_thread(void *arg)
 		work->func(work);
 		pthread_mutex_lock(&wq_lock);
 
-		pthread_cond_broadcast(&wq->work_finished);
+		pthread_cond_broadcast(&work_finished);
 	}
 	pthread_mutex_unlock(&wq_lock);
 
@@ -269,8 +295,6 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 	INIT_LIST_HEAD(&wq->list);
 	INIT_LIST_HEAD(&wq->pending_work);
 
-	pthread_cond_init(&wq->work_finished, NULL);
-
 	va_start(args, max_active);
 	vsnprintf(wq->name, sizeof(wq->name), fmt, args);
 	va_end(args);
@@ -307,3 +331,16 @@ static void wq_init(void)
 	BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
 	       !system_unbound_wq || !system_freezable_wq);
 }
+
+__attribute__((destructor(102)))
+static void wq_cleanup(void)
+{
+	destroy_workqueue(system_freezable_wq);
+	destroy_workqueue(system_unbound_wq);
+	destroy_workqueue(system_long_wq);
+	destroy_workqueue(system_highpri_wq);
+	destroy_workqueue(system_wq);
+
+	system_wq = system_highpri_wq = system_long_wq = system_unbound_wq =
+		system_freezable_wq = NULL;
+}
diff --git a/linux/xxhash.c b/linux/xxhash.c
new file mode 100644
index 00000000..b5bd567a
--- /dev/null
+++ b/linux/xxhash.c
@@ -0,0 +1,500 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following disclaimer
+ *     in the documentation and/or other materials provided with the
+ *     distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at:
+ * - xxHash homepage: https://cyan4973.github.io/xxHash/
+ * - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+#include <linux/unaligned.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/xxhash.h>
+
+/*-*************************************
+ * Macros
+ **************************************/
+#define xxh_rotl32(x, r) ((x << r) | (x >> (32 - r)))
+#define xxh_rotl64(x, r) ((x << r) | (x >> (64 - r)))
+
+#ifdef __LITTLE_ENDIAN
+# define XXH_CPU_LITTLE_ENDIAN 1
+#else
+# define XXH_CPU_LITTLE_ENDIAN 0
+#endif
+
+/*-*************************************
+ * Constants
+ **************************************/
+static const uint32_t PRIME32_1 = 2654435761U;
+static const uint32_t PRIME32_2 = 2246822519U;
+static const uint32_t PRIME32_3 = 3266489917U;
+static const uint32_t PRIME32_4 =  668265263U;
+static const uint32_t PRIME32_5 =  374761393U;
+
+static const uint64_t PRIME64_1 = 11400714785074694791ULL;
+static const uint64_t PRIME64_2 = 14029467366897019727ULL;
+static const uint64_t PRIME64_3 =  1609587929392839161ULL;
+static const uint64_t PRIME64_4 =  9650029242287828579ULL;
+static const uint64_t PRIME64_5 =  2870177450012600261ULL;
+
+/*-**************************
+ *  Utils
+ ***************************/
+void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+}
+EXPORT_SYMBOL(xxh32_copy_state);
+
+void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+}
+EXPORT_SYMBOL(xxh64_copy_state);
+
+/*-***************************
+ * Simple Hash Functions
+ ****************************/
+static uint32_t xxh32_round(uint32_t seed, const uint32_t input)
+{
+	seed += input * PRIME32_2;
+	seed = xxh_rotl32(seed, 13);
+	seed *= PRIME32_1;
+	return seed;
+}
+
+uint32_t xxh32(const void *input, const size_t len, const uint32_t seed)
+{
+	const uint8_t *p = (const uint8_t *)input;
+	const uint8_t *b_end = p + len;
+	uint32_t h32;
+
+	if (len >= 16) {
+		const uint8_t *const limit = b_end - 16;
+		uint32_t v1 = seed + PRIME32_1 + PRIME32_2;
+		uint32_t v2 = seed + PRIME32_2;
+		uint32_t v3 = seed + 0;
+		uint32_t v4 = seed - PRIME32_1;
+
+		do {
+			v1 = xxh32_round(v1, get_unaligned_le32(p));
+			p += 4;
+			v2 = xxh32_round(v2, get_unaligned_le32(p));
+			p += 4;
+			v3 = xxh32_round(v3, get_unaligned_le32(p));
+			p += 4;
+			v4 = xxh32_round(v4, get_unaligned_le32(p));
+			p += 4;
+		} while (p <= limit);
+
+		h32 = xxh_rotl32(v1, 1) + xxh_rotl32(v2, 7) +
+			xxh_rotl32(v3, 12) + xxh_rotl32(v4, 18);
+	} else {
+		h32 = seed + PRIME32_5;
+	}
+
+	h32 += (uint32_t)len;
+
+	while (p + 4 <= b_end) {
+		h32 += get_unaligned_le32(p) * PRIME32_3;
+		h32 = xxh_rotl32(h32, 17) * PRIME32_4;
+		p += 4;
+	}
+
+	while (p < b_end) {
+		h32 += (*p) * PRIME32_5;
+		h32 = xxh_rotl32(h32, 11) * PRIME32_1;
+		p++;
+	}
+
+	h32 ^= h32 >> 15;
+	h32 *= PRIME32_2;
+	h32 ^= h32 >> 13;
+	h32 *= PRIME32_3;
+	h32 ^= h32 >> 16;
+
+	return h32;
+}
+EXPORT_SYMBOL(xxh32);
+
+static uint64_t xxh64_round(uint64_t acc, const uint64_t input)
+{
+	acc += input * PRIME64_2;
+	acc = xxh_rotl64(acc, 31);
+	acc *= PRIME64_1;
+	return acc;
+}
+
+static uint64_t xxh64_merge_round(uint64_t acc, uint64_t val)
+{
+	val = xxh64_round(0, val);
+	acc ^= val;
+	acc = acc * PRIME64_1 + PRIME64_4;
+	return acc;
+}
+
+uint64_t xxh64(const void *input, const size_t len, const uint64_t seed)
+{
+	const uint8_t *p = (const uint8_t *)input;
+	const uint8_t *const b_end = p + len;
+	uint64_t h64;
+
+	if (len >= 32) {
+		const uint8_t *const limit = b_end - 32;
+		uint64_t v1 = seed + PRIME64_1 + PRIME64_2;
+		uint64_t v2 = seed + PRIME64_2;
+		uint64_t v3 = seed + 0;
+		uint64_t v4 = seed - PRIME64_1;
+
+		do {
+			v1 = xxh64_round(v1, get_unaligned_le64(p));
+			p += 8;
+			v2 = xxh64_round(v2, get_unaligned_le64(p));
+			p += 8;
+			v3 = xxh64_round(v3, get_unaligned_le64(p));
+			p += 8;
+			v4 = xxh64_round(v4, get_unaligned_le64(p));
+			p += 8;
+		} while (p <= limit);
+
+		h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) +
+			xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18);
+		h64 = xxh64_merge_round(h64, v1);
+		h64 = xxh64_merge_round(h64, v2);
+		h64 = xxh64_merge_round(h64, v3);
+		h64 = xxh64_merge_round(h64, v4);
+
+	} else {
+		h64  = seed + PRIME64_5;
+	}
+
+	h64 += (uint64_t)len;
+
+	while (p + 8 <= b_end) {
+		const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p));
+
+		h64 ^= k1;
+		h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4;
+		p += 8;
+	}
+
+	if (p + 4 <= b_end) {
+		h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1;
+		h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+		p += 4;
+	}
+
+	while (p < b_end) {
+		h64 ^= (*p) * PRIME64_5;
+		h64 = xxh_rotl64(h64, 11) * PRIME64_1;
+		p++;
+	}
+
+	h64 ^= h64 >> 33;
+	h64 *= PRIME64_2;
+	h64 ^= h64 >> 29;
+	h64 *= PRIME64_3;
+	h64 ^= h64 >> 32;
+
+	return h64;
+}
+EXPORT_SYMBOL(xxh64);
+
+/*-**************************************************
+ * Advanced Hash Functions
+ ***************************************************/
+void xxh32_reset(struct xxh32_state *statePtr, const uint32_t seed)
+{
+	/* use a local state for memcpy() to avoid strict-aliasing warnings */
+	struct xxh32_state state;
+
+	memset(&state, 0, sizeof(state));
+	state.v1 = seed + PRIME32_1 + PRIME32_2;
+	state.v2 = seed + PRIME32_2;
+	state.v3 = seed + 0;
+	state.v4 = seed - PRIME32_1;
+	memcpy(statePtr, &state, sizeof(state));
+}
+EXPORT_SYMBOL(xxh32_reset);
+
+void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed)
+{
+	/* use a local state for memcpy() to avoid strict-aliasing warnings */
+	struct xxh64_state state;
+
+	memset(&state, 0, sizeof(state));
+	state.v1 = seed + PRIME64_1 + PRIME64_2;
+	state.v2 = seed + PRIME64_2;
+	state.v3 = seed + 0;
+	state.v4 = seed - PRIME64_1;
+	memcpy(statePtr, &state, sizeof(state));
+}
+EXPORT_SYMBOL(xxh64_reset);
+
+int xxh32_update(struct xxh32_state *state, const void *input, const size_t len)
+{
+	const uint8_t *p = (const uint8_t *)input;
+	const uint8_t *const b_end = p + len;
+
+	if (input == NULL)
+		return -EINVAL;
+
+	state->total_len_32 += (uint32_t)len;
+	state->large_len |= (len >= 16) | (state->total_len_32 >= 16);
+
+	if (state->memsize + len < 16) { /* fill in tmp buffer */
+		memcpy((uint8_t *)(state->mem32) + state->memsize, input, len);
+		state->memsize += (uint32_t)len;
+		return 0;
+	}
+
+	if (state->memsize) { /* some data left from previous update */
+		const uint32_t *p32 = state->mem32;
+
+		memcpy((uint8_t *)(state->mem32) + state->memsize, input,
+			16 - state->memsize);
+
+		state->v1 = xxh32_round(state->v1, get_unaligned_le32(p32));
+		p32++;
+		state->v2 = xxh32_round(state->v2, get_unaligned_le32(p32));
+		p32++;
+		state->v3 = xxh32_round(state->v3, get_unaligned_le32(p32));
+		p32++;
+		state->v4 = xxh32_round(state->v4, get_unaligned_le32(p32));
+		p32++;
+
+		p += 16-state->memsize;
+		state->memsize = 0;
+	}
+
+	if (p <= b_end - 16) {
+		const uint8_t *const limit = b_end - 16;
+		uint32_t v1 = state->v1;
+		uint32_t v2 = state->v2;
+		uint32_t v3 = state->v3;
+		uint32_t v4 = state->v4;
+
+		do {
+			v1 = xxh32_round(v1, get_unaligned_le32(p));
+			p += 4;
+			v2 = xxh32_round(v2, get_unaligned_le32(p));
+			p += 4;
+			v3 = xxh32_round(v3, get_unaligned_le32(p));
+			p += 4;
+			v4 = xxh32_round(v4, get_unaligned_le32(p));
+			p += 4;
+		} while (p <= limit);
+
+		state->v1 = v1;
+		state->v2 = v2;
+		state->v3 = v3;
+		state->v4 = v4;
+	}
+
+	if (p < b_end) {
+		memcpy(state->mem32, p, (size_t)(b_end-p));
+		state->memsize = (uint32_t)(b_end-p);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(xxh32_update);
+
+uint32_t xxh32_digest(const struct xxh32_state *state)
+{
+	const uint8_t *p = (const uint8_t *)state->mem32;
+	const uint8_t *const b_end = (const uint8_t *)(state->mem32) +
+		state->memsize;
+	uint32_t h32;
+
+	if (state->large_len) {
+		h32 = xxh_rotl32(state->v1, 1) + xxh_rotl32(state->v2, 7) +
+			xxh_rotl32(state->v3, 12) + xxh_rotl32(state->v4, 18);
+	} else {
+		h32 = state->v3 /* == seed */ + PRIME32_5;
+	}
+
+	h32 += state->total_len_32;
+
+	while (p + 4 <= b_end) {
+		h32 += get_unaligned_le32(p) * PRIME32_3;
+		h32 = xxh_rotl32(h32, 17) * PRIME32_4;
+		p += 4;
+	}
+
+	while (p < b_end) {
+		h32 += (*p) * PRIME32_5;
+		h32 = xxh_rotl32(h32, 11) * PRIME32_1;
+		p++;
+	}
+
+	h32 ^= h32 >> 15;
+	h32 *= PRIME32_2;
+	h32 ^= h32 >> 13;
+	h32 *= PRIME32_3;
+	h32 ^= h32 >> 16;
+
+	return h32;
+}
+EXPORT_SYMBOL(xxh32_digest);
+
+int xxh64_update(struct xxh64_state *state, const void *input, const size_t len)
+{
+	const uint8_t *p = (const uint8_t *)input;
+	const uint8_t *const b_end = p + len;
+
+	if (input == NULL)
+		return -EINVAL;
+
+	state->total_len += len;
+
+	if (state->memsize + len < 32) { /* fill in tmp buffer */
+		memcpy(((uint8_t *)state->mem64) + state->memsize, input, len);
+		state->memsize += (uint32_t)len;
+		return 0;
+	}
+
+	if (state->memsize) { /* tmp buffer is full */
+		uint64_t *p64 = state->mem64;
+
+		memcpy(((uint8_t *)p64) + state->memsize, input,
+			32 - state->memsize);
+
+		state->v1 = xxh64_round(state->v1, get_unaligned_le64(p64));
+		p64++;
+		state->v2 = xxh64_round(state->v2, get_unaligned_le64(p64));
+		p64++;
+		state->v3 = xxh64_round(state->v3, get_unaligned_le64(p64));
+		p64++;
+		state->v4 = xxh64_round(state->v4, get_unaligned_le64(p64));
+
+		p += 32 - state->memsize;
+		state->memsize = 0;
+	}
+
+	if (p + 32 <= b_end) {
+		const uint8_t *const limit = b_end - 32;
+		uint64_t v1 = state->v1;
+		uint64_t v2 = state->v2;
+		uint64_t v3 = state->v3;
+		uint64_t v4 = state->v4;
+
+		do {
+			v1 = xxh64_round(v1, get_unaligned_le64(p));
+			p += 8;
+			v2 = xxh64_round(v2, get_unaligned_le64(p));
+			p += 8;
+			v3 = xxh64_round(v3, get_unaligned_le64(p));
+			p += 8;
+			v4 = xxh64_round(v4, get_unaligned_le64(p));
+			p += 8;
+		} while (p <= limit);
+
+		state->v1 = v1;
+		state->v2 = v2;
+		state->v3 = v3;
+		state->v4 = v4;
+	}
+
+	if (p < b_end) {
+		memcpy(state->mem64, p, (size_t)(b_end-p));
+		state->memsize = (uint32_t)(b_end - p);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(xxh64_update);
+
+uint64_t xxh64_digest(const struct xxh64_state *state)
+{
+	const uint8_t *p = (const uint8_t *)state->mem64;
+	const uint8_t *const b_end = (const uint8_t *)state->mem64 +
+		state->memsize;
+	uint64_t h64;
+
+	if (state->total_len >= 32) {
+		const uint64_t v1 = state->v1;
+		const uint64_t v2 = state->v2;
+		const uint64_t v3 = state->v3;
+		const uint64_t v4 = state->v4;
+
+		h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) +
+			xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18);
+		h64 = xxh64_merge_round(h64, v1);
+		h64 = xxh64_merge_round(h64, v2);
+		h64 = xxh64_merge_round(h64, v3);
+		h64 = xxh64_merge_round(h64, v4);
+	} else {
+		h64  = state->v3 + PRIME64_5;
+	}
+
+	h64 += (uint64_t)state->total_len;
+
+	while (p + 8 <= b_end) {
+		const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p));
+
+		h64 ^= k1;
+		h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4;
+		p += 8;
+	}
+
+	if (p + 4 <= b_end) {
+		h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1;
+		h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+		p += 4;
+	}
+
+	while (p < b_end) {
+		h64 ^= (*p) * PRIME64_5;
+		h64 = xxh_rotl64(h64, 11) * PRIME64_1;
+		p++;
+	}
+
+	h64 ^= h64 >> 33;
+	h64 *= PRIME64_2;
+	h64 ^= h64 >> 29;
+	h64 *= PRIME64_3;
+	h64 ^= h64 >> 32;
+
+	return h64;
+}
+EXPORT_SYMBOL(xxh64_digest);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("xxHash");
diff --git a/linux/zstd_compress_module.c b/linux/zstd_compress_module.c
new file mode 100644
index 00000000..35cc5cba
--- /dev/null
+++ b/linux/zstd_compress_module.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/zstd.h>
+
+#define ZSTD_FORWARD_IF_ERR(ret)            \
+	do {                                \
+		size_t const __ret = (ret); \
+		if (ZSTD_isError(__ret))    \
+			return __ret;       \
+	} while (0)
+
+static size_t zstd_cctx_init(zstd_cctx *cctx, const zstd_parameters *parameters,
+	unsigned long long pledged_src_size)
+{
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_reset(
+		cctx, ZSTD_reset_session_and_parameters));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setPledgedSrcSize(
+		cctx, pledged_src_size));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_windowLog, parameters->cParams.windowLog));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_hashLog, parameters->cParams.hashLog));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_chainLog, parameters->cParams.chainLog));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_searchLog, parameters->cParams.searchLog));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_minMatch, parameters->cParams.minMatch));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_targetLength, parameters->cParams.targetLength));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_strategy, parameters->cParams.strategy));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_contentSizeFlag, parameters->fParams.contentSizeFlag));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_checksumFlag, parameters->fParams.checksumFlag));
+	ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter(
+		cctx, ZSTD_c_dictIDFlag, !parameters->fParams.noDictIDFlag));
+	return 0;
+}
+
+int zstd_min_clevel(void)
+{
+	return ZSTD_minCLevel();
+}
+EXPORT_SYMBOL(zstd_min_clevel);
+
+int zstd_max_clevel(void)
+{
+	return ZSTD_maxCLevel();
+}
+EXPORT_SYMBOL(zstd_max_clevel);
+
+size_t zstd_compress_bound(size_t src_size)
+{
+	return ZSTD_compressBound(src_size);
+}
+EXPORT_SYMBOL(zstd_compress_bound);
+
+zstd_parameters zstd_get_params(int level,
+	unsigned long long estimated_src_size)
+{
+	return ZSTD_getParams(level, estimated_src_size, 0);
+}
+EXPORT_SYMBOL(zstd_get_params);
+
+size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *cparams)
+{
+	return ZSTD_estimateCCtxSize_usingCParams(*cparams);
+}
+EXPORT_SYMBOL(zstd_cctx_workspace_bound);
+
+zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size)
+{
+	if (workspace == NULL)
+		return NULL;
+	return ZSTD_initStaticCCtx(workspace, workspace_size);
+}
+EXPORT_SYMBOL(zstd_init_cctx);
+
+size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity,
+	const void *src, size_t src_size, const zstd_parameters *parameters)
+{
+	ZSTD_FORWARD_IF_ERR(zstd_cctx_init(cctx, parameters, src_size));
+	return ZSTD_compress2(cctx, dst, dst_capacity, src, src_size);
+}
+EXPORT_SYMBOL(zstd_compress_cctx);
+
+size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams)
+{
+	return ZSTD_estimateCStreamSize_usingCParams(*cparams);
+}
+EXPORT_SYMBOL(zstd_cstream_workspace_bound);
+
+zstd_cstream *zstd_init_cstream(const zstd_parameters *parameters,
+	unsigned long long pledged_src_size, void *workspace, size_t workspace_size)
+{
+	zstd_cstream *cstream;
+
+	if (workspace == NULL)
+		return NULL;
+
+	cstream = ZSTD_initStaticCStream(workspace, workspace_size);
+	if (cstream == NULL)
+		return NULL;
+
+	/* 0 means unknown in linux zstd API but means 0 in new zstd API */
+	if (pledged_src_size == 0)
+		pledged_src_size = ZSTD_CONTENTSIZE_UNKNOWN;
+
+	if (ZSTD_isError(zstd_cctx_init(cstream, parameters, pledged_src_size)))
+		return NULL;
+
+	return cstream;
+}
+EXPORT_SYMBOL(zstd_init_cstream);
+
+size_t zstd_reset_cstream(zstd_cstream *cstream,
+	unsigned long long pledged_src_size)
+{
+	return ZSTD_resetCStream(cstream, pledged_src_size);
+}
+EXPORT_SYMBOL(zstd_reset_cstream);
+
+size_t zstd_compress_stream(zstd_cstream *cstream, zstd_out_buffer *output,
+	zstd_in_buffer *input)
+{
+	return ZSTD_compressStream(cstream, output, input);
+}
+EXPORT_SYMBOL(zstd_compress_stream);
+
+size_t zstd_flush_stream(zstd_cstream *cstream, zstd_out_buffer *output)
+{
+	return ZSTD_flushStream(cstream, output);
+}
+EXPORT_SYMBOL(zstd_flush_stream);
+
+size_t zstd_end_stream(zstd_cstream *cstream, zstd_out_buffer *output)
+{
+	return ZSTD_endStream(cstream, output);
+}
+EXPORT_SYMBOL(zstd_end_stream);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Zstd Compressor");
diff --git a/linux/zstd_decompress_module.c b/linux/zstd_decompress_module.c
new file mode 100644
index 00000000..7e8cd446
--- /dev/null
+++ b/linux/zstd_decompress_module.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/zstd.h>
+
+/* Common symbols. zstd_compress must depend on zstd_decompress. */
+
+unsigned int zstd_is_error(size_t code)
+{
+	return ZSTD_isError(code);
+}
+EXPORT_SYMBOL(zstd_is_error);
+
+zstd_error_code zstd_get_error_code(size_t code)
+{
+	return ZSTD_getErrorCode(code);
+}
+EXPORT_SYMBOL(zstd_get_error_code);
+
+const char *zstd_get_error_name(size_t code)
+{
+	return ZSTD_getErrorName(code);
+}
+EXPORT_SYMBOL(zstd_get_error_name);
+
+/* Decompression symbols. */
+
+size_t zstd_dctx_workspace_bound(void)
+{
+	return ZSTD_estimateDCtxSize();
+}
+EXPORT_SYMBOL(zstd_dctx_workspace_bound);
+
+zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size)
+{
+	if (workspace == NULL)
+		return NULL;
+	return ZSTD_initStaticDCtx(workspace, workspace_size);
+}
+EXPORT_SYMBOL(zstd_init_dctx);
+
+size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity,
+	const void *src, size_t src_size)
+{
+	return ZSTD_decompressDCtx(dctx, dst, dst_capacity, src, src_size);
+}
+EXPORT_SYMBOL(zstd_decompress_dctx);
+
+size_t zstd_dstream_workspace_bound(size_t max_window_size)
+{
+	return ZSTD_estimateDStreamSize(max_window_size);
+}
+EXPORT_SYMBOL(zstd_dstream_workspace_bound);
+
+zstd_dstream *zstd_init_dstream(size_t max_window_size, void *workspace,
+	size_t workspace_size)
+{
+	if (workspace == NULL)
+		return NULL;
+	(void)max_window_size;
+	return ZSTD_initStaticDStream(workspace, workspace_size);
+}
+EXPORT_SYMBOL(zstd_init_dstream);
+
+size_t zstd_reset_dstream(zstd_dstream *dstream)
+{
+	return ZSTD_resetDStream(dstream);
+}
+EXPORT_SYMBOL(zstd_reset_dstream);
+
+size_t zstd_decompress_stream(zstd_dstream *dstream, zstd_out_buffer *output,
+	zstd_in_buffer *input)
+{
+	return ZSTD_decompressStream(dstream, output, input);
+}
+EXPORT_SYMBOL(zstd_decompress_stream);
+
+size_t zstd_find_frame_compressed_size(const void *src, size_t src_size)
+{
+	return ZSTD_findFrameCompressedSize(src, src_size);
+}
+EXPORT_SYMBOL(zstd_find_frame_compressed_size);
+
+size_t zstd_get_frame_header(zstd_frame_header *header, const void *src,
+	size_t src_size)
+{
+	return ZSTD_getFrameHeader(header, src, src_size);
+}
+EXPORT_SYMBOL(zstd_get_frame_header);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Zstd Decompressor");
diff --git a/make-release-tarball.sh b/make-release-tarball.sh
new file mode 100755
index 00000000..c468da77
--- /dev/null
+++ b/make-release-tarball.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+set -o errexit
+
+version=$1
+
+git checkout v$version
+git clean -xfd
+
+cargo license > COPYING.rust-dependencies
+
+git ls-files|
+    tar --create --file bcachefs-tools-$version.tar -T -	\
+	--transform="s_^_bcachefs-tools-$version/_"
+
+tar --append --file bcachefs-tools-$version.tar			\
+    --transform="s_^_bcachefs-tools-$version/_"			\
+    COPYING.rust-dependencies
+
+zstd -z --ultra			bcachefs-tools-$version.tar
+
+gpg --armor --detach-sign	bcachefs-tools-$version.tar
+mv bcachefs-tools-$version.tar.asc bcachefs-tools-$version.tar.sign
+
+gpg --armor --sign		bcachefs-tools-$version.tar
+
+scp bcachefs-tools-$version.tar.zst	evilpiepirate.org:/var/www/htdocs/bcachefs-tools/
+scp bcachefs-tools-$version.tar.asc	evilpiepirate.org:/var/www/htdocs/bcachefs-tools/
+scp bcachefs-tools-$version.tar.sign	evilpiepirate.org:/var/www/htdocs/bcachefs-tools/
+
+cargo vendor
+
+mkdir .cargo
+cat > .cargo/config.toml <<-ZZ
+[source.crates-io]
+replace-with = "vendored-sources"
+
+[source."git+https://evilpiepirate.org/git/rust-bindgen.git"]
+git = "https://evilpiepirate.org/git/rust-bindgen.git"
+replace-with = "vendored-sources"
+
+[source.vendored-sources]
+directory = "vendor"
+ZZ
+
+cp bcachefs-tools-$version.tar bcachefs-tools-vendored-$version.tar
+tar --append --file bcachefs-tools-vendored-$version.tar	\
+    --transform="s_^_bcachefs-tools-$version/_"			\
+    .cargo vendor
+
+zstd -z --ultra			bcachefs-tools-vendored-$version.tar
+
+gpg --armor --detach-sign	bcachefs-tools-vendored-$version.tar
+mv bcachefs-tools-vendored-$version.tar.asc bcachefs-tools-vendored-$version.tar.sign
+
+gpg --armor --sign		bcachefs-tools-vendored-$version.tar
+
+scp bcachefs-tools-vendored-$version.tar.zst	evilpiepirate.org:/var/www/htdocs/bcachefs-tools/
+scp bcachefs-tools-vendored-$version.tar.asc	evilpiepirate.org:/var/www/htdocs/bcachefs-tools/
+scp bcachefs-tools-vendored-$version.tar.sign	evilpiepirate.org:/var/www/htdocs/bcachefs-tools/
diff --git a/mkfs.bcachefs b/mkfs.bcachefs
deleted file mode 100755
index b3631bad..00000000
--- a/mkfs.bcachefs
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/sh
-
-SDIR="$(readlink -f "$0")"
-exec "${SDIR%/*}/bcachefs" format "$@"
diff --git a/mount.bcachefs.sh b/mount.bcachefs.sh
new file mode 100755
index 00000000..b75fbf8b
--- /dev/null
+++ b/mount.bcachefs.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+join_by()
+{
+    local IFS="$1"
+    shift
+    echo "$*"
+}
+
+args=$(getopt -u -o 'sfnvo:t:N:' -n 'mount.bcachefs' -- "$@")
+if [ $? -ne 0 ]; then
+    echo 'Terminating...' >&2
+    exit 1
+fi
+
+read -r -a argv <<< "$args"
+
+for i in ${!argv[@]}; do
+    [[ ${argv[$i]} == '--' ]] && break
+done
+
+i=$((i+1))
+
+if [[ $((i + 2)) < ${#argv[@]} ]]; then
+    echo "Insufficient arguments"
+    exit 1
+fi
+
+UUID=${argv[$i]}
+
+if [[ ${UUID//-/} =~ ^[[:xdigit:]]{32}$ ]]; then
+    PARTS=()
+
+    for part in $(tail -n +3 /proc/partitions|awk '{print $4}'); do
+	uuid_line=$(bcachefs show-super /dev/$part|& head -n1)
+
+	if [[ $uuid_line =~ $UUID ]]; then
+	    PARTS+=(/dev/$part)
+	fi
+    done
+
+    if [[ ${#PARTS[@]} == 0 ]]; then
+	echo "uuid $UUID not found"
+	exit 1
+    fi
+
+    argv[$i]=$(join_by : "${PARTS[@]}")
+fi
+
+exec mount -i -t bcachefs ${argv[@]}
diff --git a/nix/fetchnix.nix b/nix/fetchnix.nix
deleted file mode 100644
index 2f98788f..00000000
--- a/nix/fetchnix.nix
+++ /dev/null
@@ -1,48 +0,0 @@
-# `builtins.fetchTarball` only accepts a `sha256` argument in Nix version 1.12
-# or later, so here we provide a function that can provide a compatible interface
-# to Nix 1.11 or Nix 1.12
-#
-# TODO FIXME: remove this sometime after Nix 1.12 goes stable
-
-{ url                             # URL of the nixpkgs tarball to download
-, rev                             # The Git revision of nixpkgs to fetch
-, sha256                          # The SHA256 of the downloaded data
-, system ? builtins.currentSystem # This is overridable if necessary
-}:
-
-with {
-  ifThenElse = { bool, thenValue, elseValue }: (
-    if bool then thenValue else elseValue);
-};
-
-ifThenElse {
-  bool = (0 <= builtins.compareVersions builtins.nixVersion "1.12");
-
-  # In Nix 1.12, we can just give a `sha256` to `builtins.fetchTarball`.
-  thenValue = (builtins.fetchTarball { inherit url sha256; });
-
-  # This hack should at least work for Nix 1.11
-  elseValue = (
-    (rec {
-      tarball = import <nix/fetchurl.nix> { inherit url sha256; };
-      builtin-paths = import <nix/config.nix>;
-
-      script = builtins.toFile "nixpkgs-unpacker" ''
-        "$coreutils/mkdir" "$out"
-        cd "$out"
-        "$gzip" --decompress < "$tarball" | "$tar" -x --strip-components=1
-      '';
-
-      nixpkgs = builtins.derivation {
-        name = "nixpkgs-${builtins.substring 0 6 rev}";
-
-        builder = builtins.storePath builtin-paths.shell;
-        args = [ script ];
-
-        inherit tarball system;
-        tar       = builtins.storePath builtin-paths.tar;
-        gzip      = builtins.storePath builtin-paths.gzip;
-        coreutils = builtins.storePath builtin-paths.coreutils;
-      };
-    }).nixpkgs);
-}
diff --git a/nix/nixpkgs.json b/nix/nixpkgs.json
deleted file mode 100644
index a5a11d05..00000000
--- a/nix/nixpkgs.json
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-  "url":    "https://github.com/nixos/nixpkgs/archive/5ae883b8c3b04e0c4a9c92a5ab3c7c84b9942943.tar.gz",
-  "rev":    "5ae883b8c3b04e0c4a9c92a5ab3c7c84b9942943",
-  "sha256": "1s2nhax586v2fax7r5qd1s3d2gdg25isva7k7r9pf9x9ay630cmb"
-}
diff --git a/nix/nixpkgs.nix b/nix/nixpkgs.nix
deleted file mode 100644
index 00673665..00000000
--- a/nix/nixpkgs.nix
+++ /dev/null
@@ -1,9 +0,0 @@
-let
-  # Grab the versions we specified in the JSON file
-  nixpkgs   = builtins.fromJSON (builtins.readFile ./nixpkgs.json);
-
-  # Bootstrap a copy of nixpkgs, based on this.
-  src = import ./fetchnix.nix { inherit (nixpkgs) url rev sha256; };
-
-# We use the default nixpkgs configuration during bootstrap.
-in import src { config = {}; }
diff --git a/nix/update-nixpkgs.sh b/nix/update-nixpkgs.sh
deleted file mode 100755
index 770d2801..00000000
--- a/nix/update-nixpkgs.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env bash
-set -e
-
-if [[ "x$1" == "x" ]]; then
-    echo "Must provide a revision argument"
-    echo "Usage:"
-    echo "  ./update-nixpkgs.sh <rev>"
-    echo "  ./update-nixpkgs.sh https://github.com/foo/nixpkgs <rev>"
-    exit 1
-fi
-
-if [[ "x$2" == "x" ]]; then
-    REV="$1"
-    URL="https://github.com/nixos/nixpkgs"
-else
-    REV="$2"
-    URL="$1"
-fi
-
-DOWNLOAD="$URL/archive/$REV.tar.gz"
-echo "Updating to nixpkgs revision $REV from $URL"
-SHA256=$(nix-prefetch-url "$DOWNLOAD")
-
-cat > nixpkgs.json <<EOF
-{
-  "url":    "$DOWNLOAD",
-  "rev":    "$REV",
-  "sha256": "$SHA256"
-}
-EOF
-
-echo "Updated nixpkgs.json"
diff --git a/packaging/README b/packaging/README
new file mode 100644
index 00000000..704c60d6
--- /dev/null
+++ b/packaging/README
@@ -0,0 +1,32 @@
+==== liburcu requirement ====
+
+https://liburcu.org/ is needed for this software to work, but older RHEL/CentOS and other
+distros might not have a reliable install. You might have to do the following...
+
+1. Make & install the latest liburcu tarball from their site.
+2. In the Makefile for bcachefs-tools...
+   a. Remove liburcu from PKGCONFIG_LIBS
+   b. Add -lurcu to LDLIBS
+3. Add LD_LIBRARY_PATH=/usr/local/lib to /etc/environment
+4. Remove "BuildRequires:  userspace-rcu-devel" and "Requires:   userspace-rcu" from the
+   spec file here.
+
+==== RHEL-CentOS 7 ====
+
+You need to use https://www.softwarecollections.org/ to install a newer GCC.
+
+As root: yum install devtoolset-8-gcc
+
+Before building: scl enable devtoolset-8 bash
+
+===== RHEL-CentOS (any) ====
+
+1. As root, "yum install rpmdevtools help2man"
+2. Make a non-root user to build RPMs with.
+3. "su - (non-root user)" and use "rpmdev-setuptree" to create an RPM build structure.
+4. Copy the SPEC file from this directory to the "~/rpmbuild/SPECS/" folder.
+6. Move the parent directory here to bcachefs-tools-(VERSION), and "cd" to its parent.
+7. "tar cjf bcachefs-tools-(VERSION).tar.bz2 (directory of bcachefs-tools-(VERSION))"
+8. "rpmbuild -bs ~/rpmbuild/SPECS/bcachefs-tools.spec"
+9. "rpmbuild -bb ~/rpmbuild/SPECS/bcachefs-tools.spec"
+10. The RPMs will be in "~/rpmbuild/RPMS" and "~/rpmbuild/SRPMS".
diff --git a/packaging/bcachefs-tools.spec b/packaging/bcachefs-tools.spec
new file mode 100644
index 00000000..97a83e27
--- /dev/null
+++ b/packaging/bcachefs-tools.spec
@@ -0,0 +1,75 @@
+Name:           bcachefs-tools
+# define with i.e. --define '_version 1.0'
+Version:        %{_version}
+Release:        1%{?dist}
+Summary:        Userspace tools for bcachefs
+
+License:        GPLv2
+URL:            https://github.com/koverstreet/bcachefs-tools
+
+BuildRequires:  gcc
+BuildRequires:  make
+BuildRequires:  cargo
+BuildRequires:  clang-devel
+BuildRequires:  keyutils-libs-devel
+BuildRequires:  libaio-devel
+BuildRequires:  libattr-devel
+BuildRequires:  libblkid-devel
+BuildRequires:  libsodium-devel
+BuildRequires:  libuuid-devel
+BuildRequires:  libzstd-devel
+BuildRequires:  lz4-devel
+BuildRequires:  systemd-devel
+BuildRequires:  systemd-rpm-macros
+BuildRequires:  udev
+BuildRequires:  userspace-rcu-devel
+BuildRequires:  zlib-devel
+
+%description
+The bcachefs tool, which has a number of subcommands for formatting and managing bcachefs filesystems. Run bcachefs --help for full list of commands.
+
+%prep
+%setup -q
+
+%build
+%make_build V=0 --no-print-directory
+
+%install
+rm -rf $RPM_BUILD_ROOT
+mkdir -p $RPM_BUILD_ROOT%{_sbindir}
+mkdir -p $RPM_BUILD_ROOT%{_mandir}/man8
+%make_install PREFIX=%{_exec_prefix} ROOT_SBINDIR=%{_sbindir}
+
+# These may be debian-specific, and for unlocking encrypted root fs
+rm -f %{buildroot}/%{_datadir}/initramfs-tools/hooks/bcachefs
+rm -f %{buildroot}/%{_datadir}/initramfs-tools/scripts/local-premount/bcachefs
+# The library is not needed by userspace
+rm -f %{buildroot}/usr/lib/libbcachefs.so
+
+%files
+%{_sbindir}/bcachefs
+%{_sbindir}/mount.bcachefs
+%{_sbindir}/fsck.bcachefs
+%{_sbindir}/mkfs.bcachefs
+%{_sbindir}/mount.fuse.bcachefs
+%{_sbindir}/fsck.fuse.bcachefs
+%{_sbindir}/mkfs.fuse.bcachefs
+%{_mandir}/man8/bcachefs.8.gz
+%{_udevrulesdir}/64-bcachefs.rules
+%{_unitdir}/bcachefsck*
+%{_unitdir}/system-bcachefsck.slice
+%{_libexecdir}/bcachefsck*
+
+%changelog
+* Tue Nov 15 2022 Eric Sandeen <sandeen@sandeen.net> - 2022.11.15-1
+- NOTE: This binary RPM has been built directly from the bcachefs-tools
+  git tree with "make rpm" from the git hash indicated in the package version.
+- Update spec file to allow in-tree rpm builds
+- Remove maually added Requires: and unneeded build-requires
+
+* Tue Jan 21 2020 Michael Adams <unquietwiki@gmail.com> - 2020.01.21-1
+- Updated RPM package definition to reflect that changes in codebase have occurred.
+
+* Tue Jan 07 2020 Michael Adams <unquietwiki@gmail.com> - 2020.01.07-1
+- Initial RPM package definition
+- Makefile needs further work to accommodate RPM macros.
diff --git a/packaging/userspace-rcu.spec b/packaging/userspace-rcu.spec
new file mode 100644
index 00000000..40516cc2
--- /dev/null
+++ b/packaging/userspace-rcu.spec
@@ -0,0 +1,238 @@
+# rpmbuild with QA_RPATHS=$[0x0001]
+
+Name:           userspace-rcu
+Version:        0.11.1
+Release:        2%{?dist}
+Summary:        liburcu is a LGPLv2.1 userspace RCU (read-copy-update) library.
+
+License:        LGPLv2.1
+URL:            https://liburcu.org/
+Source0:        https://lttng.org/files/urcu/%{name}-%{version}.tar.bz2
+Source1:        https://lttng.org/files/urcu/%{name}-%{version}.tar.bz2.asc
+
+# "devel" files are installed with this package, also.
+Provides:	userspace-rcu-devel
+
+# Recommend using https://www.softwarecollections.org/en/scls/rhscl/devtoolset-8/ for this
+
+BuildRequires:  bzip2
+BuildRequires:  gcc
+BuildRequires:  make
+BuildRequires:  m4
+
+%description
+liburcu provides efficient data structures based on RCU and lock-free algorithms. Those structures include hash tables, queues, stacks, and doubly-linked lists.
+
+%prep
+%setup -q
+
+%configure
+
+%build
+make
+
+%install
+rm -rf $RPM_BUILD_ROOT
+%make_install
+
+%files
+%{_datadir}/doc/userspace-rcu/cds-api.md
+%{_datadir}/doc/userspace-rcu/examples/hlist/cds_hlist_add_head_rcu.c
+%{_datadir}/doc/userspace-rcu/examples/hlist/cds_hlist_del_rcu.c
+%{_datadir}/doc/userspace-rcu/examples/hlist/cds_hlist_for_each_entry_rcu.c
+%{_datadir}/doc/userspace-rcu/examples/hlist/cds_hlist_for_each_rcu.c
+%{_datadir}/doc/userspace-rcu/examples/hlist/Makefile
+%{_datadir}/doc/userspace-rcu/examples/hlist/Makefile.cds_hlist_add_head_rcu
+%{_datadir}/doc/userspace-rcu/examples/hlist/Makefile.cds_hlist_del_rcu
+%{_datadir}/doc/userspace-rcu/examples/hlist/Makefile.cds_hlist_for_each_entry_rcu
+%{_datadir}/doc/userspace-rcu/examples/hlist/Makefile.cds_hlist_for_each_rcu
+%{_datadir}/doc/userspace-rcu/examples/lfstack/cds_lfs_pop_all_blocking.c
+%{_datadir}/doc/userspace-rcu/examples/lfstack/cds_lfs_pop_blocking.c
+%{_datadir}/doc/userspace-rcu/examples/lfstack/cds_lfs_push.c
+%{_datadir}/doc/userspace-rcu/examples/lfstack/Makefile
+%{_datadir}/doc/userspace-rcu/examples/lfstack/Makefile.cds_lfs_pop_all_blocking
+%{_datadir}/doc/userspace-rcu/examples/lfstack/Makefile.cds_lfs_pop_blocking
+%{_datadir}/doc/userspace-rcu/examples/lfstack/Makefile.cds_lfs_push
+%{_datadir}/doc/userspace-rcu/examples/list/cds_list_add_rcu.c
+%{_datadir}/doc/userspace-rcu/examples/list/cds_list_add_tail_rcu.c
+%{_datadir}/doc/userspace-rcu/examples/list/cds_list_del_rcu.c
+%{_datadir}/doc/userspace-rcu/examples/list/cds_list_for_each_entry_rcu.c
+%{_datadir}/doc/userspace-rcu/examples/list/cds_list_for_each_rcu.c
+%{_datadir}/doc/userspace-rcu/examples/list/cds_list_replace_rcu.c
+%{_datadir}/doc/userspace-rcu/examples/list/Makefile
+%{_datadir}/doc/userspace-rcu/examples/list/Makefile.cds_list_add_rcu
+%{_datadir}/doc/userspace-rcu/examples/list/Makefile.cds_list_add_tail_rcu
+%{_datadir}/doc/userspace-rcu/examples/list/Makefile.cds_list_del_rcu
+%{_datadir}/doc/userspace-rcu/examples/list/Makefile.cds_list_for_each_entry_rcu
+%{_datadir}/doc/userspace-rcu/examples/list/Makefile.cds_list_for_each_rcu
+%{_datadir}/doc/userspace-rcu/examples/list/Makefile.cds_list_replace_rcu
+%{_datadir}/doc/userspace-rcu/examples/Makefile
+%{_datadir}/doc/userspace-rcu/examples/Makefile.examples.template
+%{_datadir}/doc/userspace-rcu/examples/rculfhash/cds_lfht_add_replace.c
+%{_datadir}/doc/userspace-rcu/examples/rculfhash/cds_lfht_add_unique.c
+%{_datadir}/doc/userspace-rcu/examples/rculfhash/cds_lfht_add.c
+%{_datadir}/doc/userspace-rcu/examples/rculfhash/cds_lfht_del.c
+%{_datadir}/doc/userspace-rcu/examples/rculfhash/cds_lfht_destroy.c
+%{_datadir}/doc/userspace-rcu/examples/rculfhash/cds_lfht_for_each_entry_duplicate.c
+%{_datadir}/doc/userspace-rcu/examples/rculfhash/cds_lfht_lookup.c
+%{_datadir}/doc/userspace-rcu/examples/rculfhash/jhash.h
+%{_datadir}/doc/userspace-rcu/examples/rculfhash/Makefile
+%{_datadir}/doc/userspace-rcu/examples/rculfhash/Makefile.cds_lfht_add
+%{_datadir}/doc/userspace-rcu/examples/rculfhash/Makefile.cds_lfht_add_replace
+%{_datadir}/doc/userspace-rcu/examples/rculfhash/Makefile.cds_lfht_add_unique
+%{_datadir}/doc/userspace-rcu/examples/rculfhash/Makefile.cds_lfht_del
+%{_datadir}/doc/userspace-rcu/examples/rculfhash/Makefile.cds_lfht_destroy
+%{_datadir}/doc/userspace-rcu/examples/rculfhash/Makefile.cds_lfht_for_each_entry_duplicate
+%{_datadir}/doc/userspace-rcu/examples/rculfhash/Makefile.cds_lfht_lookup
+%{_datadir}/doc/userspace-rcu/examples/rculfqueue/cds_lfq_dequeue.c
+%{_datadir}/doc/userspace-rcu/examples/rculfqueue/cds_lfq_enqueue.c
+%{_datadir}/doc/userspace-rcu/examples/rculfqueue/Makefile
+%{_datadir}/doc/userspace-rcu/examples/rculfqueue/Makefile.cds_lfq_dequeue
+%{_datadir}/doc/userspace-rcu/examples/rculfqueue/Makefile.cds_lfq_enqueue
+%{_datadir}/doc/userspace-rcu/examples/urcu-flavors/bp.c
+%{_datadir}/doc/userspace-rcu/examples/urcu-flavors/Makefile
+%{_datadir}/doc/userspace-rcu/examples/urcu-flavors/Makefile.bp
+%{_datadir}/doc/userspace-rcu/examples/urcu-flavors/Makefile.mb
+%{_datadir}/doc/userspace-rcu/examples/urcu-flavors/Makefile.membarrier
+%{_datadir}/doc/userspace-rcu/examples/urcu-flavors/Makefile.qsbr
+%{_datadir}/doc/userspace-rcu/examples/urcu-flavors/Makefile.signal
+%{_datadir}/doc/userspace-rcu/examples/urcu-flavors/mb.c
+%{_datadir}/doc/userspace-rcu/examples/urcu-flavors/membarrier.c
+%{_datadir}/doc/userspace-rcu/examples/urcu-flavors/qsbr.c
+%{_datadir}/doc/userspace-rcu/examples/urcu-flavors/signal.c
+%{_datadir}/doc/userspace-rcu/examples/wfcqueue/cds_wfcq_dequeue.c
+%{_datadir}/doc/userspace-rcu/examples/wfcqueue/cds_wfcq_enqueue.c
+%{_datadir}/doc/userspace-rcu/examples/wfcqueue/cds_wfcq_splice.c
+%{_datadir}/doc/userspace-rcu/examples/wfcqueue/Makefile
+%{_datadir}/doc/userspace-rcu/examples/wfcqueue/Makefile.cds_wfcq_dequeue
+%{_datadir}/doc/userspace-rcu/examples/wfcqueue/Makefile.cds_wfcq_enqueue
+%{_datadir}/doc/userspace-rcu/examples/wfcqueue/Makefile.cds_wfcq_splice
+%{_datadir}/doc/userspace-rcu/examples/wfstack/cds_wfs_pop_all_blocking.c
+%{_datadir}/doc/userspace-rcu/examples/wfstack/cds_wfs_pop.c
+%{_datadir}/doc/userspace-rcu/examples/wfstack/cds_wfs_push.c
+%{_datadir}/doc/userspace-rcu/examples/wfstack/Makefile
+%{_datadir}/doc/userspace-rcu/examples/wfstack/Makefile.cds_wfs_pop
+%{_datadir}/doc/userspace-rcu/examples/wfstack/Makefile.cds_wfs_pop_all_blocking
+%{_datadir}/doc/userspace-rcu/examples/wfstack/Makefile.cds_wfs_push
+%{_datadir}/doc/userspace-rcu/LICENSE
+%{_datadir}/doc/userspace-rcu/rcu-api.md
+%{_datadir}/doc/userspace-rcu/README.md
+%{_datadir}/doc/userspace-rcu/solaris-build.md
+%{_datadir}/doc/userspace-rcu/uatomic-api.md
+%{_includedir}/urcu-bp.h
+%{_includedir}/urcu-call-rcu.h
+%{_includedir}/urcu-defer.h
+%{_includedir}/urcu-flavor.h
+%{_includedir}/urcu-pointer.h
+%{_includedir}/urcu-qsbr.h
+%{_includedir}/urcu.h
+%{_includedir}/urcu/arch.h
+%{_includedir}/urcu/arch/generic.h
+%{_includedir}/urcu/call-rcu.h
+%{_includedir}/urcu/cds.h
+%{_includedir}/urcu/compiler.h
+%{_includedir}/urcu/config.h
+%{_includedir}/urcu/debug.h
+%{_includedir}/urcu/defer.h
+%{_includedir}/urcu/flavor.h
+%{_includedir}/urcu/futex.h
+%{_includedir}/urcu/hlist.h
+%{_includedir}/urcu/lfstack.h
+%{_includedir}/urcu/list.h
+%{_includedir}/urcu/map/clear.h
+%{_includedir}/urcu/map/urcu-bp.h
+%{_includedir}/urcu/map/urcu-mb.h
+%{_includedir}/urcu/map/urcu-memb.h
+%{_includedir}/urcu/map/urcu-qsbr.h
+%{_includedir}/urcu/map/urcu-signal.h
+%{_includedir}/urcu/map/urcu.h
+%{_includedir}/urcu/pointer.h
+%{_includedir}/urcu/rcuhlist.h
+%{_includedir}/urcu/rculfhash.h
+%{_includedir}/urcu/rculfqueue.h
+%{_includedir}/urcu/rculfstack.h
+%{_includedir}/urcu/rculist.h
+%{_includedir}/urcu/ref.h
+%{_includedir}/urcu/static/lfstack.h
+%{_includedir}/urcu/static/pointer.h
+%{_includedir}/urcu/static/rculfqueue.h
+%{_includedir}/urcu/static/rculfstack.h
+%{_includedir}/urcu/static/urcu-bp.h
+%{_includedir}/urcu/static/urcu-common.h
+%{_includedir}/urcu/static/urcu-mb.h
+%{_includedir}/urcu/static/urcu-memb.h
+%{_includedir}/urcu/static/urcu-qsbr.h
+%{_includedir}/urcu/static/urcu-signal.h
+%{_includedir}/urcu/static/urcu.h
+%{_includedir}/urcu/static/wfcqueue.h
+%{_includedir}/urcu/static/wfqueue.h
+%{_includedir}/urcu/static/wfstack.h
+%{_includedir}/urcu/syscall-compat.h
+%{_includedir}/urcu/system.h
+%{_includedir}/urcu/tls-compat.h
+%{_includedir}/urcu/uatomic_arch.h
+%{_includedir}/urcu/uatomic.h
+%{_includedir}/urcu/uatomic/generic.h
+%{_includedir}/urcu/urcu_ref.h
+%{_includedir}/urcu/urcu-bp.h
+%{_includedir}/urcu/urcu-futex.h
+%{_includedir}/urcu/urcu-mb.h
+%{_includedir}/urcu/urcu-memb.h
+%{_includedir}/urcu/urcu-qsbr.h
+%{_includedir}/urcu/urcu-signal.h
+%{_includedir}/urcu/urcu.h
+%{_includedir}/urcu/wfcqueue.h
+%{_includedir}/urcu/wfqueue.h
+%{_includedir}/urcu/wfstack.h
+%{_libdir}/liburcu-bp.a
+%{_libdir}/liburcu-bp.la
+%{_libdir}/liburcu-bp.so
+%{_libdir}/liburcu-bp.so.6
+%{_libdir}/liburcu-bp.so.6.1.0
+%{_libdir}/liburcu-cds.a
+%{_libdir}/liburcu-cds.la
+%{_libdir}/liburcu-cds.so
+%{_libdir}/liburcu-cds.so.6
+%{_libdir}/liburcu-cds.so.6.1.0
+%{_libdir}/liburcu-common.a
+%{_libdir}/liburcu-common.la
+%{_libdir}/liburcu-common.so
+%{_libdir}/liburcu-common.so.6
+%{_libdir}/liburcu-common.so.6.1.0
+%{_libdir}/liburcu-mb.a
+%{_libdir}/liburcu-mb.la
+%{_libdir}/liburcu-mb.so
+%{_libdir}/liburcu-mb.so.6
+%{_libdir}/liburcu-mb.so.6.1.0
+%{_libdir}/liburcu-memb.a
+%{_libdir}/liburcu-memb.la
+%{_libdir}/liburcu-memb.so
+%{_libdir}/liburcu-memb.so.6
+%{_libdir}/liburcu-memb.so.6.1.0
+%{_libdir}/liburcu-qsbr.a
+%{_libdir}/liburcu-qsbr.la
+%{_libdir}/liburcu-qsbr.so
+%{_libdir}/liburcu-qsbr.so.6
+%{_libdir}/liburcu-qsbr.so.6.1.0
+%{_libdir}/liburcu-signal.a
+%{_libdir}/liburcu-signal.la
+%{_libdir}/liburcu-signal.so
+%{_libdir}/liburcu-signal.so.6
+%{_libdir}/liburcu-signal.so.6.1.0
+%{_libdir}/liburcu.a
+%{_libdir}/liburcu.la
+%{_libdir}/liburcu.so
+%{_libdir}/liburcu.so.6
+%{_libdir}/liburcu.so.6.1.0
+%{_libdir}/pkgconfig/liburcu-bp.pc
+%{_libdir}/pkgconfig/liburcu-cds.pc
+%{_libdir}/pkgconfig/liburcu-mb.pc
+%{_libdir}/pkgconfig/liburcu-qsbr.pc
+%{_libdir}/pkgconfig/liburcu-signal.pc
+%{_libdir}/pkgconfig/liburcu.pc
+
+%changelog
+* Mon Feb 24 2020 Michael Adams <unquietwiki@gmail.com> - 0.11-2
+- Try to fix RPM package install warning
+* Tue Jan 07 2020 Michael Adams <unquietwiki@gmail.com> - 0.11-1
+- Initial RPM package
diff --git a/raid/COPYING b/raid/COPYING
new file mode 100644
index 00000000..a43ea212
--- /dev/null
+++ b/raid/COPYING
@@ -0,0 +1,339 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                          675 Mass Ave, Cambridge, MA 02139, USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	Appendix: How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) 19yy  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) 19yy name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/raid/check.c b/raid/check.c
new file mode 100644
index 00000000..9bed9337
--- /dev/null
+++ b/raid/check.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (C) 2015 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+#include "combo.h"
+#include "gf.h"
+
+/**
+ * Validate the provided failed blocks.
+ *
+ * This function checks if the specified failed blocks satisfy the redundancy
+ * information using the data from the known valid parity blocks.
+ *
+ * It's similar at raid_check(), just with a different format for arguments.
+ *
+ * The number of failed blocks @nr must be strictly less than the number of
+ * parities @nv, because you need one more parity to validate the recovering.
+ *
+ * No data or parity blocks are modified.
+ *
+ * @nr Number of failed data blocks.
+ * @id[] Vector of @nr indexes of the failed data blocks.
+ *   The indexes start from 0. They must be in order.
+ * @nv Number of valid parity blocks.
+ * @ip[] Vector of @nv indexes of the valid parity blocks.
+ *   The indexes start from 0. They must be in order.
+ * @nd Number of data blocks.
+ * @size Size of the blocks pointed by @v. It must be a multipler of 64.
+ * @v Vector of pointers to the blocks of data and parity.
+ *   It has (@nd + @ip[@nv - 1] + 1) elements. The starting elements are the
+ *   blocks for data, following with the parity blocks.
+ *   Each block has @size bytes. 
+ * @return 0 if the check is satisfied. -1 otherwise.
+ */
+static int raid_validate(int nr, int *id, int nv, int *ip, int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	const uint8_t *T[RAID_PARITY_MAX][RAID_PARITY_MAX];
+	uint8_t G[RAID_PARITY_MAX * RAID_PARITY_MAX];
+	uint8_t V[RAID_PARITY_MAX * RAID_PARITY_MAX];
+	size_t i;
+	int j, k, l;
+
+	BUG_ON(nr >= nv);
+
+	/* setup the coefficients matrix */
+	for (j = 0; j < nr; ++j)
+		for (k = 0; k < nr; ++k)
+			G[j * nr + k] = A(ip[j], id[k]);
+
+	/* invert it to solve the system of linear equations */
+	raid_invert(G, V, nr);
+
+	/* get multiplication tables */
+	for (j = 0; j < nr; ++j)
+		for (k = 0; k < nr; ++k)
+			T[j][k] = table(V[j * nr + k]);
+
+	/* check all positions */
+	for (i = 0; i < size; ++i) {
+		uint8_t p[RAID_PARITY_MAX];
+
+		/* get parity */
+		for (j = 0; j < nv; ++j)
+			p[j] = v[nd + ip[j]][i];
+
+		/* compute delta parity, skipping broken disks */
+		for (j = 0, k = 0; j < nd; ++j) {
+			uint8_t b;
+
+			/* skip broken disks */
+			if (k < nr && id[k] == j) {
+				++k;
+				continue;
+			}
+
+			b = v[j][i];
+			for (l = 0; l < nv; ++l)
+				p[l] ^= gfmul[b][gfgen[ip[l]][j]];
+		}
+
+		/* reconstruct data */
+		for (j = 0; j < nr; ++j) {
+			uint8_t b = 0;
+			int idj = id[j];
+
+			/* recompute the data */
+			for (k = 0; k < nr; ++k)
+				b ^= T[j][k][p[k]];
+
+			/* add the parity contribution of the reconstructed data */
+			for (l = nr; l < nv; ++l)
+				p[l] ^= gfmul[b][gfgen[ip[l]][idj]];
+		}
+
+		/* check that the final parity is 0 */
+		for (l = nr; l < nv; ++l)
+			if (p[l] != 0)
+				return -1;
+	}
+
+	return 0;
+}
+
+int raid_check(int nr, int *ir, int nd, int np, size_t size, void **v)
+{
+	/* valid parity index */
+	int ip[RAID_PARITY_MAX];
+	int vp;
+	int rd;
+	int i, j;
+
+	/* enforce limit on size */
+	BUG_ON(size % 64 != 0);
+
+	/* enforce limit on number of failures */
+	BUG_ON(nr >= np); /* >= because we check with extra parity */
+	BUG_ON(np > RAID_PARITY_MAX);
+
+	/* enforce order in index vector */
+	BUG_ON(nr >= 2 && ir[0] >= ir[1]);
+	BUG_ON(nr >= 3 && ir[1] >= ir[2]);
+	BUG_ON(nr >= 4 && ir[2] >= ir[3]);
+	BUG_ON(nr >= 5 && ir[3] >= ir[4]);
+	BUG_ON(nr >= 6 && ir[4] >= ir[5]);
+
+	/* enforce limit on index vector */
+	BUG_ON(nr > 0 && ir[nr-1] >= nd + np);
+
+	/* count failed data disk */
+	rd = 0;
+	while (rd < nr && ir[rd] < nd)
+		++rd;
+
+	/* put valid parities into ip[] */
+	vp = 0;
+	for (i = rd, j = 0; j < np; ++j) {
+		/* if parity is failed */
+		if (i < nr && ir[i] == nd + j) {
+			/* skip broken parity */
+			++i;
+		} else {
+			/* store valid parity */
+			ip[vp] = j;
+			++vp;
+		}
+	}
+
+	return raid_validate(rd, ir, vp, ip, nd, size, v);
+}
+
+int raid_scan(int *ir, int nd, int np, size_t size, void **v)
+{
+	int r;
+
+	/* check the special case of no failure */
+	if (np != 0 && raid_check(0, 0, nd, np, size, v) == 0)
+		return 0;
+
+	/* for each number of possible failures */
+	for (r = 1; r < np; ++r) {
+		/* try all combinations of r failures on n disks */
+		combination_first(r, nd + np, ir);
+		do {
+			/* verify if the combination is a valid one */
+			if (raid_check(r, ir, nd, np, size, v) == 0)
+				return r;
+		} while (combination_next(r, nd + np, ir));
+	}
+
+	/* no solution found */
+	return -1;
+}
+
diff --git a/raid/combo.h b/raid/combo.h
new file mode 100644
index 00000000..8efc31ad
--- /dev/null
+++ b/raid/combo.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAID_COMBO_H
+#define __RAID_COMBO_H
+
+#include <assert.h>
+
+/**
+ * Get the first permutation with repetition of r of n elements.
+ *
+ * Typical use is with permutation_next() in the form :
+ *
+ * int i[R];
+ * permutation_first(R, N, i);
+ * do {
+ *    code using i[0], i[1], ..., i[R-1]
+ * } while (permutation_next(R, N, i));
+ *
+ * It's equivalent at the code :
+ *
+ * for(i[0]=0;i[0]<N;++i[0])
+ *     for(i[1]=0;i[1]<N;++i[1])
+ *        ...
+ *            for(i[R-2]=0;i[R-2]<N;++i[R-2])
+ *                for(i[R-1]=0;i[R-1]<N;++i[R-1])
+ *                    code using i[0], i[1], ..., i[R-1]
+ */
+static __always_inline void permutation_first(int r, int n, int *c)
+{
+	int i;
+
+	(void)n; /* unused, but kept for clarity */
+	assert(0 < r && r <= n);
+
+	for (i = 0; i < r; ++i)
+		c[i] = 0;
+}
+
+/**
+ * Get the next permutation with repetition of r of n elements.
+ * Return ==0 when finished.
+ */
+static __always_inline int permutation_next(int r, int n, int *c)
+{
+	int i = r - 1; /* present position */
+
+recurse:
+	/* next element at position i */
+	++c[i];
+
+	/* if the position has reached the max */
+	if (c[i] >= n) {
+
+		/* if we are at the first level, we have finished */
+		if (i == 0)
+			return 0;
+
+		/* increase the previous position */
+		--i;
+		goto recurse;
+	}
+
+	++i;
+
+	/* initialize all the next positions, if any */
+	while (i < r) {
+		c[i] = 0;
+		++i;
+	}
+
+	return 1;
+}
+
+/**
+ * Get the first combination without repetition of r of n elements.
+ *
+ * Typical use is with combination_next() in the form :
+ *
+ * int i[R];
+ * combination_first(R, N, i);
+ * do {
+ *    code using i[0], i[1], ..., i[R-1]
+ * } while (combination_next(R, N, i));
+ *
+ * It's equivalent at the code :
+ *
+ * for(i[0]=0;i[0]<N-(R-1);++i[0])
+ *     for(i[1]=i[0]+1;i[1]<N-(R-2);++i[1])
+ *        ...
+ *            for(i[R-2]=i[R-3]+1;i[R-2]<N-1;++i[R-2])
+ *                for(i[R-1]=i[R-2]+1;i[R-1]<N;++i[R-1])
+ *                    code using i[0], i[1], ..., i[R-1]
+ */
+static __always_inline void combination_first(int r, int n, int *c)
+{
+	int i;
+
+	(void)n; /* unused, but kept for clarity */
+	assert(0 < r && r <= n);
+
+	for (i = 0; i < r; ++i)
+		c[i] = i;
+}
+
+/**
+ * Get the next combination without repetition of r of n elements.
+ * Return ==0 when finished.
+ */
+static __always_inline int combination_next(int r, int n, int *c)
+{
+	int i = r - 1; /* present position */
+	int h = n; /* high limit for this position */
+
+recurse:
+	/* next element at position i */
+	++c[i];
+
+	/* if the position has reached the max */
+	if (c[i] >= h) {
+
+		/* if we are at the first level, we have finished */
+		if (i == 0)
+			return 0;
+
+		/* increase the previous position */
+		--i;
+		--h;
+		goto recurse;
+	}
+
+	++i;
+
+	/* initialize all the next positions, if any */
+	while (i < r) {
+		/* each position start at the next value of the previous one */
+		c[i] = c[i - 1] + 1;
+		++i;
+	}
+
+	return 1;
+}
+#endif
+
diff --git a/raid/cpu.h b/raid/cpu.h
new file mode 100644
index 00000000..164ac8ce
--- /dev/null
+++ b/raid/cpu.h
@@ -0,0 +1,331 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAID_CPU_H
+#define __RAID_CPU_H
+
+#ifdef CONFIG_X86
+
+static inline void raid_cpuid(uint32_t func_eax, uint32_t sub_ecx, uint32_t *reg)
+{
+	asm volatile (
+#if defined(__i386__) && defined(__PIC__)
+	        /* allow compilation in PIC mode saving ebx */
+		"xchgl %%ebx, %1\n"
+		"cpuid\n"
+		"xchgl %%ebx, %1\n"
+		: "=a" (reg[0]), "=r" (reg[1]), "=c" (reg[2]), "=d" (reg[3])
+		: "0" (func_eax), "2" (sub_ecx)
+#else
+		"cpuid\n"
+		: "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3])
+		: "0" (func_eax), "2" (sub_ecx)
+#endif
+	);
+}
+
+static inline void raid_xgetbv(uint32_t* reg)
+{
+	/* get the value of the Extended Control Register ecx=0 */
+	asm volatile (
+	        /* uses a direct encoding of the XGETBV instruction as only recent */
+	        /* assemblers support it. */
+	        /* the next line is equivalent at: "xgetbv\n" */
+		".byte 0x0f, 0x01, 0xd0\n"
+		: "=a" (reg[0]), "=d" (reg[3])
+		: "c" (0)
+	);
+}
+
+#define CPU_VENDOR_MAX 13
+
+static inline void raid_cpu_info(char *vendor, unsigned *family, unsigned *model)
+{
+	uint32_t reg[4];
+	unsigned f, ef, m, em;
+
+	raid_cpuid(0, 0, reg);
+
+	memcpy(vendor, &reg[1], 4);
+	memcpy(vendor + 4, &reg[3], 4);
+	memcpy(vendor + 8, &reg[2], 4);
+	vendor[12] = 0;
+
+	raid_cpuid(1, 0, reg);
+
+	f = (reg[0] >> 8) & 0xF;
+	ef = (reg[0] >> 20) & 0xFF;
+	m = (reg[0] >> 4) & 0xF;
+	em = (reg[0] >> 16) & 0xF;
+
+	if (strcmp(vendor, "AuthenticAMD") == 0) {
+		if (f < 15) {
+			*family = f;
+			*model = m;
+		} else {
+			*family = f + ef;
+			*model = m + (em << 4);
+		}
+	} else {
+		*family = f + ef;
+		*model = m + (em << 4);
+	}
+}
+
+static inline int raid_cpu_match_sse(uint32_t cpuid_1_ecx, uint32_t cpuid_1_edx)
+{
+	uint32_t reg[4];
+
+	raid_cpuid(1, 0, reg);
+	if ((reg[2] & cpuid_1_ecx) != cpuid_1_ecx)
+		return 0;
+	if ((reg[3] & cpuid_1_edx) != cpuid_1_edx)
+		return 0;
+
+	return 1;
+}
+
+static inline int raid_cpu_match_avx(uint32_t cpuid_1_ecx, uint32_t cpuid_7_ebx, uint32_t xcr0)
+{
+	uint32_t reg[4];
+
+	raid_cpuid(1, 0, reg);
+	if ((reg[2] & cpuid_1_ecx) != cpuid_1_ecx)
+		return 0;
+
+	raid_xgetbv(reg);
+	if ((reg[0] & xcr0) != xcr0)
+		return 0;
+
+	raid_cpuid(7, 0, reg);
+	if ((reg[1] & cpuid_7_ebx) != cpuid_7_ebx)
+		return 0;
+
+	return 1;
+}
+
+static inline int raid_cpu_has_sse2(void)
+{
+	/*
+	 * Intel� 64 and IA-32 Architectures Software Developer's Manual
+	 * 325462-048US September 2013
+	 *
+	 * 11.6.2 Checking for SSE/SSE2 Support
+	 * Before an application attempts to use the SSE and/or SSE2 extensions, it should check
+	 * that they are present on the processor:
+	 * 1. Check that the processor supports the CPUID instruction. Bit 21 of the EFLAGS
+	 * register can be used to check processor's support the CPUID instruction.
+	 * 2. Check that the processor supports the SSE and/or SSE2 extensions (true if
+	 * CPUID.01H:EDX.SSE[bit 25] = 1 and/or CPUID.01H:EDX.SSE2[bit 26] = 1).
+	 */
+	return raid_cpu_match_sse(
+		0,
+		1 << 26); /* SSE2 */
+}
+
+static inline int raid_cpu_has_ssse3(void)
+{
+	/*
+	 * Intel� 64 and IA-32 Architectures Software Developer's Manual
+	 * 325462-048US September 2013
+	 *
+	 * 12.7.2 Checking for SSSE3 Support
+	 * Before an application attempts to use the SSSE3 extensions, the application should
+	 * follow the steps illustrated in Section 11.6.2, "Checking for SSE/SSE2 Support."
+	 * Next, use the additional step provided below:
+	 * Check that the processor supports SSSE3 (if CPUID.01H:ECX.SSSE3[bit 9] = 1).
+	 */
+	return raid_cpu_match_sse(
+		1 << 9, /* SSSE3 */
+		1 << 26); /* SSE2 */
+}
+
+static inline int raid_cpu_has_crc32(void)
+{
+	/*
+	 * Intel� 64 and IA-32 Architectures Software Developer's Manual
+	 * 325462-048US September 2013
+	 *
+	 * 12.12.3 Checking for SSE4.2 Support
+	 * ...
+	 * Before an application attempts to use the CRC32 instruction, it must check
+	 * that the processor supports SSE4.2 (if CPUID.01H:ECX.SSE4_2[bit 20] = 1).
+	 */
+	return raid_cpu_match_sse(
+		1 << 20, /* CRC32 */
+		0);
+}
+
+static inline int raid_cpu_has_avx2(void)
+{
+	/*
+	 * Intel Architecture Instruction Set Extensions Programming Reference
+	 * 319433-022 October 2014
+	 *
+	 * 14.3 Detection of AVX instructions
+	 * 1) Detect CPUID.1:ECX.OSXSAVE[bit 27] = 1 (XGETBV enabled for application use1)
+	 * 2) Issue XGETBV and verify that XCR0[2:1] = `11b' (XMM state and YMM state are enabled by OS).
+	 * 3) detect CPUID.1:ECX.AVX[bit 28] = 1 (AVX instructions supported).
+	 * (Step 3 can be done in any order relative to 1 and 2)
+	 *
+	 * 14.7.1 Detection of AVX2
+	 * Hardware support for AVX2 is indicated by CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]=1.
+	 * Application Software must identify that hardware supports AVX, after that it must
+	 * also detect support for AVX2 by checking CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5].
+	 */
+	return raid_cpu_match_avx(
+		(1 << 27) | (1 << 28), /* OSXSAVE and AVX */
+		1 << 5, /* AVX2 */
+		3 << 1); /* OS saves XMM and YMM registers */
+}
+
+static inline int raid_cpu_has_avx512bw(void)
+{
+	/*
+	 * Intel Architecture Instruction Set Extensions Programming Reference
+	 * 319433-022 October 2014
+	 *
+	 * 2.2 Detection of 512-bit Instruction Groups of Intel AVX-512 Family
+	 * 1) Detect CPUID.1:ECX.OSXSAVE[bit 27] = 1 (XGETBV enabled for application use)
+	 * 2) Execute XGETBV and verify that XCR0[7:5] = `111b' (OPMASK state, upper 256-bit of
+	 * ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled by OS) and that XCR0[2:1] = `11b'
+	 * (XMM state and YMM state are enabled by OS).
+	 * 3) Verify both CPUID.0x7.0:EBX.AVX512F[bit 16] = 1, CPUID.0x7.0:EBX.AVX512BW[bit 30] = 1.
+	 */
+
+	/* note that intentionally we don't check for AVX and AVX2 */
+	/* because the documentation doesn't require that */
+	return raid_cpu_match_avx(
+		1 << 27, /* XSAVE/XGETBV */
+		(1 << 16) | (1 << 30), /* AVX512F and AVX512BW */
+		(3 << 1) | (7 << 5)); /* OS saves XMM, YMM and ZMM registers */
+}
+
+/**
+ * Check if it's an Intel Atom CPU.
+ */
+static inline int raid_cpu_is_atom(unsigned family, unsigned model)
+{
+	if (family != 6)
+		return 0;
+
+	/*
+	 * x86 Architecture CPUID
+	 * http://www.sandpile.org/x86/cpuid.htm
+	 *
+	 * Intel Atom
+	 * 1C (28) Atom (45 nm) with 512 KB on-die L2
+	 * 26 (38) Atom (45 nm) with 512 KB on-die L2
+	 * 36 (54) Atom (32 nm) with 512 KB on-die L2
+	 * 27 (39) Atom (32 nm) with 512 KB on-die L2
+	 * 35 (53) Atom (?? nm) with ??? KB on-die L2
+	 * 4A (74) Atom 2C (22 nm) 1 MB L2 + PowerVR (TGR)
+	 * 5A (90) Atom 4C (22 nm) 2 MB L2 + PowerVR (ANN)
+	 * 37 (55) Atom 4C (22 nm) 2 MB L2 + Intel Gen7 (BYT)
+	 * 4C (76) Atom 4C (14 nm) 2 MB L2 + Intel Gen8 (BSW)
+	 * 5D (93) Atom 4C (28 nm TSMC) 1 MB L2 + Mali (SoFIA)
+	 * 4D (77) Atom 8C (22 nm) 4 MB L2 (AVN)
+	 * ?? Atom ?C (14 nm) ? MB L2 (DVN)
+	 */
+	return model == 28 || model == 38 || model == 54
+		|| model == 39 || model == 53 || model == 74
+		|| model == 90 || model == 55 || model == 76
+		|| model == 93 || model == 77;
+}
+
+/**
+ * Check if the processor has a slow MULT implementation.
+ * If yes, it's better to use a hash not based on multiplication.
+ */
+static inline int raid_cpu_has_slowmult(void)
+{
+	char vendor[CPU_VENDOR_MAX];
+	unsigned family;
+	unsigned model;
+
+	/*
+	 * In some cases Murmur3 based on MUL instruction,
+	 * is a LOT slower than Spooky2 based on SHIFTs.
+	 */
+	raid_cpu_info(vendor, &family, &model);
+
+	if (strcmp(vendor, "GenuineIntel") == 0) {
+		/*
+		 * Intel Atom (Model 28)
+		 * murmur3:378 MB/s, spooky2:3413 MB/s (x86)
+		 *
+		 * Intel Atom (Model 77)
+		 * murmur3:1311 MB/s, spooky2:4056 MB/s (x64)
+		 */
+		if (raid_cpu_is_atom(family, model))
+			return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * Check if the processor has a slow extended set of SSE registers.
+ * If yes, it's better to limit the unroll to the firsrt 8 registers.
+ */
+static inline int raid_cpu_has_slowextendedreg(void)
+{
+	char vendor[CPU_VENDOR_MAX];
+	unsigned family;
+	unsigned model;
+
+	/*
+	 * In some cases the PAR2 implementation using 16 SSE registers
+	 * is a LITTLE slower than the one using only the first 8 registers.
+	 * This doesn't happen for PARZ.
+	 */
+	raid_cpu_info(vendor, &family, &model);
+
+	if (strcmp(vendor, "AuthenticAMD") == 0) {
+		/*
+		 * AMD Bulldozer
+		 * par2_sse2:4922 MB/s, par2_sse2e:4465 MB/s
+		 */
+		if (family == 21)
+			return 1;
+	}
+
+	if (strcmp(vendor, "GenuineIntel") == 0) {
+		/*
+		 * Intel Atom (Model 77)
+		 * par2_sse2:5686 MB/s, par2_sse2e:5250 MB/s
+		 * parz_sse2:3100 MB/s, parz_sse2e:3400 MB/s
+		 * par3_sse3:1921 MB/s, par3_sse3e:1813 MB/s
+		 * par4_sse3:1175 MB/s, par4_sse3e:1113 MB/s
+		 * par5_sse3:876 MB/s, par5_sse3e:675 MB/s
+		 * par6_sse3:705 MB/s, par6_sse3e:529 MB/s
+		 *
+		 * Intel Atom (Model 77) "Avoton C2750"
+		 * par2_sse2:5661 MB/s, par2_sse2e:5382 MB/s
+		 * parz_sse2:3110 MB/s, parz_sse2e:3450 MB/s
+		 * par3_sse3:1769 MB/s, par3_sse3e:1856 MB/s
+		 * par4_sse3:1221 MB/s, par4_sse3e:1141 MB/s
+		 * par5_sse3:910 MB/s, par5_sse3e:675 MB/s
+		 * par6_sse3:720 MB/s, par6_sse3e:534 MB/s
+		 */
+		if (raid_cpu_is_atom(family, model))
+			return 1;
+	}
+
+	return 0;
+}
+#endif
+
+#endif
+
diff --git a/raid/gf.h b/raid/gf.h
new file mode 100644
index 00000000..1702c287
--- /dev/null
+++ b/raid/gf.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAID_GF_H
+#define __RAID_GF_H
+
+/*
+ * Galois field operations.
+ *
+ * Basic range checks are implemented using BUG_ON().
+ */
+
+/*
+ * GF a*b.
+ */
+static __always_inline uint8_t mul(uint8_t a, uint8_t b)
+{
+	return gfmul[a][b];
+}
+
+/*
+ * GF 1/a.
+ * Not defined for a == 0.
+ */
+static __always_inline uint8_t inv(uint8_t v)
+{
+	BUG_ON(v == 0); /* division by zero */
+
+	return gfinv[v];
+}
+
+/*
+ * GF 2^a.
+ */
+static __always_inline uint8_t pow2(int v)
+{
+	BUG_ON(v < 0 || v > 254); /* invalid exponent */
+
+	return gfexp[v];
+}
+
+/*
+ * Gets the multiplication table for a specified value.
+ */
+static __always_inline const uint8_t *table(uint8_t v)
+{
+	return gfmul[v];
+}
+
+/*
+ * Gets the generator matrix coefficient for parity 'p' and disk 'd'.
+ */
+static __always_inline uint8_t A(int p, int d)
+{
+	return gfgen[p][d];
+}
+
+/*
+ * Dereference as uint8_t
+ */
+#define v_8(p) (*(uint8_t *)&(p))
+
+/*
+ * Dereference as uint32_t
+ */
+#define v_32(p) (*(uint32_t *)&(p))
+
+/*
+ * Dereference as uint64_t
+ */
+#define v_64(p) (*(uint64_t *)&(p))
+
+/*
+ * Multiply each byte of a uint32 by 2 in the GF(2^8).
+ */
+static __always_inline uint32_t x2_32(uint32_t v)
+{
+	uint32_t mask = v & 0x80808080U;
+
+	mask = (mask << 1) - (mask >> 7);
+	v = (v << 1) & 0xfefefefeU;
+	v ^= mask & 0x1d1d1d1dU;
+	return v;
+}
+
+/*
+ * Multiply each byte of a uint64 by 2 in the GF(2^8).
+ */
+static __always_inline uint64_t x2_64(uint64_t v)
+{
+	uint64_t mask = v & 0x8080808080808080ULL;
+
+	mask = (mask << 1) - (mask >> 7);
+	v = (v << 1) & 0xfefefefefefefefeULL;
+	v ^= mask & 0x1d1d1d1d1d1d1d1dULL;
+	return v;
+}
+
+/*
+ * Divide each byte of a uint32 by 2 in the GF(2^8).
+ */
+static __always_inline uint32_t d2_32(uint32_t v)
+{
+	uint32_t mask = v & 0x01010101U;
+
+	mask = (mask << 8) - mask;
+	v = (v >> 1) & 0x7f7f7f7fU;
+	v ^= mask & 0x8e8e8e8eU;
+	return v;
+}
+
+/*
+ * Divide each byte of a uint64 by 2 in the GF(2^8).
+ */
+static __always_inline uint64_t d2_64(uint64_t v)
+{
+	uint64_t mask = v & 0x0101010101010101ULL;
+
+	mask = (mask << 8) - mask;
+	v = (v >> 1) & 0x7f7f7f7f7f7f7f7fULL;
+	v ^= mask & 0x8e8e8e8e8e8e8e8eULL;
+	return v;
+}
+
+#endif
+
diff --git a/raid/helper.c b/raid/helper.c
new file mode 100644
index 00000000..f66093fa
--- /dev/null
+++ b/raid/helper.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+
+#define RAID_SWAP(a, b) \
+	do { \
+		if (v[a] > v[b]) { \
+			int t = v[a]; \
+			v[a] = v[b]; \
+			v[b] = t; \
+		} \
+	} while (0)
+
+void raid_sort(int n, int *v)
+{
+	/* sorting networks generated with Batcher's Merge-Exchange */
+	switch (n) {
+	case 2:
+		RAID_SWAP(0, 1);
+		break;
+	case 3:
+		RAID_SWAP(0, 2);
+		RAID_SWAP(0, 1);
+		RAID_SWAP(1, 2);
+		break;
+	case 4:
+		RAID_SWAP(0, 2);
+		RAID_SWAP(1, 3);
+		RAID_SWAP(0, 1);
+		RAID_SWAP(2, 3);
+		RAID_SWAP(1, 2);
+		break;
+	case 5:
+		RAID_SWAP(0, 4);
+		RAID_SWAP(0, 2);
+		RAID_SWAP(1, 3);
+		RAID_SWAP(2, 4);
+		RAID_SWAP(0, 1);
+		RAID_SWAP(2, 3);
+		RAID_SWAP(1, 4);
+		RAID_SWAP(1, 2);
+		RAID_SWAP(3, 4);
+		break;
+	case 6:
+		RAID_SWAP(0, 4);
+		RAID_SWAP(1, 5);
+		RAID_SWAP(0, 2);
+		RAID_SWAP(1, 3);
+		RAID_SWAP(2, 4);
+		RAID_SWAP(3, 5);
+		RAID_SWAP(0, 1);
+		RAID_SWAP(2, 3);
+		RAID_SWAP(4, 5);
+		RAID_SWAP(1, 4);
+		RAID_SWAP(1, 2);
+		RAID_SWAP(3, 4);
+		break;
+	}
+}
+
+void raid_insert(int n, int *v, int i)
+{
+	/* we don't use binary search because this is intended */
+	/* for very small vectors and we want to optimize the case */
+	/* of elements inserted already in order */
+
+	/* insert at the end */
+	v[n] = i;
+
+	/* swap until in the correct position */
+	while (n > 0 && v[n - 1] > v[n]) {
+		/* swap */
+		int t = v[n - 1];
+
+		v[n - 1] = v[n];
+		v[n] = t;
+
+		/* previous position */
+		--n;
+	}
+}
+
diff --git a/raid/helper.h b/raid/helper.h
new file mode 100644
index 00000000..bf682882
--- /dev/null
+++ b/raid/helper.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAID_HELPER_H
+#define __RAID_HELPER_H
+
+/**
+ * Inserts an integer in a sorted vector.
+ *
+ * This function can be used to insert indexes in order, ready to be used for
+ * calling raid_rec().
+ *
+ * @n Number of integers currently in the vector.
+ * @v Vector of integers already sorted.
+ *   It must have extra space for the new elemet at the end.
+ * @i Value to insert.
+ */
+void raid_insert(int n, int *v, int i);
+
+/**
+ * Sorts a small vector of integers.
+ *
+ * If you have indexes not in order, you can use this function to sort them
+ * before calling raid_rec().
+ *
+ * @n Number of integers. No more than RAID_PARITY_MAX.
+ * @v Vector of integers.
+ */
+void raid_sort(int n, int *v);
+
+#endif
+
diff --git a/raid/int.c b/raid/int.c
new file mode 100644
index 00000000..e16332a5
--- /dev/null
+++ b/raid/int.c
@@ -0,0 +1,556 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+#include "gf.h"
+
+/*
+ * GEN1 (RAID5 with xor) 32bit C implementation
+ */
+void raid_gen1_int32(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	int d, l;
+	size_t i;
+
+	uint32_t p0;
+	uint32_t p1;
+
+	l = nd - 1;
+	p = v[nd];
+
+	for (i = 0; i < size; i += 8) {
+		p0 = v_32(v[l][i]);
+		p1 = v_32(v[l][i + 4]);
+		for (d = l - 1; d >= 0; --d) {
+			p0 ^= v_32(v[d][i]);
+			p1 ^= v_32(v[d][i + 4]);
+		}
+		v_32(p[i]) = p0;
+		v_32(p[i + 4]) = p1;
+	}
+}
+
+/*
+ * GEN1 (RAID5 with xor) 64bit C implementation
+ */
+void raid_gen1_int64(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	int d, l;
+	size_t i;
+
+	uint64_t p0;
+	uint64_t p1;
+
+	l = nd - 1;
+	p = v[nd];
+
+	for (i = 0; i < size; i += 16) {
+		p0 = v_64(v[l][i]);
+		p1 = v_64(v[l][i + 8]);
+		for (d = l - 1; d >= 0; --d) {
+			p0 ^= v_64(v[d][i]);
+			p1 ^= v_64(v[d][i + 8]);
+		}
+		v_64(p[i]) = p0;
+		v_64(p[i + 8]) = p1;
+	}
+}
+
+/*
+ * GEN2 (RAID6 with powers of 2) 32bit C implementation
+ */
+void raid_gen2_int32(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	int d, l;
+	size_t i;
+
+	uint32_t d0, q0, p0;
+	uint32_t d1, q1, p1;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+
+	for (i = 0; i < size; i += 8) {
+		q0 = p0 = v_32(v[l][i]);
+		q1 = p1 = v_32(v[l][i + 4]);
+		for (d = l - 1; d >= 0; --d) {
+			d0 = v_32(v[d][i]);
+			d1 = v_32(v[d][i + 4]);
+
+			p0 ^= d0;
+			p1 ^= d1;
+
+			q0 = x2_32(q0);
+			q1 = x2_32(q1);
+
+			q0 ^= d0;
+			q1 ^= d1;
+		}
+		v_32(p[i]) = p0;
+		v_32(p[i + 4]) = p1;
+		v_32(q[i]) = q0;
+		v_32(q[i + 4]) = q1;
+	}
+}
+
+/*
+ * GEN2 (RAID6 with powers of 2) 64bit C implementation
+ */
+void raid_gen2_int64(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	int d, l;
+	size_t i;
+
+	uint64_t d0, q0, p0;
+	uint64_t d1, q1, p1;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+
+	for (i = 0; i < size; i += 16) {
+		q0 = p0 = v_64(v[l][i]);
+		q1 = p1 = v_64(v[l][i + 8]);
+		for (d = l - 1; d >= 0; --d) {
+			d0 = v_64(v[d][i]);
+			d1 = v_64(v[d][i + 8]);
+
+			p0 ^= d0;
+			p1 ^= d1;
+
+			q0 = x2_64(q0);
+			q1 = x2_64(q1);
+
+			q0 ^= d0;
+			q1 ^= d1;
+		}
+		v_64(p[i]) = p0;
+		v_64(p[i + 8]) = p1;
+		v_64(q[i]) = q0;
+		v_64(q[i + 8]) = q1;
+	}
+}
+
+/*
+ * GEN3 (triple parity with Cauchy matrix) 8bit C implementation
+ *
+ * Note that instead of a generic multiplication table, likely resulting
+ * in multiple cache misses, a precomputed table could be used.
+ * But this is only a kind of reference function, and we are not really
+ * interested in speed.
+ */
+void raid_gen3_int8(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	int d, l;
+	size_t i;
+
+	uint8_t d0, r0, q0, p0;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+
+	for (i = 0; i < size; i += 1) {
+		p0 = q0 = r0 = 0;
+		for (d = l; d > 0; --d) {
+			d0 = v_8(v[d][i]);
+
+			p0 ^= d0;
+			q0 ^= gfmul[d0][gfgen[1][d]];
+			r0 ^= gfmul[d0][gfgen[2][d]];
+		}
+
+		/* first disk with all coefficients at 1 */
+		d0 = v_8(v[0][i]);
+
+		p0 ^= d0;
+		q0 ^= d0;
+		r0 ^= d0;
+
+		v_8(p[i]) = p0;
+		v_8(q[i]) = q0;
+		v_8(r[i]) = r0;
+	}
+}
+
+/*
+ * GEN4 (quad parity with Cauchy matrix) 8bit C implementation
+ *
+ * Note that instead of a generic multiplication table, likely resulting
+ * in multiple cache misses, a precomputed table could be used.
+ * But this is only a kind of reference function, and we are not really
+ * interested in speed.
+ */
+void raid_gen4_int8(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	uint8_t *s;
+	int d, l;
+	size_t i;
+
+	uint8_t d0, s0, r0, q0, p0;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+	s = v[nd + 3];
+
+	for (i = 0; i < size; i += 1) {
+		p0 = q0 = r0 = s0 = 0;
+		for (d = l; d > 0; --d) {
+			d0 = v_8(v[d][i]);
+
+			p0 ^= d0;
+			q0 ^= gfmul[d0][gfgen[1][d]];
+			r0 ^= gfmul[d0][gfgen[2][d]];
+			s0 ^= gfmul[d0][gfgen[3][d]];
+		}
+
+		/* first disk with all coefficients at 1 */
+		d0 = v_8(v[0][i]);
+
+		p0 ^= d0;
+		q0 ^= d0;
+		r0 ^= d0;
+		s0 ^= d0;
+
+		v_8(p[i]) = p0;
+		v_8(q[i]) = q0;
+		v_8(r[i]) = r0;
+		v_8(s[i]) = s0;
+	}
+}
+
+/*
+ * GEN5 (penta parity with Cauchy matrix) 8bit C implementation
+ *
+ * Note that instead of a generic multiplication table, likely resulting
+ * in multiple cache misses, a precomputed table could be used.
+ * But this is only a kind of reference function, and we are not really
+ * interested in speed.
+ */
+void raid_gen5_int8(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	uint8_t *s;
+	uint8_t *t;
+	int d, l;
+	size_t i;
+
+	uint8_t d0, t0, s0, r0, q0, p0;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+	s = v[nd + 3];
+	t = v[nd + 4];
+
+	for (i = 0; i < size; i += 1) {
+		p0 = q0 = r0 = s0 = t0 = 0;
+		for (d = l; d > 0; --d) {
+			d0 = v_8(v[d][i]);
+
+			p0 ^= d0;
+			q0 ^= gfmul[d0][gfgen[1][d]];
+			r0 ^= gfmul[d0][gfgen[2][d]];
+			s0 ^= gfmul[d0][gfgen[3][d]];
+			t0 ^= gfmul[d0][gfgen[4][d]];
+		}
+
+		/* first disk with all coefficients at 1 */
+		d0 = v_8(v[0][i]);
+
+		p0 ^= d0;
+		q0 ^= d0;
+		r0 ^= d0;
+		s0 ^= d0;
+		t0 ^= d0;
+
+		v_8(p[i]) = p0;
+		v_8(q[i]) = q0;
+		v_8(r[i]) = r0;
+		v_8(s[i]) = s0;
+		v_8(t[i]) = t0;
+	}
+}
+
+/*
+ * GEN6 (hexa parity with Cauchy matrix) 8bit C implementation
+ *
+ * Note that instead of a generic multiplication table, likely resulting
+ * in multiple cache misses, a precomputed table could be used.
+ * But this is only a kind of reference function, and we are not really
+ * interested in speed.
+ */
+void raid_gen6_int8(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	uint8_t *s;
+	uint8_t *t;
+	uint8_t *u;
+	int d, l;
+	size_t i;
+
+	uint8_t d0, u0, t0, s0, r0, q0, p0;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+	s = v[nd + 3];
+	t = v[nd + 4];
+	u = v[nd + 5];
+
+	for (i = 0; i < size; i += 1) {
+		p0 = q0 = r0 = s0 = t0 = u0 = 0;
+		for (d = l; d > 0; --d) {
+			d0 = v_8(v[d][i]);
+
+			p0 ^= d0;
+			q0 ^= gfmul[d0][gfgen[1][d]];
+			r0 ^= gfmul[d0][gfgen[2][d]];
+			s0 ^= gfmul[d0][gfgen[3][d]];
+			t0 ^= gfmul[d0][gfgen[4][d]];
+			u0 ^= gfmul[d0][gfgen[5][d]];
+		}
+
+		/* first disk with all coefficients at 1 */
+		d0 = v_8(v[0][i]);
+
+		p0 ^= d0;
+		q0 ^= d0;
+		r0 ^= d0;
+		s0 ^= d0;
+		t0 ^= d0;
+		u0 ^= d0;
+
+		v_8(p[i]) = p0;
+		v_8(q[i]) = q0;
+		v_8(r[i]) = r0;
+		v_8(s[i]) = s0;
+		v_8(t[i]) = t0;
+		v_8(u[i]) = u0;
+	}
+}
+
+/*
+ * Recover failure of one data block at index id[0] using parity at index
+ * ip[0] for any RAID level.
+ *
+ * Starting from the equation:
+ *
+ * Pd = A[ip[0],id[0]] * Dx
+ *
+ * and solving we get:
+ *
+ * Dx = A[ip[0],id[0]]^-1 * Pd
+ */
+void raid_rec1_int8(int nr, int *id, int *ip, int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *pa;
+	const uint8_t *T;
+	uint8_t G;
+	uint8_t V;
+	size_t i;
+
+	(void)nr; /* unused, it's always 1 */
+
+	/* if it's RAID5 uses the faster function */
+	if (ip[0] == 0) {
+		raid_rec1of1(id, nd, size, vv);
+		return;
+	}
+
+	/* setup the coefficients matrix */
+	G = A(ip[0], id[0]);
+
+	/* invert it to solve the system of linear equations */
+	V = inv(G);
+
+	/* get multiplication tables */
+	T = table(V);
+
+	/* compute delta parity */
+	raid_delta_gen(1, id, ip, nd, size, vv);
+
+	p = v[nd + ip[0]];
+	pa = v[id[0]];
+
+	for (i = 0; i < size; ++i) {
+		/* delta */
+		uint8_t Pd = p[i] ^ pa[i];
+
+		/* reconstruct */
+		pa[i] = T[Pd];
+	}
+}
+
+/*
+ * Recover failure of two data blocks at indexes id[0],id[1] using parity at
+ * indexes ip[0],ip[1] for any RAID level.
+ *
+ * Starting from the equations:
+ *
+ * Pd = A[ip[0],id[0]] * Dx + A[ip[0],id[1]] * Dy
+ * Qd = A[ip[1],id[0]] * Dx + A[ip[1],id[1]] * Dy
+ *
+ * we solve inverting the coefficients matrix.
+ */
+void raid_rec2_int8(int nr, int *id, int *ip, int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *pa;
+	uint8_t *q;
+	uint8_t *qa;
+	const int N = 2;
+	const uint8_t *T[N][N];
+	uint8_t G[N * N];
+	uint8_t V[N * N];
+	size_t i;
+	int j, k;
+
+	(void)nr; /* unused, it's always 2 */
+
+	/* if it's RAID6 recovering with P and Q uses the faster function */
+	if (ip[0] == 0 && ip[1] == 1) {
+		raid_rec2of2_int8(id, ip, nd, size, vv);
+		return;
+	}
+
+	/* setup the coefficients matrix */
+	for (j = 0; j < N; ++j)
+		for (k = 0; k < N; ++k)
+			G[j * N + k] = A(ip[j], id[k]);
+
+	/* invert it to solve the system of linear equations */
+	raid_invert(G, V, N);
+
+	/* get multiplication tables */
+	for (j = 0; j < N; ++j)
+		for (k = 0; k < N; ++k)
+			T[j][k] = table(V[j * N + k]);
+
+	/* compute delta parity */
+	raid_delta_gen(2, id, ip, nd, size, vv);
+
+	p = v[nd + ip[0]];
+	q = v[nd + ip[1]];
+	pa = v[id[0]];
+	qa = v[id[1]];
+
+	for (i = 0; i < size; ++i) {
+		/* delta */
+		uint8_t Pd = p[i] ^ pa[i];
+		uint8_t Qd = q[i] ^ qa[i];
+
+		/* reconstruct */
+		pa[i] = T[0][0][Pd] ^ T[0][1][Qd];
+		qa[i] = T[1][0][Pd] ^ T[1][1][Qd];
+	}
+}
+
+/*
+ * Recover failure of N data blocks at indexes id[N] using parity at indexes
+ * ip[N] for any RAID level.
+ *
+ * Starting from the N equations, with 0<=i<N :
+ *
+ * PD[i] = sum(A[ip[i],id[j]] * D[i]) 0<=j<N
+ *
+ * we solve inverting the coefficients matrix.
+ *
+ * Note that referring at previous equations you have:
+ * PD[0] = Pd, PD[1] = Qd, PD[2] = Rd, ...
+ * D[0] = Dx, D[1] = Dy, D[2] = Dz, ...
+ */
+void raid_recX_int8(int nr, int *id, int *ip, int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p[RAID_PARITY_MAX];
+	uint8_t *pa[RAID_PARITY_MAX];
+	const uint8_t *T[RAID_PARITY_MAX][RAID_PARITY_MAX];
+	uint8_t G[RAID_PARITY_MAX * RAID_PARITY_MAX];
+	uint8_t V[RAID_PARITY_MAX * RAID_PARITY_MAX];
+	size_t i;
+	int j, k;
+
+	/* setup the coefficients matrix */
+	for (j = 0; j < nr; ++j)
+		for (k = 0; k < nr; ++k)
+			G[j * nr + k] = A(ip[j], id[k]);
+
+	/* invert it to solve the system of linear equations */
+	raid_invert(G, V, nr);
+
+	/* get multiplication tables */
+	for (j = 0; j < nr; ++j)
+		for (k = 0; k < nr; ++k)
+			T[j][k] = table(V[j * nr + k]);
+
+	/* compute delta parity */
+	raid_delta_gen(nr, id, ip, nd, size, vv);
+
+	for (j = 0; j < nr; ++j) {
+		p[j] = v[nd + ip[j]];
+		pa[j] = v[id[j]];
+	}
+
+	for (i = 0; i < size; ++i) {
+		uint8_t PD[RAID_PARITY_MAX];
+
+		/* delta */
+		for (j = 0; j < nr; ++j)
+			PD[j] = p[j][i] ^ pa[j][i];
+
+		/* reconstruct */
+		for (j = 0; j < nr; ++j) {
+			uint8_t b = 0;
+
+			for (k = 0; k < nr; ++k)
+				b ^= T[j][k][PD[k]];
+			pa[j][i] = b;
+		}
+	}
+}
+
diff --git a/raid/internal.h b/raid/internal.h
new file mode 100644
index 00000000..4465cb97
--- /dev/null
+++ b/raid/internal.h
@@ -0,0 +1,274 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAID_INTERNAL_H
+#define __RAID_INTERNAL_H
+
+/*
+ * Supported instruction sets.
+ *
+ * It may happen that the assembler is too old to support
+ * all instructions, even if the architecture supports them.
+ * These defines allow to exclude from the build the not supported ones.
+ *
+ * If in your project you use a predefined assembler, you can define them
+ * using fixed values, instead of using the HAVE_* defines.
+ */
+#if HAVE_CONFIG_H
+
+/* Includes the project configuration for HAVE_* defines */
+#include "config.h"
+
+/* If the compiler supports assembly */
+#if HAVE_ASSEMBLY
+/* Autodetect from the compiler */
+#if defined(__i386__)
+#define CONFIG_X86 1
+#define CONFIG_X86_32 1
+#endif
+#if defined(__x86_64__)
+#define CONFIG_X86 1
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+/* Enables SSE2, SSSE3, AVX2 only if the assembler supports it */
+#if HAVE_SSE2
+#define CONFIG_SSE2 1
+#endif
+#if HAVE_SSSE3
+#define CONFIG_SSSE3 1
+#endif
+#if HAVE_AVX2
+#define CONFIG_AVX2 1
+#endif
+
+#else /* if HAVE_CONFIG_H is not defined */
+
+/* Assume that assembly is always supported */
+#if defined(__i386__)
+#define CONFIG_X86 1
+#define CONFIG_X86_32 1
+#endif
+
+#if defined(__x86_64__)
+#define CONFIG_X86 1
+#define CONFIG_X86_64 1
+#endif
+
+/* Assumes that the assembler supports everything */
+#ifdef CONFIG_X86
+#define CONFIG_SSE2 1
+#define CONFIG_SSSE3 1
+#define CONFIG_AVX2 1
+#endif
+#endif
+
+/*
+ * Includes anything required for compatibility.
+ */
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * Inverse assert.
+ */
+#define BUG_ON(a) assert(!(a))
+
+/*
+ * Forced inline.
+ */
+#ifndef __always_inline
+#define __always_inline inline __attribute__((always_inline))
+#endif
+
+/*
+ * Forced alignment.
+ */
+#ifndef __aligned
+#define __aligned(a) __attribute__((aligned(a)))
+#endif
+
+/*
+ * Align a pointer at the specified size.
+ */
+static __always_inline void *__align_ptr(void *ptr, uintptr_t size)
+{
+	uintptr_t offset = (uintptr_t)ptr;
+
+	offset = (offset + size - 1U) & ~(size - 1U);
+
+	return (void *)offset;
+}
+
+/*
+ * Includes the main interface headers.
+ */
+#include "raid.h"
+#include "helper.h"
+
+/*
+ * Internal functions.
+ *
+ * These are intended to provide access for testing.
+ */
+int raid_selftest(void);
+void raid_gen_ref(int nd, int np, size_t size, void **vv);
+void raid_invert(uint8_t *M, uint8_t *V, int n);
+void raid_delta_gen(int nr, int *id, int *ip, int nd, size_t size, void **v);
+void raid_rec1of1(int *id, int nd, size_t size, void **v);
+void raid_rec2of2_int8(int *id, int *ip, int nd, size_t size, void **vv);
+void raid_gen1_int32(int nd, size_t size, void **vv);
+void raid_gen1_int64(int nd, size_t size, void **vv);
+void raid_gen1_sse2(int nd, size_t size, void **vv);
+void raid_gen1_avx2(int nd, size_t size, void **vv);
+void raid_gen2_int32(int nd, size_t size, void **vv);
+void raid_gen2_int64(int nd, size_t size, void **vv);
+void raid_gen2_sse2(int nd, size_t size, void **vv);
+void raid_gen2_avx2(int nd, size_t size, void **vv);
+void raid_gen2_sse2ext(int nd, size_t size, void **vv);
+void raid_genz_int32(int nd, size_t size, void **vv);
+void raid_genz_int64(int nd, size_t size, void **vv);
+void raid_genz_sse2(int nd, size_t size, void **vv);
+void raid_genz_sse2ext(int nd, size_t size, void **vv);
+void raid_genz_avx2ext(int nd, size_t size, void **vv);
+void raid_gen3_int8(int nd, size_t size, void **vv);
+void raid_gen3_ssse3(int nd, size_t size, void **vv);
+void raid_gen3_ssse3ext(int nd, size_t size, void **vv);
+void raid_gen3_avx2ext(int nd, size_t size, void **vv);
+void raid_gen4_int8(int nd, size_t size, void **vv);
+void raid_gen4_ssse3(int nd, size_t size, void **vv);
+void raid_gen4_ssse3ext(int nd, size_t size, void **vv);
+void raid_gen4_avx2ext(int nd, size_t size, void **vv);
+void raid_gen5_int8(int nd, size_t size, void **vv);
+void raid_gen5_ssse3(int nd, size_t size, void **vv);
+void raid_gen5_ssse3ext(int nd, size_t size, void **vv);
+void raid_gen5_avx2ext(int nd, size_t size, void **vv);
+void raid_gen6_int8(int nd, size_t size, void **vv);
+void raid_gen6_ssse3(int nd, size_t size, void **vv);
+void raid_gen6_ssse3ext(int nd, size_t size, void **vv);
+void raid_gen6_avx2ext(int nd, size_t size, void **vv);
+void raid_rec1_int8(int nr, int *id, int *ip, int nd, size_t size, void **vv);
+void raid_rec2_int8(int nr, int *id, int *ip, int nd, size_t size, void **vv);
+void raid_recX_int8(int nr, int *id, int *ip, int nd, size_t size, void **vv);
+void raid_rec1_ssse3(int nr, int *id, int *ip, int nd, size_t size, void **vv);
+void raid_rec2_ssse3(int nr, int *id, int *ip, int nd, size_t size, void **vv);
+void raid_recX_ssse3(int nr, int *id, int *ip, int nd, size_t size, void **vv);
+void raid_rec1_avx2(int nr, int *id, int *ip, int nd, size_t size, void **vv);
+void raid_rec2_avx2(int nr, int *id, int *ip, int nd, size_t size, void **vv);
+void raid_recX_avx2(int nr, int *id, int *ip, int nd, size_t size, void **vv);
+
+/*
+ * Internal naming.
+ *
+ * These are intented to provide access for testing.
+ */
+const char *raid_gen1_tag(void);
+const char *raid_gen2_tag(void);
+const char *raid_genz_tag(void);
+const char *raid_gen3_tag(void);
+const char *raid_gen4_tag(void);
+const char *raid_gen5_tag(void);
+const char *raid_gen6_tag(void);
+const char *raid_rec1_tag(void);
+const char *raid_rec2_tag(void);
+const char *raid_recX_tag(void);
+
+/*
+ * Internal forwarders.
+ */
+extern void (*raid_gen3_ptr)(int nd, size_t size, void **vv);
+extern void (*raid_genz_ptr)(int nd, size_t size, void **vv);
+extern void (*raid_gen_ptr[RAID_PARITY_MAX])(
+	int nd, size_t size, void **vv);
+extern void (*raid_rec_ptr[RAID_PARITY_MAX])(
+	int nr, int *id, int *ip, int nd, size_t size, void **vv);
+
+/*
+ * Tables.
+ */
+extern const uint8_t raid_gfmul[256][256] __aligned(256);
+extern const uint8_t raid_gfexp[256] __aligned(256);
+extern const uint8_t raid_gfinv[256] __aligned(256);
+extern const uint8_t raid_gfvandermonde[3][256] __aligned(256);
+extern const uint8_t raid_gfcauchy[6][256] __aligned(256);
+extern const uint8_t raid_gfcauchypshufb[251][4][2][16] __aligned(256);
+extern const uint8_t raid_gfmulpshufb[256][2][16] __aligned(256);
+extern const uint8_t (*raid_gfgen)[256];
+#define gfmul raid_gfmul
+#define gfexp raid_gfexp
+#define gfinv raid_gfinv
+#define gfvandermonde raid_gfvandermonde
+#define gfcauchy raid_gfcauchy
+#define gfgenpshufb raid_gfcauchypshufb
+#define gfmulpshufb raid_gfmulpshufb
+#define gfgen raid_gfgen
+
+/*
+ * Assembler blocks.
+ */
+#ifdef CONFIG_X86
+#ifdef CONFIG_SSE2
+static __always_inline void raid_sse_begin(void)
+{
+}
+
+static __always_inline void raid_sse_end(void)
+{
+	/* SSE and AVX code uses non-temporal writes, like MOVNTDQ, */
+	/* that use a weak memory model. To ensure that other processors */
+	/* see correctly the data written, we use a store-store memory */
+	/* barrier at the end of the asm code */
+	asm volatile ("sfence" : : : "memory");
+
+	/* clobbers registers used in the asm code */
+	/* this is required because in the Windows ABI, */
+	/* registers xmm6-xmm15 should be kept by the callee. */
+	/* this clobber list force the compiler to save any */
+	/* register that needs to be saved */
+	/* we check for __SSE2_ because we require that the */
+	/* compiler supports SSE2 registers in the clobber list */
+#ifdef __SSE2__
+	asm volatile ("" : : : "%xmm0", "%xmm1", "%xmm2", "%xmm3");
+	asm volatile ("" : : : "%xmm4", "%xmm5", "%xmm6", "%xmm7");
+#ifdef CONFIG_X86_64
+	asm volatile ("" : : : "%xmm8", "%xmm9", "%xmm10", "%xmm11");
+	asm volatile ("" : : : "%xmm12", "%xmm13", "%xmm14", "%xmm15");
+#endif
+#endif
+}
+#endif
+
+#ifdef CONFIG_AVX2
+static __always_inline void raid_avx_begin(void)
+{
+	raid_sse_begin();
+}
+
+static __always_inline void raid_avx_end(void)
+{
+	raid_sse_end();
+
+	/* reset the upper part of the ymm registers */
+	/* to avoid the 70 clocks penality on the next */
+	/* xmm register use */
+	asm volatile ("vzeroupper" : : : "memory");
+}
+#endif
+#endif /* CONFIG_X86 */
+
+#endif
+
diff --git a/raid/intz.c b/raid/intz.c
new file mode 100644
index 00000000..80c20142
--- /dev/null
+++ b/raid/intz.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+#include "gf.h"
+
+/*
+ * GENz (triple parity with powers of 2^-1) 32bit C implementation
+ */
+void raid_genz_int32(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t**)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	int d, l;
+	size_t i;
+
+	uint32_t d0, r0, q0, p0;
+	uint32_t d1, r1, q1, p1;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+
+	for (i = 0; i < size; i += 8) {
+		r0 = q0 = p0 = v_32(v[l][i]);
+		r1 = q1 = p1 = v_32(v[l][i + 4]);
+		for (d = l - 1; d >= 0; --d) {
+			d0 = v_32(v[d][i]);
+			d1 = v_32(v[d][i + 4]);
+
+			p0 ^= d0;
+			p1 ^= d1;
+
+			q0 = x2_32(q0);
+			q1 = x2_32(q1);
+
+			q0 ^= d0;
+			q1 ^= d1;
+
+			r0 = d2_32(r0);
+			r1 = d2_32(r1);
+
+			r0 ^= d0;
+			r1 ^= d1;
+		}
+		v_32(p[i]) = p0;
+		v_32(p[i + 4]) = p1;
+		v_32(q[i]) = q0;
+		v_32(q[i + 4]) = q1;
+		v_32(r[i]) = r0;
+		v_32(r[i + 4]) = r1;
+	}
+}
+
+/*
+ * GENz (triple parity with powers of 2^-1) 64bit C implementation
+ */
+void raid_genz_int64(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t**)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	int d, l;
+	size_t i;
+
+	uint64_t d0, r0, q0, p0;
+	uint64_t d1, r1, q1, p1;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+
+	for (i = 0; i < size; i += 16) {
+		r0 = q0 = p0 = v_64(v[l][i]);
+		r1 = q1 = p1 = v_64(v[l][i + 8]);
+		for (d = l - 1; d >= 0; --d) {
+			d0 = v_64(v[d][i]);
+			d1 = v_64(v[d][i + 8]);
+
+			p0 ^= d0;
+			p1 ^= d1;
+
+			q0 = x2_64(q0);
+			q1 = x2_64(q1);
+
+			q0 ^= d0;
+			q1 ^= d1;
+
+			r0 = d2_64(r0);
+			r1 = d2_64(r1);
+
+			r0 ^= d0;
+			r1 ^= d1;
+		}
+		v_64(p[i]) = p0;
+		v_64(p[i + 8]) = p1;
+		v_64(q[i]) = q0;
+		v_64(q[i + 8]) = q1;
+		v_64(r[i]) = r0;
+		v_64(r[i + 8]) = r1;
+	}
+}
+
diff --git a/raid/memory.c b/raid/memory.c
new file mode 100644
index 00000000..02a5a927
--- /dev/null
+++ b/raid/memory.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+#include "memory.h"
+
+void *raid_malloc_align(size_t size, size_t align_size, void **freeptr)
+{
+	unsigned char *ptr;
+	uintptr_t offset;
+
+	ptr = malloc(size + align_size);
+	if (!ptr) {
+		/* LCOV_EXCL_START */
+		return 0;
+		/* LCOV_EXCL_STOP */
+	}
+
+	*freeptr = ptr;
+
+	offset = ((uintptr_t)ptr) % align_size;
+
+	if (offset != 0)
+		ptr += align_size - offset;
+
+	return ptr;
+}
+
+void *raid_malloc(size_t size, void **freeptr)
+{
+    return raid_malloc_align(size, RAID_MALLOC_ALIGN, freeptr);
+}
+
+void **raid_malloc_vector_align(int nd, int n, size_t size, size_t align_size, size_t displacement_size, void **freeptr)
+{
+	void **v;
+	unsigned char *va;
+	int i;
+
+	BUG_ON(n <= 0 || nd < 0);
+
+	v = malloc(n * sizeof(void *));
+	if (!v) {
+		/* LCOV_EXCL_START */
+		return 0;
+		/* LCOV_EXCL_STOP */
+	}
+
+	va = raid_malloc_align(n * (size + displacement_size), align_size, freeptr);
+	if (!va) {
+		/* LCOV_EXCL_START */
+		free(v);
+		return 0;
+		/* LCOV_EXCL_STOP */
+	}
+
+	for (i = 0; i < n; ++i) {
+		v[i] = va;
+		va += size + displacement_size;
+	}
+
+	/* reverse order of the data blocks */
+	/* because they are usually accessed from the last one */
+	for (i = 0; i < nd / 2; ++i) {
+		void *ptr = v[i];
+
+		v[i] = v[nd - 1 - i];
+		v[nd - 1 - i] = ptr;
+	}
+
+	return v;
+}
+
+void **raid_malloc_vector(int nd, int n, size_t size, void **freeptr)
+{
+    return raid_malloc_vector_align(nd, n, size, RAID_MALLOC_ALIGN, RAID_MALLOC_DISPLACEMENT, freeptr);
+}
+
+void raid_mrand_vector(unsigned seed, int n, size_t size, void **vv)
+{
+	unsigned char **v = (unsigned char **)vv;
+	int i;
+	size_t j;
+
+	for (i = 0; i < n; ++i)
+		for (j = 0; j < size; ++j) {
+			/* basic C99/C11 linear congruential generator */
+			seed = seed * 1103515245U + 12345U;
+
+			v[i][j] = seed >> 16;
+		}
+}
+
+int raid_mtest_vector(int n, size_t size, void **vv)
+{
+	unsigned char **v = (unsigned char **)vv;
+	int i;
+	size_t j;
+	unsigned k;
+	unsigned char d;
+	unsigned char p;
+
+	/* fill with 0 */
+	d = 0;
+	for (i = 0; i < n; ++i)
+		for (j = 0; j < size; ++j)
+			v[i][j] = d;
+
+	/* test with all the byte patterns */
+	for (k = 1; k < 256; ++k) {
+		p = d;
+		d = k;
+
+		/* forward fill */
+		for (i = 0; i < n; ++i) {
+			for (j = 0; j < size; ++j) {
+				if (v[i][j] != p) {
+					/* LCOV_EXCL_START */
+					return -1;
+					/* LCOV_EXCL_STOP */
+				}
+				v[i][j] = d;
+			}
+		}
+
+		p = d;
+		d = ~p;
+		/* backward fill with complement */
+		for (i = 0; i < n; ++i) {
+			for (j = size; j > 0; --j) {
+				if (v[i][j - 1] != p) {
+					/* LCOV_EXCL_START */
+					return -1;
+					/* LCOV_EXCL_STOP */
+				}
+				v[i][j - 1] = d;
+			}
+		}
+	}
+
+	return 0;
+}
+
diff --git a/raid/memory.h b/raid/memory.h
new file mode 100644
index 00000000..de00614f
--- /dev/null
+++ b/raid/memory.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAID_MEMORY_H
+#define __RAID_MEMORY_H
+
+/**
+ * Memory alignment provided by raid_malloc().
+ *
+ * It should guarantee good cache performance everywhere.
+ */
+#define RAID_MALLOC_ALIGN 256
+
+/**
+ * Memory displacement to avoid cache address sharing on contiguous blocks,
+ * used by raid_malloc_vector().
+ *
+ * When allocating a sequence of blocks with a size of power of 2,
+ * there is the risk that the addresses of each block are mapped into the
+ * same cache line and prefetching predictor, resulting in a lot of cache
+ * sharing if you access all the blocks in parallel, from the start to the
+ * end.
+ *
+ * To avoid this effect, it's better if all the blocks are allocated
+ * with a fixed displacement trying to reduce the cache addresses sharing.
+ *
+ * The selected displacement was chosen empirically with some speed tests
+ * with 8/12/16/20/24 data buffers of 256 KB.
+ *
+ * These are the results in MB/s with no displacement:
+ *
+ *            sse2
+ *    gen1   15368 [MB/s]
+ *    gen2    6814 [MB/s]
+ *    genz    3033 [MB/s]
+ *
+ * These are the results with displacement resulting in improvments
+ * in the order of 20% or more:
+ *
+ *            sse2
+ *    gen1   21936 [MB/s]
+ *    gen2   11902 [MB/s]
+ *    genz    5838 [MB/s]
+ *
+ */
+#define RAID_MALLOC_DISPLACEMENT (7*256)
+
+/**
+ * Aligned malloc.
+ * Use an alignment suitable for the raid functions.
+ */
+void *raid_malloc(size_t size, void **freeptr);
+
+/**
+ * Arbitrary aligned malloc.
+ */
+void *raid_malloc_align(size_t size, size_t align_size, void **freeptr);
+
+/**
+ * Aligned vector allocation.
+ * Use an alignment suitable for the raid functions.
+ * Returns a vector of @n pointers, each one pointing to a block of
+ * the specified @size.
+ * The first @nd elements are reversed in order.
+ */
+void **raid_malloc_vector(int nd, int n, size_t size, void **freeptr);
+
+/**
+ * Arbitrary aligned vector allocation.
+ */
+void **raid_malloc_vector_align(int nd, int n, size_t size, size_t align_size, size_t displacement_size, void **freeptr);
+
+/**
+ * Fills the memory vector with pseudo-random data based on the specified seed.
+ */
+void raid_mrand_vector(unsigned seed, int n, size_t size, void **vv);
+
+/**
+ * Tests the memory vector for RAM problems.
+ * If a problem is found, it crashes.
+ */
+int raid_mtest_vector(int n, size_t size, void **vv);
+
+#endif
+
diff --git a/raid/module.c b/raid/module.c
new file mode 100644
index 00000000..b688d22c
--- /dev/null
+++ b/raid/module.c
@@ -0,0 +1,473 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+#include "memory.h"
+#include "cpu.h"
+
+/*
+ * Initializes and selects the best algorithm.
+ */
+void raid_init(void)
+{
+	raid_gen3_ptr = raid_gen3_int8;
+	raid_gen_ptr[3] = raid_gen4_int8;
+	raid_gen_ptr[4] = raid_gen5_int8;
+	raid_gen_ptr[5] = raid_gen6_int8;
+
+	if (sizeof(void *) == 4) {
+		raid_gen_ptr[0] = raid_gen1_int32;
+		raid_gen_ptr[1] = raid_gen2_int32;
+		raid_genz_ptr = raid_genz_int32;
+	} else {
+		raid_gen_ptr[0] = raid_gen1_int64;
+		raid_gen_ptr[1] = raid_gen2_int64;
+		raid_genz_ptr = raid_genz_int64;
+	}
+
+	raid_rec_ptr[0] = raid_rec1_int8;
+	raid_rec_ptr[1] = raid_rec2_int8;
+	raid_rec_ptr[2] = raid_recX_int8;
+	raid_rec_ptr[3] = raid_recX_int8;
+	raid_rec_ptr[4] = raid_recX_int8;
+	raid_rec_ptr[5] = raid_recX_int8;
+
+#ifdef CONFIG_X86
+#ifdef CONFIG_SSE2
+	if (raid_cpu_has_sse2()) {
+		raid_gen_ptr[0] = raid_gen1_sse2;
+#ifdef CONFIG_X86_64
+		if (raid_cpu_has_slowextendedreg()) {
+			raid_gen_ptr[1] = raid_gen2_sse2;
+		} else {
+			raid_gen_ptr[1] = raid_gen2_sse2ext;
+		}
+		/* note that raid_cpu_has_slowextendedreg() doesn't affect parz */
+		raid_genz_ptr = raid_genz_sse2ext;
+#else
+		raid_gen_ptr[1] = raid_gen2_sse2;
+		raid_genz_ptr = raid_genz_sse2;
+#endif
+	}
+#endif
+
+#ifdef CONFIG_SSSE3
+	if (raid_cpu_has_ssse3()) {
+#ifdef CONFIG_X86_64
+		if (raid_cpu_has_slowextendedreg()) {
+			raid_gen3_ptr = raid_gen3_ssse3;
+			raid_gen_ptr[3] = raid_gen4_ssse3;
+			raid_gen_ptr[4] = raid_gen5_ssse3;
+			raid_gen_ptr[5] = raid_gen6_ssse3;
+		} else {
+			raid_gen3_ptr = raid_gen3_ssse3ext;
+			raid_gen_ptr[3] = raid_gen4_ssse3ext;
+			raid_gen_ptr[4] = raid_gen5_ssse3ext;
+			raid_gen_ptr[5] = raid_gen6_ssse3ext;
+		}
+#else
+		raid_gen3_ptr = raid_gen3_ssse3;
+		raid_gen_ptr[3] = raid_gen4_ssse3;
+		raid_gen_ptr[4] = raid_gen5_ssse3;
+		raid_gen_ptr[5] = raid_gen6_ssse3;
+#endif
+		raid_rec_ptr[0] = raid_rec1_ssse3;
+		raid_rec_ptr[1] = raid_rec2_ssse3;
+		raid_rec_ptr[2] = raid_recX_ssse3;
+		raid_rec_ptr[3] = raid_recX_ssse3;
+		raid_rec_ptr[4] = raid_recX_ssse3;
+		raid_rec_ptr[5] = raid_recX_ssse3;
+	}
+#endif
+
+#ifdef CONFIG_AVX2
+	if (raid_cpu_has_avx2()) {
+		raid_gen_ptr[0] = raid_gen1_avx2;
+		raid_gen_ptr[1] = raid_gen2_avx2;
+#ifdef CONFIG_X86_64
+		raid_gen3_ptr = raid_gen3_avx2ext;
+		raid_genz_ptr = raid_genz_avx2ext;
+		raid_gen_ptr[3] = raid_gen4_avx2ext;
+		raid_gen_ptr[4] = raid_gen5_avx2ext;
+		raid_gen_ptr[5] = raid_gen6_avx2ext;
+#endif
+		raid_rec_ptr[0] = raid_rec1_avx2;
+		raid_rec_ptr[1] = raid_rec2_avx2;
+		raid_rec_ptr[2] = raid_recX_avx2;
+		raid_rec_ptr[3] = raid_recX_avx2;
+		raid_rec_ptr[4] = raid_recX_avx2;
+		raid_rec_ptr[5] = raid_recX_avx2;
+	}
+#endif
+#endif /* CONFIG_X86 */
+
+	/* set the default mode */
+	raid_mode(RAID_MODE_CAUCHY);
+}
+
+/*
+ * Reference parity computation.
+ */
+void raid_gen_ref(int nd, int np, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	size_t i;
+
+	for (i = 0; i < size; ++i) {
+		uint8_t p[RAID_PARITY_MAX];
+		int j, d;
+
+		for (j = 0; j < np; ++j)
+			p[j] = 0;
+
+		for (d = 0; d < nd; ++d) {
+			uint8_t b = v[d][i];
+
+			for (j = 0; j < np; ++j)
+				p[j] ^= gfmul[b][gfgen[j][d]];
+		}
+
+		for (j = 0; j < np; ++j)
+			v[nd + j][i] = p[j];
+	}
+}
+
+/*
+ * Size of the blocks to test.
+ */
+#define TEST_SIZE 4096
+
+/*
+ * Number of data blocks to test.
+ */
+#define TEST_COUNT (65536 / TEST_SIZE)
+
+/*
+ * Parity generation test.
+ */
+static int raid_test_par(int nd, int np, size_t size, void **v, void **ref)
+{
+	int i;
+	void *t[TEST_COUNT + RAID_PARITY_MAX];
+
+	/* setup data */
+	for (i = 0; i < nd; ++i)
+		t[i] = ref[i];
+
+	/* setup parity */
+	for (i = 0; i < np; ++i)
+		t[nd + i] = v[nd + i];
+
+	raid_gen(nd, np, size, t);
+
+	/* compare parity */
+	for (i = 0; i < np; ++i) {
+		if (memcmp(t[nd + i], ref[nd + i], size) != 0) {
+			/* LCOV_EXCL_START */
+			return -1;
+			/* LCOV_EXCL_STOP */
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Recovering test.
+ */
+static int raid_test_rec(int nr, int *ir, int nd, int np, size_t size, void **v, void **ref)
+{
+	int i, j;
+	void *t[TEST_COUNT + RAID_PARITY_MAX];
+
+	/* setup data and parity vector */
+	for (i = 0, j = 0; i < nd + np; ++i) {
+		if (j < nr && ir[j] == i) {
+			/* this block has to be recovered */
+			t[i] = v[i];
+			++j;
+		} else {
+			/* this block is used for recovering */
+			t[i] = ref[i];
+		}
+	}
+
+	raid_rec(nr, ir, nd, np, size, t);
+
+	/* compare all data and parity */
+	for (i = 0; i < nd + np; ++i) {
+		if (t[i] != ref[i]
+			&& memcmp(t[i], ref[i], size) != 0) {
+			/* LCOV_EXCL_START */
+			return -1;
+			/* LCOV_EXCL_STOP */
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Recovering test for data.
+ */
+static int raid_test_data(int nr, int *id, int *ip, int nd, int np, size_t size, void **v, void **ref)
+{
+	int i, j;
+	void *t[TEST_COUNT + RAID_PARITY_MAX];
+
+	/* setup data vector */
+	for (i = 0, j = 0; i < nd; ++i) {
+		if (j < nr && id[j] == i) {
+			/* this block has to be recovered */
+			t[i] = v[i];
+			++j;
+		} else {
+			/* this block is left unchanged */
+			t[i] = ref[i];
+		}
+	}
+
+	/* setup parity vector */
+	for (i = 0, j = 0; i < np; ++i) {
+		if (j < nr && ip[j] == i) {
+			/* this block is used for recovering */
+			t[nd + i] = ref[nd + i];
+			++j;
+		} else {
+			/* this block should not be read or written */
+			t[nd + i] = 0;
+		}
+	}
+
+	raid_data(nr, id, ip, nd, size, t);
+
+	/* compare all data and parity */
+	for (i = 0; i < nd; ++i) {
+		if (t[i] != ref[i]
+			&& t[i] != 0
+			&& memcmp(t[i], ref[i], size) != 0) {
+			/* LCOV_EXCL_START */
+			return -1;
+			/* LCOV_EXCL_STOP */
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Scan test.
+ */
+static int raid_test_scan(int nr, int *ir, int nd, int np, size_t size, void **v, void **ref)
+{
+	int i, j, ret;
+	void *t[TEST_COUNT + RAID_PARITY_MAX];
+	int is[RAID_PARITY_MAX];
+
+	/* setup data and parity vector */
+	for (i = 0, j = 0; i < nd + np; ++i) {
+		if (j < nr && ir[j] == i) {
+			/* this block is bad */
+			t[i] = v[i];
+			++j;
+		} else {
+			/* this block is used for recovering */
+			t[i] = ref[i];
+		}
+	}
+
+	ret = raid_scan(is, nd, np, size, t);
+
+	/* compare identified bad blocks */
+	if (ret != nr)
+		return -1;
+	for (i = 0; i < nr; ++i) {
+		if (ir[i] != is[i]) {
+			/* LCOV_EXCL_START */
+			return -1;
+			/* LCOV_EXCL_STOP */
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Basic functionality self test.
+ */
+int raid_selftest(void)
+{
+	const int nd = TEST_COUNT;
+	const size_t size = TEST_SIZE;
+	const int nv = nd + RAID_PARITY_MAX * 2 + 1;
+	void *v_alloc;
+	void **v;
+	void *ref[nd + RAID_PARITY_MAX];
+	int ir[RAID_PARITY_MAX];
+	int ip[RAID_PARITY_MAX];
+	int i, np;
+	int ret = 0;
+
+	/* ensure to have enough space for data */
+	BUG_ON(nd * size > 65536);
+
+	v = raid_malloc_vector(nd, nv, size, &v_alloc);
+	if (!v) {
+		/* LCOV_EXCL_START */
+		return -1;
+		/* LCOV_EXCL_STOP */
+	}
+
+	memset(v[nv - 1], 0, size);
+	raid_zero(v[nv - 1]);
+
+	/* use the multiplication table as data */
+	for (i = 0; i < nd; ++i)
+		ref[i] = ((uint8_t *)gfmul) + size * i;
+
+	/* setup reference parity */
+	for (i = 0; i < RAID_PARITY_MAX; ++i)
+		ref[nd + i] = v[nd + RAID_PARITY_MAX + i];
+
+	/* compute reference parity */
+	raid_gen_ref(nd, RAID_PARITY_MAX, size, ref);
+
+	/* test for each parity level */
+	for (np = 1; np <= RAID_PARITY_MAX; ++np) {
+		/* test parity generation */
+		ret = raid_test_par(nd, np, size, v, ref);
+		if (ret != 0) {
+			/* LCOV_EXCL_START */
+			goto bail;
+			/* LCOV_EXCL_STOP */
+		}
+
+		/* test recovering with broken ending data disks */
+		for (i = 0; i < np; ++i) {
+			/* bad data */
+			ir[i] = nd - np + i;
+
+			/* good parity */
+			ip[i] = i;
+		}
+
+		ret = raid_test_rec(np, ir, nd, np, size, v, ref);
+		if (ret != 0) {
+			/* LCOV_EXCL_START */
+			goto bail;
+			/* LCOV_EXCL_STOP */
+		}
+
+		ret = raid_test_data(np, ir, ip, nd, np, size, v, ref);
+		if (ret != 0) {
+			/* LCOV_EXCL_START */
+			goto bail;
+			/* LCOV_EXCL_STOP */
+		}
+
+		/* test recovering with broken leading data and broken leading parity */
+		for (i = 0; i < np / 2; ++i) {
+			/* bad data */
+			ir[i] = i;
+
+			/* good parity */
+			ip[i] = (np + 1) / 2 + i;
+		}
+
+		/* bad parity */
+		for (i = 0; i < (np + 1) / 2; ++i)
+			ir[np / 2 + i] = nd + i;
+
+		ret = raid_test_rec(np, ir, nd, np, size, v, ref);
+		if (ret != 0) {
+			/* LCOV_EXCL_START */
+			goto bail;
+			/* LCOV_EXCL_STOP */
+		}
+
+		ret = raid_test_data(np / 2, ir, ip, nd, np, size, v, ref);
+		if (ret != 0) {
+			/* LCOV_EXCL_START */
+			goto bail;
+			/* LCOV_EXCL_STOP */
+		}
+
+		/* test recovering with broken leading data and broken ending parity */
+		for (i = 0; i < np / 2; ++i) {
+			/* bad data */
+			ir[i] = i;
+
+			/* good parity */
+			ip[i] = i;
+		}
+
+		/* bad parity */
+		for (i = 0; i < (np + 1) / 2; ++i)
+			ir[np / 2 + i] = nd + np - (np + 1) / 2 + i;
+
+		ret = raid_test_rec(np, ir, nd, np, size, v, ref);
+		if (ret != 0) {
+			/* LCOV_EXCL_START */
+			goto bail;
+			/* LCOV_EXCL_STOP */
+		}
+
+		ret = raid_test_data(np / 2, ir, ip, nd, np, size, v, ref);
+		if (ret != 0) {
+			/* LCOV_EXCL_START */
+			goto bail;
+			/* LCOV_EXCL_STOP */
+		}
+
+		/* scan test with broken data and parity */
+		for (i = 0; i < np / 2; ++i) {
+			/* bad data */
+			ir[i] = i;
+		}
+		for (i = 0; i < (np - 1) / 2; ++i) {
+			/* bad parity */
+			ir[np / 2 + i] = nd + i;
+		}
+		for (i = 0; i < np - 1; ++i) {
+			/* make blocks bad */
+			/* we cannot fill them with 0, because the original */
+			/* data may be already filled with 0 */
+			memset(v[ir[i]], 0x55, size);
+		}
+
+		ret = raid_test_scan(np - 1, ir, nd, np, size, v, ref);
+		if (ret != 0) {
+			/* LCOV_EXCL_START */
+			goto bail;
+			/* LCOV_EXCL_STOP */
+		}
+	}
+
+	/* scan test with no parity */
+	ret = raid_test_scan(0, 0, nd, 0, size, v, ref);
+	if (ret != -1) {
+		/* LCOV_EXCL_START */
+		goto bail;
+		/* LCOV_EXCL_STOP */
+	}
+
+	ret = 0;
+
+bail:
+	free(v);
+	free(v_alloc);
+
+	return ret;
+}
+
diff --git a/raid/raid.c b/raid/raid.c
new file mode 100644
index 00000000..3052675f
--- /dev/null
+++ b/raid/raid.c
@@ -0,0 +1,586 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+#include "gf.h"
+
+/*
+ * This is a RAID implementation working in the Galois Field GF(2^8) with
+ * the primitive polynomial x^8 + x^4 + x^3 + x^2 + 1 (285 decimal), and
+ * supporting up to six parity levels.
+ *
+ * For RAID5 and RAID6 it works as as described in the H. Peter Anvin's
+ * paper "The mathematics of RAID-6" [1]. Please refer to this paper for a
+ * complete explanation.
+ *
+ * To support triple parity, it was first evaluated and then dropped, an
+ * extension of the same approach, with additional parity coefficients set
+ * as powers of 2^-1, with equations:
+ *
+ * P = sum(Di)
+ * Q = sum(2^i * Di)
+ * R = sum(2^-i * Di) with 0<=i<N
+ *
+ * This approach works well for triple parity and it's very efficient,
+ * because we can implement very fast parallel multiplications and
+ * divisions by 2 in GF(2^8).
+ *
+ * It's also similar at the approach used by ZFS RAIDZ3, with the
+ * difference that ZFS uses powers of 4 instead of 2^-1.
+ *
+ * Unfortunately it doesn't work beyond triple parity, because whatever
+ * value we choose to generate the power coefficients to compute other
+ * parities, the resulting equations are not solvable for some
+ * combinations of missing disks.
+ *
+ * This is expected, because the Vandermonde matrix used to compute the
+ * parity has no guarantee to have all submatrices not singular
+ * [2, Chap 11, Problem 7] and this is a requirement to have
+ * a MDS (Maximum Distance Separable) code [2, Chap 11, Theorem 8].
+ *
+ * To overcome this limitation, we use a Cauchy matrix [3][4] to compute
+ * the parity. A Cauchy matrix has the property to have all the square
+ * submatrices not singular, resulting in always solvable equations,
+ * for any combination of missing disks.
+ *
+ * The problem of this approach is that it requires the use of
+ * generic multiplications, and not only by 2 or 2^-1, potentially
+ * affecting badly the performance.
+ *
+ * Hopefully there is a method to implement parallel multiplications
+ * using SSSE3 or AVX2 instructions [1][5]. Method competitive with the
+ * computation of triple parity using power coefficients.
+ *
+ * Another important property of the Cauchy matrix is that we can setup
+ * the first two rows with coeffients equal at the RAID5 and RAID6 approach
+ * decribed, resulting in a compatible extension, and requiring SSSE3
+ * or AVX2 instructions only if triple parity or beyond is used.
+ *
+ * The matrix is also adjusted, multipling each row by a constant factor
+ * to make the first column of all 1, to optimize the computation for
+ * the first disk.
+ *
+ * This results in the matrix A[row,col] defined as:
+ *
+ * 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01...
+ * 01 02 04 08 10 20 40 80 1d 3a 74 e8 cd 87 13 26 4c 98 2d 5a b4 75...
+ * 01 f5 d2 c4 9a 71 f1 7f fc 87 c1 c6 19 2f 40 55 3d ba 53 04 9c 61...
+ * 01 bb a6 d7 c7 07 ce 82 4a 2f a5 9b b6 60 f1 ad e7 f4 06 d2 df 2e...
+ * 01 97 7f 9c 7c 18 bd a2 58 1a da 74 70 a3 e5 47 29 07 f5 80 23 e9...
+ * 01 2b 3f cf 73 2c d6 ed cb 74 15 78 8a c1 17 c9 89 68 21 ab 76 3b...
+ *
+ * This matrix supports 6 level of parity, one for each row, for up to 251
+ * data disks, one for each column, with all the 377,342,351,231 square
+ * submatrices not singular, verified also with brute-force.
+ *
+ * This matrix can be extended to support any number of parities, just
+ * adding additional rows, and removing one column for each new row.
+ * (see mktables.c for more details in how the matrix is generated)
+ *
+ * In details, parity is computed as:
+ *
+ * P = sum(Di)
+ * Q = sum(2^i *  Di)
+ * R = sum(A[2,i] * Di)
+ * S = sum(A[3,i] * Di)
+ * T = sum(A[4,i] * Di)
+ * U = sum(A[5,i] * Di) with 0<=i<N
+ *
+ * To recover from a failure of six disks at indexes x,y,z,h,v,w,
+ * with 0<=x<y<z<h<v<w<N, we compute the parity of the available N-6
+ * disks as:
+ *
+ * Pa = sum(Di)
+ * Qa = sum(2^i * Di)
+ * Ra = sum(A[2,i] * Di)
+ * Sa = sum(A[3,i] * Di)
+ * Ta = sum(A[4,i] * Di)
+ * Ua = sum(A[5,i] * Di) with 0<=i<N,i!=x,i!=y,i!=z,i!=h,i!=v,i!=w.
+ *
+ * And if we define:
+ *
+ * Pd = Pa + P
+ * Qd = Qa + Q
+ * Rd = Ra + R
+ * Sd = Sa + S
+ * Td = Ta + T
+ * Ud = Ua + U
+ *
+ * we can sum these two sets of equations, obtaining:
+ *
+ * Pd =          Dx +          Dy +          Dz +          Dh +          Dv +          Dw
+ * Qd =    2^x * Dx +    2^y * Dy +    2^z * Dz +    2^h * Dh +    2^v * Dv +    2^w * Dw
+ * Rd = A[2,x] * Dx + A[2,y] * Dy + A[2,z] * Dz + A[2,h] * Dh + A[2,v] * Dv + A[2,w] * Dw
+ * Sd = A[3,x] * Dx + A[3,y] * Dy + A[3,z] * Dz + A[3,h] * Dh + A[3,v] * Dv + A[3,w] * Dw
+ * Td = A[4,x] * Dx + A[4,y] * Dy + A[4,z] * Dz + A[4,h] * Dh + A[4,v] * Dv + A[4,w] * Dw
+ * Ud = A[5,x] * Dx + A[5,y] * Dy + A[5,z] * Dz + A[5,h] * Dh + A[5,v] * Dv + A[5,w] * Dw
+ *
+ * A linear system always solvable because the coefficients matrix is
+ * always not singular due the properties of the matrix A[].
+ *
+ * Resulting speed in x64, with 8 data disks, using a stripe of 256 KiB,
+ * for a Core i5-4670K Haswell Quad-Core 3.4GHz is:
+ *
+ *             int8   int32   int64    sse2   ssse3    avx2
+ *   gen1             13339   25438   45438           50588
+ *   gen2              4115    6514   21840           32201
+ *   gen3       814                           10154   18613
+ *   gen4       620                            7569   14229
+ *   gen5       496                            5149   10051
+ *   gen6       413                            4239    8190
+ *
+ * Values are in MiB/s of data processed by a single thread, not counting
+ * generated parity.
+ *
+ * You can replicate these results in your machine using the
+ * "raid/test/speedtest.c" program.
+ *
+ * For comparison, the triple parity computation using the power
+ * coeffients "1,2,2^-1" is only a little faster than the one based on
+ * the Cauchy matrix if SSSE3 or AVX2 is present.
+ *
+ *             int8   int32   int64    sse2   ssse3    avx2
+ *   genz              2337    2874   10920           18944
+ *
+ * In conclusion, the use of power coefficients, and specifically powers
+ * of 1,2,2^-1, is the best option to implement triple parity in CPUs
+ * without SSSE3 and AVX2.
+ * But if a modern CPU with SSSE3 or AVX2 is available, the Cauchy
+ * matrix is the best option because it provides a fast and general
+ * approach working for any number of parities.
+ *
+ * References:
+ * [1] Anvin, "The mathematics of RAID-6", 2004
+ * [2] MacWilliams, Sloane, "The Theory of Error-Correcting Codes", 1977
+ * [3] Blomer, "An XOR-Based Erasure-Resilient Coding Scheme", 1995
+ * [4] Roth, "Introduction to Coding Theory", 2006
+ * [5] Plank, "Screaming Fast Galois Field Arithmetic Using Intel SIMD Instructions", 2013
+ */
+
+/**
+ * Generator matrix currently used.
+ */
+const uint8_t (*raid_gfgen)[256];
+
+void raid_mode(int mode)
+{
+	if (mode == RAID_MODE_VANDERMONDE) {
+		raid_gen_ptr[2] = raid_genz_ptr;
+		raid_gfgen = gfvandermonde;
+	} else {
+		raid_gen_ptr[2] = raid_gen3_ptr;
+		raid_gfgen = gfcauchy;
+	}
+}
+
+/**
+ * Buffer filled with 0 used in recovering.
+ */
+static void *raid_zero_block;
+
+void raid_zero(void *zero)
+{
+	raid_zero_block = zero;
+}
+
+/*
+ * Forwarders for parity computation.
+ *
+ * These functions compute the parity blocks from the provided data.
+ *
+ * The number of parities to compute is implicit in the position in the
+ * forwarder vector. Position at index #i, computes (#i+1) parities.
+ *
+ * All these functions give the guarantee that parities are written
+ * in order. First parity P, then parity Q, and so on.
+ * This allows to specify the same memory buffer for multiple parities
+ * knowning that you'll get the latest written one.
+ * This characteristic is used by the raid_delta_gen() function to
+ * avoid to damage unused parities in recovering.
+ *
+ * @nd Number of data blocks
+ * @size Size of the blocks pointed by @v. It must be a multipler of 64.
+ * @v Vector of pointers to the blocks of data and parity.
+ *   It has (@nd + #parities) elements. The starting elements are the blocks
+ *   for data, following with the parity blocks.
+ *   Each block has @size bytes.
+ */
+void (*raid_gen_ptr[RAID_PARITY_MAX])(int nd, size_t size, void **vv);
+void (*raid_gen3_ptr)(int nd, size_t size, void **vv);
+void (*raid_genz_ptr)(int nd, size_t size, void **vv);
+
+void raid_gen(int nd, int np, size_t size, void **v)
+{
+	/* enforce limit on size */
+	BUG_ON(size % 64 != 0);
+
+	/* enforce limit on number of failures */
+	BUG_ON(np < 1);
+	BUG_ON(np > RAID_PARITY_MAX);
+
+	raid_gen_ptr[np - 1](nd, size, v);
+}
+
+/**
+ * Inverts the square matrix M of size nxn into V.
+ *
+ * This is not a general matrix inversion because we assume the matrix M
+ * to have all the square submatrix not singular.
+ * We use Gauss elimination to invert.
+ *
+ * @M Matrix to invert with @n rows and @n columns.
+ * @V Destination matrix where the result is put.
+ * @n Number of rows and columns of the matrix.
+ */
+void raid_invert(uint8_t *M, uint8_t *V, int n)
+{
+	int i, j, k;
+
+	/* set the identity matrix in V */
+	for (i = 0; i < n; ++i)
+		for (j = 0; j < n; ++j)
+			V[i * n + j] = i == j;
+
+	/* for each element in the diagonal */
+	for (k = 0; k < n; ++k) {
+		uint8_t f;
+
+		/* the diagonal element cannot be 0 because */
+		/* we are inverting matrices with all the square */
+		/* submatrices not singular */
+		BUG_ON(M[k * n + k] == 0);
+
+		/* make the diagonal element to be 1 */
+		f = inv(M[k * n + k]);
+		for (j = 0; j < n; ++j) {
+			M[k * n + j] = mul(f, M[k * n + j]);
+			V[k * n + j] = mul(f, V[k * n + j]);
+		}
+
+		/* make all the elements over and under the diagonal */
+		/* to be zero */
+		for (i = 0; i < n; ++i) {
+			if (i == k)
+				continue;
+			f = M[i * n + k];
+			for (j = 0; j < n; ++j) {
+				M[i * n + j] ^= mul(f, M[k * n + j]);
+				V[i * n + j] ^= mul(f, V[k * n + j]);
+			}
+		}
+	}
+}
+
+/**
+ * Computes the parity without the missing data blocks
+ * and store it in the buffers of such data blocks.
+ *
+ * This is the parity expressed as Pa,Qa,Ra,Sa,Ta,Ua in the equations.
+ */
+void raid_delta_gen(int nr, int *id, int *ip, int nd, size_t size, void **v)
+{
+	void *p[RAID_PARITY_MAX];
+	void *pa[RAID_PARITY_MAX];
+	int i, j;
+	int np;
+	void *latest;
+
+	/* total number of parities we are going to process */
+	/* they are both the used and the unused ones */
+	np = ip[nr - 1] + 1;
+
+	/* latest missing data block */
+	latest = v[id[nr - 1]];
+
+	/* setup pointers for delta computation */
+	for (i = 0, j = 0; i < np; ++i) {
+		/* keep a copy of the original parity vector */
+		p[i] = v[nd + i];
+
+		if (ip[j] == i) {
+			/*
+			 * Set used parities to point to the missing
+			 * data blocks.
+			 *
+			 * The related data blocks are instead set
+			 * to point to the "zero" buffer.
+			 */
+
+			/* the latest parity to use ends the for loop and */
+			/* then it cannot happen to process more of them */
+			BUG_ON(j >= nr);
+
+			/* buffer for missing data blocks */
+			pa[j] = v[id[j]];
+
+			/* set at zero the missing data blocks */
+			v[id[j]] = raid_zero_block;
+
+			/* compute the parity over the missing data blocks */
+			v[nd + i] = pa[j];
+
+			/* check for the next used entry */
+			++j;
+		} else {
+			/*
+			 * Unused parities are going to be rewritten with
+			 * not significative data, becase we don't have
+			 * functions able to compute only a subset of
+			 * parities.
+			 *
+			 * To avoid this, we reuse parity buffers,
+			 * assuming that all the parity functions write
+			 * parities in order.
+			 *
+			 * We assign the unused parity block to the same
+			 * block of the latest used parity that we know it
+			 * will be written.
+			 *
+			 * This means that this block will be written
+			 * multiple times and only the latest write will
+			 * contain the correct data.
+			 */
+			v[nd + i] = latest;
+		}
+	}
+
+	/* all the parities have to be processed */
+	BUG_ON(j != nr);
+
+	/* recompute the parity, note that np may be smaller than the */
+	/* total number of parities available */
+	raid_gen(nd, np, size, v);
+
+	/* restore data buffers as before */
+	for (j = 0; j < nr; ++j)
+		v[id[j]] = pa[j];
+
+	/* restore parity buffers as before */
+	for (i = 0; i < np; ++i)
+		v[nd + i] = p[i];
+}
+
+/**
+ * Recover failure of one data block for PAR1.
+ *
+ * Starting from the equation:
+ *
+ * Pd = Dx
+ *
+ * and solving we get:
+ *
+ * Dx = Pd
+ */
+void raid_rec1of1(int *id, int nd, size_t size, void **v)
+{
+	void *p;
+	void *pa;
+
+	/* for PAR1 we can directly compute the missing block */
+	/* and we don't need to use the zero buffer */
+	p = v[nd];
+	pa = v[id[0]];
+
+	/* use the parity as missing data block */
+	v[id[0]] = p;
+
+	/* compute the parity over the missing data block */
+	v[nd] = pa;
+
+	/* compute */
+	raid_gen(nd, 1, size, v);
+
+	/* restore as before */
+	v[id[0]] = pa;
+	v[nd] = p;
+}
+
+/**
+ * Recover failure of two data blocks for PAR2.
+ *
+ * Starting from the equations:
+ *
+ * Pd = Dx + Dy
+ * Qd = 2^id[0] * Dx + 2^id[1] * Dy
+ *
+ * and solving we get:
+ *
+ *               1                     2^(-id[0])
+ * Dy = ------------------- * Pd + ------------------- * Qd
+ *      2^(id[1]-id[0]) + 1        2^(id[1]-id[0]) + 1
+ *
+ * Dx = Dy + Pd
+ *
+ * with conditions:
+ *
+ * 2^id[0] != 0
+ * 2^(id[1]-id[0]) + 1 != 0
+ *
+ * That are always satisfied for any 0<=id[0]<id[1]<255.
+ */
+void raid_rec2of2_int8(int *id, int *ip, int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	size_t i;
+	uint8_t *p;
+	uint8_t *pa;
+	uint8_t *q;
+	uint8_t *qa;
+	const uint8_t *T[2];
+
+	/* get multiplication tables */
+	T[0] = table(inv(pow2(id[1] - id[0]) ^ 1));
+	T[1] = table(inv(pow2(id[0]) ^ pow2(id[1])));
+
+	/* compute delta parity */
+	raid_delta_gen(2, id, ip, nd, size, vv);
+
+	p = v[nd];
+	q = v[nd + 1];
+	pa = v[id[0]];
+	qa = v[id[1]];
+
+	for (i = 0; i < size; ++i) {
+		/* delta */
+		uint8_t Pd = p[i] ^ pa[i];
+		uint8_t Qd = q[i] ^ qa[i];
+
+		/* reconstruct */
+		uint8_t Dy = T[0][Pd] ^ T[1][Qd];
+		uint8_t Dx = Pd ^ Dy;
+
+		/* set */
+		pa[i] = Dx;
+		qa[i] = Dy;
+	}
+}
+
+/*
+ * Forwarders for data recovery.
+ *
+ * These functions recover data blocks using the specified parity
+ * to recompute the missing data.
+ *
+ * Note that the format of vectors @id/@ip is different than raid_rec().
+ * For example, in the vector @ip the first parity is represented with the
+ * value 0 and not @nd.
+ *
+ * @nr Number of failed data blocks to recover.
+ * @id[] Vector of @nr indexes of the data blocks to recover.
+ *   The indexes start from 0. They must be in order.
+ * @ip[] Vector of @nr indexes of the parity blocks to use in the recovering.
+ *   The indexes start from 0. They must be in order.
+ * @nd Number of data blocks.
+ * @np Number of parity blocks.
+ * @size Size of the blocks pointed by @v. It must be a multipler of 64.
+ * @v Vector of pointers to the blocks of data and parity.
+ *   It has (@nd + @np) elements. The starting elements are the blocks
+ *   for data, following with the parity blocks.
+ *   Each block has @size bytes.
+ */
+void (*raid_rec_ptr[RAID_PARITY_MAX])(
+	int nr, int *id, int *ip, int nd, size_t size, void **vv);
+
+void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
+{
+	int nrd; /* number of data blocks to recover */
+	int nrp; /* number of parity blocks to recover */
+
+	/* enforce limit on size */
+	BUG_ON(size % 64 != 0);
+
+	/* enforce limit on number of failures */
+	BUG_ON(nr > np);
+	BUG_ON(np > RAID_PARITY_MAX);
+
+	/* enforce order in index vector */
+	BUG_ON(nr >= 2 && ir[0] >= ir[1]);
+	BUG_ON(nr >= 3 && ir[1] >= ir[2]);
+	BUG_ON(nr >= 4 && ir[2] >= ir[3]);
+	BUG_ON(nr >= 5 && ir[3] >= ir[4]);
+	BUG_ON(nr >= 6 && ir[4] >= ir[5]);
+
+	/* enforce limit on index vector */
+	BUG_ON(nr > 0 && ir[nr-1] >= nd + np);
+
+	/* count the number of data blocks to recover */
+	nrd = 0;
+	while (nrd < nr && ir[nrd] < nd)
+		++nrd;
+
+	/* all the remaining are parity */
+	nrp = nr - nrd;
+
+	/* enforce limit on number of failures */
+	BUG_ON(nrd > nd);
+	BUG_ON(nrp > np);
+
+	/* if failed data is present */
+	if (nrd != 0) {
+		int ip[RAID_PARITY_MAX];
+		int i, j, k;
+
+		/* setup the vector of parities to use */
+		for (i = 0, j = 0, k = 0; i < np; ++i) {
+			if (j < nrp && ir[nrd + j] == nd + i) {
+				/* this parity has to be recovered */
+				++j;
+			} else {
+				/* this parity is used for recovering */
+				ip[k] = i;
+				++k;
+			}
+		}
+
+		/* recover the nrd data blocks specified in ir[], */
+		/* using the first nrd parity in ip[] for recovering */
+		raid_rec_ptr[nrd - 1](nrd, ir, ip, nd, size, v);
+	}
+
+	/* recompute all the parities up to the last bad one */
+	if (nrp != 0)
+		raid_gen(nd, ir[nr - 1] - nd + 1, size, v);
+}
+
+void raid_data(int nr, int *id, int *ip, int nd, size_t size, void **v)
+{
+	/* enforce limit on size */
+	BUG_ON(size % 64 != 0);
+
+	/* enforce limit on number of failures */
+	BUG_ON(nr > nd);
+	BUG_ON(nr > RAID_PARITY_MAX);
+
+	/* enforce order in index vector for data */
+	BUG_ON(nr >= 2 && id[0] >= id[1]);
+	BUG_ON(nr >= 3 && id[1] >= id[2]);
+	BUG_ON(nr >= 4 && id[2] >= id[3]);
+	BUG_ON(nr >= 5 && id[3] >= id[4]);
+	BUG_ON(nr >= 6 && id[4] >= id[5]);
+
+	/* enforce limit on index vector for data */
+	BUG_ON(nr > 0 && id[nr-1] >= nd);
+
+	/* enforce order in index vector for parity */
+	BUG_ON(nr >= 2 && ip[0] >= ip[1]);
+	BUG_ON(nr >= 3 && ip[1] >= ip[2]);
+	BUG_ON(nr >= 4 && ip[2] >= ip[3]);
+	BUG_ON(nr >= 5 && ip[3] >= ip[4]);
+	BUG_ON(nr >= 6 && ip[4] >= ip[5]);
+
+	/* if failed data is present */
+	if (nr != 0)
+		raid_rec_ptr[nr - 1](nr, id, ip, nd, size, v);
+}
+
diff --git a/raid/raid.h b/raid/raid.h
new file mode 100644
index 00000000..aeeb39f3
--- /dev/null
+++ b/raid/raid.h
@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAID_H
+#define __RAID_H
+
+/**
+ * RAID mode supporting up to 6 parities.
+ *
+ * It requires SSSE3 to get good performance with triple or more parities.
+ *
+ * This is the default mode set after calling raid_init().
+ */
+#define RAID_MODE_CAUCHY 0
+
+/**
+ * RAID mode supporting up to 3 parities,
+ *
+ * It has a fast triple parity implementation without SSSE3, but it cannot
+ * go beyond triple parity.
+ *
+ * This is mostly intended for low end CPUs like ARM and AMD Athlon.
+ */
+#define RAID_MODE_VANDERMONDE 1
+
+/**
+ * Maximum number of parity disks supported.
+ */
+#define RAID_PARITY_MAX 6
+
+/**
+ * Maximum number of data disks supported.
+ */
+#define RAID_DATA_MAX 251
+
+/**
+ * Initializes the RAID system.
+ *
+ * You must call this function before any other.
+ *
+ * The RAID system is initialized in the RAID_MODE_CAUCHY mode.
+ */
+void raid_init(void);
+
+/**
+ * Runs a basic functionality self test.
+ *
+ * The test is immediate, and it's intended to be run at application
+ * startup to check the integrity of the RAID system.
+ *
+ * It returns 0 on success.
+ */
+int raid_selftest(void);
+
+/**
+ * Sets the mode to use. One of RAID_MODE_*.
+ *
+ * You can change mode at any time, and it will affect next calls to raid_gen(),
+ * raid_rec() and raid_data().
+ *
+ * The two modes are compatible for the first two levels of parity.
+ * The third one is different.
+ */
+void raid_mode(int mode);
+
+/**
+ * Sets the zero buffer to use in recovering.
+ *
+ * Before calling raid_rec() and raid_data() you must provide a memory
+ * buffer filled with zero with the same size of the blocks to recover.
+ *
+ * This buffer is only read and never written.
+ */
+void raid_zero(void *zero);
+
+/**
+ * Computes parity blocks.
+ *
+ * This function computes the specified number of parity blocks of the
+ * provided set of data blocks.
+ *
+ * Each parity block allows to recover one data block.
+ *
+ * @nd Number of data blocks.
+ * @np Number of parities blocks to compute.
+ * @size Size of the blocks pointed by @v. It must be a multiplier of 64.
+ * @v Vector of pointers to the blocks of data and parity.
+ *   It has (@nd + @np) elements. The starting elements are the blocks for
+ *   data, following with the parity blocks.
+ *   Data blocks are only read and not modified. Parity blocks are written.
+ *   Each block has @size bytes.
+ */
+void raid_gen(int nd, int np, size_t size, void **v);
+
+/**
+ * Recovers failures in data and parity blocks.
+ *
+ * This function recovers all the data and parity blocks marked as bad
+ * in the @ir vector.
+ *
+ * Ensure to have @nr <= @np, otherwise recovering is not possible.
+ *
+ * The parities blocks used for recovering are automatically selected from
+ * the ones NOT present in the @ir vector.
+ *
+ * In case there are more parity blocks than needed, the parities at lower
+ * indexes are used in the recovering, and the others are ignored.
+ *
+ * Note that no internal integrity check is done when recovering. If the
+ * provided parities are correct, the resulting data will be correct.
+ * If parities are wrong, the resulting recovered data will be wrong.
+ * This happens even in the case you have more parities blocks than needed,
+ * and some form of integrity verification would be possible.
+ *
+ * @nr Number of failed data and parity blocks to recover.
+ * @ir[] Vector of @nr indexes of the failed data and parity blocks.
+ *   The indexes start from 0. They must be in order.
+ *   The first parity is represented with value @nd, the second with value
+ *   @nd + 1, just like positions in the @v vector.
+ * @nd Number of data blocks.
+ * @np Number of parity blocks.
+ * @size Size of the blocks pointed by @v. It must be a multiplier of 64.
+ * @v Vector of pointers to the blocks of data and parity.
+ *   It has (@nd + @np) elements. The starting elements are the blocks
+ *   for data, following with the parity blocks.
+ *   Each block has @size bytes.
+ */
+void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v);
+
+/**
+ * Recovers failures in data blocks only.
+ *
+ * This function recovers all the data blocks marked as bad in the @id vector.
+ * The parity blocks are not modified.
+ *
+ * @nr Number of failed data blocks to recover.
+ * @id[] Vector of @nr indexes of the data blocks to recover.
+ *   The indexes start from 0. They must be in order.
+ * @ip[] Vector of @nr indexes of the parity blocks to use for recovering.
+ *   The indexes start from 0. They must be in order.
+ * @nd Number of data blocks.
+ * @size Size of the blocks pointed by @v. It must be a multiplier of 64.
+ * @v Vector of pointers to the blocks of data and parity.
+ *   It has (@nd + @ip[@nr - 1] + 1) elements. The starting elements are the
+ *   blocks for data, following with the parity blocks.
+ *   Each blocks has @size bytes.
+ */
+void raid_data(int nr, int *id, int *ip, int nd, size_t size, void **v);
+
+/**
+ * Check the provided failed blocks combination.
+ *
+ * This function checks if the specified failed blocks combination satisfies
+ * the redundancy information. A combination is assumed matching, if the
+ * remaining valid parity is matching the expected value after recovering.
+ *
+ * The number of failed blocks @nr must be strictly less than the number of
+ * parities @np, because you need one more parity to validate the recovering.
+ *
+ * No data or parity blocks are modified.
+ *
+ * @nr Number of failed data and parity blocks.
+ * @ir[] Vector of @nr indexes of the failed data and parity blocks.
+ *   The indexes start from 0. They must be in order.
+ *   The first parity is represented with value @nd, the second with value
+ *   @nd + 1, just like positions in the @v vector.
+ * @nd Number of data blocks.
+ * @np Number of parity blocks.
+ * @size Size of the blocks pointed by @v. It must be a multiplier of 64.
+ * @v Vector of pointers to the blocks of data and parity.
+ *   It has (@nd + @np) elements. The starting elements are the blocks
+ *   for data, following with the parity blocks.
+ *   Each block has @size bytes.
+ * @return 0 if the check is satisfied. -1 otherwise.
+ */
+int raid_check(int nr, int *ir, int nd, int np, size_t size, void **v);
+
+/**
+ * Scan for failed blocks.
+ *
+ * This function identifies the failed data and parity blocks using the
+ * available redundancy.
+ *
+ * It uses a brute force method, and then the call can be expansive.
+ * The expected execution time is proportional at the binomial coefficient
+ * @np + @nd choose @np - 1, usually written as:
+ *
+ * ( @np + @nd )
+ * (           )
+ * (  @np - 1  )
+ *
+ * No data or parity blocks are modified.
+ *
+ * The failed block indexes are returned in the @ir vector.
+ * It must have space for at least @np - 1 values.
+ *
+ * The returned @ir vector can then be used in a raid_rec() call to recover
+ * the failed data and parity blocks.
+ *
+ * @ir[] Vector filled with the indexes of the failed data and parity blocks.
+ *   The indexes start from 0 and they are in order.
+ *   The first parity is represented with value @nd, the second with value
+ *   @nd + 1, just like positions in the @v vector.
+ * @nd Number of data blocks.
+ * @np Number of parity blocks.
+ * @size Size of the blocks pointed by @v. It must be a multiplier of 64.
+ * @v Vector of pointers to the blocks of data and parity.
+ *   It has (@nd + @np) elements. The starting elements are the blocks
+ *   for data, following with the parity blocks.
+ *   Each block has @size bytes.
+ * @return Number of block indexes returned in the @ir vector.
+ *   0 if no error is detected.
+ *   -1 if it's not possible to identify the failed disks.
+ */
+int raid_scan(int *ir, int nd, int np, size_t size, void **v);
+
+#endif
+
diff --git a/raid/tables.c b/raid/tables.c
new file mode 100644
index 00000000..49035022
--- /dev/null
+++ b/raid/tables.c
@@ -0,0 +1,14696 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+
+const uint8_t __aligned(256) raid_gfmul[256][256] =
+{
+	{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	},
+	{
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+		0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+		0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+		0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+		0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+		0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+		0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+		0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+		0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+		0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+		0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+		0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+		0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+		0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+		0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+		0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+		0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+		0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+		0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+		0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+		0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+		0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+		0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+		0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+		0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+		0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+		0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+		0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+	},
+	{
+		0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e,
+		0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e,
+		0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e,
+		0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
+		0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e,
+		0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e,
+		0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e,
+		0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e,
+		0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e,
+		0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e,
+		0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac, 0xae,
+		0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe,
+		0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce,
+		0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde,
+		0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee,
+		0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe,
+		0x1d, 0x1f, 0x19, 0x1b, 0x15, 0x17, 0x11, 0x13,
+		0x0d, 0x0f, 0x09, 0x0b, 0x05, 0x07, 0x01, 0x03,
+		0x3d, 0x3f, 0x39, 0x3b, 0x35, 0x37, 0x31, 0x33,
+		0x2d, 0x2f, 0x29, 0x2b, 0x25, 0x27, 0x21, 0x23,
+		0x5d, 0x5f, 0x59, 0x5b, 0x55, 0x57, 0x51, 0x53,
+		0x4d, 0x4f, 0x49, 0x4b, 0x45, 0x47, 0x41, 0x43,
+		0x7d, 0x7f, 0x79, 0x7b, 0x75, 0x77, 0x71, 0x73,
+		0x6d, 0x6f, 0x69, 0x6b, 0x65, 0x67, 0x61, 0x63,
+		0x9d, 0x9f, 0x99, 0x9b, 0x95, 0x97, 0x91, 0x93,
+		0x8d, 0x8f, 0x89, 0x8b, 0x85, 0x87, 0x81, 0x83,
+		0xbd, 0xbf, 0xb9, 0xbb, 0xb5, 0xb7, 0xb1, 0xb3,
+		0xad, 0xaf, 0xa9, 0xab, 0xa5, 0xa7, 0xa1, 0xa3,
+		0xdd, 0xdf, 0xd9, 0xdb, 0xd5, 0xd7, 0xd1, 0xd3,
+		0xcd, 0xcf, 0xc9, 0xcb, 0xc5, 0xc7, 0xc1, 0xc3,
+		0xfd, 0xff, 0xf9, 0xfb, 0xf5, 0xf7, 0xf1, 0xf3,
+		0xed, 0xef, 0xe9, 0xeb, 0xe5, 0xe7, 0xe1, 0xe3,
+	},
+	{
+		0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09,
+		0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11,
+		0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39,
+		0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21,
+		0x60, 0x63, 0x66, 0x65, 0x6c, 0x6f, 0x6a, 0x69,
+		0x78, 0x7b, 0x7e, 0x7d, 0x74, 0x77, 0x72, 0x71,
+		0x50, 0x53, 0x56, 0x55, 0x5c, 0x5f, 0x5a, 0x59,
+		0x48, 0x4b, 0x4e, 0x4d, 0x44, 0x47, 0x42, 0x41,
+		0xc0, 0xc3, 0xc6, 0xc5, 0xcc, 0xcf, 0xca, 0xc9,
+		0xd8, 0xdb, 0xde, 0xdd, 0xd4, 0xd7, 0xd2, 0xd1,
+		0xf0, 0xf3, 0xf6, 0xf5, 0xfc, 0xff, 0xfa, 0xf9,
+		0xe8, 0xeb, 0xee, 0xed, 0xe4, 0xe7, 0xe2, 0xe1,
+		0xa0, 0xa3, 0xa6, 0xa5, 0xac, 0xaf, 0xaa, 0xa9,
+		0xb8, 0xbb, 0xbe, 0xbd, 0xb4, 0xb7, 0xb2, 0xb1,
+		0x90, 0x93, 0x96, 0x95, 0x9c, 0x9f, 0x9a, 0x99,
+		0x88, 0x8b, 0x8e, 0x8d, 0x84, 0x87, 0x82, 0x81,
+		0x9d, 0x9e, 0x9b, 0x98, 0x91, 0x92, 0x97, 0x94,
+		0x85, 0x86, 0x83, 0x80, 0x89, 0x8a, 0x8f, 0x8c,
+		0xad, 0xae, 0xab, 0xa8, 0xa1, 0xa2, 0xa7, 0xa4,
+		0xb5, 0xb6, 0xb3, 0xb0, 0xb9, 0xba, 0xbf, 0xbc,
+		0xfd, 0xfe, 0xfb, 0xf8, 0xf1, 0xf2, 0xf7, 0xf4,
+		0xe5, 0xe6, 0xe3, 0xe0, 0xe9, 0xea, 0xef, 0xec,
+		0xcd, 0xce, 0xcb, 0xc8, 0xc1, 0xc2, 0xc7, 0xc4,
+		0xd5, 0xd6, 0xd3, 0xd0, 0xd9, 0xda, 0xdf, 0xdc,
+		0x5d, 0x5e, 0x5b, 0x58, 0x51, 0x52, 0x57, 0x54,
+		0x45, 0x46, 0x43, 0x40, 0x49, 0x4a, 0x4f, 0x4c,
+		0x6d, 0x6e, 0x6b, 0x68, 0x61, 0x62, 0x67, 0x64,
+		0x75, 0x76, 0x73, 0x70, 0x79, 0x7a, 0x7f, 0x7c,
+		0x3d, 0x3e, 0x3b, 0x38, 0x31, 0x32, 0x37, 0x34,
+		0x25, 0x26, 0x23, 0x20, 0x29, 0x2a, 0x2f, 0x2c,
+		0x0d, 0x0e, 0x0b, 0x08, 0x01, 0x02, 0x07, 0x04,
+		0x15, 0x16, 0x13, 0x10, 0x19, 0x1a, 0x1f, 0x1c,
+	},
+	{
+		0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c,
+		0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c,
+		0x40, 0x44, 0x48, 0x4c, 0x50, 0x54, 0x58, 0x5c,
+		0x60, 0x64, 0x68, 0x6c, 0x70, 0x74, 0x78, 0x7c,
+		0x80, 0x84, 0x88, 0x8c, 0x90, 0x94, 0x98, 0x9c,
+		0xa0, 0xa4, 0xa8, 0xac, 0xb0, 0xb4, 0xb8, 0xbc,
+		0xc0, 0xc4, 0xc8, 0xcc, 0xd0, 0xd4, 0xd8, 0xdc,
+		0xe0, 0xe4, 0xe8, 0xec, 0xf0, 0xf4, 0xf8, 0xfc,
+		0x1d, 0x19, 0x15, 0x11, 0x0d, 0x09, 0x05, 0x01,
+		0x3d, 0x39, 0x35, 0x31, 0x2d, 0x29, 0x25, 0x21,
+		0x5d, 0x59, 0x55, 0x51, 0x4d, 0x49, 0x45, 0x41,
+		0x7d, 0x79, 0x75, 0x71, 0x6d, 0x69, 0x65, 0x61,
+		0x9d, 0x99, 0x95, 0x91, 0x8d, 0x89, 0x85, 0x81,
+		0xbd, 0xb9, 0xb5, 0xb1, 0xad, 0xa9, 0xa5, 0xa1,
+		0xdd, 0xd9, 0xd5, 0xd1, 0xcd, 0xc9, 0xc5, 0xc1,
+		0xfd, 0xf9, 0xf5, 0xf1, 0xed, 0xe9, 0xe5, 0xe1,
+		0x3a, 0x3e, 0x32, 0x36, 0x2a, 0x2e, 0x22, 0x26,
+		0x1a, 0x1e, 0x12, 0x16, 0x0a, 0x0e, 0x02, 0x06,
+		0x7a, 0x7e, 0x72, 0x76, 0x6a, 0x6e, 0x62, 0x66,
+		0x5a, 0x5e, 0x52, 0x56, 0x4a, 0x4e, 0x42, 0x46,
+		0xba, 0xbe, 0xb2, 0xb6, 0xaa, 0xae, 0xa2, 0xa6,
+		0x9a, 0x9e, 0x92, 0x96, 0x8a, 0x8e, 0x82, 0x86,
+		0xfa, 0xfe, 0xf2, 0xf6, 0xea, 0xee, 0xe2, 0xe6,
+		0xda, 0xde, 0xd2, 0xd6, 0xca, 0xce, 0xc2, 0xc6,
+		0x27, 0x23, 0x2f, 0x2b, 0x37, 0x33, 0x3f, 0x3b,
+		0x07, 0x03, 0x0f, 0x0b, 0x17, 0x13, 0x1f, 0x1b,
+		0x67, 0x63, 0x6f, 0x6b, 0x77, 0x73, 0x7f, 0x7b,
+		0x47, 0x43, 0x4f, 0x4b, 0x57, 0x53, 0x5f, 0x5b,
+		0xa7, 0xa3, 0xaf, 0xab, 0xb7, 0xb3, 0xbf, 0xbb,
+		0x87, 0x83, 0x8f, 0x8b, 0x97, 0x93, 0x9f, 0x9b,
+		0xe7, 0xe3, 0xef, 0xeb, 0xf7, 0xf3, 0xff, 0xfb,
+		0xc7, 0xc3, 0xcf, 0xcb, 0xd7, 0xd3, 0xdf, 0xdb,
+	},
+	{
+		0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b,
+		0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33,
+		0x50, 0x55, 0x5a, 0x5f, 0x44, 0x41, 0x4e, 0x4b,
+		0x78, 0x7d, 0x72, 0x77, 0x6c, 0x69, 0x66, 0x63,
+		0xa0, 0xa5, 0xaa, 0xaf, 0xb4, 0xb1, 0xbe, 0xbb,
+		0x88, 0x8d, 0x82, 0x87, 0x9c, 0x99, 0x96, 0x93,
+		0xf0, 0xf5, 0xfa, 0xff, 0xe4, 0xe1, 0xee, 0xeb,
+		0xd8, 0xdd, 0xd2, 0xd7, 0xcc, 0xc9, 0xc6, 0xc3,
+		0x5d, 0x58, 0x57, 0x52, 0x49, 0x4c, 0x43, 0x46,
+		0x75, 0x70, 0x7f, 0x7a, 0x61, 0x64, 0x6b, 0x6e,
+		0x0d, 0x08, 0x07, 0x02, 0x19, 0x1c, 0x13, 0x16,
+		0x25, 0x20, 0x2f, 0x2a, 0x31, 0x34, 0x3b, 0x3e,
+		0xfd, 0xf8, 0xf7, 0xf2, 0xe9, 0xec, 0xe3, 0xe6,
+		0xd5, 0xd0, 0xdf, 0xda, 0xc1, 0xc4, 0xcb, 0xce,
+		0xad, 0xa8, 0xa7, 0xa2, 0xb9, 0xbc, 0xb3, 0xb6,
+		0x85, 0x80, 0x8f, 0x8a, 0x91, 0x94, 0x9b, 0x9e,
+		0xba, 0xbf, 0xb0, 0xb5, 0xae, 0xab, 0xa4, 0xa1,
+		0x92, 0x97, 0x98, 0x9d, 0x86, 0x83, 0x8c, 0x89,
+		0xea, 0xef, 0xe0, 0xe5, 0xfe, 0xfb, 0xf4, 0xf1,
+		0xc2, 0xc7, 0xc8, 0xcd, 0xd6, 0xd3, 0xdc, 0xd9,
+		0x1a, 0x1f, 0x10, 0x15, 0x0e, 0x0b, 0x04, 0x01,
+		0x32, 0x37, 0x38, 0x3d, 0x26, 0x23, 0x2c, 0x29,
+		0x4a, 0x4f, 0x40, 0x45, 0x5e, 0x5b, 0x54, 0x51,
+		0x62, 0x67, 0x68, 0x6d, 0x76, 0x73, 0x7c, 0x79,
+		0xe7, 0xe2, 0xed, 0xe8, 0xf3, 0xf6, 0xf9, 0xfc,
+		0xcf, 0xca, 0xc5, 0xc0, 0xdb, 0xde, 0xd1, 0xd4,
+		0xb7, 0xb2, 0xbd, 0xb8, 0xa3, 0xa6, 0xa9, 0xac,
+		0x9f, 0x9a, 0x95, 0x90, 0x8b, 0x8e, 0x81, 0x84,
+		0x47, 0x42, 0x4d, 0x48, 0x53, 0x56, 0x59, 0x5c,
+		0x6f, 0x6a, 0x65, 0x60, 0x7b, 0x7e, 0x71, 0x74,
+		0x17, 0x12, 0x1d, 0x18, 0x03, 0x06, 0x09, 0x0c,
+		0x3f, 0x3a, 0x35, 0x30, 0x2b, 0x2e, 0x21, 0x24,
+	},
+	{
+		0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12,
+		0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22,
+		0x60, 0x66, 0x6c, 0x6a, 0x78, 0x7e, 0x74, 0x72,
+		0x50, 0x56, 0x5c, 0x5a, 0x48, 0x4e, 0x44, 0x42,
+		0xc0, 0xc6, 0xcc, 0xca, 0xd8, 0xde, 0xd4, 0xd2,
+		0xf0, 0xf6, 0xfc, 0xfa, 0xe8, 0xee, 0xe4, 0xe2,
+		0xa0, 0xa6, 0xac, 0xaa, 0xb8, 0xbe, 0xb4, 0xb2,
+		0x90, 0x96, 0x9c, 0x9a, 0x88, 0x8e, 0x84, 0x82,
+		0x9d, 0x9b, 0x91, 0x97, 0x85, 0x83, 0x89, 0x8f,
+		0xad, 0xab, 0xa1, 0xa7, 0xb5, 0xb3, 0xb9, 0xbf,
+		0xfd, 0xfb, 0xf1, 0xf7, 0xe5, 0xe3, 0xe9, 0xef,
+		0xcd, 0xcb, 0xc1, 0xc7, 0xd5, 0xd3, 0xd9, 0xdf,
+		0x5d, 0x5b, 0x51, 0x57, 0x45, 0x43, 0x49, 0x4f,
+		0x6d, 0x6b, 0x61, 0x67, 0x75, 0x73, 0x79, 0x7f,
+		0x3d, 0x3b, 0x31, 0x37, 0x25, 0x23, 0x29, 0x2f,
+		0x0d, 0x0b, 0x01, 0x07, 0x15, 0x13, 0x19, 0x1f,
+		0x27, 0x21, 0x2b, 0x2d, 0x3f, 0x39, 0x33, 0x35,
+		0x17, 0x11, 0x1b, 0x1d, 0x0f, 0x09, 0x03, 0x05,
+		0x47, 0x41, 0x4b, 0x4d, 0x5f, 0x59, 0x53, 0x55,
+		0x77, 0x71, 0x7b, 0x7d, 0x6f, 0x69, 0x63, 0x65,
+		0xe7, 0xe1, 0xeb, 0xed, 0xff, 0xf9, 0xf3, 0xf5,
+		0xd7, 0xd1, 0xdb, 0xdd, 0xcf, 0xc9, 0xc3, 0xc5,
+		0x87, 0x81, 0x8b, 0x8d, 0x9f, 0x99, 0x93, 0x95,
+		0xb7, 0xb1, 0xbb, 0xbd, 0xaf, 0xa9, 0xa3, 0xa5,
+		0xba, 0xbc, 0xb6, 0xb0, 0xa2, 0xa4, 0xae, 0xa8,
+		0x8a, 0x8c, 0x86, 0x80, 0x92, 0x94, 0x9e, 0x98,
+		0xda, 0xdc, 0xd6, 0xd0, 0xc2, 0xc4, 0xce, 0xc8,
+		0xea, 0xec, 0xe6, 0xe0, 0xf2, 0xf4, 0xfe, 0xf8,
+		0x7a, 0x7c, 0x76, 0x70, 0x62, 0x64, 0x6e, 0x68,
+		0x4a, 0x4c, 0x46, 0x40, 0x52, 0x54, 0x5e, 0x58,
+		0x1a, 0x1c, 0x16, 0x10, 0x02, 0x04, 0x0e, 0x08,
+		0x2a, 0x2c, 0x26, 0x20, 0x32, 0x34, 0x3e, 0x38,
+	},
+	{
+		0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15,
+		0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d,
+		0x70, 0x77, 0x7e, 0x79, 0x6c, 0x6b, 0x62, 0x65,
+		0x48, 0x4f, 0x46, 0x41, 0x54, 0x53, 0x5a, 0x5d,
+		0xe0, 0xe7, 0xee, 0xe9, 0xfc, 0xfb, 0xf2, 0xf5,
+		0xd8, 0xdf, 0xd6, 0xd1, 0xc4, 0xc3, 0xca, 0xcd,
+		0x90, 0x97, 0x9e, 0x99, 0x8c, 0x8b, 0x82, 0x85,
+		0xa8, 0xaf, 0xa6, 0xa1, 0xb4, 0xb3, 0xba, 0xbd,
+		0xdd, 0xda, 0xd3, 0xd4, 0xc1, 0xc6, 0xcf, 0xc8,
+		0xe5, 0xe2, 0xeb, 0xec, 0xf9, 0xfe, 0xf7, 0xf0,
+		0xad, 0xaa, 0xa3, 0xa4, 0xb1, 0xb6, 0xbf, 0xb8,
+		0x95, 0x92, 0x9b, 0x9c, 0x89, 0x8e, 0x87, 0x80,
+		0x3d, 0x3a, 0x33, 0x34, 0x21, 0x26, 0x2f, 0x28,
+		0x05, 0x02, 0x0b, 0x0c, 0x19, 0x1e, 0x17, 0x10,
+		0x4d, 0x4a, 0x43, 0x44, 0x51, 0x56, 0x5f, 0x58,
+		0x75, 0x72, 0x7b, 0x7c, 0x69, 0x6e, 0x67, 0x60,
+		0xa7, 0xa0, 0xa9, 0xae, 0xbb, 0xbc, 0xb5, 0xb2,
+		0x9f, 0x98, 0x91, 0x96, 0x83, 0x84, 0x8d, 0x8a,
+		0xd7, 0xd0, 0xd9, 0xde, 0xcb, 0xcc, 0xc5, 0xc2,
+		0xef, 0xe8, 0xe1, 0xe6, 0xf3, 0xf4, 0xfd, 0xfa,
+		0x47, 0x40, 0x49, 0x4e, 0x5b, 0x5c, 0x55, 0x52,
+		0x7f, 0x78, 0x71, 0x76, 0x63, 0x64, 0x6d, 0x6a,
+		0x37, 0x30, 0x39, 0x3e, 0x2b, 0x2c, 0x25, 0x22,
+		0x0f, 0x08, 0x01, 0x06, 0x13, 0x14, 0x1d, 0x1a,
+		0x7a, 0x7d, 0x74, 0x73, 0x66, 0x61, 0x68, 0x6f,
+		0x42, 0x45, 0x4c, 0x4b, 0x5e, 0x59, 0x50, 0x57,
+		0x0a, 0x0d, 0x04, 0x03, 0x16, 0x11, 0x18, 0x1f,
+		0x32, 0x35, 0x3c, 0x3b, 0x2e, 0x29, 0x20, 0x27,
+		0x9a, 0x9d, 0x94, 0x93, 0x86, 0x81, 0x88, 0x8f,
+		0xa2, 0xa5, 0xac, 0xab, 0xbe, 0xb9, 0xb0, 0xb7,
+		0xea, 0xed, 0xe4, 0xe3, 0xf6, 0xf1, 0xf8, 0xff,
+		0xd2, 0xd5, 0xdc, 0xdb, 0xce, 0xc9, 0xc0, 0xc7,
+	},
+	{
+		0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38,
+		0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78,
+		0x80, 0x88, 0x90, 0x98, 0xa0, 0xa8, 0xb0, 0xb8,
+		0xc0, 0xc8, 0xd0, 0xd8, 0xe0, 0xe8, 0xf0, 0xf8,
+		0x1d, 0x15, 0x0d, 0x05, 0x3d, 0x35, 0x2d, 0x25,
+		0x5d, 0x55, 0x4d, 0x45, 0x7d, 0x75, 0x6d, 0x65,
+		0x9d, 0x95, 0x8d, 0x85, 0xbd, 0xb5, 0xad, 0xa5,
+		0xdd, 0xd5, 0xcd, 0xc5, 0xfd, 0xf5, 0xed, 0xe5,
+		0x3a, 0x32, 0x2a, 0x22, 0x1a, 0x12, 0x0a, 0x02,
+		0x7a, 0x72, 0x6a, 0x62, 0x5a, 0x52, 0x4a, 0x42,
+		0xba, 0xb2, 0xaa, 0xa2, 0x9a, 0x92, 0x8a, 0x82,
+		0xfa, 0xf2, 0xea, 0xe2, 0xda, 0xd2, 0xca, 0xc2,
+		0x27, 0x2f, 0x37, 0x3f, 0x07, 0x0f, 0x17, 0x1f,
+		0x67, 0x6f, 0x77, 0x7f, 0x47, 0x4f, 0x57, 0x5f,
+		0xa7, 0xaf, 0xb7, 0xbf, 0x87, 0x8f, 0x97, 0x9f,
+		0xe7, 0xef, 0xf7, 0xff, 0xc7, 0xcf, 0xd7, 0xdf,
+		0x74, 0x7c, 0x64, 0x6c, 0x54, 0x5c, 0x44, 0x4c,
+		0x34, 0x3c, 0x24, 0x2c, 0x14, 0x1c, 0x04, 0x0c,
+		0xf4, 0xfc, 0xe4, 0xec, 0xd4, 0xdc, 0xc4, 0xcc,
+		0xb4, 0xbc, 0xa4, 0xac, 0x94, 0x9c, 0x84, 0x8c,
+		0x69, 0x61, 0x79, 0x71, 0x49, 0x41, 0x59, 0x51,
+		0x29, 0x21, 0x39, 0x31, 0x09, 0x01, 0x19, 0x11,
+		0xe9, 0xe1, 0xf9, 0xf1, 0xc9, 0xc1, 0xd9, 0xd1,
+		0xa9, 0xa1, 0xb9, 0xb1, 0x89, 0x81, 0x99, 0x91,
+		0x4e, 0x46, 0x5e, 0x56, 0x6e, 0x66, 0x7e, 0x76,
+		0x0e, 0x06, 0x1e, 0x16, 0x2e, 0x26, 0x3e, 0x36,
+		0xce, 0xc6, 0xde, 0xd6, 0xee, 0xe6, 0xfe, 0xf6,
+		0x8e, 0x86, 0x9e, 0x96, 0xae, 0xa6, 0xbe, 0xb6,
+		0x53, 0x5b, 0x43, 0x4b, 0x73, 0x7b, 0x63, 0x6b,
+		0x13, 0x1b, 0x03, 0x0b, 0x33, 0x3b, 0x23, 0x2b,
+		0xd3, 0xdb, 0xc3, 0xcb, 0xf3, 0xfb, 0xe3, 0xeb,
+		0x93, 0x9b, 0x83, 0x8b, 0xb3, 0xbb, 0xa3, 0xab,
+	},
+	{
+		0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f,
+		0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77,
+		0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf,
+		0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7,
+		0x3d, 0x34, 0x2f, 0x26, 0x19, 0x10, 0x0b, 0x02,
+		0x75, 0x7c, 0x67, 0x6e, 0x51, 0x58, 0x43, 0x4a,
+		0xad, 0xa4, 0xbf, 0xb6, 0x89, 0x80, 0x9b, 0x92,
+		0xe5, 0xec, 0xf7, 0xfe, 0xc1, 0xc8, 0xd3, 0xda,
+		0x7a, 0x73, 0x68, 0x61, 0x5e, 0x57, 0x4c, 0x45,
+		0x32, 0x3b, 0x20, 0x29, 0x16, 0x1f, 0x04, 0x0d,
+		0xea, 0xe3, 0xf8, 0xf1, 0xce, 0xc7, 0xdc, 0xd5,
+		0xa2, 0xab, 0xb0, 0xb9, 0x86, 0x8f, 0x94, 0x9d,
+		0x47, 0x4e, 0x55, 0x5c, 0x63, 0x6a, 0x71, 0x78,
+		0x0f, 0x06, 0x1d, 0x14, 0x2b, 0x22, 0x39, 0x30,
+		0xd7, 0xde, 0xc5, 0xcc, 0xf3, 0xfa, 0xe1, 0xe8,
+		0x9f, 0x96, 0x8d, 0x84, 0xbb, 0xb2, 0xa9, 0xa0,
+		0xf4, 0xfd, 0xe6, 0xef, 0xd0, 0xd9, 0xc2, 0xcb,
+		0xbc, 0xb5, 0xae, 0xa7, 0x98, 0x91, 0x8a, 0x83,
+		0x64, 0x6d, 0x76, 0x7f, 0x40, 0x49, 0x52, 0x5b,
+		0x2c, 0x25, 0x3e, 0x37, 0x08, 0x01, 0x1a, 0x13,
+		0xc9, 0xc0, 0xdb, 0xd2, 0xed, 0xe4, 0xff, 0xf6,
+		0x81, 0x88, 0x93, 0x9a, 0xa5, 0xac, 0xb7, 0xbe,
+		0x59, 0x50, 0x4b, 0x42, 0x7d, 0x74, 0x6f, 0x66,
+		0x11, 0x18, 0x03, 0x0a, 0x35, 0x3c, 0x27, 0x2e,
+		0x8e, 0x87, 0x9c, 0x95, 0xaa, 0xa3, 0xb8, 0xb1,
+		0xc6, 0xcf, 0xd4, 0xdd, 0xe2, 0xeb, 0xf0, 0xf9,
+		0x1e, 0x17, 0x0c, 0x05, 0x3a, 0x33, 0x28, 0x21,
+		0x56, 0x5f, 0x44, 0x4d, 0x72, 0x7b, 0x60, 0x69,
+		0xb3, 0xba, 0xa1, 0xa8, 0x97, 0x9e, 0x85, 0x8c,
+		0xfb, 0xf2, 0xe9, 0xe0, 0xdf, 0xd6, 0xcd, 0xc4,
+		0x23, 0x2a, 0x31, 0x38, 0x07, 0x0e, 0x15, 0x1c,
+		0x6b, 0x62, 0x79, 0x70, 0x4f, 0x46, 0x5d, 0x54,
+	},
+	{
+		0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36,
+		0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66,
+		0xa0, 0xaa, 0xb4, 0xbe, 0x88, 0x82, 0x9c, 0x96,
+		0xf0, 0xfa, 0xe4, 0xee, 0xd8, 0xd2, 0xcc, 0xc6,
+		0x5d, 0x57, 0x49, 0x43, 0x75, 0x7f, 0x61, 0x6b,
+		0x0d, 0x07, 0x19, 0x13, 0x25, 0x2f, 0x31, 0x3b,
+		0xfd, 0xf7, 0xe9, 0xe3, 0xd5, 0xdf, 0xc1, 0xcb,
+		0xad, 0xa7, 0xb9, 0xb3, 0x85, 0x8f, 0x91, 0x9b,
+		0xba, 0xb0, 0xae, 0xa4, 0x92, 0x98, 0x86, 0x8c,
+		0xea, 0xe0, 0xfe, 0xf4, 0xc2, 0xc8, 0xd6, 0xdc,
+		0x1a, 0x10, 0x0e, 0x04, 0x32, 0x38, 0x26, 0x2c,
+		0x4a, 0x40, 0x5e, 0x54, 0x62, 0x68, 0x76, 0x7c,
+		0xe7, 0xed, 0xf3, 0xf9, 0xcf, 0xc5, 0xdb, 0xd1,
+		0xb7, 0xbd, 0xa3, 0xa9, 0x9f, 0x95, 0x8b, 0x81,
+		0x47, 0x4d, 0x53, 0x59, 0x6f, 0x65, 0x7b, 0x71,
+		0x17, 0x1d, 0x03, 0x09, 0x3f, 0x35, 0x2b, 0x21,
+		0x69, 0x63, 0x7d, 0x77, 0x41, 0x4b, 0x55, 0x5f,
+		0x39, 0x33, 0x2d, 0x27, 0x11, 0x1b, 0x05, 0x0f,
+		0xc9, 0xc3, 0xdd, 0xd7, 0xe1, 0xeb, 0xf5, 0xff,
+		0x99, 0x93, 0x8d, 0x87, 0xb1, 0xbb, 0xa5, 0xaf,
+		0x34, 0x3e, 0x20, 0x2a, 0x1c, 0x16, 0x08, 0x02,
+		0x64, 0x6e, 0x70, 0x7a, 0x4c, 0x46, 0x58, 0x52,
+		0x94, 0x9e, 0x80, 0x8a, 0xbc, 0xb6, 0xa8, 0xa2,
+		0xc4, 0xce, 0xd0, 0xda, 0xec, 0xe6, 0xf8, 0xf2,
+		0xd3, 0xd9, 0xc7, 0xcd, 0xfb, 0xf1, 0xef, 0xe5,
+		0x83, 0x89, 0x97, 0x9d, 0xab, 0xa1, 0xbf, 0xb5,
+		0x73, 0x79, 0x67, 0x6d, 0x5b, 0x51, 0x4f, 0x45,
+		0x23, 0x29, 0x37, 0x3d, 0x0b, 0x01, 0x1f, 0x15,
+		0x8e, 0x84, 0x9a, 0x90, 0xa6, 0xac, 0xb2, 0xb8,
+		0xde, 0xd4, 0xca, 0xc0, 0xf6, 0xfc, 0xe2, 0xe8,
+		0x2e, 0x24, 0x3a, 0x30, 0x06, 0x0c, 0x12, 0x18,
+		0x7e, 0x74, 0x6a, 0x60, 0x56, 0x5c, 0x42, 0x48,
+	},
+	{
+		0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31,
+		0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69,
+		0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81,
+		0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9,
+		0x7d, 0x76, 0x6b, 0x60, 0x51, 0x5a, 0x47, 0x4c,
+		0x25, 0x2e, 0x33, 0x38, 0x09, 0x02, 0x1f, 0x14,
+		0xcd, 0xc6, 0xdb, 0xd0, 0xe1, 0xea, 0xf7, 0xfc,
+		0x95, 0x9e, 0x83, 0x88, 0xb9, 0xb2, 0xaf, 0xa4,
+		0xfa, 0xf1, 0xec, 0xe7, 0xd6, 0xdd, 0xc0, 0xcb,
+		0xa2, 0xa9, 0xb4, 0xbf, 0x8e, 0x85, 0x98, 0x93,
+		0x4a, 0x41, 0x5c, 0x57, 0x66, 0x6d, 0x70, 0x7b,
+		0x12, 0x19, 0x04, 0x0f, 0x3e, 0x35, 0x28, 0x23,
+		0x87, 0x8c, 0x91, 0x9a, 0xab, 0xa0, 0xbd, 0xb6,
+		0xdf, 0xd4, 0xc9, 0xc2, 0xf3, 0xf8, 0xe5, 0xee,
+		0x37, 0x3c, 0x21, 0x2a, 0x1b, 0x10, 0x0d, 0x06,
+		0x6f, 0x64, 0x79, 0x72, 0x43, 0x48, 0x55, 0x5e,
+		0xe9, 0xe2, 0xff, 0xf4, 0xc5, 0xce, 0xd3, 0xd8,
+		0xb1, 0xba, 0xa7, 0xac, 0x9d, 0x96, 0x8b, 0x80,
+		0x59, 0x52, 0x4f, 0x44, 0x75, 0x7e, 0x63, 0x68,
+		0x01, 0x0a, 0x17, 0x1c, 0x2d, 0x26, 0x3b, 0x30,
+		0x94, 0x9f, 0x82, 0x89, 0xb8, 0xb3, 0xae, 0xa5,
+		0xcc, 0xc7, 0xda, 0xd1, 0xe0, 0xeb, 0xf6, 0xfd,
+		0x24, 0x2f, 0x32, 0x39, 0x08, 0x03, 0x1e, 0x15,
+		0x7c, 0x77, 0x6a, 0x61, 0x50, 0x5b, 0x46, 0x4d,
+		0x13, 0x18, 0x05, 0x0e, 0x3f, 0x34, 0x29, 0x22,
+		0x4b, 0x40, 0x5d, 0x56, 0x67, 0x6c, 0x71, 0x7a,
+		0xa3, 0xa8, 0xb5, 0xbe, 0x8f, 0x84, 0x99, 0x92,
+		0xfb, 0xf0, 0xed, 0xe6, 0xd7, 0xdc, 0xc1, 0xca,
+		0x6e, 0x65, 0x78, 0x73, 0x42, 0x49, 0x54, 0x5f,
+		0x36, 0x3d, 0x20, 0x2b, 0x1a, 0x11, 0x0c, 0x07,
+		0xde, 0xd5, 0xc8, 0xc3, 0xf2, 0xf9, 0xe4, 0xef,
+		0x86, 0x8d, 0x90, 0x9b, 0xaa, 0xa1, 0xbc, 0xb7,
+	},
+	{
+		0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24,
+		0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44,
+		0xc0, 0xcc, 0xd8, 0xd4, 0xf0, 0xfc, 0xe8, 0xe4,
+		0xa0, 0xac, 0xb8, 0xb4, 0x90, 0x9c, 0x88, 0x84,
+		0x9d, 0x91, 0x85, 0x89, 0xad, 0xa1, 0xb5, 0xb9,
+		0xfd, 0xf1, 0xe5, 0xe9, 0xcd, 0xc1, 0xd5, 0xd9,
+		0x5d, 0x51, 0x45, 0x49, 0x6d, 0x61, 0x75, 0x79,
+		0x3d, 0x31, 0x25, 0x29, 0x0d, 0x01, 0x15, 0x19,
+		0x27, 0x2b, 0x3f, 0x33, 0x17, 0x1b, 0x0f, 0x03,
+		0x47, 0x4b, 0x5f, 0x53, 0x77, 0x7b, 0x6f, 0x63,
+		0xe7, 0xeb, 0xff, 0xf3, 0xd7, 0xdb, 0xcf, 0xc3,
+		0x87, 0x8b, 0x9f, 0x93, 0xb7, 0xbb, 0xaf, 0xa3,
+		0xba, 0xb6, 0xa2, 0xae, 0x8a, 0x86, 0x92, 0x9e,
+		0xda, 0xd6, 0xc2, 0xce, 0xea, 0xe6, 0xf2, 0xfe,
+		0x7a, 0x76, 0x62, 0x6e, 0x4a, 0x46, 0x52, 0x5e,
+		0x1a, 0x16, 0x02, 0x0e, 0x2a, 0x26, 0x32, 0x3e,
+		0x4e, 0x42, 0x56, 0x5a, 0x7e, 0x72, 0x66, 0x6a,
+		0x2e, 0x22, 0x36, 0x3a, 0x1e, 0x12, 0x06, 0x0a,
+		0x8e, 0x82, 0x96, 0x9a, 0xbe, 0xb2, 0xa6, 0xaa,
+		0xee, 0xe2, 0xf6, 0xfa, 0xde, 0xd2, 0xc6, 0xca,
+		0xd3, 0xdf, 0xcb, 0xc7, 0xe3, 0xef, 0xfb, 0xf7,
+		0xb3, 0xbf, 0xab, 0xa7, 0x83, 0x8f, 0x9b, 0x97,
+		0x13, 0x1f, 0x0b, 0x07, 0x23, 0x2f, 0x3b, 0x37,
+		0x73, 0x7f, 0x6b, 0x67, 0x43, 0x4f, 0x5b, 0x57,
+		0x69, 0x65, 0x71, 0x7d, 0x59, 0x55, 0x41, 0x4d,
+		0x09, 0x05, 0x11, 0x1d, 0x39, 0x35, 0x21, 0x2d,
+		0xa9, 0xa5, 0xb1, 0xbd, 0x99, 0x95, 0x81, 0x8d,
+		0xc9, 0xc5, 0xd1, 0xdd, 0xf9, 0xf5, 0xe1, 0xed,
+		0xf4, 0xf8, 0xec, 0xe0, 0xc4, 0xc8, 0xdc, 0xd0,
+		0x94, 0x98, 0x8c, 0x80, 0xa4, 0xa8, 0xbc, 0xb0,
+		0x34, 0x38, 0x2c, 0x20, 0x04, 0x08, 0x1c, 0x10,
+		0x54, 0x58, 0x4c, 0x40, 0x64, 0x68, 0x7c, 0x70,
+	},
+	{
+		0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23,
+		0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b,
+		0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3,
+		0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b,
+		0xbd, 0xb0, 0xa7, 0xaa, 0x89, 0x84, 0x93, 0x9e,
+		0xd5, 0xd8, 0xcf, 0xc2, 0xe1, 0xec, 0xfb, 0xf6,
+		0x6d, 0x60, 0x77, 0x7a, 0x59, 0x54, 0x43, 0x4e,
+		0x05, 0x08, 0x1f, 0x12, 0x31, 0x3c, 0x2b, 0x26,
+		0x67, 0x6a, 0x7d, 0x70, 0x53, 0x5e, 0x49, 0x44,
+		0x0f, 0x02, 0x15, 0x18, 0x3b, 0x36, 0x21, 0x2c,
+		0xb7, 0xba, 0xad, 0xa0, 0x83, 0x8e, 0x99, 0x94,
+		0xdf, 0xd2, 0xc5, 0xc8, 0xeb, 0xe6, 0xf1, 0xfc,
+		0xda, 0xd7, 0xc0, 0xcd, 0xee, 0xe3, 0xf4, 0xf9,
+		0xb2, 0xbf, 0xa8, 0xa5, 0x86, 0x8b, 0x9c, 0x91,
+		0x0a, 0x07, 0x10, 0x1d, 0x3e, 0x33, 0x24, 0x29,
+		0x62, 0x6f, 0x78, 0x75, 0x56, 0x5b, 0x4c, 0x41,
+		0xce, 0xc3, 0xd4, 0xd9, 0xfa, 0xf7, 0xe0, 0xed,
+		0xa6, 0xab, 0xbc, 0xb1, 0x92, 0x9f, 0x88, 0x85,
+		0x1e, 0x13, 0x04, 0x09, 0x2a, 0x27, 0x30, 0x3d,
+		0x76, 0x7b, 0x6c, 0x61, 0x42, 0x4f, 0x58, 0x55,
+		0x73, 0x7e, 0x69, 0x64, 0x47, 0x4a, 0x5d, 0x50,
+		0x1b, 0x16, 0x01, 0x0c, 0x2f, 0x22, 0x35, 0x38,
+		0xa3, 0xae, 0xb9, 0xb4, 0x97, 0x9a, 0x8d, 0x80,
+		0xcb, 0xc6, 0xd1, 0xdc, 0xff, 0xf2, 0xe5, 0xe8,
+		0xa9, 0xa4, 0xb3, 0xbe, 0x9d, 0x90, 0x87, 0x8a,
+		0xc1, 0xcc, 0xdb, 0xd6, 0xf5, 0xf8, 0xef, 0xe2,
+		0x79, 0x74, 0x63, 0x6e, 0x4d, 0x40, 0x57, 0x5a,
+		0x11, 0x1c, 0x0b, 0x06, 0x25, 0x28, 0x3f, 0x32,
+		0x14, 0x19, 0x0e, 0x03, 0x20, 0x2d, 0x3a, 0x37,
+		0x7c, 0x71, 0x66, 0x6b, 0x48, 0x45, 0x52, 0x5f,
+		0xc4, 0xc9, 0xde, 0xd3, 0xf0, 0xfd, 0xea, 0xe7,
+		0xac, 0xa1, 0xb6, 0xbb, 0x98, 0x95, 0x82, 0x8f,
+	},
+	{
+		0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a,
+		0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a,
+		0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca,
+		0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba,
+		0xdd, 0xd3, 0xc1, 0xcf, 0xe5, 0xeb, 0xf9, 0xf7,
+		0xad, 0xa3, 0xb1, 0xbf, 0x95, 0x9b, 0x89, 0x87,
+		0x3d, 0x33, 0x21, 0x2f, 0x05, 0x0b, 0x19, 0x17,
+		0x4d, 0x43, 0x51, 0x5f, 0x75, 0x7b, 0x69, 0x67,
+		0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d,
+		0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd,
+		0x47, 0x49, 0x5b, 0x55, 0x7f, 0x71, 0x63, 0x6d,
+		0x37, 0x39, 0x2b, 0x25, 0x0f, 0x01, 0x13, 0x1d,
+		0x7a, 0x74, 0x66, 0x68, 0x42, 0x4c, 0x5e, 0x50,
+		0x0a, 0x04, 0x16, 0x18, 0x32, 0x3c, 0x2e, 0x20,
+		0x9a, 0x94, 0x86, 0x88, 0xa2, 0xac, 0xbe, 0xb0,
+		0xea, 0xe4, 0xf6, 0xf8, 0xd2, 0xdc, 0xce, 0xc0,
+		0x53, 0x5d, 0x4f, 0x41, 0x6b, 0x65, 0x77, 0x79,
+		0x23, 0x2d, 0x3f, 0x31, 0x1b, 0x15, 0x07, 0x09,
+		0xb3, 0xbd, 0xaf, 0xa1, 0x8b, 0x85, 0x97, 0x99,
+		0xc3, 0xcd, 0xdf, 0xd1, 0xfb, 0xf5, 0xe7, 0xe9,
+		0x8e, 0x80, 0x92, 0x9c, 0xb6, 0xb8, 0xaa, 0xa4,
+		0xfe, 0xf0, 0xe2, 0xec, 0xc6, 0xc8, 0xda, 0xd4,
+		0x6e, 0x60, 0x72, 0x7c, 0x56, 0x58, 0x4a, 0x44,
+		0x1e, 0x10, 0x02, 0x0c, 0x26, 0x28, 0x3a, 0x34,
+		0xf4, 0xfa, 0xe8, 0xe6, 0xcc, 0xc2, 0xd0, 0xde,
+		0x84, 0x8a, 0x98, 0x96, 0xbc, 0xb2, 0xa0, 0xae,
+		0x14, 0x1a, 0x08, 0x06, 0x2c, 0x22, 0x30, 0x3e,
+		0x64, 0x6a, 0x78, 0x76, 0x5c, 0x52, 0x40, 0x4e,
+		0x29, 0x27, 0x35, 0x3b, 0x11, 0x1f, 0x0d, 0x03,
+		0x59, 0x57, 0x45, 0x4b, 0x61, 0x6f, 0x7d, 0x73,
+		0xc9, 0xc7, 0xd5, 0xdb, 0xf1, 0xff, 0xed, 0xe3,
+		0xb9, 0xb7, 0xa5, 0xab, 0x81, 0x8f, 0x9d, 0x93,
+	},
+	{
+		0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d,
+		0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55,
+		0xf0, 0xff, 0xee, 0xe1, 0xcc, 0xc3, 0xd2, 0xdd,
+		0x88, 0x87, 0x96, 0x99, 0xb4, 0xbb, 0xaa, 0xa5,
+		0xfd, 0xf2, 0xe3, 0xec, 0xc1, 0xce, 0xdf, 0xd0,
+		0x85, 0x8a, 0x9b, 0x94, 0xb9, 0xb6, 0xa7, 0xa8,
+		0x0d, 0x02, 0x13, 0x1c, 0x31, 0x3e, 0x2f, 0x20,
+		0x75, 0x7a, 0x6b, 0x64, 0x49, 0x46, 0x57, 0x58,
+		0xe7, 0xe8, 0xf9, 0xf6, 0xdb, 0xd4, 0xc5, 0xca,
+		0x9f, 0x90, 0x81, 0x8e, 0xa3, 0xac, 0xbd, 0xb2,
+		0x17, 0x18, 0x09, 0x06, 0x2b, 0x24, 0x35, 0x3a,
+		0x6f, 0x60, 0x71, 0x7e, 0x53, 0x5c, 0x4d, 0x42,
+		0x1a, 0x15, 0x04, 0x0b, 0x26, 0x29, 0x38, 0x37,
+		0x62, 0x6d, 0x7c, 0x73, 0x5e, 0x51, 0x40, 0x4f,
+		0xea, 0xe5, 0xf4, 0xfb, 0xd6, 0xd9, 0xc8, 0xc7,
+		0x92, 0x9d, 0x8c, 0x83, 0xae, 0xa1, 0xb0, 0xbf,
+		0xd3, 0xdc, 0xcd, 0xc2, 0xef, 0xe0, 0xf1, 0xfe,
+		0xab, 0xa4, 0xb5, 0xba, 0x97, 0x98, 0x89, 0x86,
+		0x23, 0x2c, 0x3d, 0x32, 0x1f, 0x10, 0x01, 0x0e,
+		0x5b, 0x54, 0x45, 0x4a, 0x67, 0x68, 0x79, 0x76,
+		0x2e, 0x21, 0x30, 0x3f, 0x12, 0x1d, 0x0c, 0x03,
+		0x56, 0x59, 0x48, 0x47, 0x6a, 0x65, 0x74, 0x7b,
+		0xde, 0xd1, 0xc0, 0xcf, 0xe2, 0xed, 0xfc, 0xf3,
+		0xa6, 0xa9, 0xb8, 0xb7, 0x9a, 0x95, 0x84, 0x8b,
+		0x34, 0x3b, 0x2a, 0x25, 0x08, 0x07, 0x16, 0x19,
+		0x4c, 0x43, 0x52, 0x5d, 0x70, 0x7f, 0x6e, 0x61,
+		0xc4, 0xcb, 0xda, 0xd5, 0xf8, 0xf7, 0xe6, 0xe9,
+		0xbc, 0xb3, 0xa2, 0xad, 0x80, 0x8f, 0x9e, 0x91,
+		0xc9, 0xc6, 0xd7, 0xd8, 0xf5, 0xfa, 0xeb, 0xe4,
+		0xb1, 0xbe, 0xaf, 0xa0, 0x8d, 0x82, 0x93, 0x9c,
+		0x39, 0x36, 0x27, 0x28, 0x05, 0x0a, 0x1b, 0x14,
+		0x41, 0x4e, 0x5f, 0x50, 0x7d, 0x72, 0x63, 0x6c,
+	},
+	{
+		0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+		0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0,
+		0x1d, 0x0d, 0x3d, 0x2d, 0x5d, 0x4d, 0x7d, 0x6d,
+		0x9d, 0x8d, 0xbd, 0xad, 0xdd, 0xcd, 0xfd, 0xed,
+		0x3a, 0x2a, 0x1a, 0x0a, 0x7a, 0x6a, 0x5a, 0x4a,
+		0xba, 0xaa, 0x9a, 0x8a, 0xfa, 0xea, 0xda, 0xca,
+		0x27, 0x37, 0x07, 0x17, 0x67, 0x77, 0x47, 0x57,
+		0xa7, 0xb7, 0x87, 0x97, 0xe7, 0xf7, 0xc7, 0xd7,
+		0x74, 0x64, 0x54, 0x44, 0x34, 0x24, 0x14, 0x04,
+		0xf4, 0xe4, 0xd4, 0xc4, 0xb4, 0xa4, 0x94, 0x84,
+		0x69, 0x79, 0x49, 0x59, 0x29, 0x39, 0x09, 0x19,
+		0xe9, 0xf9, 0xc9, 0xd9, 0xa9, 0xb9, 0x89, 0x99,
+		0x4e, 0x5e, 0x6e, 0x7e, 0x0e, 0x1e, 0x2e, 0x3e,
+		0xce, 0xde, 0xee, 0xfe, 0x8e, 0x9e, 0xae, 0xbe,
+		0x53, 0x43, 0x73, 0x63, 0x13, 0x03, 0x33, 0x23,
+		0xd3, 0xc3, 0xf3, 0xe3, 0x93, 0x83, 0xb3, 0xa3,
+		0xe8, 0xf8, 0xc8, 0xd8, 0xa8, 0xb8, 0x88, 0x98,
+		0x68, 0x78, 0x48, 0x58, 0x28, 0x38, 0x08, 0x18,
+		0xf5, 0xe5, 0xd5, 0xc5, 0xb5, 0xa5, 0x95, 0x85,
+		0x75, 0x65, 0x55, 0x45, 0x35, 0x25, 0x15, 0x05,
+		0xd2, 0xc2, 0xf2, 0xe2, 0x92, 0x82, 0xb2, 0xa2,
+		0x52, 0x42, 0x72, 0x62, 0x12, 0x02, 0x32, 0x22,
+		0xcf, 0xdf, 0xef, 0xff, 0x8f, 0x9f, 0xaf, 0xbf,
+		0x4f, 0x5f, 0x6f, 0x7f, 0x0f, 0x1f, 0x2f, 0x3f,
+		0x9c, 0x8c, 0xbc, 0xac, 0xdc, 0xcc, 0xfc, 0xec,
+		0x1c, 0x0c, 0x3c, 0x2c, 0x5c, 0x4c, 0x7c, 0x6c,
+		0x81, 0x91, 0xa1, 0xb1, 0xc1, 0xd1, 0xe1, 0xf1,
+		0x01, 0x11, 0x21, 0x31, 0x41, 0x51, 0x61, 0x71,
+		0xa6, 0xb6, 0x86, 0x96, 0xe6, 0xf6, 0xc6, 0xd6,
+		0x26, 0x36, 0x06, 0x16, 0x66, 0x76, 0x46, 0x56,
+		0xbb, 0xab, 0x9b, 0x8b, 0xfb, 0xeb, 0xdb, 0xcb,
+		0x3b, 0x2b, 0x1b, 0x0b, 0x7b, 0x6b, 0x5b, 0x4b,
+	},
+	{
+		0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
+		0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff,
+		0x0d, 0x1c, 0x2f, 0x3e, 0x49, 0x58, 0x6b, 0x7a,
+		0x85, 0x94, 0xa7, 0xb6, 0xc1, 0xd0, 0xe3, 0xf2,
+		0x1a, 0x0b, 0x38, 0x29, 0x5e, 0x4f, 0x7c, 0x6d,
+		0x92, 0x83, 0xb0, 0xa1, 0xd6, 0xc7, 0xf4, 0xe5,
+		0x17, 0x06, 0x35, 0x24, 0x53, 0x42, 0x71, 0x60,
+		0x9f, 0x8e, 0xbd, 0xac, 0xdb, 0xca, 0xf9, 0xe8,
+		0x34, 0x25, 0x16, 0x07, 0x70, 0x61, 0x52, 0x43,
+		0xbc, 0xad, 0x9e, 0x8f, 0xf8, 0xe9, 0xda, 0xcb,
+		0x39, 0x28, 0x1b, 0x0a, 0x7d, 0x6c, 0x5f, 0x4e,
+		0xb1, 0xa0, 0x93, 0x82, 0xf5, 0xe4, 0xd7, 0xc6,
+		0x2e, 0x3f, 0x0c, 0x1d, 0x6a, 0x7b, 0x48, 0x59,
+		0xa6, 0xb7, 0x84, 0x95, 0xe2, 0xf3, 0xc0, 0xd1,
+		0x23, 0x32, 0x01, 0x10, 0x67, 0x76, 0x45, 0x54,
+		0xab, 0xba, 0x89, 0x98, 0xef, 0xfe, 0xcd, 0xdc,
+		0x68, 0x79, 0x4a, 0x5b, 0x2c, 0x3d, 0x0e, 0x1f,
+		0xe0, 0xf1, 0xc2, 0xd3, 0xa4, 0xb5, 0x86, 0x97,
+		0x65, 0x74, 0x47, 0x56, 0x21, 0x30, 0x03, 0x12,
+		0xed, 0xfc, 0xcf, 0xde, 0xa9, 0xb8, 0x8b, 0x9a,
+		0x72, 0x63, 0x50, 0x41, 0x36, 0x27, 0x14, 0x05,
+		0xfa, 0xeb, 0xd8, 0xc9, 0xbe, 0xaf, 0x9c, 0x8d,
+		0x7f, 0x6e, 0x5d, 0x4c, 0x3b, 0x2a, 0x19, 0x08,
+		0xf7, 0xe6, 0xd5, 0xc4, 0xb3, 0xa2, 0x91, 0x80,
+		0x5c, 0x4d, 0x7e, 0x6f, 0x18, 0x09, 0x3a, 0x2b,
+		0xd4, 0xc5, 0xf6, 0xe7, 0x90, 0x81, 0xb2, 0xa3,
+		0x51, 0x40, 0x73, 0x62, 0x15, 0x04, 0x37, 0x26,
+		0xd9, 0xc8, 0xfb, 0xea, 0x9d, 0x8c, 0xbf, 0xae,
+		0x46, 0x57, 0x64, 0x75, 0x02, 0x13, 0x20, 0x31,
+		0xce, 0xdf, 0xec, 0xfd, 0x8a, 0x9b, 0xa8, 0xb9,
+		0x4b, 0x5a, 0x69, 0x78, 0x0f, 0x1e, 0x2d, 0x3c,
+		0xc3, 0xd2, 0xe1, 0xf0, 0x87, 0x96, 0xa5, 0xb4,
+	},
+	{
+		0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e,
+		0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee,
+		0x3d, 0x2f, 0x19, 0x0b, 0x75, 0x67, 0x51, 0x43,
+		0xad, 0xbf, 0x89, 0x9b, 0xe5, 0xf7, 0xc1, 0xd3,
+		0x7a, 0x68, 0x5e, 0x4c, 0x32, 0x20, 0x16, 0x04,
+		0xea, 0xf8, 0xce, 0xdc, 0xa2, 0xb0, 0x86, 0x94,
+		0x47, 0x55, 0x63, 0x71, 0x0f, 0x1d, 0x2b, 0x39,
+		0xd7, 0xc5, 0xf3, 0xe1, 0x9f, 0x8d, 0xbb, 0xa9,
+		0xf4, 0xe6, 0xd0, 0xc2, 0xbc, 0xae, 0x98, 0x8a,
+		0x64, 0x76, 0x40, 0x52, 0x2c, 0x3e, 0x08, 0x1a,
+		0xc9, 0xdb, 0xed, 0xff, 0x81, 0x93, 0xa5, 0xb7,
+		0x59, 0x4b, 0x7d, 0x6f, 0x11, 0x03, 0x35, 0x27,
+		0x8e, 0x9c, 0xaa, 0xb8, 0xc6, 0xd4, 0xe2, 0xf0,
+		0x1e, 0x0c, 0x3a, 0x28, 0x56, 0x44, 0x72, 0x60,
+		0xb3, 0xa1, 0x97, 0x85, 0xfb, 0xe9, 0xdf, 0xcd,
+		0x23, 0x31, 0x07, 0x15, 0x6b, 0x79, 0x4f, 0x5d,
+		0xf5, 0xe7, 0xd1, 0xc3, 0xbd, 0xaf, 0x99, 0x8b,
+		0x65, 0x77, 0x41, 0x53, 0x2d, 0x3f, 0x09, 0x1b,
+		0xc8, 0xda, 0xec, 0xfe, 0x80, 0x92, 0xa4, 0xb6,
+		0x58, 0x4a, 0x7c, 0x6e, 0x10, 0x02, 0x34, 0x26,
+		0x8f, 0x9d, 0xab, 0xb9, 0xc7, 0xd5, 0xe3, 0xf1,
+		0x1f, 0x0d, 0x3b, 0x29, 0x57, 0x45, 0x73, 0x61,
+		0xb2, 0xa0, 0x96, 0x84, 0xfa, 0xe8, 0xde, 0xcc,
+		0x22, 0x30, 0x06, 0x14, 0x6a, 0x78, 0x4e, 0x5c,
+		0x01, 0x13, 0x25, 0x37, 0x49, 0x5b, 0x6d, 0x7f,
+		0x91, 0x83, 0xb5, 0xa7, 0xd9, 0xcb, 0xfd, 0xef,
+		0x3c, 0x2e, 0x18, 0x0a, 0x74, 0x66, 0x50, 0x42,
+		0xac, 0xbe, 0x88, 0x9a, 0xe4, 0xf6, 0xc0, 0xd2,
+		0x7b, 0x69, 0x5f, 0x4d, 0x33, 0x21, 0x17, 0x05,
+		0xeb, 0xf9, 0xcf, 0xdd, 0xa3, 0xb1, 0x87, 0x95,
+		0x46, 0x54, 0x62, 0x70, 0x0e, 0x1c, 0x2a, 0x38,
+		0xd6, 0xc4, 0xf2, 0xe0, 0x9e, 0x8c, 0xba, 0xa8,
+	},
+	{
+		0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79,
+		0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1,
+		0x2d, 0x3e, 0x0b, 0x18, 0x61, 0x72, 0x47, 0x54,
+		0xb5, 0xa6, 0x93, 0x80, 0xf9, 0xea, 0xdf, 0xcc,
+		0x5a, 0x49, 0x7c, 0x6f, 0x16, 0x05, 0x30, 0x23,
+		0xc2, 0xd1, 0xe4, 0xf7, 0x8e, 0x9d, 0xa8, 0xbb,
+		0x77, 0x64, 0x51, 0x42, 0x3b, 0x28, 0x1d, 0x0e,
+		0xef, 0xfc, 0xc9, 0xda, 0xa3, 0xb0, 0x85, 0x96,
+		0xb4, 0xa7, 0x92, 0x81, 0xf8, 0xeb, 0xde, 0xcd,
+		0x2c, 0x3f, 0x0a, 0x19, 0x60, 0x73, 0x46, 0x55,
+		0x99, 0x8a, 0xbf, 0xac, 0xd5, 0xc6, 0xf3, 0xe0,
+		0x01, 0x12, 0x27, 0x34, 0x4d, 0x5e, 0x6b, 0x78,
+		0xee, 0xfd, 0xc8, 0xdb, 0xa2, 0xb1, 0x84, 0x97,
+		0x76, 0x65, 0x50, 0x43, 0x3a, 0x29, 0x1c, 0x0f,
+		0xc3, 0xd0, 0xe5, 0xf6, 0x8f, 0x9c, 0xa9, 0xba,
+		0x5b, 0x48, 0x7d, 0x6e, 0x17, 0x04, 0x31, 0x22,
+		0x75, 0x66, 0x53, 0x40, 0x39, 0x2a, 0x1f, 0x0c,
+		0xed, 0xfe, 0xcb, 0xd8, 0xa1, 0xb2, 0x87, 0x94,
+		0x58, 0x4b, 0x7e, 0x6d, 0x14, 0x07, 0x32, 0x21,
+		0xc0, 0xd3, 0xe6, 0xf5, 0x8c, 0x9f, 0xaa, 0xb9,
+		0x2f, 0x3c, 0x09, 0x1a, 0x63, 0x70, 0x45, 0x56,
+		0xb7, 0xa4, 0x91, 0x82, 0xfb, 0xe8, 0xdd, 0xce,
+		0x02, 0x11, 0x24, 0x37, 0x4e, 0x5d, 0x68, 0x7b,
+		0x9a, 0x89, 0xbc, 0xaf, 0xd6, 0xc5, 0xf0, 0xe3,
+		0xc1, 0xd2, 0xe7, 0xf4, 0x8d, 0x9e, 0xab, 0xb8,
+		0x59, 0x4a, 0x7f, 0x6c, 0x15, 0x06, 0x33, 0x20,
+		0xec, 0xff, 0xca, 0xd9, 0xa0, 0xb3, 0x86, 0x95,
+		0x74, 0x67, 0x52, 0x41, 0x38, 0x2b, 0x1e, 0x0d,
+		0x9b, 0x88, 0xbd, 0xae, 0xd7, 0xc4, 0xf1, 0xe2,
+		0x03, 0x10, 0x25, 0x36, 0x4f, 0x5c, 0x69, 0x7a,
+		0xb6, 0xa5, 0x90, 0x83, 0xfa, 0xe9, 0xdc, 0xcf,
+		0x2e, 0x3d, 0x08, 0x1b, 0x62, 0x71, 0x44, 0x57,
+	},
+	{
+		0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c,
+		0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc,
+		0x5d, 0x49, 0x75, 0x61, 0x0d, 0x19, 0x25, 0x31,
+		0xfd, 0xe9, 0xd5, 0xc1, 0xad, 0xb9, 0x85, 0x91,
+		0xba, 0xae, 0x92, 0x86, 0xea, 0xfe, 0xc2, 0xd6,
+		0x1a, 0x0e, 0x32, 0x26, 0x4a, 0x5e, 0x62, 0x76,
+		0xe7, 0xf3, 0xcf, 0xdb, 0xb7, 0xa3, 0x9f, 0x8b,
+		0x47, 0x53, 0x6f, 0x7b, 0x17, 0x03, 0x3f, 0x2b,
+		0x69, 0x7d, 0x41, 0x55, 0x39, 0x2d, 0x11, 0x05,
+		0xc9, 0xdd, 0xe1, 0xf5, 0x99, 0x8d, 0xb1, 0xa5,
+		0x34, 0x20, 0x1c, 0x08, 0x64, 0x70, 0x4c, 0x58,
+		0x94, 0x80, 0xbc, 0xa8, 0xc4, 0xd0, 0xec, 0xf8,
+		0xd3, 0xc7, 0xfb, 0xef, 0x83, 0x97, 0xab, 0xbf,
+		0x73, 0x67, 0x5b, 0x4f, 0x23, 0x37, 0x0b, 0x1f,
+		0x8e, 0x9a, 0xa6, 0xb2, 0xde, 0xca, 0xf6, 0xe2,
+		0x2e, 0x3a, 0x06, 0x12, 0x7e, 0x6a, 0x56, 0x42,
+		0xd2, 0xc6, 0xfa, 0xee, 0x82, 0x96, 0xaa, 0xbe,
+		0x72, 0x66, 0x5a, 0x4e, 0x22, 0x36, 0x0a, 0x1e,
+		0x8f, 0x9b, 0xa7, 0xb3, 0xdf, 0xcb, 0xf7, 0xe3,
+		0x2f, 0x3b, 0x07, 0x13, 0x7f, 0x6b, 0x57, 0x43,
+		0x68, 0x7c, 0x40, 0x54, 0x38, 0x2c, 0x10, 0x04,
+		0xc8, 0xdc, 0xe0, 0xf4, 0x98, 0x8c, 0xb0, 0xa4,
+		0x35, 0x21, 0x1d, 0x09, 0x65, 0x71, 0x4d, 0x59,
+		0x95, 0x81, 0xbd, 0xa9, 0xc5, 0xd1, 0xed, 0xf9,
+		0xbb, 0xaf, 0x93, 0x87, 0xeb, 0xff, 0xc3, 0xd7,
+		0x1b, 0x0f, 0x33, 0x27, 0x4b, 0x5f, 0x63, 0x77,
+		0xe6, 0xf2, 0xce, 0xda, 0xb6, 0xa2, 0x9e, 0x8a,
+		0x46, 0x52, 0x6e, 0x7a, 0x16, 0x02, 0x3e, 0x2a,
+		0x01, 0x15, 0x29, 0x3d, 0x51, 0x45, 0x79, 0x6d,
+		0xa1, 0xb5, 0x89, 0x9d, 0xf1, 0xe5, 0xd9, 0xcd,
+		0x5c, 0x48, 0x74, 0x60, 0x0c, 0x18, 0x24, 0x30,
+		0xfc, 0xe8, 0xd4, 0xc0, 0xac, 0xb8, 0x84, 0x90,
+	},
+	{
+		0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b,
+		0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3,
+		0x4d, 0x58, 0x67, 0x72, 0x19, 0x0c, 0x33, 0x26,
+		0xe5, 0xf0, 0xcf, 0xda, 0xb1, 0xa4, 0x9b, 0x8e,
+		0x9a, 0x8f, 0xb0, 0xa5, 0xce, 0xdb, 0xe4, 0xf1,
+		0x32, 0x27, 0x18, 0x0d, 0x66, 0x73, 0x4c, 0x59,
+		0xd7, 0xc2, 0xfd, 0xe8, 0x83, 0x96, 0xa9, 0xbc,
+		0x7f, 0x6a, 0x55, 0x40, 0x2b, 0x3e, 0x01, 0x14,
+		0x29, 0x3c, 0x03, 0x16, 0x7d, 0x68, 0x57, 0x42,
+		0x81, 0x94, 0xab, 0xbe, 0xd5, 0xc0, 0xff, 0xea,
+		0x64, 0x71, 0x4e, 0x5b, 0x30, 0x25, 0x1a, 0x0f,
+		0xcc, 0xd9, 0xe6, 0xf3, 0x98, 0x8d, 0xb2, 0xa7,
+		0xb3, 0xa6, 0x99, 0x8c, 0xe7, 0xf2, 0xcd, 0xd8,
+		0x1b, 0x0e, 0x31, 0x24, 0x4f, 0x5a, 0x65, 0x70,
+		0xfe, 0xeb, 0xd4, 0xc1, 0xaa, 0xbf, 0x80, 0x95,
+		0x56, 0x43, 0x7c, 0x69, 0x02, 0x17, 0x28, 0x3d,
+		0x52, 0x47, 0x78, 0x6d, 0x06, 0x13, 0x2c, 0x39,
+		0xfa, 0xef, 0xd0, 0xc5, 0xae, 0xbb, 0x84, 0x91,
+		0x1f, 0x0a, 0x35, 0x20, 0x4b, 0x5e, 0x61, 0x74,
+		0xb7, 0xa2, 0x9d, 0x88, 0xe3, 0xf6, 0xc9, 0xdc,
+		0xc8, 0xdd, 0xe2, 0xf7, 0x9c, 0x89, 0xb6, 0xa3,
+		0x60, 0x75, 0x4a, 0x5f, 0x34, 0x21, 0x1e, 0x0b,
+		0x85, 0x90, 0xaf, 0xba, 0xd1, 0xc4, 0xfb, 0xee,
+		0x2d, 0x38, 0x07, 0x12, 0x79, 0x6c, 0x53, 0x46,
+		0x7b, 0x6e, 0x51, 0x44, 0x2f, 0x3a, 0x05, 0x10,
+		0xd3, 0xc6, 0xf9, 0xec, 0x87, 0x92, 0xad, 0xb8,
+		0x36, 0x23, 0x1c, 0x09, 0x62, 0x77, 0x48, 0x5d,
+		0x9e, 0x8b, 0xb4, 0xa1, 0xca, 0xdf, 0xe0, 0xf5,
+		0xe1, 0xf4, 0xcb, 0xde, 0xb5, 0xa0, 0x9f, 0x8a,
+		0x49, 0x5c, 0x63, 0x76, 0x1d, 0x08, 0x37, 0x22,
+		0xac, 0xb9, 0x86, 0x93, 0xf8, 0xed, 0xd2, 0xc7,
+		0x04, 0x11, 0x2e, 0x3b, 0x50, 0x45, 0x7a, 0x6f,
+	},
+	{
+		0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62,
+		0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2,
+		0x7d, 0x6b, 0x51, 0x47, 0x25, 0x33, 0x09, 0x1f,
+		0xcd, 0xdb, 0xe1, 0xf7, 0x95, 0x83, 0xb9, 0xaf,
+		0xfa, 0xec, 0xd6, 0xc0, 0xa2, 0xb4, 0x8e, 0x98,
+		0x4a, 0x5c, 0x66, 0x70, 0x12, 0x04, 0x3e, 0x28,
+		0x87, 0x91, 0xab, 0xbd, 0xdf, 0xc9, 0xf3, 0xe5,
+		0x37, 0x21, 0x1b, 0x0d, 0x6f, 0x79, 0x43, 0x55,
+		0xe9, 0xff, 0xc5, 0xd3, 0xb1, 0xa7, 0x9d, 0x8b,
+		0x59, 0x4f, 0x75, 0x63, 0x01, 0x17, 0x2d, 0x3b,
+		0x94, 0x82, 0xb8, 0xae, 0xcc, 0xda, 0xe0, 0xf6,
+		0x24, 0x32, 0x08, 0x1e, 0x7c, 0x6a, 0x50, 0x46,
+		0x13, 0x05, 0x3f, 0x29, 0x4b, 0x5d, 0x67, 0x71,
+		0xa3, 0xb5, 0x8f, 0x99, 0xfb, 0xed, 0xd7, 0xc1,
+		0x6e, 0x78, 0x42, 0x54, 0x36, 0x20, 0x1a, 0x0c,
+		0xde, 0xc8, 0xf2, 0xe4, 0x86, 0x90, 0xaa, 0xbc,
+		0xcf, 0xd9, 0xe3, 0xf5, 0x97, 0x81, 0xbb, 0xad,
+		0x7f, 0x69, 0x53, 0x45, 0x27, 0x31, 0x0b, 0x1d,
+		0xb2, 0xa4, 0x9e, 0x88, 0xea, 0xfc, 0xc6, 0xd0,
+		0x02, 0x14, 0x2e, 0x38, 0x5a, 0x4c, 0x76, 0x60,
+		0x35, 0x23, 0x19, 0x0f, 0x6d, 0x7b, 0x41, 0x57,
+		0x85, 0x93, 0xa9, 0xbf, 0xdd, 0xcb, 0xf1, 0xe7,
+		0x48, 0x5e, 0x64, 0x72, 0x10, 0x06, 0x3c, 0x2a,
+		0xf8, 0xee, 0xd4, 0xc2, 0xa0, 0xb6, 0x8c, 0x9a,
+		0x26, 0x30, 0x0a, 0x1c, 0x7e, 0x68, 0x52, 0x44,
+		0x96, 0x80, 0xba, 0xac, 0xce, 0xd8, 0xe2, 0xf4,
+		0x5b, 0x4d, 0x77, 0x61, 0x03, 0x15, 0x2f, 0x39,
+		0xeb, 0xfd, 0xc7, 0xd1, 0xb3, 0xa5, 0x9f, 0x89,
+		0xdc, 0xca, 0xf0, 0xe6, 0x84, 0x92, 0xa8, 0xbe,
+		0x6c, 0x7a, 0x40, 0x56, 0x34, 0x22, 0x18, 0x0e,
+		0xa1, 0xb7, 0x8d, 0x9b, 0xf9, 0xef, 0xd5, 0xc3,
+		0x11, 0x07, 0x3d, 0x2b, 0x49, 0x5f, 0x65, 0x73,
+	},
+	{
+		0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65,
+		0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd,
+		0x6d, 0x7a, 0x43, 0x54, 0x31, 0x26, 0x1f, 0x08,
+		0xd5, 0xc2, 0xfb, 0xec, 0x89, 0x9e, 0xa7, 0xb0,
+		0xda, 0xcd, 0xf4, 0xe3, 0x86, 0x91, 0xa8, 0xbf,
+		0x62, 0x75, 0x4c, 0x5b, 0x3e, 0x29, 0x10, 0x07,
+		0xb7, 0xa0, 0x99, 0x8e, 0xeb, 0xfc, 0xc5, 0xd2,
+		0x0f, 0x18, 0x21, 0x36, 0x53, 0x44, 0x7d, 0x6a,
+		0xa9, 0xbe, 0x87, 0x90, 0xf5, 0xe2, 0xdb, 0xcc,
+		0x11, 0x06, 0x3f, 0x28, 0x4d, 0x5a, 0x63, 0x74,
+		0xc4, 0xd3, 0xea, 0xfd, 0x98, 0x8f, 0xb6, 0xa1,
+		0x7c, 0x6b, 0x52, 0x45, 0x20, 0x37, 0x0e, 0x19,
+		0x73, 0x64, 0x5d, 0x4a, 0x2f, 0x38, 0x01, 0x16,
+		0xcb, 0xdc, 0xe5, 0xf2, 0x97, 0x80, 0xb9, 0xae,
+		0x1e, 0x09, 0x30, 0x27, 0x42, 0x55, 0x6c, 0x7b,
+		0xa6, 0xb1, 0x88, 0x9f, 0xfa, 0xed, 0xd4, 0xc3,
+		0x4f, 0x58, 0x61, 0x76, 0x13, 0x04, 0x3d, 0x2a,
+		0xf7, 0xe0, 0xd9, 0xce, 0xab, 0xbc, 0x85, 0x92,
+		0x22, 0x35, 0x0c, 0x1b, 0x7e, 0x69, 0x50, 0x47,
+		0x9a, 0x8d, 0xb4, 0xa3, 0xc6, 0xd1, 0xe8, 0xff,
+		0x95, 0x82, 0xbb, 0xac, 0xc9, 0xde, 0xe7, 0xf0,
+		0x2d, 0x3a, 0x03, 0x14, 0x71, 0x66, 0x5f, 0x48,
+		0xf8, 0xef, 0xd6, 0xc1, 0xa4, 0xb3, 0x8a, 0x9d,
+		0x40, 0x57, 0x6e, 0x79, 0x1c, 0x0b, 0x32, 0x25,
+		0xe6, 0xf1, 0xc8, 0xdf, 0xba, 0xad, 0x94, 0x83,
+		0x5e, 0x49, 0x70, 0x67, 0x02, 0x15, 0x2c, 0x3b,
+		0x8b, 0x9c, 0xa5, 0xb2, 0xd7, 0xc0, 0xf9, 0xee,
+		0x33, 0x24, 0x1d, 0x0a, 0x6f, 0x78, 0x41, 0x56,
+		0x3c, 0x2b, 0x12, 0x05, 0x60, 0x77, 0x4e, 0x59,
+		0x84, 0x93, 0xaa, 0xbd, 0xd8, 0xcf, 0xf6, 0xe1,
+		0x51, 0x46, 0x7f, 0x68, 0x0d, 0x1a, 0x23, 0x34,
+		0xe9, 0xfe, 0xc7, 0xd0, 0xb5, 0xa2, 0x9b, 0x8c,
+	},
+	{
+		0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48,
+		0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88,
+		0x9d, 0x85, 0xad, 0xb5, 0xfd, 0xe5, 0xcd, 0xd5,
+		0x5d, 0x45, 0x6d, 0x75, 0x3d, 0x25, 0x0d, 0x15,
+		0x27, 0x3f, 0x17, 0x0f, 0x47, 0x5f, 0x77, 0x6f,
+		0xe7, 0xff, 0xd7, 0xcf, 0x87, 0x9f, 0xb7, 0xaf,
+		0xba, 0xa2, 0x8a, 0x92, 0xda, 0xc2, 0xea, 0xf2,
+		0x7a, 0x62, 0x4a, 0x52, 0x1a, 0x02, 0x2a, 0x32,
+		0x4e, 0x56, 0x7e, 0x66, 0x2e, 0x36, 0x1e, 0x06,
+		0x8e, 0x96, 0xbe, 0xa6, 0xee, 0xf6, 0xde, 0xc6,
+		0xd3, 0xcb, 0xe3, 0xfb, 0xb3, 0xab, 0x83, 0x9b,
+		0x13, 0x0b, 0x23, 0x3b, 0x73, 0x6b, 0x43, 0x5b,
+		0x69, 0x71, 0x59, 0x41, 0x09, 0x11, 0x39, 0x21,
+		0xa9, 0xb1, 0x99, 0x81, 0xc9, 0xd1, 0xf9, 0xe1,
+		0xf4, 0xec, 0xc4, 0xdc, 0x94, 0x8c, 0xa4, 0xbc,
+		0x34, 0x2c, 0x04, 0x1c, 0x54, 0x4c, 0x64, 0x7c,
+		0x9c, 0x84, 0xac, 0xb4, 0xfc, 0xe4, 0xcc, 0xd4,
+		0x5c, 0x44, 0x6c, 0x74, 0x3c, 0x24, 0x0c, 0x14,
+		0x01, 0x19, 0x31, 0x29, 0x61, 0x79, 0x51, 0x49,
+		0xc1, 0xd9, 0xf1, 0xe9, 0xa1, 0xb9, 0x91, 0x89,
+		0xbb, 0xa3, 0x8b, 0x93, 0xdb, 0xc3, 0xeb, 0xf3,
+		0x7b, 0x63, 0x4b, 0x53, 0x1b, 0x03, 0x2b, 0x33,
+		0x26, 0x3e, 0x16, 0x0e, 0x46, 0x5e, 0x76, 0x6e,
+		0xe6, 0xfe, 0xd6, 0xce, 0x86, 0x9e, 0xb6, 0xae,
+		0xd2, 0xca, 0xe2, 0xfa, 0xb2, 0xaa, 0x82, 0x9a,
+		0x12, 0x0a, 0x22, 0x3a, 0x72, 0x6a, 0x42, 0x5a,
+		0x4f, 0x57, 0x7f, 0x67, 0x2f, 0x37, 0x1f, 0x07,
+		0x8f, 0x97, 0xbf, 0xa7, 0xef, 0xf7, 0xdf, 0xc7,
+		0xf5, 0xed, 0xc5, 0xdd, 0x95, 0x8d, 0xa5, 0xbd,
+		0x35, 0x2d, 0x05, 0x1d, 0x55, 0x4d, 0x65, 0x7d,
+		0x68, 0x70, 0x58, 0x40, 0x08, 0x10, 0x38, 0x20,
+		0xa8, 0xb0, 0x98, 0x80, 0xc8, 0xd0, 0xf8, 0xe0,
+	},
+	{
+		0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f,
+		0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87,
+		0x8d, 0x94, 0xbf, 0xa6, 0xe9, 0xf0, 0xdb, 0xc2,
+		0x45, 0x5c, 0x77, 0x6e, 0x21, 0x38, 0x13, 0x0a,
+		0x07, 0x1e, 0x35, 0x2c, 0x63, 0x7a, 0x51, 0x48,
+		0xcf, 0xd6, 0xfd, 0xe4, 0xab, 0xb2, 0x99, 0x80,
+		0x8a, 0x93, 0xb8, 0xa1, 0xee, 0xf7, 0xdc, 0xc5,
+		0x42, 0x5b, 0x70, 0x69, 0x26, 0x3f, 0x14, 0x0d,
+		0x0e, 0x17, 0x3c, 0x25, 0x6a, 0x73, 0x58, 0x41,
+		0xc6, 0xdf, 0xf4, 0xed, 0xa2, 0xbb, 0x90, 0x89,
+		0x83, 0x9a, 0xb1, 0xa8, 0xe7, 0xfe, 0xd5, 0xcc,
+		0x4b, 0x52, 0x79, 0x60, 0x2f, 0x36, 0x1d, 0x04,
+		0x09, 0x10, 0x3b, 0x22, 0x6d, 0x74, 0x5f, 0x46,
+		0xc1, 0xd8, 0xf3, 0xea, 0xa5, 0xbc, 0x97, 0x8e,
+		0x84, 0x9d, 0xb6, 0xaf, 0xe0, 0xf9, 0xd2, 0xcb,
+		0x4c, 0x55, 0x7e, 0x67, 0x28, 0x31, 0x1a, 0x03,
+		0x1c, 0x05, 0x2e, 0x37, 0x78, 0x61, 0x4a, 0x53,
+		0xd4, 0xcd, 0xe6, 0xff, 0xb0, 0xa9, 0x82, 0x9b,
+		0x91, 0x88, 0xa3, 0xba, 0xf5, 0xec, 0xc7, 0xde,
+		0x59, 0x40, 0x6b, 0x72, 0x3d, 0x24, 0x0f, 0x16,
+		0x1b, 0x02, 0x29, 0x30, 0x7f, 0x66, 0x4d, 0x54,
+		0xd3, 0xca, 0xe1, 0xf8, 0xb7, 0xae, 0x85, 0x9c,
+		0x96, 0x8f, 0xa4, 0xbd, 0xf2, 0xeb, 0xc0, 0xd9,
+		0x5e, 0x47, 0x6c, 0x75, 0x3a, 0x23, 0x08, 0x11,
+		0x12, 0x0b, 0x20, 0x39, 0x76, 0x6f, 0x44, 0x5d,
+		0xda, 0xc3, 0xe8, 0xf1, 0xbe, 0xa7, 0x8c, 0x95,
+		0x9f, 0x86, 0xad, 0xb4, 0xfb, 0xe2, 0xc9, 0xd0,
+		0x57, 0x4e, 0x65, 0x7c, 0x33, 0x2a, 0x01, 0x18,
+		0x15, 0x0c, 0x27, 0x3e, 0x71, 0x68, 0x43, 0x5a,
+		0xdd, 0xc4, 0xef, 0xf6, 0xb9, 0xa0, 0x8b, 0x92,
+		0x98, 0x81, 0xaa, 0xb3, 0xfc, 0xe5, 0xce, 0xd7,
+		0x50, 0x49, 0x62, 0x7b, 0x34, 0x2d, 0x06, 0x1f,
+	},
+	{
+		0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46,
+		0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96,
+		0xbd, 0xa7, 0x89, 0x93, 0xd5, 0xcf, 0xe1, 0xfb,
+		0x6d, 0x77, 0x59, 0x43, 0x05, 0x1f, 0x31, 0x2b,
+		0x67, 0x7d, 0x53, 0x49, 0x0f, 0x15, 0x3b, 0x21,
+		0xb7, 0xad, 0x83, 0x99, 0xdf, 0xc5, 0xeb, 0xf1,
+		0xda, 0xc0, 0xee, 0xf4, 0xb2, 0xa8, 0x86, 0x9c,
+		0x0a, 0x10, 0x3e, 0x24, 0x62, 0x78, 0x56, 0x4c,
+		0xce, 0xd4, 0xfa, 0xe0, 0xa6, 0xbc, 0x92, 0x88,
+		0x1e, 0x04, 0x2a, 0x30, 0x76, 0x6c, 0x42, 0x58,
+		0x73, 0x69, 0x47, 0x5d, 0x1b, 0x01, 0x2f, 0x35,
+		0xa3, 0xb9, 0x97, 0x8d, 0xcb, 0xd1, 0xff, 0xe5,
+		0xa9, 0xb3, 0x9d, 0x87, 0xc1, 0xdb, 0xf5, 0xef,
+		0x79, 0x63, 0x4d, 0x57, 0x11, 0x0b, 0x25, 0x3f,
+		0x14, 0x0e, 0x20, 0x3a, 0x7c, 0x66, 0x48, 0x52,
+		0xc4, 0xde, 0xf0, 0xea, 0xac, 0xb6, 0x98, 0x82,
+		0x81, 0x9b, 0xb5, 0xaf, 0xe9, 0xf3, 0xdd, 0xc7,
+		0x51, 0x4b, 0x65, 0x7f, 0x39, 0x23, 0x0d, 0x17,
+		0x3c, 0x26, 0x08, 0x12, 0x54, 0x4e, 0x60, 0x7a,
+		0xec, 0xf6, 0xd8, 0xc2, 0x84, 0x9e, 0xb0, 0xaa,
+		0xe6, 0xfc, 0xd2, 0xc8, 0x8e, 0x94, 0xba, 0xa0,
+		0x36, 0x2c, 0x02, 0x18, 0x5e, 0x44, 0x6a, 0x70,
+		0x5b, 0x41, 0x6f, 0x75, 0x33, 0x29, 0x07, 0x1d,
+		0x8b, 0x91, 0xbf, 0xa5, 0xe3, 0xf9, 0xd7, 0xcd,
+		0x4f, 0x55, 0x7b, 0x61, 0x27, 0x3d, 0x13, 0x09,
+		0x9f, 0x85, 0xab, 0xb1, 0xf7, 0xed, 0xc3, 0xd9,
+		0xf2, 0xe8, 0xc6, 0xdc, 0x9a, 0x80, 0xae, 0xb4,
+		0x22, 0x38, 0x16, 0x0c, 0x4a, 0x50, 0x7e, 0x64,
+		0x28, 0x32, 0x1c, 0x06, 0x40, 0x5a, 0x74, 0x6e,
+		0xf8, 0xe2, 0xcc, 0xd6, 0x90, 0x8a, 0xa4, 0xbe,
+		0x95, 0x8f, 0xa1, 0xbb, 0xfd, 0xe7, 0xc9, 0xd3,
+		0x45, 0x5f, 0x71, 0x6b, 0x2d, 0x37, 0x19, 0x03,
+	},
+	{
+		0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41,
+		0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99,
+		0xad, 0xb6, 0x9b, 0x80, 0xc1, 0xda, 0xf7, 0xec,
+		0x75, 0x6e, 0x43, 0x58, 0x19, 0x02, 0x2f, 0x34,
+		0x47, 0x5c, 0x71, 0x6a, 0x2b, 0x30, 0x1d, 0x06,
+		0x9f, 0x84, 0xa9, 0xb2, 0xf3, 0xe8, 0xc5, 0xde,
+		0xea, 0xf1, 0xdc, 0xc7, 0x86, 0x9d, 0xb0, 0xab,
+		0x32, 0x29, 0x04, 0x1f, 0x5e, 0x45, 0x68, 0x73,
+		0x8e, 0x95, 0xb8, 0xa3, 0xe2, 0xf9, 0xd4, 0xcf,
+		0x56, 0x4d, 0x60, 0x7b, 0x3a, 0x21, 0x0c, 0x17,
+		0x23, 0x38, 0x15, 0x0e, 0x4f, 0x54, 0x79, 0x62,
+		0xfb, 0xe0, 0xcd, 0xd6, 0x97, 0x8c, 0xa1, 0xba,
+		0xc9, 0xd2, 0xff, 0xe4, 0xa5, 0xbe, 0x93, 0x88,
+		0x11, 0x0a, 0x27, 0x3c, 0x7d, 0x66, 0x4b, 0x50,
+		0x64, 0x7f, 0x52, 0x49, 0x08, 0x13, 0x3e, 0x25,
+		0xbc, 0xa7, 0x8a, 0x91, 0xd0, 0xcb, 0xe6, 0xfd,
+		0x01, 0x1a, 0x37, 0x2c, 0x6d, 0x76, 0x5b, 0x40,
+		0xd9, 0xc2, 0xef, 0xf4, 0xb5, 0xae, 0x83, 0x98,
+		0xac, 0xb7, 0x9a, 0x81, 0xc0, 0xdb, 0xf6, 0xed,
+		0x74, 0x6f, 0x42, 0x59, 0x18, 0x03, 0x2e, 0x35,
+		0x46, 0x5d, 0x70, 0x6b, 0x2a, 0x31, 0x1c, 0x07,
+		0x9e, 0x85, 0xa8, 0xb3, 0xf2, 0xe9, 0xc4, 0xdf,
+		0xeb, 0xf0, 0xdd, 0xc6, 0x87, 0x9c, 0xb1, 0xaa,
+		0x33, 0x28, 0x05, 0x1e, 0x5f, 0x44, 0x69, 0x72,
+		0x8f, 0x94, 0xb9, 0xa2, 0xe3, 0xf8, 0xd5, 0xce,
+		0x57, 0x4c, 0x61, 0x7a, 0x3b, 0x20, 0x0d, 0x16,
+		0x22, 0x39, 0x14, 0x0f, 0x4e, 0x55, 0x78, 0x63,
+		0xfa, 0xe1, 0xcc, 0xd7, 0x96, 0x8d, 0xa0, 0xbb,
+		0xc8, 0xd3, 0xfe, 0xe5, 0xa4, 0xbf, 0x92, 0x89,
+		0x10, 0x0b, 0x26, 0x3d, 0x7c, 0x67, 0x4a, 0x51,
+		0x65, 0x7e, 0x53, 0x48, 0x09, 0x12, 0x3f, 0x24,
+		0xbd, 0xa6, 0x8b, 0x90, 0xd1, 0xca, 0xe7, 0xfc,
+	},
+	{
+		0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54,
+		0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4,
+		0xdd, 0xc1, 0xe5, 0xf9, 0xad, 0xb1, 0x95, 0x89,
+		0x3d, 0x21, 0x05, 0x19, 0x4d, 0x51, 0x75, 0x69,
+		0xa7, 0xbb, 0x9f, 0x83, 0xd7, 0xcb, 0xef, 0xf3,
+		0x47, 0x5b, 0x7f, 0x63, 0x37, 0x2b, 0x0f, 0x13,
+		0x7a, 0x66, 0x42, 0x5e, 0x0a, 0x16, 0x32, 0x2e,
+		0x9a, 0x86, 0xa2, 0xbe, 0xea, 0xf6, 0xd2, 0xce,
+		0x53, 0x4f, 0x6b, 0x77, 0x23, 0x3f, 0x1b, 0x07,
+		0xb3, 0xaf, 0x8b, 0x97, 0xc3, 0xdf, 0xfb, 0xe7,
+		0x8e, 0x92, 0xb6, 0xaa, 0xfe, 0xe2, 0xc6, 0xda,
+		0x6e, 0x72, 0x56, 0x4a, 0x1e, 0x02, 0x26, 0x3a,
+		0xf4, 0xe8, 0xcc, 0xd0, 0x84, 0x98, 0xbc, 0xa0,
+		0x14, 0x08, 0x2c, 0x30, 0x64, 0x78, 0x5c, 0x40,
+		0x29, 0x35, 0x11, 0x0d, 0x59, 0x45, 0x61, 0x7d,
+		0xc9, 0xd5, 0xf1, 0xed, 0xb9, 0xa5, 0x81, 0x9d,
+		0xa6, 0xba, 0x9e, 0x82, 0xd6, 0xca, 0xee, 0xf2,
+		0x46, 0x5a, 0x7e, 0x62, 0x36, 0x2a, 0x0e, 0x12,
+		0x7b, 0x67, 0x43, 0x5f, 0x0b, 0x17, 0x33, 0x2f,
+		0x9b, 0x87, 0xa3, 0xbf, 0xeb, 0xf7, 0xd3, 0xcf,
+		0x01, 0x1d, 0x39, 0x25, 0x71, 0x6d, 0x49, 0x55,
+		0xe1, 0xfd, 0xd9, 0xc5, 0x91, 0x8d, 0xa9, 0xb5,
+		0xdc, 0xc0, 0xe4, 0xf8, 0xac, 0xb0, 0x94, 0x88,
+		0x3c, 0x20, 0x04, 0x18, 0x4c, 0x50, 0x74, 0x68,
+		0xf5, 0xe9, 0xcd, 0xd1, 0x85, 0x99, 0xbd, 0xa1,
+		0x15, 0x09, 0x2d, 0x31, 0x65, 0x79, 0x5d, 0x41,
+		0x28, 0x34, 0x10, 0x0c, 0x58, 0x44, 0x60, 0x7c,
+		0xc8, 0xd4, 0xf0, 0xec, 0xb8, 0xa4, 0x80, 0x9c,
+		0x52, 0x4e, 0x6a, 0x76, 0x22, 0x3e, 0x1a, 0x06,
+		0xb2, 0xae, 0x8a, 0x96, 0xc2, 0xde, 0xfa, 0xe6,
+		0x8f, 0x93, 0xb7, 0xab, 0xff, 0xe3, 0xc7, 0xdb,
+		0x6f, 0x73, 0x57, 0x4b, 0x1f, 0x03, 0x27, 0x3b,
+	},
+	{
+		0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+		0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb,
+		0xcd, 0xd0, 0xf7, 0xea, 0xb9, 0xa4, 0x83, 0x9e,
+		0x25, 0x38, 0x1f, 0x02, 0x51, 0x4c, 0x6b, 0x76,
+		0x87, 0x9a, 0xbd, 0xa0, 0xf3, 0xee, 0xc9, 0xd4,
+		0x6f, 0x72, 0x55, 0x48, 0x1b, 0x06, 0x21, 0x3c,
+		0x4a, 0x57, 0x70, 0x6d, 0x3e, 0x23, 0x04, 0x19,
+		0xa2, 0xbf, 0x98, 0x85, 0xd6, 0xcb, 0xec, 0xf1,
+		0x13, 0x0e, 0x29, 0x34, 0x67, 0x7a, 0x5d, 0x40,
+		0xfb, 0xe6, 0xc1, 0xdc, 0x8f, 0x92, 0xb5, 0xa8,
+		0xde, 0xc3, 0xe4, 0xf9, 0xaa, 0xb7, 0x90, 0x8d,
+		0x36, 0x2b, 0x0c, 0x11, 0x42, 0x5f, 0x78, 0x65,
+		0x94, 0x89, 0xae, 0xb3, 0xe0, 0xfd, 0xda, 0xc7,
+		0x7c, 0x61, 0x46, 0x5b, 0x08, 0x15, 0x32, 0x2f,
+		0x59, 0x44, 0x63, 0x7e, 0x2d, 0x30, 0x17, 0x0a,
+		0xb1, 0xac, 0x8b, 0x96, 0xc5, 0xd8, 0xff, 0xe2,
+		0x26, 0x3b, 0x1c, 0x01, 0x52, 0x4f, 0x68, 0x75,
+		0xce, 0xd3, 0xf4, 0xe9, 0xba, 0xa7, 0x80, 0x9d,
+		0xeb, 0xf6, 0xd1, 0xcc, 0x9f, 0x82, 0xa5, 0xb8,
+		0x03, 0x1e, 0x39, 0x24, 0x77, 0x6a, 0x4d, 0x50,
+		0xa1, 0xbc, 0x9b, 0x86, 0xd5, 0xc8, 0xef, 0xf2,
+		0x49, 0x54, 0x73, 0x6e, 0x3d, 0x20, 0x07, 0x1a,
+		0x6c, 0x71, 0x56, 0x4b, 0x18, 0x05, 0x22, 0x3f,
+		0x84, 0x99, 0xbe, 0xa3, 0xf0, 0xed, 0xca, 0xd7,
+		0x35, 0x28, 0x0f, 0x12, 0x41, 0x5c, 0x7b, 0x66,
+		0xdd, 0xc0, 0xe7, 0xfa, 0xa9, 0xb4, 0x93, 0x8e,
+		0xf8, 0xe5, 0xc2, 0xdf, 0x8c, 0x91, 0xb6, 0xab,
+		0x10, 0x0d, 0x2a, 0x37, 0x64, 0x79, 0x5e, 0x43,
+		0xb2, 0xaf, 0x88, 0x95, 0xc6, 0xdb, 0xfc, 0xe1,
+		0x5a, 0x47, 0x60, 0x7d, 0x2e, 0x33, 0x14, 0x09,
+		0x7f, 0x62, 0x45, 0x58, 0x0b, 0x16, 0x31, 0x2c,
+		0x97, 0x8a, 0xad, 0xb0, 0xe3, 0xfe, 0xd9, 0xc4,
+	},
+	{
+		0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a,
+		0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa,
+		0xfd, 0xe3, 0xc1, 0xdf, 0x85, 0x9b, 0xb9, 0xa7,
+		0x0d, 0x13, 0x31, 0x2f, 0x75, 0x6b, 0x49, 0x57,
+		0xe7, 0xf9, 0xdb, 0xc5, 0x9f, 0x81, 0xa3, 0xbd,
+		0x17, 0x09, 0x2b, 0x35, 0x6f, 0x71, 0x53, 0x4d,
+		0x1a, 0x04, 0x26, 0x38, 0x62, 0x7c, 0x5e, 0x40,
+		0xea, 0xf4, 0xd6, 0xc8, 0x92, 0x8c, 0xae, 0xb0,
+		0xd3, 0xcd, 0xef, 0xf1, 0xab, 0xb5, 0x97, 0x89,
+		0x23, 0x3d, 0x1f, 0x01, 0x5b, 0x45, 0x67, 0x79,
+		0x2e, 0x30, 0x12, 0x0c, 0x56, 0x48, 0x6a, 0x74,
+		0xde, 0xc0, 0xe2, 0xfc, 0xa6, 0xb8, 0x9a, 0x84,
+		0x34, 0x2a, 0x08, 0x16, 0x4c, 0x52, 0x70, 0x6e,
+		0xc4, 0xda, 0xf8, 0xe6, 0xbc, 0xa2, 0x80, 0x9e,
+		0xc9, 0xd7, 0xf5, 0xeb, 0xb1, 0xaf, 0x8d, 0x93,
+		0x39, 0x27, 0x05, 0x1b, 0x41, 0x5f, 0x7d, 0x63,
+		0xbb, 0xa5, 0x87, 0x99, 0xc3, 0xdd, 0xff, 0xe1,
+		0x4b, 0x55, 0x77, 0x69, 0x33, 0x2d, 0x0f, 0x11,
+		0x46, 0x58, 0x7a, 0x64, 0x3e, 0x20, 0x02, 0x1c,
+		0xb6, 0xa8, 0x8a, 0x94, 0xce, 0xd0, 0xf2, 0xec,
+		0x5c, 0x42, 0x60, 0x7e, 0x24, 0x3a, 0x18, 0x06,
+		0xac, 0xb2, 0x90, 0x8e, 0xd4, 0xca, 0xe8, 0xf6,
+		0xa1, 0xbf, 0x9d, 0x83, 0xd9, 0xc7, 0xe5, 0xfb,
+		0x51, 0x4f, 0x6d, 0x73, 0x29, 0x37, 0x15, 0x0b,
+		0x68, 0x76, 0x54, 0x4a, 0x10, 0x0e, 0x2c, 0x32,
+		0x98, 0x86, 0xa4, 0xba, 0xe0, 0xfe, 0xdc, 0xc2,
+		0x95, 0x8b, 0xa9, 0xb7, 0xed, 0xf3, 0xd1, 0xcf,
+		0x65, 0x7b, 0x59, 0x47, 0x1d, 0x03, 0x21, 0x3f,
+		0x8f, 0x91, 0xb3, 0xad, 0xf7, 0xe9, 0xcb, 0xd5,
+		0x7f, 0x61, 0x43, 0x5d, 0x07, 0x19, 0x3b, 0x25,
+		0x72, 0x6c, 0x4e, 0x50, 0x0a, 0x14, 0x36, 0x28,
+		0x82, 0x9c, 0xbe, 0xa0, 0xfa, 0xe4, 0xc6, 0xd8,
+	},
+	{
+		0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d,
+		0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5,
+		0xed, 0xf2, 0xd3, 0xcc, 0x91, 0x8e, 0xaf, 0xb0,
+		0x15, 0x0a, 0x2b, 0x34, 0x69, 0x76, 0x57, 0x48,
+		0xc7, 0xd8, 0xf9, 0xe6, 0xbb, 0xa4, 0x85, 0x9a,
+		0x3f, 0x20, 0x01, 0x1e, 0x43, 0x5c, 0x7d, 0x62,
+		0x2a, 0x35, 0x14, 0x0b, 0x56, 0x49, 0x68, 0x77,
+		0xd2, 0xcd, 0xec, 0xf3, 0xae, 0xb1, 0x90, 0x8f,
+		0x93, 0x8c, 0xad, 0xb2, 0xef, 0xf0, 0xd1, 0xce,
+		0x6b, 0x74, 0x55, 0x4a, 0x17, 0x08, 0x29, 0x36,
+		0x7e, 0x61, 0x40, 0x5f, 0x02, 0x1d, 0x3c, 0x23,
+		0x86, 0x99, 0xb8, 0xa7, 0xfa, 0xe5, 0xc4, 0xdb,
+		0x54, 0x4b, 0x6a, 0x75, 0x28, 0x37, 0x16, 0x09,
+		0xac, 0xb3, 0x92, 0x8d, 0xd0, 0xcf, 0xee, 0xf1,
+		0xb9, 0xa6, 0x87, 0x98, 0xc5, 0xda, 0xfb, 0xe4,
+		0x41, 0x5e, 0x7f, 0x60, 0x3d, 0x22, 0x03, 0x1c,
+		0x3b, 0x24, 0x05, 0x1a, 0x47, 0x58, 0x79, 0x66,
+		0xc3, 0xdc, 0xfd, 0xe2, 0xbf, 0xa0, 0x81, 0x9e,
+		0xd6, 0xc9, 0xe8, 0xf7, 0xaa, 0xb5, 0x94, 0x8b,
+		0x2e, 0x31, 0x10, 0x0f, 0x52, 0x4d, 0x6c, 0x73,
+		0xfc, 0xe3, 0xc2, 0xdd, 0x80, 0x9f, 0xbe, 0xa1,
+		0x04, 0x1b, 0x3a, 0x25, 0x78, 0x67, 0x46, 0x59,
+		0x11, 0x0e, 0x2f, 0x30, 0x6d, 0x72, 0x53, 0x4c,
+		0xe9, 0xf6, 0xd7, 0xc8, 0x95, 0x8a, 0xab, 0xb4,
+		0xa8, 0xb7, 0x96, 0x89, 0xd4, 0xcb, 0xea, 0xf5,
+		0x50, 0x4f, 0x6e, 0x71, 0x2c, 0x33, 0x12, 0x0d,
+		0x45, 0x5a, 0x7b, 0x64, 0x39, 0x26, 0x07, 0x18,
+		0xbd, 0xa2, 0x83, 0x9c, 0xc1, 0xde, 0xff, 0xe0,
+		0x6f, 0x70, 0x51, 0x4e, 0x13, 0x0c, 0x2d, 0x32,
+		0x97, 0x88, 0xa9, 0xb6, 0xeb, 0xf4, 0xd5, 0xca,
+		0x82, 0x9d, 0xbc, 0xa3, 0xfe, 0xe1, 0xc0, 0xdf,
+		0x7a, 0x65, 0x44, 0x5b, 0x06, 0x19, 0x38, 0x27,
+	},
+	{
+		0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+		0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd,
+		0x3a, 0x1a, 0x7a, 0x5a, 0xba, 0x9a, 0xfa, 0xda,
+		0x27, 0x07, 0x67, 0x47, 0xa7, 0x87, 0xe7, 0xc7,
+		0x74, 0x54, 0x34, 0x14, 0xf4, 0xd4, 0xb4, 0x94,
+		0x69, 0x49, 0x29, 0x09, 0xe9, 0xc9, 0xa9, 0x89,
+		0x4e, 0x6e, 0x0e, 0x2e, 0xce, 0xee, 0x8e, 0xae,
+		0x53, 0x73, 0x13, 0x33, 0xd3, 0xf3, 0x93, 0xb3,
+		0xe8, 0xc8, 0xa8, 0x88, 0x68, 0x48, 0x28, 0x08,
+		0xf5, 0xd5, 0xb5, 0x95, 0x75, 0x55, 0x35, 0x15,
+		0xd2, 0xf2, 0x92, 0xb2, 0x52, 0x72, 0x12, 0x32,
+		0xcf, 0xef, 0x8f, 0xaf, 0x4f, 0x6f, 0x0f, 0x2f,
+		0x9c, 0xbc, 0xdc, 0xfc, 0x1c, 0x3c, 0x5c, 0x7c,
+		0x81, 0xa1, 0xc1, 0xe1, 0x01, 0x21, 0x41, 0x61,
+		0xa6, 0x86, 0xe6, 0xc6, 0x26, 0x06, 0x66, 0x46,
+		0xbb, 0x9b, 0xfb, 0xdb, 0x3b, 0x1b, 0x7b, 0x5b,
+		0xcd, 0xed, 0x8d, 0xad, 0x4d, 0x6d, 0x0d, 0x2d,
+		0xd0, 0xf0, 0x90, 0xb0, 0x50, 0x70, 0x10, 0x30,
+		0xf7, 0xd7, 0xb7, 0x97, 0x77, 0x57, 0x37, 0x17,
+		0xea, 0xca, 0xaa, 0x8a, 0x6a, 0x4a, 0x2a, 0x0a,
+		0xb9, 0x99, 0xf9, 0xd9, 0x39, 0x19, 0x79, 0x59,
+		0xa4, 0x84, 0xe4, 0xc4, 0x24, 0x04, 0x64, 0x44,
+		0x83, 0xa3, 0xc3, 0xe3, 0x03, 0x23, 0x43, 0x63,
+		0x9e, 0xbe, 0xde, 0xfe, 0x1e, 0x3e, 0x5e, 0x7e,
+		0x25, 0x05, 0x65, 0x45, 0xa5, 0x85, 0xe5, 0xc5,
+		0x38, 0x18, 0x78, 0x58, 0xb8, 0x98, 0xf8, 0xd8,
+		0x1f, 0x3f, 0x5f, 0x7f, 0x9f, 0xbf, 0xdf, 0xff,
+		0x02, 0x22, 0x42, 0x62, 0x82, 0xa2, 0xc2, 0xe2,
+		0x51, 0x71, 0x11, 0x31, 0xd1, 0xf1, 0x91, 0xb1,
+		0x4c, 0x6c, 0x0c, 0x2c, 0xcc, 0xec, 0x8c, 0xac,
+		0x6b, 0x4b, 0x2b, 0x0b, 0xeb, 0xcb, 0xab, 0x8b,
+		0x76, 0x56, 0x36, 0x16, 0xf6, 0xd6, 0xb6, 0x96,
+	},
+	{
+		0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7,
+		0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2,
+		0x2a, 0x0b, 0x68, 0x49, 0xae, 0x8f, 0xec, 0xcd,
+		0x3f, 0x1e, 0x7d, 0x5c, 0xbb, 0x9a, 0xf9, 0xd8,
+		0x54, 0x75, 0x16, 0x37, 0xd0, 0xf1, 0x92, 0xb3,
+		0x41, 0x60, 0x03, 0x22, 0xc5, 0xe4, 0x87, 0xa6,
+		0x7e, 0x5f, 0x3c, 0x1d, 0xfa, 0xdb, 0xb8, 0x99,
+		0x6b, 0x4a, 0x29, 0x08, 0xef, 0xce, 0xad, 0x8c,
+		0xa8, 0x89, 0xea, 0xcb, 0x2c, 0x0d, 0x6e, 0x4f,
+		0xbd, 0x9c, 0xff, 0xde, 0x39, 0x18, 0x7b, 0x5a,
+		0x82, 0xa3, 0xc0, 0xe1, 0x06, 0x27, 0x44, 0x65,
+		0x97, 0xb6, 0xd5, 0xf4, 0x13, 0x32, 0x51, 0x70,
+		0xfc, 0xdd, 0xbe, 0x9f, 0x78, 0x59, 0x3a, 0x1b,
+		0xe9, 0xc8, 0xab, 0x8a, 0x6d, 0x4c, 0x2f, 0x0e,
+		0xd6, 0xf7, 0x94, 0xb5, 0x52, 0x73, 0x10, 0x31,
+		0xc3, 0xe2, 0x81, 0xa0, 0x47, 0x66, 0x05, 0x24,
+		0x4d, 0x6c, 0x0f, 0x2e, 0xc9, 0xe8, 0x8b, 0xaa,
+		0x58, 0x79, 0x1a, 0x3b, 0xdc, 0xfd, 0x9e, 0xbf,
+		0x67, 0x46, 0x25, 0x04, 0xe3, 0xc2, 0xa1, 0x80,
+		0x72, 0x53, 0x30, 0x11, 0xf6, 0xd7, 0xb4, 0x95,
+		0x19, 0x38, 0x5b, 0x7a, 0x9d, 0xbc, 0xdf, 0xfe,
+		0x0c, 0x2d, 0x4e, 0x6f, 0x88, 0xa9, 0xca, 0xeb,
+		0x33, 0x12, 0x71, 0x50, 0xb7, 0x96, 0xf5, 0xd4,
+		0x26, 0x07, 0x64, 0x45, 0xa2, 0x83, 0xe0, 0xc1,
+		0xe5, 0xc4, 0xa7, 0x86, 0x61, 0x40, 0x23, 0x02,
+		0xf0, 0xd1, 0xb2, 0x93, 0x74, 0x55, 0x36, 0x17,
+		0xcf, 0xee, 0x8d, 0xac, 0x4b, 0x6a, 0x09, 0x28,
+		0xda, 0xfb, 0x98, 0xb9, 0x5e, 0x7f, 0x1c, 0x3d,
+		0xb1, 0x90, 0xf3, 0xd2, 0x35, 0x14, 0x77, 0x56,
+		0xa4, 0x85, 0xe6, 0xc7, 0x20, 0x01, 0x62, 0x43,
+		0x9b, 0xba, 0xd9, 0xf8, 0x1f, 0x3e, 0x5d, 0x7c,
+		0x8e, 0xaf, 0xcc, 0xed, 0x0a, 0x2b, 0x48, 0x69,
+	},
+	{
+		0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee,
+		0x0d, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3,
+		0x1a, 0x38, 0x5e, 0x7c, 0x92, 0xb0, 0xd6, 0xf4,
+		0x17, 0x35, 0x53, 0x71, 0x9f, 0xbd, 0xdb, 0xf9,
+		0x34, 0x16, 0x70, 0x52, 0xbc, 0x9e, 0xf8, 0xda,
+		0x39, 0x1b, 0x7d, 0x5f, 0xb1, 0x93, 0xf5, 0xd7,
+		0x2e, 0x0c, 0x6a, 0x48, 0xa6, 0x84, 0xe2, 0xc0,
+		0x23, 0x01, 0x67, 0x45, 0xab, 0x89, 0xef, 0xcd,
+		0x68, 0x4a, 0x2c, 0x0e, 0xe0, 0xc2, 0xa4, 0x86,
+		0x65, 0x47, 0x21, 0x03, 0xed, 0xcf, 0xa9, 0x8b,
+		0x72, 0x50, 0x36, 0x14, 0xfa, 0xd8, 0xbe, 0x9c,
+		0x7f, 0x5d, 0x3b, 0x19, 0xf7, 0xd5, 0xb3, 0x91,
+		0x5c, 0x7e, 0x18, 0x3a, 0xd4, 0xf6, 0x90, 0xb2,
+		0x51, 0x73, 0x15, 0x37, 0xd9, 0xfb, 0x9d, 0xbf,
+		0x46, 0x64, 0x02, 0x20, 0xce, 0xec, 0x8a, 0xa8,
+		0x4b, 0x69, 0x0f, 0x2d, 0xc3, 0xe1, 0x87, 0xa5,
+		0xd0, 0xf2, 0x94, 0xb6, 0x58, 0x7a, 0x1c, 0x3e,
+		0xdd, 0xff, 0x99, 0xbb, 0x55, 0x77, 0x11, 0x33,
+		0xca, 0xe8, 0x8e, 0xac, 0x42, 0x60, 0x06, 0x24,
+		0xc7, 0xe5, 0x83, 0xa1, 0x4f, 0x6d, 0x0b, 0x29,
+		0xe4, 0xc6, 0xa0, 0x82, 0x6c, 0x4e, 0x28, 0x0a,
+		0xe9, 0xcb, 0xad, 0x8f, 0x61, 0x43, 0x25, 0x07,
+		0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10,
+		0xf3, 0xd1, 0xb7, 0x95, 0x7b, 0x59, 0x3f, 0x1d,
+		0xb8, 0x9a, 0xfc, 0xde, 0x30, 0x12, 0x74, 0x56,
+		0xb5, 0x97, 0xf1, 0xd3, 0x3d, 0x1f, 0x79, 0x5b,
+		0xa2, 0x80, 0xe6, 0xc4, 0x2a, 0x08, 0x6e, 0x4c,
+		0xaf, 0x8d, 0xeb, 0xc9, 0x27, 0x05, 0x63, 0x41,
+		0x8c, 0xae, 0xc8, 0xea, 0x04, 0x26, 0x40, 0x62,
+		0x81, 0xa3, 0xc5, 0xe7, 0x09, 0x2b, 0x4d, 0x6f,
+		0x96, 0xb4, 0xd2, 0xf0, 0x1e, 0x3c, 0x5a, 0x78,
+		0x9b, 0xb9, 0xdf, 0xfd, 0x13, 0x31, 0x57, 0x75,
+	},
+	{
+		0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9,
+		0x05, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec,
+		0x0a, 0x29, 0x4c, 0x6f, 0x86, 0xa5, 0xc0, 0xe3,
+		0x0f, 0x2c, 0x49, 0x6a, 0x83, 0xa0, 0xc5, 0xe6,
+		0x14, 0x37, 0x52, 0x71, 0x98, 0xbb, 0xde, 0xfd,
+		0x11, 0x32, 0x57, 0x74, 0x9d, 0xbe, 0xdb, 0xf8,
+		0x1e, 0x3d, 0x58, 0x7b, 0x92, 0xb1, 0xd4, 0xf7,
+		0x1b, 0x38, 0x5d, 0x7e, 0x97, 0xb4, 0xd1, 0xf2,
+		0x28, 0x0b, 0x6e, 0x4d, 0xa4, 0x87, 0xe2, 0xc1,
+		0x2d, 0x0e, 0x6b, 0x48, 0xa1, 0x82, 0xe7, 0xc4,
+		0x22, 0x01, 0x64, 0x47, 0xae, 0x8d, 0xe8, 0xcb,
+		0x27, 0x04, 0x61, 0x42, 0xab, 0x88, 0xed, 0xce,
+		0x3c, 0x1f, 0x7a, 0x59, 0xb0, 0x93, 0xf6, 0xd5,
+		0x39, 0x1a, 0x7f, 0x5c, 0xb5, 0x96, 0xf3, 0xd0,
+		0x36, 0x15, 0x70, 0x53, 0xba, 0x99, 0xfc, 0xdf,
+		0x33, 0x10, 0x75, 0x56, 0xbf, 0x9c, 0xf9, 0xda,
+		0x50, 0x73, 0x16, 0x35, 0xdc, 0xff, 0x9a, 0xb9,
+		0x55, 0x76, 0x13, 0x30, 0xd9, 0xfa, 0x9f, 0xbc,
+		0x5a, 0x79, 0x1c, 0x3f, 0xd6, 0xf5, 0x90, 0xb3,
+		0x5f, 0x7c, 0x19, 0x3a, 0xd3, 0xf0, 0x95, 0xb6,
+		0x44, 0x67, 0x02, 0x21, 0xc8, 0xeb, 0x8e, 0xad,
+		0x41, 0x62, 0x07, 0x24, 0xcd, 0xee, 0x8b, 0xa8,
+		0x4e, 0x6d, 0x08, 0x2b, 0xc2, 0xe1, 0x84, 0xa7,
+		0x4b, 0x68, 0x0d, 0x2e, 0xc7, 0xe4, 0x81, 0xa2,
+		0x78, 0x5b, 0x3e, 0x1d, 0xf4, 0xd7, 0xb2, 0x91,
+		0x7d, 0x5e, 0x3b, 0x18, 0xf1, 0xd2, 0xb7, 0x94,
+		0x72, 0x51, 0x34, 0x17, 0xfe, 0xdd, 0xb8, 0x9b,
+		0x77, 0x54, 0x31, 0x12, 0xfb, 0xd8, 0xbd, 0x9e,
+		0x6c, 0x4f, 0x2a, 0x09, 0xe0, 0xc3, 0xa6, 0x85,
+		0x69, 0x4a, 0x2f, 0x0c, 0xe5, 0xc6, 0xa3, 0x80,
+		0x66, 0x45, 0x20, 0x03, 0xea, 0xc9, 0xac, 0x8f,
+		0x63, 0x40, 0x25, 0x06, 0xef, 0xcc, 0xa9, 0x8a,
+	},
+	{
+		0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc,
+		0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1,
+		0x7a, 0x5e, 0x32, 0x16, 0xea, 0xce, 0xa2, 0x86,
+		0x47, 0x63, 0x0f, 0x2b, 0xd7, 0xf3, 0x9f, 0xbb,
+		0xf4, 0xd0, 0xbc, 0x98, 0x64, 0x40, 0x2c, 0x08,
+		0xc9, 0xed, 0x81, 0xa5, 0x59, 0x7d, 0x11, 0x35,
+		0x8e, 0xaa, 0xc6, 0xe2, 0x1e, 0x3a, 0x56, 0x72,
+		0xb3, 0x97, 0xfb, 0xdf, 0x23, 0x07, 0x6b, 0x4f,
+		0xf5, 0xd1, 0xbd, 0x99, 0x65, 0x41, 0x2d, 0x09,
+		0xc8, 0xec, 0x80, 0xa4, 0x58, 0x7c, 0x10, 0x34,
+		0x8f, 0xab, 0xc7, 0xe3, 0x1f, 0x3b, 0x57, 0x73,
+		0xb2, 0x96, 0xfa, 0xde, 0x22, 0x06, 0x6a, 0x4e,
+		0x01, 0x25, 0x49, 0x6d, 0x91, 0xb5, 0xd9, 0xfd,
+		0x3c, 0x18, 0x74, 0x50, 0xac, 0x88, 0xe4, 0xc0,
+		0x7b, 0x5f, 0x33, 0x17, 0xeb, 0xcf, 0xa3, 0x87,
+		0x46, 0x62, 0x0e, 0x2a, 0xd6, 0xf2, 0x9e, 0xba,
+		0xf7, 0xd3, 0xbf, 0x9b, 0x67, 0x43, 0x2f, 0x0b,
+		0xca, 0xee, 0x82, 0xa6, 0x5a, 0x7e, 0x12, 0x36,
+		0x8d, 0xa9, 0xc5, 0xe1, 0x1d, 0x39, 0x55, 0x71,
+		0xb0, 0x94, 0xf8, 0xdc, 0x20, 0x04, 0x68, 0x4c,
+		0x03, 0x27, 0x4b, 0x6f, 0x93, 0xb7, 0xdb, 0xff,
+		0x3e, 0x1a, 0x76, 0x52, 0xae, 0x8a, 0xe6, 0xc2,
+		0x79, 0x5d, 0x31, 0x15, 0xe9, 0xcd, 0xa1, 0x85,
+		0x44, 0x60, 0x0c, 0x28, 0xd4, 0xf0, 0x9c, 0xb8,
+		0x02, 0x26, 0x4a, 0x6e, 0x92, 0xb6, 0xda, 0xfe,
+		0x3f, 0x1b, 0x77, 0x53, 0xaf, 0x8b, 0xe7, 0xc3,
+		0x78, 0x5c, 0x30, 0x14, 0xe8, 0xcc, 0xa0, 0x84,
+		0x45, 0x61, 0x0d, 0x29, 0xd5, 0xf1, 0x9d, 0xb9,
+		0xf6, 0xd2, 0xbe, 0x9a, 0x66, 0x42, 0x2e, 0x0a,
+		0xcb, 0xef, 0x83, 0xa7, 0x5b, 0x7f, 0x13, 0x37,
+		0x8c, 0xa8, 0xc4, 0xe0, 0x1c, 0x38, 0x54, 0x70,
+		0xb1, 0x95, 0xf9, 0xdd, 0x21, 0x05, 0x69, 0x4d,
+	},
+	{
+		0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb,
+		0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce,
+		0x6a, 0x4f, 0x20, 0x05, 0xfe, 0xdb, 0xb4, 0x91,
+		0x5f, 0x7a, 0x15, 0x30, 0xcb, 0xee, 0x81, 0xa4,
+		0xd4, 0xf1, 0x9e, 0xbb, 0x40, 0x65, 0x0a, 0x2f,
+		0xe1, 0xc4, 0xab, 0x8e, 0x75, 0x50, 0x3f, 0x1a,
+		0xbe, 0x9b, 0xf4, 0xd1, 0x2a, 0x0f, 0x60, 0x45,
+		0x8b, 0xae, 0xc1, 0xe4, 0x1f, 0x3a, 0x55, 0x70,
+		0xb5, 0x90, 0xff, 0xda, 0x21, 0x04, 0x6b, 0x4e,
+		0x80, 0xa5, 0xca, 0xef, 0x14, 0x31, 0x5e, 0x7b,
+		0xdf, 0xfa, 0x95, 0xb0, 0x4b, 0x6e, 0x01, 0x24,
+		0xea, 0xcf, 0xa0, 0x85, 0x7e, 0x5b, 0x34, 0x11,
+		0x61, 0x44, 0x2b, 0x0e, 0xf5, 0xd0, 0xbf, 0x9a,
+		0x54, 0x71, 0x1e, 0x3b, 0xc0, 0xe5, 0x8a, 0xaf,
+		0x0b, 0x2e, 0x41, 0x64, 0x9f, 0xba, 0xd5, 0xf0,
+		0x3e, 0x1b, 0x74, 0x51, 0xaa, 0x8f, 0xe0, 0xc5,
+		0x77, 0x52, 0x3d, 0x18, 0xe3, 0xc6, 0xa9, 0x8c,
+		0x42, 0x67, 0x08, 0x2d, 0xd6, 0xf3, 0x9c, 0xb9,
+		0x1d, 0x38, 0x57, 0x72, 0x89, 0xac, 0xc3, 0xe6,
+		0x28, 0x0d, 0x62, 0x47, 0xbc, 0x99, 0xf6, 0xd3,
+		0xa3, 0x86, 0xe9, 0xcc, 0x37, 0x12, 0x7d, 0x58,
+		0x96, 0xb3, 0xdc, 0xf9, 0x02, 0x27, 0x48, 0x6d,
+		0xc9, 0xec, 0x83, 0xa6, 0x5d, 0x78, 0x17, 0x32,
+		0xfc, 0xd9, 0xb6, 0x93, 0x68, 0x4d, 0x22, 0x07,
+		0xc2, 0xe7, 0x88, 0xad, 0x56, 0x73, 0x1c, 0x39,
+		0xf7, 0xd2, 0xbd, 0x98, 0x63, 0x46, 0x29, 0x0c,
+		0xa8, 0x8d, 0xe2, 0xc7, 0x3c, 0x19, 0x76, 0x53,
+		0x9d, 0xb8, 0xd7, 0xf2, 0x09, 0x2c, 0x43, 0x66,
+		0x16, 0x33, 0x5c, 0x79, 0x82, 0xa7, 0xc8, 0xed,
+		0x23, 0x06, 0x69, 0x4c, 0xb7, 0x92, 0xfd, 0xd8,
+		0x7c, 0x59, 0x36, 0x13, 0xe8, 0xcd, 0xa2, 0x87,
+		0x49, 0x6c, 0x03, 0x26, 0xdd, 0xf8, 0x97, 0xb2,
+	},
+	{
+		0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2,
+		0x2d, 0x0b, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf,
+		0x5a, 0x7c, 0x16, 0x30, 0xc2, 0xe4, 0x8e, 0xa8,
+		0x77, 0x51, 0x3b, 0x1d, 0xef, 0xc9, 0xa3, 0x85,
+		0xb4, 0x92, 0xf8, 0xde, 0x2c, 0x0a, 0x60, 0x46,
+		0x99, 0xbf, 0xd5, 0xf3, 0x01, 0x27, 0x4d, 0x6b,
+		0xee, 0xc8, 0xa2, 0x84, 0x76, 0x50, 0x3a, 0x1c,
+		0xc3, 0xe5, 0x8f, 0xa9, 0x5b, 0x7d, 0x17, 0x31,
+		0x75, 0x53, 0x39, 0x1f, 0xed, 0xcb, 0xa1, 0x87,
+		0x58, 0x7e, 0x14, 0x32, 0xc0, 0xe6, 0x8c, 0xaa,
+		0x2f, 0x09, 0x63, 0x45, 0xb7, 0x91, 0xfb, 0xdd,
+		0x02, 0x24, 0x4e, 0x68, 0x9a, 0xbc, 0xd6, 0xf0,
+		0xc1, 0xe7, 0x8d, 0xab, 0x59, 0x7f, 0x15, 0x33,
+		0xec, 0xca, 0xa0, 0x86, 0x74, 0x52, 0x38, 0x1e,
+		0x9b, 0xbd, 0xd7, 0xf1, 0x03, 0x25, 0x4f, 0x69,
+		0xb6, 0x90, 0xfa, 0xdc, 0x2e, 0x08, 0x62, 0x44,
+		0xea, 0xcc, 0xa6, 0x80, 0x72, 0x54, 0x3e, 0x18,
+		0xc7, 0xe1, 0x8b, 0xad, 0x5f, 0x79, 0x13, 0x35,
+		0xb0, 0x96, 0xfc, 0xda, 0x28, 0x0e, 0x64, 0x42,
+		0x9d, 0xbb, 0xd1, 0xf7, 0x05, 0x23, 0x49, 0x6f,
+		0x5e, 0x78, 0x12, 0x34, 0xc6, 0xe0, 0x8a, 0xac,
+		0x73, 0x55, 0x3f, 0x19, 0xeb, 0xcd, 0xa7, 0x81,
+		0x04, 0x22, 0x48, 0x6e, 0x9c, 0xba, 0xd0, 0xf6,
+		0x29, 0x0f, 0x65, 0x43, 0xb1, 0x97, 0xfd, 0xdb,
+		0x9f, 0xb9, 0xd3, 0xf5, 0x07, 0x21, 0x4b, 0x6d,
+		0xb2, 0x94, 0xfe, 0xd8, 0x2a, 0x0c, 0x66, 0x40,
+		0xc5, 0xe3, 0x89, 0xaf, 0x5d, 0x7b, 0x11, 0x37,
+		0xe8, 0xce, 0xa4, 0x82, 0x70, 0x56, 0x3c, 0x1a,
+		0x2b, 0x0d, 0x67, 0x41, 0xb3, 0x95, 0xff, 0xd9,
+		0x06, 0x20, 0x4a, 0x6c, 0x9e, 0xb8, 0xd2, 0xf4,
+		0x71, 0x57, 0x3d, 0x1b, 0xe9, 0xcf, 0xa5, 0x83,
+		0x5c, 0x7a, 0x10, 0x36, 0xc4, 0xe2, 0x88, 0xae,
+	},
+	{
+		0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+		0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0,
+		0x4a, 0x6d, 0x04, 0x23, 0xd6, 0xf1, 0x98, 0xbf,
+		0x6f, 0x48, 0x21, 0x06, 0xf3, 0xd4, 0xbd, 0x9a,
+		0x94, 0xb3, 0xda, 0xfd, 0x08, 0x2f, 0x46, 0x61,
+		0xb1, 0x96, 0xff, 0xd8, 0x2d, 0x0a, 0x63, 0x44,
+		0xde, 0xf9, 0x90, 0xb7, 0x42, 0x65, 0x0c, 0x2b,
+		0xfb, 0xdc, 0xb5, 0x92, 0x67, 0x40, 0x29, 0x0e,
+		0x35, 0x12, 0x7b, 0x5c, 0xa9, 0x8e, 0xe7, 0xc0,
+		0x10, 0x37, 0x5e, 0x79, 0x8c, 0xab, 0xc2, 0xe5,
+		0x7f, 0x58, 0x31, 0x16, 0xe3, 0xc4, 0xad, 0x8a,
+		0x5a, 0x7d, 0x14, 0x33, 0xc6, 0xe1, 0x88, 0xaf,
+		0xa1, 0x86, 0xef, 0xc8, 0x3d, 0x1a, 0x73, 0x54,
+		0x84, 0xa3, 0xca, 0xed, 0x18, 0x3f, 0x56, 0x71,
+		0xeb, 0xcc, 0xa5, 0x82, 0x77, 0x50, 0x39, 0x1e,
+		0xce, 0xe9, 0x80, 0xa7, 0x52, 0x75, 0x1c, 0x3b,
+		0x6a, 0x4d, 0x24, 0x03, 0xf6, 0xd1, 0xb8, 0x9f,
+		0x4f, 0x68, 0x01, 0x26, 0xd3, 0xf4, 0x9d, 0xba,
+		0x20, 0x07, 0x6e, 0x49, 0xbc, 0x9b, 0xf2, 0xd5,
+		0x05, 0x22, 0x4b, 0x6c, 0x99, 0xbe, 0xd7, 0xf0,
+		0xfe, 0xd9, 0xb0, 0x97, 0x62, 0x45, 0x2c, 0x0b,
+		0xdb, 0xfc, 0x95, 0xb2, 0x47, 0x60, 0x09, 0x2e,
+		0xb4, 0x93, 0xfa, 0xdd, 0x28, 0x0f, 0x66, 0x41,
+		0x91, 0xb6, 0xdf, 0xf8, 0x0d, 0x2a, 0x43, 0x64,
+		0x5f, 0x78, 0x11, 0x36, 0xc3, 0xe4, 0x8d, 0xaa,
+		0x7a, 0x5d, 0x34, 0x13, 0xe6, 0xc1, 0xa8, 0x8f,
+		0x15, 0x32, 0x5b, 0x7c, 0x89, 0xae, 0xc7, 0xe0,
+		0x30, 0x17, 0x7e, 0x59, 0xac, 0x8b, 0xe2, 0xc5,
+		0xcb, 0xec, 0x85, 0xa2, 0x57, 0x70, 0x19, 0x3e,
+		0xee, 0xc9, 0xa0, 0x87, 0x72, 0x55, 0x3c, 0x1b,
+		0x81, 0xa6, 0xcf, 0xe8, 0x1d, 0x3a, 0x53, 0x74,
+		0xa4, 0x83, 0xea, 0xcd, 0x38, 0x1f, 0x76, 0x51,
+	},
+	{
+		0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8,
+		0x5d, 0x75, 0x0d, 0x25, 0xfd, 0xd5, 0xad, 0x85,
+		0xba, 0x92, 0xea, 0xc2, 0x1a, 0x32, 0x4a, 0x62,
+		0xe7, 0xcf, 0xb7, 0x9f, 0x47, 0x6f, 0x17, 0x3f,
+		0x69, 0x41, 0x39, 0x11, 0xc9, 0xe1, 0x99, 0xb1,
+		0x34, 0x1c, 0x64, 0x4c, 0x94, 0xbc, 0xc4, 0xec,
+		0xd3, 0xfb, 0x83, 0xab, 0x73, 0x5b, 0x23, 0x0b,
+		0x8e, 0xa6, 0xde, 0xf6, 0x2e, 0x06, 0x7e, 0x56,
+		0xd2, 0xfa, 0x82, 0xaa, 0x72, 0x5a, 0x22, 0x0a,
+		0x8f, 0xa7, 0xdf, 0xf7, 0x2f, 0x07, 0x7f, 0x57,
+		0x68, 0x40, 0x38, 0x10, 0xc8, 0xe0, 0x98, 0xb0,
+		0x35, 0x1d, 0x65, 0x4d, 0x95, 0xbd, 0xc5, 0xed,
+		0xbb, 0x93, 0xeb, 0xc3, 0x1b, 0x33, 0x4b, 0x63,
+		0xe6, 0xce, 0xb6, 0x9e, 0x46, 0x6e, 0x16, 0x3e,
+		0x01, 0x29, 0x51, 0x79, 0xa1, 0x89, 0xf1, 0xd9,
+		0x5c, 0x74, 0x0c, 0x24, 0xfc, 0xd4, 0xac, 0x84,
+		0xb9, 0x91, 0xe9, 0xc1, 0x19, 0x31, 0x49, 0x61,
+		0xe4, 0xcc, 0xb4, 0x9c, 0x44, 0x6c, 0x14, 0x3c,
+		0x03, 0x2b, 0x53, 0x7b, 0xa3, 0x8b, 0xf3, 0xdb,
+		0x5e, 0x76, 0x0e, 0x26, 0xfe, 0xd6, 0xae, 0x86,
+		0xd0, 0xf8, 0x80, 0xa8, 0x70, 0x58, 0x20, 0x08,
+		0x8d, 0xa5, 0xdd, 0xf5, 0x2d, 0x05, 0x7d, 0x55,
+		0x6a, 0x42, 0x3a, 0x12, 0xca, 0xe2, 0x9a, 0xb2,
+		0x37, 0x1f, 0x67, 0x4f, 0x97, 0xbf, 0xc7, 0xef,
+		0x6b, 0x43, 0x3b, 0x13, 0xcb, 0xe3, 0x9b, 0xb3,
+		0x36, 0x1e, 0x66, 0x4e, 0x96, 0xbe, 0xc6, 0xee,
+		0xd1, 0xf9, 0x81, 0xa9, 0x71, 0x59, 0x21, 0x09,
+		0x8c, 0xa4, 0xdc, 0xf4, 0x2c, 0x04, 0x7c, 0x54,
+		0x02, 0x2a, 0x52, 0x7a, 0xa2, 0x8a, 0xf2, 0xda,
+		0x5f, 0x77, 0x0f, 0x27, 0xff, 0xd7, 0xaf, 0x87,
+		0xb8, 0x90, 0xe8, 0xc0, 0x18, 0x30, 0x48, 0x60,
+		0xe5, 0xcd, 0xb5, 0x9d, 0x45, 0x6d, 0x15, 0x3d,
+	},
+	{
+		0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf,
+		0x55, 0x7c, 0x07, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a,
+		0xaa, 0x83, 0xf8, 0xd1, 0x0e, 0x27, 0x5c, 0x75,
+		0xff, 0xd6, 0xad, 0x84, 0x5b, 0x72, 0x09, 0x20,
+		0x49, 0x60, 0x1b, 0x32, 0xed, 0xc4, 0xbf, 0x96,
+		0x1c, 0x35, 0x4e, 0x67, 0xb8, 0x91, 0xea, 0xc3,
+		0xe3, 0xca, 0xb1, 0x98, 0x47, 0x6e, 0x15, 0x3c,
+		0xb6, 0x9f, 0xe4, 0xcd, 0x12, 0x3b, 0x40, 0x69,
+		0x92, 0xbb, 0xc0, 0xe9, 0x36, 0x1f, 0x64, 0x4d,
+		0xc7, 0xee, 0x95, 0xbc, 0x63, 0x4a, 0x31, 0x18,
+		0x38, 0x11, 0x6a, 0x43, 0x9c, 0xb5, 0xce, 0xe7,
+		0x6d, 0x44, 0x3f, 0x16, 0xc9, 0xe0, 0x9b, 0xb2,
+		0xdb, 0xf2, 0x89, 0xa0, 0x7f, 0x56, 0x2d, 0x04,
+		0x8e, 0xa7, 0xdc, 0xf5, 0x2a, 0x03, 0x78, 0x51,
+		0x71, 0x58, 0x23, 0x0a, 0xd5, 0xfc, 0x87, 0xae,
+		0x24, 0x0d, 0x76, 0x5f, 0x80, 0xa9, 0xd2, 0xfb,
+		0x39, 0x10, 0x6b, 0x42, 0x9d, 0xb4, 0xcf, 0xe6,
+		0x6c, 0x45, 0x3e, 0x17, 0xc8, 0xe1, 0x9a, 0xb3,
+		0x93, 0xba, 0xc1, 0xe8, 0x37, 0x1e, 0x65, 0x4c,
+		0xc6, 0xef, 0x94, 0xbd, 0x62, 0x4b, 0x30, 0x19,
+		0x70, 0x59, 0x22, 0x0b, 0xd4, 0xfd, 0x86, 0xaf,
+		0x25, 0x0c, 0x77, 0x5e, 0x81, 0xa8, 0xd3, 0xfa,
+		0xda, 0xf3, 0x88, 0xa1, 0x7e, 0x57, 0x2c, 0x05,
+		0x8f, 0xa6, 0xdd, 0xf4, 0x2b, 0x02, 0x79, 0x50,
+		0xab, 0x82, 0xf9, 0xd0, 0x0f, 0x26, 0x5d, 0x74,
+		0xfe, 0xd7, 0xac, 0x85, 0x5a, 0x73, 0x08, 0x21,
+		0x01, 0x28, 0x53, 0x7a, 0xa5, 0x8c, 0xf7, 0xde,
+		0x54, 0x7d, 0x06, 0x2f, 0xf0, 0xd9, 0xa2, 0x8b,
+		0xe2, 0xcb, 0xb0, 0x99, 0x46, 0x6f, 0x14, 0x3d,
+		0xb7, 0x9e, 0xe5, 0xcc, 0x13, 0x3a, 0x41, 0x68,
+		0x48, 0x61, 0x1a, 0x33, 0xec, 0xc5, 0xbe, 0x97,
+		0x1d, 0x34, 0x4f, 0x66, 0xb9, 0x90, 0xeb, 0xc2,
+	},
+	{
+		0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6,
+		0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b,
+		0x9a, 0xb0, 0xce, 0xe4, 0x32, 0x18, 0x66, 0x4c,
+		0xd7, 0xfd, 0x83, 0xa9, 0x7f, 0x55, 0x2b, 0x01,
+		0x29, 0x03, 0x7d, 0x57, 0x81, 0xab, 0xd5, 0xff,
+		0x64, 0x4e, 0x30, 0x1a, 0xcc, 0xe6, 0x98, 0xb2,
+		0xb3, 0x99, 0xe7, 0xcd, 0x1b, 0x31, 0x4f, 0x65,
+		0xfe, 0xd4, 0xaa, 0x80, 0x56, 0x7c, 0x02, 0x28,
+		0x52, 0x78, 0x06, 0x2c, 0xfa, 0xd0, 0xae, 0x84,
+		0x1f, 0x35, 0x4b, 0x61, 0xb7, 0x9d, 0xe3, 0xc9,
+		0xc8, 0xe2, 0x9c, 0xb6, 0x60, 0x4a, 0x34, 0x1e,
+		0x85, 0xaf, 0xd1, 0xfb, 0x2d, 0x07, 0x79, 0x53,
+		0x7b, 0x51, 0x2f, 0x05, 0xd3, 0xf9, 0x87, 0xad,
+		0x36, 0x1c, 0x62, 0x48, 0x9e, 0xb4, 0xca, 0xe0,
+		0xe1, 0xcb, 0xb5, 0x9f, 0x49, 0x63, 0x1d, 0x37,
+		0xac, 0x86, 0xf8, 0xd2, 0x04, 0x2e, 0x50, 0x7a,
+		0xa4, 0x8e, 0xf0, 0xda, 0x0c, 0x26, 0x58, 0x72,
+		0xe9, 0xc3, 0xbd, 0x97, 0x41, 0x6b, 0x15, 0x3f,
+		0x3e, 0x14, 0x6a, 0x40, 0x96, 0xbc, 0xc2, 0xe8,
+		0x73, 0x59, 0x27, 0x0d, 0xdb, 0xf1, 0x8f, 0xa5,
+		0x8d, 0xa7, 0xd9, 0xf3, 0x25, 0x0f, 0x71, 0x5b,
+		0xc0, 0xea, 0x94, 0xbe, 0x68, 0x42, 0x3c, 0x16,
+		0x17, 0x3d, 0x43, 0x69, 0xbf, 0x95, 0xeb, 0xc1,
+		0x5a, 0x70, 0x0e, 0x24, 0xf2, 0xd8, 0xa6, 0x8c,
+		0xf6, 0xdc, 0xa2, 0x88, 0x5e, 0x74, 0x0a, 0x20,
+		0xbb, 0x91, 0xef, 0xc5, 0x13, 0x39, 0x47, 0x6d,
+		0x6c, 0x46, 0x38, 0x12, 0xc4, 0xee, 0x90, 0xba,
+		0x21, 0x0b, 0x75, 0x5f, 0x89, 0xa3, 0xdd, 0xf7,
+		0xdf, 0xf5, 0x8b, 0xa1, 0x77, 0x5d, 0x23, 0x09,
+		0x92, 0xb8, 0xc6, 0xec, 0x3a, 0x10, 0x6e, 0x44,
+		0x45, 0x6f, 0x11, 0x3b, 0xed, 0xc7, 0xb9, 0x93,
+		0x08, 0x22, 0x5c, 0x76, 0xa0, 0x8a, 0xf4, 0xde,
+	},
+	{
+		0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1,
+		0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94,
+		0x8a, 0xa1, 0xdc, 0xf7, 0x26, 0x0d, 0x70, 0x5b,
+		0xcf, 0xe4, 0x99, 0xb2, 0x63, 0x48, 0x35, 0x1e,
+		0x09, 0x22, 0x5f, 0x74, 0xa5, 0x8e, 0xf3, 0xd8,
+		0x4c, 0x67, 0x1a, 0x31, 0xe0, 0xcb, 0xb6, 0x9d,
+		0x83, 0xa8, 0xd5, 0xfe, 0x2f, 0x04, 0x79, 0x52,
+		0xc6, 0xed, 0x90, 0xbb, 0x6a, 0x41, 0x3c, 0x17,
+		0x12, 0x39, 0x44, 0x6f, 0xbe, 0x95, 0xe8, 0xc3,
+		0x57, 0x7c, 0x01, 0x2a, 0xfb, 0xd0, 0xad, 0x86,
+		0x98, 0xb3, 0xce, 0xe5, 0x34, 0x1f, 0x62, 0x49,
+		0xdd, 0xf6, 0x8b, 0xa0, 0x71, 0x5a, 0x27, 0x0c,
+		0x1b, 0x30, 0x4d, 0x66, 0xb7, 0x9c, 0xe1, 0xca,
+		0x5e, 0x75, 0x08, 0x23, 0xf2, 0xd9, 0xa4, 0x8f,
+		0x91, 0xba, 0xc7, 0xec, 0x3d, 0x16, 0x6b, 0x40,
+		0xd4, 0xff, 0x82, 0xa9, 0x78, 0x53, 0x2e, 0x05,
+		0x24, 0x0f, 0x72, 0x59, 0x88, 0xa3, 0xde, 0xf5,
+		0x61, 0x4a, 0x37, 0x1c, 0xcd, 0xe6, 0x9b, 0xb0,
+		0xae, 0x85, 0xf8, 0xd3, 0x02, 0x29, 0x54, 0x7f,
+		0xeb, 0xc0, 0xbd, 0x96, 0x47, 0x6c, 0x11, 0x3a,
+		0x2d, 0x06, 0x7b, 0x50, 0x81, 0xaa, 0xd7, 0xfc,
+		0x68, 0x43, 0x3e, 0x15, 0xc4, 0xef, 0x92, 0xb9,
+		0xa7, 0x8c, 0xf1, 0xda, 0x0b, 0x20, 0x5d, 0x76,
+		0xe2, 0xc9, 0xb4, 0x9f, 0x4e, 0x65, 0x18, 0x33,
+		0x36, 0x1d, 0x60, 0x4b, 0x9a, 0xb1, 0xcc, 0xe7,
+		0x73, 0x58, 0x25, 0x0e, 0xdf, 0xf4, 0x89, 0xa2,
+		0xbc, 0x97, 0xea, 0xc1, 0x10, 0x3b, 0x46, 0x6d,
+		0xf9, 0xd2, 0xaf, 0x84, 0x55, 0x7e, 0x03, 0x28,
+		0x3f, 0x14, 0x69, 0x42, 0x93, 0xb8, 0xc5, 0xee,
+		0x7a, 0x51, 0x2c, 0x07, 0xd6, 0xfd, 0x80, 0xab,
+		0xb5, 0x9e, 0xe3, 0xc8, 0x19, 0x32, 0x4f, 0x64,
+		0xf0, 0xdb, 0xa6, 0x8d, 0x5c, 0x77, 0x0a, 0x21,
+	},
+	{
+		0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4,
+		0x7d, 0x51, 0x25, 0x09, 0xcd, 0xe1, 0x95, 0xb9,
+		0xfa, 0xd6, 0xa2, 0x8e, 0x4a, 0x66, 0x12, 0x3e,
+		0x87, 0xab, 0xdf, 0xf3, 0x37, 0x1b, 0x6f, 0x43,
+		0xe9, 0xc5, 0xb1, 0x9d, 0x59, 0x75, 0x01, 0x2d,
+		0x94, 0xb8, 0xcc, 0xe0, 0x24, 0x08, 0x7c, 0x50,
+		0x13, 0x3f, 0x4b, 0x67, 0xa3, 0x8f, 0xfb, 0xd7,
+		0x6e, 0x42, 0x36, 0x1a, 0xde, 0xf2, 0x86, 0xaa,
+		0xcf, 0xe3, 0x97, 0xbb, 0x7f, 0x53, 0x27, 0x0b,
+		0xb2, 0x9e, 0xea, 0xc6, 0x02, 0x2e, 0x5a, 0x76,
+		0x35, 0x19, 0x6d, 0x41, 0x85, 0xa9, 0xdd, 0xf1,
+		0x48, 0x64, 0x10, 0x3c, 0xf8, 0xd4, 0xa0, 0x8c,
+		0x26, 0x0a, 0x7e, 0x52, 0x96, 0xba, 0xce, 0xe2,
+		0x5b, 0x77, 0x03, 0x2f, 0xeb, 0xc7, 0xb3, 0x9f,
+		0xdc, 0xf0, 0x84, 0xa8, 0x6c, 0x40, 0x34, 0x18,
+		0xa1, 0x8d, 0xf9, 0xd5, 0x11, 0x3d, 0x49, 0x65,
+		0x83, 0xaf, 0xdb, 0xf7, 0x33, 0x1f, 0x6b, 0x47,
+		0xfe, 0xd2, 0xa6, 0x8a, 0x4e, 0x62, 0x16, 0x3a,
+		0x79, 0x55, 0x21, 0x0d, 0xc9, 0xe5, 0x91, 0xbd,
+		0x04, 0x28, 0x5c, 0x70, 0xb4, 0x98, 0xec, 0xc0,
+		0x6a, 0x46, 0x32, 0x1e, 0xda, 0xf6, 0x82, 0xae,
+		0x17, 0x3b, 0x4f, 0x63, 0xa7, 0x8b, 0xff, 0xd3,
+		0x90, 0xbc, 0xc8, 0xe4, 0x20, 0x0c, 0x78, 0x54,
+		0xed, 0xc1, 0xb5, 0x99, 0x5d, 0x71, 0x05, 0x29,
+		0x4c, 0x60, 0x14, 0x38, 0xfc, 0xd0, 0xa4, 0x88,
+		0x31, 0x1d, 0x69, 0x45, 0x81, 0xad, 0xd9, 0xf5,
+		0xb6, 0x9a, 0xee, 0xc2, 0x06, 0x2a, 0x5e, 0x72,
+		0xcb, 0xe7, 0x93, 0xbf, 0x7b, 0x57, 0x23, 0x0f,
+		0xa5, 0x89, 0xfd, 0xd1, 0x15, 0x39, 0x4d, 0x61,
+		0xd8, 0xf4, 0x80, 0xac, 0x68, 0x44, 0x30, 0x1c,
+		0x5f, 0x73, 0x07, 0x2b, 0xef, 0xc3, 0xb7, 0x9b,
+		0x22, 0x0e, 0x7a, 0x56, 0x92, 0xbe, 0xca, 0xe6,
+	},
+	{
+		0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3,
+		0x75, 0x58, 0x2f, 0x02, 0xc1, 0xec, 0x9b, 0xb6,
+		0xea, 0xc7, 0xb0, 0x9d, 0x5e, 0x73, 0x04, 0x29,
+		0x9f, 0xb2, 0xc5, 0xe8, 0x2b, 0x06, 0x71, 0x5c,
+		0xc9, 0xe4, 0x93, 0xbe, 0x7d, 0x50, 0x27, 0x0a,
+		0xbc, 0x91, 0xe6, 0xcb, 0x08, 0x25, 0x52, 0x7f,
+		0x23, 0x0e, 0x79, 0x54, 0x97, 0xba, 0xcd, 0xe0,
+		0x56, 0x7b, 0x0c, 0x21, 0xe2, 0xcf, 0xb8, 0x95,
+		0x8f, 0xa2, 0xd5, 0xf8, 0x3b, 0x16, 0x61, 0x4c,
+		0xfa, 0xd7, 0xa0, 0x8d, 0x4e, 0x63, 0x14, 0x39,
+		0x65, 0x48, 0x3f, 0x12, 0xd1, 0xfc, 0x8b, 0xa6,
+		0x10, 0x3d, 0x4a, 0x67, 0xa4, 0x89, 0xfe, 0xd3,
+		0x46, 0x6b, 0x1c, 0x31, 0xf2, 0xdf, 0xa8, 0x85,
+		0x33, 0x1e, 0x69, 0x44, 0x87, 0xaa, 0xdd, 0xf0,
+		0xac, 0x81, 0xf6, 0xdb, 0x18, 0x35, 0x42, 0x6f,
+		0xd9, 0xf4, 0x83, 0xae, 0x6d, 0x40, 0x37, 0x1a,
+		0x03, 0x2e, 0x59, 0x74, 0xb7, 0x9a, 0xed, 0xc0,
+		0x76, 0x5b, 0x2c, 0x01, 0xc2, 0xef, 0x98, 0xb5,
+		0xe9, 0xc4, 0xb3, 0x9e, 0x5d, 0x70, 0x07, 0x2a,
+		0x9c, 0xb1, 0xc6, 0xeb, 0x28, 0x05, 0x72, 0x5f,
+		0xca, 0xe7, 0x90, 0xbd, 0x7e, 0x53, 0x24, 0x09,
+		0xbf, 0x92, 0xe5, 0xc8, 0x0b, 0x26, 0x51, 0x7c,
+		0x20, 0x0d, 0x7a, 0x57, 0x94, 0xb9, 0xce, 0xe3,
+		0x55, 0x78, 0x0f, 0x22, 0xe1, 0xcc, 0xbb, 0x96,
+		0x8c, 0xa1, 0xd6, 0xfb, 0x38, 0x15, 0x62, 0x4f,
+		0xf9, 0xd4, 0xa3, 0x8e, 0x4d, 0x60, 0x17, 0x3a,
+		0x66, 0x4b, 0x3c, 0x11, 0xd2, 0xff, 0x88, 0xa5,
+		0x13, 0x3e, 0x49, 0x64, 0xa7, 0x8a, 0xfd, 0xd0,
+		0x45, 0x68, 0x1f, 0x32, 0xf1, 0xdc, 0xab, 0x86,
+		0x30, 0x1d, 0x6a, 0x47, 0x84, 0xa9, 0xde, 0xf3,
+		0xaf, 0x82, 0xf5, 0xd8, 0x1b, 0x36, 0x41, 0x6c,
+		0xda, 0xf7, 0x80, 0xad, 0x6e, 0x43, 0x34, 0x19,
+	},
+	{
+		0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca,
+		0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7,
+		0xda, 0xf4, 0x86, 0xa8, 0x62, 0x4c, 0x3e, 0x10,
+		0xb7, 0x99, 0xeb, 0xc5, 0x0f, 0x21, 0x53, 0x7d,
+		0xa9, 0x87, 0xf5, 0xdb, 0x11, 0x3f, 0x4d, 0x63,
+		0xc4, 0xea, 0x98, 0xb6, 0x7c, 0x52, 0x20, 0x0e,
+		0x73, 0x5d, 0x2f, 0x01, 0xcb, 0xe5, 0x97, 0xb9,
+		0x1e, 0x30, 0x42, 0x6c, 0xa6, 0x88, 0xfa, 0xd4,
+		0x4f, 0x61, 0x13, 0x3d, 0xf7, 0xd9, 0xab, 0x85,
+		0x22, 0x0c, 0x7e, 0x50, 0x9a, 0xb4, 0xc6, 0xe8,
+		0x95, 0xbb, 0xc9, 0xe7, 0x2d, 0x03, 0x71, 0x5f,
+		0xf8, 0xd6, 0xa4, 0x8a, 0x40, 0x6e, 0x1c, 0x32,
+		0xe6, 0xc8, 0xba, 0x94, 0x5e, 0x70, 0x02, 0x2c,
+		0x8b, 0xa5, 0xd7, 0xf9, 0x33, 0x1d, 0x6f, 0x41,
+		0x3c, 0x12, 0x60, 0x4e, 0x84, 0xaa, 0xd8, 0xf6,
+		0x51, 0x7f, 0x0d, 0x23, 0xe9, 0xc7, 0xb5, 0x9b,
+		0x9e, 0xb0, 0xc2, 0xec, 0x26, 0x08, 0x7a, 0x54,
+		0xf3, 0xdd, 0xaf, 0x81, 0x4b, 0x65, 0x17, 0x39,
+		0x44, 0x6a, 0x18, 0x36, 0xfc, 0xd2, 0xa0, 0x8e,
+		0x29, 0x07, 0x75, 0x5b, 0x91, 0xbf, 0xcd, 0xe3,
+		0x37, 0x19, 0x6b, 0x45, 0x8f, 0xa1, 0xd3, 0xfd,
+		0x5a, 0x74, 0x06, 0x28, 0xe2, 0xcc, 0xbe, 0x90,
+		0xed, 0xc3, 0xb1, 0x9f, 0x55, 0x7b, 0x09, 0x27,
+		0x80, 0xae, 0xdc, 0xf2, 0x38, 0x16, 0x64, 0x4a,
+		0xd1, 0xff, 0x8d, 0xa3, 0x69, 0x47, 0x35, 0x1b,
+		0xbc, 0x92, 0xe0, 0xce, 0x04, 0x2a, 0x58, 0x76,
+		0x0b, 0x25, 0x57, 0x79, 0xb3, 0x9d, 0xef, 0xc1,
+		0x66, 0x48, 0x3a, 0x14, 0xde, 0xf0, 0x82, 0xac,
+		0x78, 0x56, 0x24, 0x0a, 0xc0, 0xee, 0x9c, 0xb2,
+		0x15, 0x3b, 0x49, 0x67, 0xad, 0x83, 0xf1, 0xdf,
+		0xa2, 0x8c, 0xfe, 0xd0, 0x1a, 0x34, 0x46, 0x68,
+		0xcf, 0xe1, 0x93, 0xbd, 0x77, 0x59, 0x2b, 0x05,
+	},
+	{
+		0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd,
+		0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8,
+		0xca, 0xe5, 0x94, 0xbb, 0x76, 0x59, 0x28, 0x07,
+		0xaf, 0x80, 0xf1, 0xde, 0x13, 0x3c, 0x4d, 0x62,
+		0x89, 0xa6, 0xd7, 0xf8, 0x35, 0x1a, 0x6b, 0x44,
+		0xec, 0xc3, 0xb2, 0x9d, 0x50, 0x7f, 0x0e, 0x21,
+		0x43, 0x6c, 0x1d, 0x32, 0xff, 0xd0, 0xa1, 0x8e,
+		0x26, 0x09, 0x78, 0x57, 0x9a, 0xb5, 0xc4, 0xeb,
+		0x0f, 0x20, 0x51, 0x7e, 0xb3, 0x9c, 0xed, 0xc2,
+		0x6a, 0x45, 0x34, 0x1b, 0xd6, 0xf9, 0x88, 0xa7,
+		0xc5, 0xea, 0x9b, 0xb4, 0x79, 0x56, 0x27, 0x08,
+		0xa0, 0x8f, 0xfe, 0xd1, 0x1c, 0x33, 0x42, 0x6d,
+		0x86, 0xa9, 0xd8, 0xf7, 0x3a, 0x15, 0x64, 0x4b,
+		0xe3, 0xcc, 0xbd, 0x92, 0x5f, 0x70, 0x01, 0x2e,
+		0x4c, 0x63, 0x12, 0x3d, 0xf0, 0xdf, 0xae, 0x81,
+		0x29, 0x06, 0x77, 0x58, 0x95, 0xba, 0xcb, 0xe4,
+		0x1e, 0x31, 0x40, 0x6f, 0xa2, 0x8d, 0xfc, 0xd3,
+		0x7b, 0x54, 0x25, 0x0a, 0xc7, 0xe8, 0x99, 0xb6,
+		0xd4, 0xfb, 0x8a, 0xa5, 0x68, 0x47, 0x36, 0x19,
+		0xb1, 0x9e, 0xef, 0xc0, 0x0d, 0x22, 0x53, 0x7c,
+		0x97, 0xb8, 0xc9, 0xe6, 0x2b, 0x04, 0x75, 0x5a,
+		0xf2, 0xdd, 0xac, 0x83, 0x4e, 0x61, 0x10, 0x3f,
+		0x5d, 0x72, 0x03, 0x2c, 0xe1, 0xce, 0xbf, 0x90,
+		0x38, 0x17, 0x66, 0x49, 0x84, 0xab, 0xda, 0xf5,
+		0x11, 0x3e, 0x4f, 0x60, 0xad, 0x82, 0xf3, 0xdc,
+		0x74, 0x5b, 0x2a, 0x05, 0xc8, 0xe7, 0x96, 0xb9,
+		0xdb, 0xf4, 0x85, 0xaa, 0x67, 0x48, 0x39, 0x16,
+		0xbe, 0x91, 0xe0, 0xcf, 0x02, 0x2d, 0x5c, 0x73,
+		0x98, 0xb7, 0xc6, 0xe9, 0x24, 0x0b, 0x7a, 0x55,
+		0xfd, 0xd2, 0xa3, 0x8c, 0x41, 0x6e, 0x1f, 0x30,
+		0x52, 0x7d, 0x0c, 0x23, 0xee, 0xc1, 0xb0, 0x9f,
+		0x37, 0x18, 0x69, 0x46, 0x8b, 0xa4, 0xd5, 0xfa,
+	},
+	{
+		0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+		0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0x0d,
+		0x27, 0x17, 0x47, 0x77, 0xe7, 0xd7, 0x87, 0xb7,
+		0xba, 0x8a, 0xda, 0xea, 0x7a, 0x4a, 0x1a, 0x2a,
+		0x4e, 0x7e, 0x2e, 0x1e, 0x8e, 0xbe, 0xee, 0xde,
+		0xd3, 0xe3, 0xb3, 0x83, 0x13, 0x23, 0x73, 0x43,
+		0x69, 0x59, 0x09, 0x39, 0xa9, 0x99, 0xc9, 0xf9,
+		0xf4, 0xc4, 0x94, 0xa4, 0x34, 0x04, 0x54, 0x64,
+		0x9c, 0xac, 0xfc, 0xcc, 0x5c, 0x6c, 0x3c, 0x0c,
+		0x01, 0x31, 0x61, 0x51, 0xc1, 0xf1, 0xa1, 0x91,
+		0xbb, 0x8b, 0xdb, 0xeb, 0x7b, 0x4b, 0x1b, 0x2b,
+		0x26, 0x16, 0x46, 0x76, 0xe6, 0xd6, 0x86, 0xb6,
+		0xd2, 0xe2, 0xb2, 0x82, 0x12, 0x22, 0x72, 0x42,
+		0x4f, 0x7f, 0x2f, 0x1f, 0x8f, 0xbf, 0xef, 0xdf,
+		0xf5, 0xc5, 0x95, 0xa5, 0x35, 0x05, 0x55, 0x65,
+		0x68, 0x58, 0x08, 0x38, 0xa8, 0x98, 0xc8, 0xf8,
+		0x25, 0x15, 0x45, 0x75, 0xe5, 0xd5, 0x85, 0xb5,
+		0xb8, 0x88, 0xd8, 0xe8, 0x78, 0x48, 0x18, 0x28,
+		0x02, 0x32, 0x62, 0x52, 0xc2, 0xf2, 0xa2, 0x92,
+		0x9f, 0xaf, 0xff, 0xcf, 0x5f, 0x6f, 0x3f, 0x0f,
+		0x6b, 0x5b, 0x0b, 0x3b, 0xab, 0x9b, 0xcb, 0xfb,
+		0xf6, 0xc6, 0x96, 0xa6, 0x36, 0x06, 0x56, 0x66,
+		0x4c, 0x7c, 0x2c, 0x1c, 0x8c, 0xbc, 0xec, 0xdc,
+		0xd1, 0xe1, 0xb1, 0x81, 0x11, 0x21, 0x71, 0x41,
+		0xb9, 0x89, 0xd9, 0xe9, 0x79, 0x49, 0x19, 0x29,
+		0x24, 0x14, 0x44, 0x74, 0xe4, 0xd4, 0x84, 0xb4,
+		0x9e, 0xae, 0xfe, 0xce, 0x5e, 0x6e, 0x3e, 0x0e,
+		0x03, 0x33, 0x63, 0x53, 0xc3, 0xf3, 0xa3, 0x93,
+		0xf7, 0xc7, 0x97, 0xa7, 0x37, 0x07, 0x57, 0x67,
+		0x6a, 0x5a, 0x0a, 0x3a, 0xaa, 0x9a, 0xca, 0xfa,
+		0xd0, 0xe0, 0xb0, 0x80, 0x10, 0x20, 0x70, 0x40,
+		0x4d, 0x7d, 0x2d, 0x1d, 0x8d, 0xbd, 0xed, 0xdd,
+	},
+	{
+		0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97,
+		0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x02,
+		0x37, 0x06, 0x55, 0x64, 0xf3, 0xc2, 0x91, 0xa0,
+		0xa2, 0x93, 0xc0, 0xf1, 0x66, 0x57, 0x04, 0x35,
+		0x6e, 0x5f, 0x0c, 0x3d, 0xaa, 0x9b, 0xc8, 0xf9,
+		0xfb, 0xca, 0x99, 0xa8, 0x3f, 0x0e, 0x5d, 0x6c,
+		0x59, 0x68, 0x3b, 0x0a, 0x9d, 0xac, 0xff, 0xce,
+		0xcc, 0xfd, 0xae, 0x9f, 0x08, 0x39, 0x6a, 0x5b,
+		0xdc, 0xed, 0xbe, 0x8f, 0x18, 0x29, 0x7a, 0x4b,
+		0x49, 0x78, 0x2b, 0x1a, 0x8d, 0xbc, 0xef, 0xde,
+		0xeb, 0xda, 0x89, 0xb8, 0x2f, 0x1e, 0x4d, 0x7c,
+		0x7e, 0x4f, 0x1c, 0x2d, 0xba, 0x8b, 0xd8, 0xe9,
+		0xb2, 0x83, 0xd0, 0xe1, 0x76, 0x47, 0x14, 0x25,
+		0x27, 0x16, 0x45, 0x74, 0xe3, 0xd2, 0x81, 0xb0,
+		0x85, 0xb4, 0xe7, 0xd6, 0x41, 0x70, 0x23, 0x12,
+		0x10, 0x21, 0x72, 0x43, 0xd4, 0xe5, 0xb6, 0x87,
+		0xa5, 0x94, 0xc7, 0xf6, 0x61, 0x50, 0x03, 0x32,
+		0x30, 0x01, 0x52, 0x63, 0xf4, 0xc5, 0x96, 0xa7,
+		0x92, 0xa3, 0xf0, 0xc1, 0x56, 0x67, 0x34, 0x05,
+		0x07, 0x36, 0x65, 0x54, 0xc3, 0xf2, 0xa1, 0x90,
+		0xcb, 0xfa, 0xa9, 0x98, 0x0f, 0x3e, 0x6d, 0x5c,
+		0x5e, 0x6f, 0x3c, 0x0d, 0x9a, 0xab, 0xf8, 0xc9,
+		0xfc, 0xcd, 0x9e, 0xaf, 0x38, 0x09, 0x5a, 0x6b,
+		0x69, 0x58, 0x0b, 0x3a, 0xad, 0x9c, 0xcf, 0xfe,
+		0x79, 0x48, 0x1b, 0x2a, 0xbd, 0x8c, 0xdf, 0xee,
+		0xec, 0xdd, 0x8e, 0xbf, 0x28, 0x19, 0x4a, 0x7b,
+		0x4e, 0x7f, 0x2c, 0x1d, 0x8a, 0xbb, 0xe8, 0xd9,
+		0xdb, 0xea, 0xb9, 0x88, 0x1f, 0x2e, 0x7d, 0x4c,
+		0x17, 0x26, 0x75, 0x44, 0xd3, 0xe2, 0xb1, 0x80,
+		0x82, 0xb3, 0xe0, 0xd1, 0x46, 0x77, 0x24, 0x15,
+		0x20, 0x11, 0x42, 0x73, 0xe4, 0xd5, 0x86, 0xb7,
+		0xb5, 0x84, 0xd7, 0xe6, 0x71, 0x40, 0x13, 0x22,
+	},
+	{
+		0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e,
+		0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13,
+		0x07, 0x35, 0x63, 0x51, 0xcf, 0xfd, 0xab, 0x99,
+		0x8a, 0xb8, 0xee, 0xdc, 0x42, 0x70, 0x26, 0x14,
+		0x0e, 0x3c, 0x6a, 0x58, 0xc6, 0xf4, 0xa2, 0x90,
+		0x83, 0xb1, 0xe7, 0xd5, 0x4b, 0x79, 0x2f, 0x1d,
+		0x09, 0x3b, 0x6d, 0x5f, 0xc1, 0xf3, 0xa5, 0x97,
+		0x84, 0xb6, 0xe0, 0xd2, 0x4c, 0x7e, 0x28, 0x1a,
+		0x1c, 0x2e, 0x78, 0x4a, 0xd4, 0xe6, 0xb0, 0x82,
+		0x91, 0xa3, 0xf5, 0xc7, 0x59, 0x6b, 0x3d, 0x0f,
+		0x1b, 0x29, 0x7f, 0x4d, 0xd3, 0xe1, 0xb7, 0x85,
+		0x96, 0xa4, 0xf2, 0xc0, 0x5e, 0x6c, 0x3a, 0x08,
+		0x12, 0x20, 0x76, 0x44, 0xda, 0xe8, 0xbe, 0x8c,
+		0x9f, 0xad, 0xfb, 0xc9, 0x57, 0x65, 0x33, 0x01,
+		0x15, 0x27, 0x71, 0x43, 0xdd, 0xef, 0xb9, 0x8b,
+		0x98, 0xaa, 0xfc, 0xce, 0x50, 0x62, 0x34, 0x06,
+		0x38, 0x0a, 0x5c, 0x6e, 0xf0, 0xc2, 0x94, 0xa6,
+		0xb5, 0x87, 0xd1, 0xe3, 0x7d, 0x4f, 0x19, 0x2b,
+		0x3f, 0x0d, 0x5b, 0x69, 0xf7, 0xc5, 0x93, 0xa1,
+		0xb2, 0x80, 0xd6, 0xe4, 0x7a, 0x48, 0x1e, 0x2c,
+		0x36, 0x04, 0x52, 0x60, 0xfe, 0xcc, 0x9a, 0xa8,
+		0xbb, 0x89, 0xdf, 0xed, 0x73, 0x41, 0x17, 0x25,
+		0x31, 0x03, 0x55, 0x67, 0xf9, 0xcb, 0x9d, 0xaf,
+		0xbc, 0x8e, 0xd8, 0xea, 0x74, 0x46, 0x10, 0x22,
+		0x24, 0x16, 0x40, 0x72, 0xec, 0xde, 0x88, 0xba,
+		0xa9, 0x9b, 0xcd, 0xff, 0x61, 0x53, 0x05, 0x37,
+		0x23, 0x11, 0x47, 0x75, 0xeb, 0xd9, 0x8f, 0xbd,
+		0xae, 0x9c, 0xca, 0xf8, 0x66, 0x54, 0x02, 0x30,
+		0x2a, 0x18, 0x4e, 0x7c, 0xe2, 0xd0, 0x86, 0xb4,
+		0xa7, 0x95, 0xc3, 0xf1, 0x6f, 0x5d, 0x0b, 0x39,
+		0x2d, 0x1f, 0x49, 0x7b, 0xe5, 0xd7, 0x81, 0xb3,
+		0xa0, 0x92, 0xc4, 0xf6, 0x68, 0x5a, 0x0c, 0x3e,
+	},
+	{
+		0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99,
+		0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c,
+		0x17, 0x24, 0x71, 0x42, 0xdb, 0xe8, 0xbd, 0x8e,
+		0x92, 0xa1, 0xf4, 0xc7, 0x5e, 0x6d, 0x38, 0x0b,
+		0x2e, 0x1d, 0x48, 0x7b, 0xe2, 0xd1, 0x84, 0xb7,
+		0xab, 0x98, 0xcd, 0xfe, 0x67, 0x54, 0x01, 0x32,
+		0x39, 0x0a, 0x5f, 0x6c, 0xf5, 0xc6, 0x93, 0xa0,
+		0xbc, 0x8f, 0xda, 0xe9, 0x70, 0x43, 0x16, 0x25,
+		0x5c, 0x6f, 0x3a, 0x09, 0x90, 0xa3, 0xf6, 0xc5,
+		0xd9, 0xea, 0xbf, 0x8c, 0x15, 0x26, 0x73, 0x40,
+		0x4b, 0x78, 0x2d, 0x1e, 0x87, 0xb4, 0xe1, 0xd2,
+		0xce, 0xfd, 0xa8, 0x9b, 0x02, 0x31, 0x64, 0x57,
+		0x72, 0x41, 0x14, 0x27, 0xbe, 0x8d, 0xd8, 0xeb,
+		0xf7, 0xc4, 0x91, 0xa2, 0x3b, 0x08, 0x5d, 0x6e,
+		0x65, 0x56, 0x03, 0x30, 0xa9, 0x9a, 0xcf, 0xfc,
+		0xe0, 0xd3, 0x86, 0xb5, 0x2c, 0x1f, 0x4a, 0x79,
+		0xb8, 0x8b, 0xde, 0xed, 0x74, 0x47, 0x12, 0x21,
+		0x3d, 0x0e, 0x5b, 0x68, 0xf1, 0xc2, 0x97, 0xa4,
+		0xaf, 0x9c, 0xc9, 0xfa, 0x63, 0x50, 0x05, 0x36,
+		0x2a, 0x19, 0x4c, 0x7f, 0xe6, 0xd5, 0x80, 0xb3,
+		0x96, 0xa5, 0xf0, 0xc3, 0x5a, 0x69, 0x3c, 0x0f,
+		0x13, 0x20, 0x75, 0x46, 0xdf, 0xec, 0xb9, 0x8a,
+		0x81, 0xb2, 0xe7, 0xd4, 0x4d, 0x7e, 0x2b, 0x18,
+		0x04, 0x37, 0x62, 0x51, 0xc8, 0xfb, 0xae, 0x9d,
+		0xe4, 0xd7, 0x82, 0xb1, 0x28, 0x1b, 0x4e, 0x7d,
+		0x61, 0x52, 0x07, 0x34, 0xad, 0x9e, 0xcb, 0xf8,
+		0xf3, 0xc0, 0x95, 0xa6, 0x3f, 0x0c, 0x59, 0x6a,
+		0x76, 0x45, 0x10, 0x23, 0xba, 0x89, 0xdc, 0xef,
+		0xca, 0xf9, 0xac, 0x9f, 0x06, 0x35, 0x60, 0x53,
+		0x4f, 0x7c, 0x29, 0x1a, 0x83, 0xb0, 0xe5, 0xd6,
+		0xdd, 0xee, 0xbb, 0x88, 0x11, 0x22, 0x77, 0x44,
+		0x58, 0x6b, 0x3e, 0x0d, 0x94, 0xa7, 0xf2, 0xc1,
+	},
+	{
+		0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c,
+		0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x05, 0x31,
+		0x67, 0x53, 0x0f, 0x3b, 0xb7, 0x83, 0xdf, 0xeb,
+		0xda, 0xee, 0xb2, 0x86, 0x0a, 0x3e, 0x62, 0x56,
+		0xce, 0xfa, 0xa6, 0x92, 0x1e, 0x2a, 0x76, 0x42,
+		0x73, 0x47, 0x1b, 0x2f, 0xa3, 0x97, 0xcb, 0xff,
+		0xa9, 0x9d, 0xc1, 0xf5, 0x79, 0x4d, 0x11, 0x25,
+		0x14, 0x20, 0x7c, 0x48, 0xc4, 0xf0, 0xac, 0x98,
+		0x81, 0xb5, 0xe9, 0xdd, 0x51, 0x65, 0x39, 0x0d,
+		0x3c, 0x08, 0x54, 0x60, 0xec, 0xd8, 0x84, 0xb0,
+		0xe6, 0xd2, 0x8e, 0xba, 0x36, 0x02, 0x5e, 0x6a,
+		0x5b, 0x6f, 0x33, 0x07, 0x8b, 0xbf, 0xe3, 0xd7,
+		0x4f, 0x7b, 0x27, 0x13, 0x9f, 0xab, 0xf7, 0xc3,
+		0xf2, 0xc6, 0x9a, 0xae, 0x22, 0x16, 0x4a, 0x7e,
+		0x28, 0x1c, 0x40, 0x74, 0xf8, 0xcc, 0x90, 0xa4,
+		0x95, 0xa1, 0xfd, 0xc9, 0x45, 0x71, 0x2d, 0x19,
+		0x1f, 0x2b, 0x77, 0x43, 0xcf, 0xfb, 0xa7, 0x93,
+		0xa2, 0x96, 0xca, 0xfe, 0x72, 0x46, 0x1a, 0x2e,
+		0x78, 0x4c, 0x10, 0x24, 0xa8, 0x9c, 0xc0, 0xf4,
+		0xc5, 0xf1, 0xad, 0x99, 0x15, 0x21, 0x7d, 0x49,
+		0xd1, 0xe5, 0xb9, 0x8d, 0x01, 0x35, 0x69, 0x5d,
+		0x6c, 0x58, 0x04, 0x30, 0xbc, 0x88, 0xd4, 0xe0,
+		0xb6, 0x82, 0xde, 0xea, 0x66, 0x52, 0x0e, 0x3a,
+		0x0b, 0x3f, 0x63, 0x57, 0xdb, 0xef, 0xb3, 0x87,
+		0x9e, 0xaa, 0xf6, 0xc2, 0x4e, 0x7a, 0x26, 0x12,
+		0x23, 0x17, 0x4b, 0x7f, 0xf3, 0xc7, 0x9b, 0xaf,
+		0xf9, 0xcd, 0x91, 0xa5, 0x29, 0x1d, 0x41, 0x75,
+		0x44, 0x70, 0x2c, 0x18, 0x94, 0xa0, 0xfc, 0xc8,
+		0x50, 0x64, 0x38, 0x0c, 0x80, 0xb4, 0xe8, 0xdc,
+		0xed, 0xd9, 0x85, 0xb1, 0x3d, 0x09, 0x55, 0x61,
+		0x37, 0x03, 0x5f, 0x6b, 0xe7, 0xd3, 0x8f, 0xbb,
+		0x8a, 0xbe, 0xe2, 0xd6, 0x5a, 0x6e, 0x32, 0x06,
+	},
+	{
+		0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b,
+		0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0x0b, 0x3e,
+		0x77, 0x42, 0x1d, 0x28, 0xa3, 0x96, 0xc9, 0xfc,
+		0xc2, 0xf7, 0xa8, 0x9d, 0x16, 0x23, 0x7c, 0x49,
+		0xee, 0xdb, 0x84, 0xb1, 0x3a, 0x0f, 0x50, 0x65,
+		0x5b, 0x6e, 0x31, 0x04, 0x8f, 0xba, 0xe5, 0xd0,
+		0x99, 0xac, 0xf3, 0xc6, 0x4d, 0x78, 0x27, 0x12,
+		0x2c, 0x19, 0x46, 0x73, 0xf8, 0xcd, 0x92, 0xa7,
+		0xc1, 0xf4, 0xab, 0x9e, 0x15, 0x20, 0x7f, 0x4a,
+		0x74, 0x41, 0x1e, 0x2b, 0xa0, 0x95, 0xca, 0xff,
+		0xb6, 0x83, 0xdc, 0xe9, 0x62, 0x57, 0x08, 0x3d,
+		0x03, 0x36, 0x69, 0x5c, 0xd7, 0xe2, 0xbd, 0x88,
+		0x2f, 0x1a, 0x45, 0x70, 0xfb, 0xce, 0x91, 0xa4,
+		0x9a, 0xaf, 0xf0, 0xc5, 0x4e, 0x7b, 0x24, 0x11,
+		0x58, 0x6d, 0x32, 0x07, 0x8c, 0xb9, 0xe6, 0xd3,
+		0xed, 0xd8, 0x87, 0xb2, 0x39, 0x0c, 0x53, 0x66,
+		0x9f, 0xaa, 0xf5, 0xc0, 0x4b, 0x7e, 0x21, 0x14,
+		0x2a, 0x1f, 0x40, 0x75, 0xfe, 0xcb, 0x94, 0xa1,
+		0xe8, 0xdd, 0x82, 0xb7, 0x3c, 0x09, 0x56, 0x63,
+		0x5d, 0x68, 0x37, 0x02, 0x89, 0xbc, 0xe3, 0xd6,
+		0x71, 0x44, 0x1b, 0x2e, 0xa5, 0x90, 0xcf, 0xfa,
+		0xc4, 0xf1, 0xae, 0x9b, 0x10, 0x25, 0x7a, 0x4f,
+		0x06, 0x33, 0x6c, 0x59, 0xd2, 0xe7, 0xb8, 0x8d,
+		0xb3, 0x86, 0xd9, 0xec, 0x67, 0x52, 0x0d, 0x38,
+		0x5e, 0x6b, 0x34, 0x01, 0x8a, 0xbf, 0xe0, 0xd5,
+		0xeb, 0xde, 0x81, 0xb4, 0x3f, 0x0a, 0x55, 0x60,
+		0x29, 0x1c, 0x43, 0x76, 0xfd, 0xc8, 0x97, 0xa2,
+		0x9c, 0xa9, 0xf6, 0xc3, 0x48, 0x7d, 0x22, 0x17,
+		0xb0, 0x85, 0xda, 0xef, 0x64, 0x51, 0x0e, 0x3b,
+		0x05, 0x30, 0x6f, 0x5a, 0xd1, 0xe4, 0xbb, 0x8e,
+		0xc7, 0xf2, 0xad, 0x98, 0x13, 0x26, 0x79, 0x4c,
+		0x72, 0x47, 0x18, 0x2d, 0xa6, 0x93, 0xcc, 0xf9,
+	},
+	{
+		0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82,
+		0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f,
+		0x47, 0x71, 0x2b, 0x1d, 0x9f, 0xa9, 0xf3, 0xc5,
+		0xea, 0xdc, 0x86, 0xb0, 0x32, 0x04, 0x5e, 0x68,
+		0x8e, 0xb8, 0xe2, 0xd4, 0x56, 0x60, 0x3a, 0x0c,
+		0x23, 0x15, 0x4f, 0x79, 0xfb, 0xcd, 0x97, 0xa1,
+		0xc9, 0xff, 0xa5, 0x93, 0x11, 0x27, 0x7d, 0x4b,
+		0x64, 0x52, 0x08, 0x3e, 0xbc, 0x8a, 0xd0, 0xe6,
+		0x01, 0x37, 0x6d, 0x5b, 0xd9, 0xef, 0xb5, 0x83,
+		0xac, 0x9a, 0xc0, 0xf6, 0x74, 0x42, 0x18, 0x2e,
+		0x46, 0x70, 0x2a, 0x1c, 0x9e, 0xa8, 0xf2, 0xc4,
+		0xeb, 0xdd, 0x87, 0xb1, 0x33, 0x05, 0x5f, 0x69,
+		0x8f, 0xb9, 0xe3, 0xd5, 0x57, 0x61, 0x3b, 0x0d,
+		0x22, 0x14, 0x4e, 0x78, 0xfa, 0xcc, 0x96, 0xa0,
+		0xc8, 0xfe, 0xa4, 0x92, 0x10, 0x26, 0x7c, 0x4a,
+		0x65, 0x53, 0x09, 0x3f, 0xbd, 0x8b, 0xd1, 0xe7,
+		0x02, 0x34, 0x6e, 0x58, 0xda, 0xec, 0xb6, 0x80,
+		0xaf, 0x99, 0xc3, 0xf5, 0x77, 0x41, 0x1b, 0x2d,
+		0x45, 0x73, 0x29, 0x1f, 0x9d, 0xab, 0xf1, 0xc7,
+		0xe8, 0xde, 0x84, 0xb2, 0x30, 0x06, 0x5c, 0x6a,
+		0x8c, 0xba, 0xe0, 0xd6, 0x54, 0x62, 0x38, 0x0e,
+		0x21, 0x17, 0x4d, 0x7b, 0xf9, 0xcf, 0x95, 0xa3,
+		0xcb, 0xfd, 0xa7, 0x91, 0x13, 0x25, 0x7f, 0x49,
+		0x66, 0x50, 0x0a, 0x3c, 0xbe, 0x88, 0xd2, 0xe4,
+		0x03, 0x35, 0x6f, 0x59, 0xdb, 0xed, 0xb7, 0x81,
+		0xae, 0x98, 0xc2, 0xf4, 0x76, 0x40, 0x1a, 0x2c,
+		0x44, 0x72, 0x28, 0x1e, 0x9c, 0xaa, 0xf0, 0xc6,
+		0xe9, 0xdf, 0x85, 0xb3, 0x31, 0x07, 0x5d, 0x6b,
+		0x8d, 0xbb, 0xe1, 0xd7, 0x55, 0x63, 0x39, 0x0f,
+		0x20, 0x16, 0x4c, 0x7a, 0xf8, 0xce, 0x94, 0xa2,
+		0xca, 0xfc, 0xa6, 0x90, 0x12, 0x24, 0x7e, 0x48,
+		0x67, 0x51, 0x0b, 0x3d, 0xbf, 0x89, 0xd3, 0xe5,
+	},
+	{
+		0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85,
+		0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20,
+		0x57, 0x60, 0x39, 0x0e, 0x8b, 0xbc, 0xe5, 0xd2,
+		0xf2, 0xc5, 0x9c, 0xab, 0x2e, 0x19, 0x40, 0x77,
+		0xae, 0x99, 0xc0, 0xf7, 0x72, 0x45, 0x1c, 0x2b,
+		0x0b, 0x3c, 0x65, 0x52, 0xd7, 0xe0, 0xb9, 0x8e,
+		0xf9, 0xce, 0x97, 0xa0, 0x25, 0x12, 0x4b, 0x7c,
+		0x5c, 0x6b, 0x32, 0x05, 0x80, 0xb7, 0xee, 0xd9,
+		0x41, 0x76, 0x2f, 0x18, 0x9d, 0xaa, 0xf3, 0xc4,
+		0xe4, 0xd3, 0x8a, 0xbd, 0x38, 0x0f, 0x56, 0x61,
+		0x16, 0x21, 0x78, 0x4f, 0xca, 0xfd, 0xa4, 0x93,
+		0xb3, 0x84, 0xdd, 0xea, 0x6f, 0x58, 0x01, 0x36,
+		0xef, 0xd8, 0x81, 0xb6, 0x33, 0x04, 0x5d, 0x6a,
+		0x4a, 0x7d, 0x24, 0x13, 0x96, 0xa1, 0xf8, 0xcf,
+		0xb8, 0x8f, 0xd6, 0xe1, 0x64, 0x53, 0x0a, 0x3d,
+		0x1d, 0x2a, 0x73, 0x44, 0xc1, 0xf6, 0xaf, 0x98,
+		0x82, 0xb5, 0xec, 0xdb, 0x5e, 0x69, 0x30, 0x07,
+		0x27, 0x10, 0x49, 0x7e, 0xfb, 0xcc, 0x95, 0xa2,
+		0xd5, 0xe2, 0xbb, 0x8c, 0x09, 0x3e, 0x67, 0x50,
+		0x70, 0x47, 0x1e, 0x29, 0xac, 0x9b, 0xc2, 0xf5,
+		0x2c, 0x1b, 0x42, 0x75, 0xf0, 0xc7, 0x9e, 0xa9,
+		0x89, 0xbe, 0xe7, 0xd0, 0x55, 0x62, 0x3b, 0x0c,
+		0x7b, 0x4c, 0x15, 0x22, 0xa7, 0x90, 0xc9, 0xfe,
+		0xde, 0xe9, 0xb0, 0x87, 0x02, 0x35, 0x6c, 0x5b,
+		0xc3, 0xf4, 0xad, 0x9a, 0x1f, 0x28, 0x71, 0x46,
+		0x66, 0x51, 0x08, 0x3f, 0xba, 0x8d, 0xd4, 0xe3,
+		0x94, 0xa3, 0xfa, 0xcd, 0x48, 0x7f, 0x26, 0x11,
+		0x31, 0x06, 0x5f, 0x68, 0xed, 0xda, 0x83, 0xb4,
+		0x6d, 0x5a, 0x03, 0x34, 0xb1, 0x86, 0xdf, 0xe8,
+		0xc8, 0xff, 0xa6, 0x91, 0x14, 0x23, 0x7a, 0x4d,
+		0x3a, 0x0d, 0x54, 0x63, 0xe6, 0xd1, 0x88, 0xbf,
+		0x9f, 0xa8, 0xf1, 0xc6, 0x43, 0x74, 0x2d, 0x1a,
+	},
+	{
+		0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8,
+		0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x05, 0x4d, 0x75,
+		0xa7, 0x9f, 0xd7, 0xef, 0x47, 0x7f, 0x37, 0x0f,
+		0x7a, 0x42, 0x0a, 0x32, 0x9a, 0xa2, 0xea, 0xd2,
+		0x53, 0x6b, 0x23, 0x1b, 0xb3, 0x8b, 0xc3, 0xfb,
+		0x8e, 0xb6, 0xfe, 0xc6, 0x6e, 0x56, 0x1e, 0x26,
+		0xf4, 0xcc, 0x84, 0xbc, 0x14, 0x2c, 0x64, 0x5c,
+		0x29, 0x11, 0x59, 0x61, 0xc9, 0xf1, 0xb9, 0x81,
+		0xa6, 0x9e, 0xd6, 0xee, 0x46, 0x7e, 0x36, 0x0e,
+		0x7b, 0x43, 0x0b, 0x33, 0x9b, 0xa3, 0xeb, 0xd3,
+		0x01, 0x39, 0x71, 0x49, 0xe1, 0xd9, 0x91, 0xa9,
+		0xdc, 0xe4, 0xac, 0x94, 0x3c, 0x04, 0x4c, 0x74,
+		0xf5, 0xcd, 0x85, 0xbd, 0x15, 0x2d, 0x65, 0x5d,
+		0x28, 0x10, 0x58, 0x60, 0xc8, 0xf0, 0xb8, 0x80,
+		0x52, 0x6a, 0x22, 0x1a, 0xb2, 0x8a, 0xc2, 0xfa,
+		0x8f, 0xb7, 0xff, 0xc7, 0x6f, 0x57, 0x1f, 0x27,
+		0x51, 0x69, 0x21, 0x19, 0xb1, 0x89, 0xc1, 0xf9,
+		0x8c, 0xb4, 0xfc, 0xc4, 0x6c, 0x54, 0x1c, 0x24,
+		0xf6, 0xce, 0x86, 0xbe, 0x16, 0x2e, 0x66, 0x5e,
+		0x2b, 0x13, 0x5b, 0x63, 0xcb, 0xf3, 0xbb, 0x83,
+		0x02, 0x3a, 0x72, 0x4a, 0xe2, 0xda, 0x92, 0xaa,
+		0xdf, 0xe7, 0xaf, 0x97, 0x3f, 0x07, 0x4f, 0x77,
+		0xa5, 0x9d, 0xd5, 0xed, 0x45, 0x7d, 0x35, 0x0d,
+		0x78, 0x40, 0x08, 0x30, 0x98, 0xa0, 0xe8, 0xd0,
+		0xf7, 0xcf, 0x87, 0xbf, 0x17, 0x2f, 0x67, 0x5f,
+		0x2a, 0x12, 0x5a, 0x62, 0xca, 0xf2, 0xba, 0x82,
+		0x50, 0x68, 0x20, 0x18, 0xb0, 0x88, 0xc0, 0xf8,
+		0x8d, 0xb5, 0xfd, 0xc5, 0x6d, 0x55, 0x1d, 0x25,
+		0xa4, 0x9c, 0xd4, 0xec, 0x44, 0x7c, 0x34, 0x0c,
+		0x79, 0x41, 0x09, 0x31, 0x99, 0xa1, 0xe9, 0xd1,
+		0x03, 0x3b, 0x73, 0x4b, 0xe3, 0xdb, 0x93, 0xab,
+		0xde, 0xe6, 0xae, 0x96, 0x3e, 0x06, 0x4e, 0x76,
+	},
+	{
+		0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf,
+		0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x08, 0x43, 0x7a,
+		0xb7, 0x8e, 0xc5, 0xfc, 0x53, 0x6a, 0x21, 0x18,
+		0x62, 0x5b, 0x10, 0x29, 0x86, 0xbf, 0xf4, 0xcd,
+		0x73, 0x4a, 0x01, 0x38, 0x97, 0xae, 0xe5, 0xdc,
+		0xa6, 0x9f, 0xd4, 0xed, 0x42, 0x7b, 0x30, 0x09,
+		0xc4, 0xfd, 0xb6, 0x8f, 0x20, 0x19, 0x52, 0x6b,
+		0x11, 0x28, 0x63, 0x5a, 0xf5, 0xcc, 0x87, 0xbe,
+		0xe6, 0xdf, 0x94, 0xad, 0x02, 0x3b, 0x70, 0x49,
+		0x33, 0x0a, 0x41, 0x78, 0xd7, 0xee, 0xa5, 0x9c,
+		0x51, 0x68, 0x23, 0x1a, 0xb5, 0x8c, 0xc7, 0xfe,
+		0x84, 0xbd, 0xf6, 0xcf, 0x60, 0x59, 0x12, 0x2b,
+		0x95, 0xac, 0xe7, 0xde, 0x71, 0x48, 0x03, 0x3a,
+		0x40, 0x79, 0x32, 0x0b, 0xa4, 0x9d, 0xd6, 0xef,
+		0x22, 0x1b, 0x50, 0x69, 0xc6, 0xff, 0xb4, 0x8d,
+		0xf7, 0xce, 0x85, 0xbc, 0x13, 0x2a, 0x61, 0x58,
+		0xd1, 0xe8, 0xa3, 0x9a, 0x35, 0x0c, 0x47, 0x7e,
+		0x04, 0x3d, 0x76, 0x4f, 0xe0, 0xd9, 0x92, 0xab,
+		0x66, 0x5f, 0x14, 0x2d, 0x82, 0xbb, 0xf0, 0xc9,
+		0xb3, 0x8a, 0xc1, 0xf8, 0x57, 0x6e, 0x25, 0x1c,
+		0xa2, 0x9b, 0xd0, 0xe9, 0x46, 0x7f, 0x34, 0x0d,
+		0x77, 0x4e, 0x05, 0x3c, 0x93, 0xaa, 0xe1, 0xd8,
+		0x15, 0x2c, 0x67, 0x5e, 0xf1, 0xc8, 0x83, 0xba,
+		0xc0, 0xf9, 0xb2, 0x8b, 0x24, 0x1d, 0x56, 0x6f,
+		0x37, 0x0e, 0x45, 0x7c, 0xd3, 0xea, 0xa1, 0x98,
+		0xe2, 0xdb, 0x90, 0xa9, 0x06, 0x3f, 0x74, 0x4d,
+		0x80, 0xb9, 0xf2, 0xcb, 0x64, 0x5d, 0x16, 0x2f,
+		0x55, 0x6c, 0x27, 0x1e, 0xb1, 0x88, 0xc3, 0xfa,
+		0x44, 0x7d, 0x36, 0x0f, 0xa0, 0x99, 0xd2, 0xeb,
+		0x91, 0xa8, 0xe3, 0xda, 0x75, 0x4c, 0x07, 0x3e,
+		0xf3, 0xca, 0x81, 0xb8, 0x17, 0x2e, 0x65, 0x5c,
+		0x26, 0x1f, 0x54, 0x6d, 0xc2, 0xfb, 0xb0, 0x89,
+	},
+	{
+		0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+		0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b,
+		0x87, 0xbd, 0xf3, 0xc9, 0x6f, 0x55, 0x1b, 0x21,
+		0x4a, 0x70, 0x3e, 0x04, 0xa2, 0x98, 0xd6, 0xec,
+		0x13, 0x29, 0x67, 0x5d, 0xfb, 0xc1, 0x8f, 0xb5,
+		0xde, 0xe4, 0xaa, 0x90, 0x36, 0x0c, 0x42, 0x78,
+		0x94, 0xae, 0xe0, 0xda, 0x7c, 0x46, 0x08, 0x32,
+		0x59, 0x63, 0x2d, 0x17, 0xb1, 0x8b, 0xc5, 0xff,
+		0x26, 0x1c, 0x52, 0x68, 0xce, 0xf4, 0xba, 0x80,
+		0xeb, 0xd1, 0x9f, 0xa5, 0x03, 0x39, 0x77, 0x4d,
+		0xa1, 0x9b, 0xd5, 0xef, 0x49, 0x73, 0x3d, 0x07,
+		0x6c, 0x56, 0x18, 0x22, 0x84, 0xbe, 0xf0, 0xca,
+		0x35, 0x0f, 0x41, 0x7b, 0xdd, 0xe7, 0xa9, 0x93,
+		0xf8, 0xc2, 0x8c, 0xb6, 0x10, 0x2a, 0x64, 0x5e,
+		0xb2, 0x88, 0xc6, 0xfc, 0x5a, 0x60, 0x2e, 0x14,
+		0x7f, 0x45, 0x0b, 0x31, 0x97, 0xad, 0xe3, 0xd9,
+		0x4c, 0x76, 0x38, 0x02, 0xa4, 0x9e, 0xd0, 0xea,
+		0x81, 0xbb, 0xf5, 0xcf, 0x69, 0x53, 0x1d, 0x27,
+		0xcb, 0xf1, 0xbf, 0x85, 0x23, 0x19, 0x57, 0x6d,
+		0x06, 0x3c, 0x72, 0x48, 0xee, 0xd4, 0x9a, 0xa0,
+		0x5f, 0x65, 0x2b, 0x11, 0xb7, 0x8d, 0xc3, 0xf9,
+		0x92, 0xa8, 0xe6, 0xdc, 0x7a, 0x40, 0x0e, 0x34,
+		0xd8, 0xe2, 0xac, 0x96, 0x30, 0x0a, 0x44, 0x7e,
+		0x15, 0x2f, 0x61, 0x5b, 0xfd, 0xc7, 0x89, 0xb3,
+		0x6a, 0x50, 0x1e, 0x24, 0x82, 0xb8, 0xf6, 0xcc,
+		0xa7, 0x9d, 0xd3, 0xe9, 0x4f, 0x75, 0x3b, 0x01,
+		0xed, 0xd7, 0x99, 0xa3, 0x05, 0x3f, 0x71, 0x4b,
+		0x20, 0x1a, 0x54, 0x6e, 0xc8, 0xf2, 0xbc, 0x86,
+		0x79, 0x43, 0x0d, 0x37, 0x91, 0xab, 0xe5, 0xdf,
+		0xb4, 0x8e, 0xc0, 0xfa, 0x5c, 0x66, 0x28, 0x12,
+		0xfe, 0xc4, 0x8a, 0xb0, 0x16, 0x2c, 0x62, 0x58,
+		0x33, 0x09, 0x47, 0x7d, 0xdb, 0xe1, 0xaf, 0x95,
+	},
+	{
+		0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1,
+		0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64,
+		0x97, 0xac, 0xe1, 0xda, 0x7b, 0x40, 0x0d, 0x36,
+		0x52, 0x69, 0x24, 0x1f, 0xbe, 0x85, 0xc8, 0xf3,
+		0x33, 0x08, 0x45, 0x7e, 0xdf, 0xe4, 0xa9, 0x92,
+		0xf6, 0xcd, 0x80, 0xbb, 0x1a, 0x21, 0x6c, 0x57,
+		0xa4, 0x9f, 0xd2, 0xe9, 0x48, 0x73, 0x3e, 0x05,
+		0x61, 0x5a, 0x17, 0x2c, 0x8d, 0xb6, 0xfb, 0xc0,
+		0x66, 0x5d, 0x10, 0x2b, 0x8a, 0xb1, 0xfc, 0xc7,
+		0xa3, 0x98, 0xd5, 0xee, 0x4f, 0x74, 0x39, 0x02,
+		0xf1, 0xca, 0x87, 0xbc, 0x1d, 0x26, 0x6b, 0x50,
+		0x34, 0x0f, 0x42, 0x79, 0xd8, 0xe3, 0xae, 0x95,
+		0x55, 0x6e, 0x23, 0x18, 0xb9, 0x82, 0xcf, 0xf4,
+		0x90, 0xab, 0xe6, 0xdd, 0x7c, 0x47, 0x0a, 0x31,
+		0xc2, 0xf9, 0xb4, 0x8f, 0x2e, 0x15, 0x58, 0x63,
+		0x07, 0x3c, 0x71, 0x4a, 0xeb, 0xd0, 0x9d, 0xa6,
+		0xcc, 0xf7, 0xba, 0x81, 0x20, 0x1b, 0x56, 0x6d,
+		0x09, 0x32, 0x7f, 0x44, 0xe5, 0xde, 0x93, 0xa8,
+		0x5b, 0x60, 0x2d, 0x16, 0xb7, 0x8c, 0xc1, 0xfa,
+		0x9e, 0xa5, 0xe8, 0xd3, 0x72, 0x49, 0x04, 0x3f,
+		0xff, 0xc4, 0x89, 0xb2, 0x13, 0x28, 0x65, 0x5e,
+		0x3a, 0x01, 0x4c, 0x77, 0xd6, 0xed, 0xa0, 0x9b,
+		0x68, 0x53, 0x1e, 0x25, 0x84, 0xbf, 0xf2, 0xc9,
+		0xad, 0x96, 0xdb, 0xe0, 0x41, 0x7a, 0x37, 0x0c,
+		0xaa, 0x91, 0xdc, 0xe7, 0x46, 0x7d, 0x30, 0x0b,
+		0x6f, 0x54, 0x19, 0x22, 0x83, 0xb8, 0xf5, 0xce,
+		0x3d, 0x06, 0x4b, 0x70, 0xd1, 0xea, 0xa7, 0x9c,
+		0xf8, 0xc3, 0x8e, 0xb5, 0x14, 0x2f, 0x62, 0x59,
+		0x99, 0xa2, 0xef, 0xd4, 0x75, 0x4e, 0x03, 0x38,
+		0x5c, 0x67, 0x2a, 0x11, 0xb0, 0x8b, 0xc6, 0xfd,
+		0x0e, 0x35, 0x78, 0x43, 0xe2, 0xd9, 0x94, 0xaf,
+		0xcb, 0xf0, 0xbd, 0x86, 0x27, 0x1c, 0x51, 0x6a,
+	},
+	{
+		0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4,
+		0xfd, 0xc1, 0x85, 0xb9, 0x0d, 0x31, 0x75, 0x49,
+		0xe7, 0xdb, 0x9f, 0xa3, 0x17, 0x2b, 0x6f, 0x53,
+		0x1a, 0x26, 0x62, 0x5e, 0xea, 0xd6, 0x92, 0xae,
+		0xd3, 0xef, 0xab, 0x97, 0x23, 0x1f, 0x5b, 0x67,
+		0x2e, 0x12, 0x56, 0x6a, 0xde, 0xe2, 0xa6, 0x9a,
+		0x34, 0x08, 0x4c, 0x70, 0xc4, 0xf8, 0xbc, 0x80,
+		0xc9, 0xf5, 0xb1, 0x8d, 0x39, 0x05, 0x41, 0x7d,
+		0xbb, 0x87, 0xc3, 0xff, 0x4b, 0x77, 0x33, 0x0f,
+		0x46, 0x7a, 0x3e, 0x02, 0xb6, 0x8a, 0xce, 0xf2,
+		0x5c, 0x60, 0x24, 0x18, 0xac, 0x90, 0xd4, 0xe8,
+		0xa1, 0x9d, 0xd9, 0xe5, 0x51, 0x6d, 0x29, 0x15,
+		0x68, 0x54, 0x10, 0x2c, 0x98, 0xa4, 0xe0, 0xdc,
+		0x95, 0xa9, 0xed, 0xd1, 0x65, 0x59, 0x1d, 0x21,
+		0x8f, 0xb3, 0xf7, 0xcb, 0x7f, 0x43, 0x07, 0x3b,
+		0x72, 0x4e, 0x0a, 0x36, 0x82, 0xbe, 0xfa, 0xc6,
+		0x6b, 0x57, 0x13, 0x2f, 0x9b, 0xa7, 0xe3, 0xdf,
+		0x96, 0xaa, 0xee, 0xd2, 0x66, 0x5a, 0x1e, 0x22,
+		0x8c, 0xb0, 0xf4, 0xc8, 0x7c, 0x40, 0x04, 0x38,
+		0x71, 0x4d, 0x09, 0x35, 0x81, 0xbd, 0xf9, 0xc5,
+		0xb8, 0x84, 0xc0, 0xfc, 0x48, 0x74, 0x30, 0x0c,
+		0x45, 0x79, 0x3d, 0x01, 0xb5, 0x89, 0xcd, 0xf1,
+		0x5f, 0x63, 0x27, 0x1b, 0xaf, 0x93, 0xd7, 0xeb,
+		0xa2, 0x9e, 0xda, 0xe6, 0x52, 0x6e, 0x2a, 0x16,
+		0xd0, 0xec, 0xa8, 0x94, 0x20, 0x1c, 0x58, 0x64,
+		0x2d, 0x11, 0x55, 0x69, 0xdd, 0xe1, 0xa5, 0x99,
+		0x37, 0x0b, 0x4f, 0x73, 0xc7, 0xfb, 0xbf, 0x83,
+		0xca, 0xf6, 0xb2, 0x8e, 0x3a, 0x06, 0x42, 0x7e,
+		0x03, 0x3f, 0x7b, 0x47, 0xf3, 0xcf, 0x8b, 0xb7,
+		0xfe, 0xc2, 0x86, 0xba, 0x0e, 0x32, 0x76, 0x4a,
+		0xe4, 0xd8, 0x9c, 0xa0, 0x14, 0x28, 0x6c, 0x50,
+		0x19, 0x25, 0x61, 0x5d, 0xe9, 0xd5, 0x91, 0xad,
+	},
+	{
+		0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3,
+		0xf5, 0xc8, 0x8f, 0xb2, 0x01, 0x3c, 0x7b, 0x46,
+		0xf7, 0xca, 0x8d, 0xb0, 0x03, 0x3e, 0x79, 0x44,
+		0x02, 0x3f, 0x78, 0x45, 0xf6, 0xcb, 0x8c, 0xb1,
+		0xf3, 0xce, 0x89, 0xb4, 0x07, 0x3a, 0x7d, 0x40,
+		0x06, 0x3b, 0x7c, 0x41, 0xf2, 0xcf, 0x88, 0xb5,
+		0x04, 0x39, 0x7e, 0x43, 0xf0, 0xcd, 0x8a, 0xb7,
+		0xf1, 0xcc, 0x8b, 0xb6, 0x05, 0x38, 0x7f, 0x42,
+		0xfb, 0xc6, 0x81, 0xbc, 0x0f, 0x32, 0x75, 0x48,
+		0x0e, 0x33, 0x74, 0x49, 0xfa, 0xc7, 0x80, 0xbd,
+		0x0c, 0x31, 0x76, 0x4b, 0xf8, 0xc5, 0x82, 0xbf,
+		0xf9, 0xc4, 0x83, 0xbe, 0x0d, 0x30, 0x77, 0x4a,
+		0x08, 0x35, 0x72, 0x4f, 0xfc, 0xc1, 0x86, 0xbb,
+		0xfd, 0xc0, 0x87, 0xba, 0x09, 0x34, 0x73, 0x4e,
+		0xff, 0xc2, 0x85, 0xb8, 0x0b, 0x36, 0x71, 0x4c,
+		0x0a, 0x37, 0x70, 0x4d, 0xfe, 0xc3, 0x84, 0xb9,
+		0xeb, 0xd6, 0x91, 0xac, 0x1f, 0x22, 0x65, 0x58,
+		0x1e, 0x23, 0x64, 0x59, 0xea, 0xd7, 0x90, 0xad,
+		0x1c, 0x21, 0x66, 0x5b, 0xe8, 0xd5, 0x92, 0xaf,
+		0xe9, 0xd4, 0x93, 0xae, 0x1d, 0x20, 0x67, 0x5a,
+		0x18, 0x25, 0x62, 0x5f, 0xec, 0xd1, 0x96, 0xab,
+		0xed, 0xd0, 0x97, 0xaa, 0x19, 0x24, 0x63, 0x5e,
+		0xef, 0xd2, 0x95, 0xa8, 0x1b, 0x26, 0x61, 0x5c,
+		0x1a, 0x27, 0x60, 0x5d, 0xee, 0xd3, 0x94, 0xa9,
+		0x10, 0x2d, 0x6a, 0x57, 0xe4, 0xd9, 0x9e, 0xa3,
+		0xe5, 0xd8, 0x9f, 0xa2, 0x11, 0x2c, 0x6b, 0x56,
+		0xe7, 0xda, 0x9d, 0xa0, 0x13, 0x2e, 0x69, 0x54,
+		0x12, 0x2f, 0x68, 0x55, 0xe6, 0xdb, 0x9c, 0xa1,
+		0xe3, 0xde, 0x99, 0xa4, 0x17, 0x2a, 0x6d, 0x50,
+		0x16, 0x2b, 0x6c, 0x51, 0xe2, 0xdf, 0x98, 0xa5,
+		0x14, 0x29, 0x6e, 0x53, 0xe0, 0xdd, 0x9a, 0xa7,
+		0xe1, 0xdc, 0x9b, 0xa6, 0x15, 0x28, 0x6f, 0x52,
+	},
+	{
+		0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba,
+		0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57,
+		0xc7, 0xf9, 0xbb, 0x85, 0x3f, 0x01, 0x43, 0x7d,
+		0x2a, 0x14, 0x56, 0x68, 0xd2, 0xec, 0xae, 0x90,
+		0x93, 0xad, 0xef, 0xd1, 0x6b, 0x55, 0x17, 0x29,
+		0x7e, 0x40, 0x02, 0x3c, 0x86, 0xb8, 0xfa, 0xc4,
+		0x54, 0x6a, 0x28, 0x16, 0xac, 0x92, 0xd0, 0xee,
+		0xb9, 0x87, 0xc5, 0xfb, 0x41, 0x7f, 0x3d, 0x03,
+		0x3b, 0x05, 0x47, 0x79, 0xc3, 0xfd, 0xbf, 0x81,
+		0xd6, 0xe8, 0xaa, 0x94, 0x2e, 0x10, 0x52, 0x6c,
+		0xfc, 0xc2, 0x80, 0xbe, 0x04, 0x3a, 0x78, 0x46,
+		0x11, 0x2f, 0x6d, 0x53, 0xe9, 0xd7, 0x95, 0xab,
+		0xa8, 0x96, 0xd4, 0xea, 0x50, 0x6e, 0x2c, 0x12,
+		0x45, 0x7b, 0x39, 0x07, 0xbd, 0x83, 0xc1, 0xff,
+		0x6f, 0x51, 0x13, 0x2d, 0x97, 0xa9, 0xeb, 0xd5,
+		0x82, 0xbc, 0xfe, 0xc0, 0x7a, 0x44, 0x06, 0x38,
+		0x76, 0x48, 0x0a, 0x34, 0x8e, 0xb0, 0xf2, 0xcc,
+		0x9b, 0xa5, 0xe7, 0xd9, 0x63, 0x5d, 0x1f, 0x21,
+		0xb1, 0x8f, 0xcd, 0xf3, 0x49, 0x77, 0x35, 0x0b,
+		0x5c, 0x62, 0x20, 0x1e, 0xa4, 0x9a, 0xd8, 0xe6,
+		0xe5, 0xdb, 0x99, 0xa7, 0x1d, 0x23, 0x61, 0x5f,
+		0x08, 0x36, 0x74, 0x4a, 0xf0, 0xce, 0x8c, 0xb2,
+		0x22, 0x1c, 0x5e, 0x60, 0xda, 0xe4, 0xa6, 0x98,
+		0xcf, 0xf1, 0xb3, 0x8d, 0x37, 0x09, 0x4b, 0x75,
+		0x4d, 0x73, 0x31, 0x0f, 0xb5, 0x8b, 0xc9, 0xf7,
+		0xa0, 0x9e, 0xdc, 0xe2, 0x58, 0x66, 0x24, 0x1a,
+		0x8a, 0xb4, 0xf6, 0xc8, 0x72, 0x4c, 0x0e, 0x30,
+		0x67, 0x59, 0x1b, 0x25, 0x9f, 0xa1, 0xe3, 0xdd,
+		0xde, 0xe0, 0xa2, 0x9c, 0x26, 0x18, 0x5a, 0x64,
+		0x33, 0x0d, 0x4f, 0x71, 0xcb, 0xf5, 0xb7, 0x89,
+		0x19, 0x27, 0x65, 0x5b, 0xe1, 0xdf, 0x9d, 0xa3,
+		0xf4, 0xca, 0x88, 0xb6, 0x0c, 0x32, 0x70, 0x4e,
+	},
+	{
+		0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd,
+		0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58,
+		0xd7, 0xe8, 0xa9, 0x96, 0x2b, 0x14, 0x55, 0x6a,
+		0x32, 0x0d, 0x4c, 0x73, 0xce, 0xf1, 0xb0, 0x8f,
+		0xb3, 0x8c, 0xcd, 0xf2, 0x4f, 0x70, 0x31, 0x0e,
+		0x56, 0x69, 0x28, 0x17, 0xaa, 0x95, 0xd4, 0xeb,
+		0x64, 0x5b, 0x1a, 0x25, 0x98, 0xa7, 0xe6, 0xd9,
+		0x81, 0xbe, 0xff, 0xc0, 0x7d, 0x42, 0x03, 0x3c,
+		0x7b, 0x44, 0x05, 0x3a, 0x87, 0xb8, 0xf9, 0xc6,
+		0x9e, 0xa1, 0xe0, 0xdf, 0x62, 0x5d, 0x1c, 0x23,
+		0xac, 0x93, 0xd2, 0xed, 0x50, 0x6f, 0x2e, 0x11,
+		0x49, 0x76, 0x37, 0x08, 0xb5, 0x8a, 0xcb, 0xf4,
+		0xc8, 0xf7, 0xb6, 0x89, 0x34, 0x0b, 0x4a, 0x75,
+		0x2d, 0x12, 0x53, 0x6c, 0xd1, 0xee, 0xaf, 0x90,
+		0x1f, 0x20, 0x61, 0x5e, 0xe3, 0xdc, 0x9d, 0xa2,
+		0xfa, 0xc5, 0x84, 0xbb, 0x06, 0x39, 0x78, 0x47,
+		0xf6, 0xc9, 0x88, 0xb7, 0x0a, 0x35, 0x74, 0x4b,
+		0x13, 0x2c, 0x6d, 0x52, 0xef, 0xd0, 0x91, 0xae,
+		0x21, 0x1e, 0x5f, 0x60, 0xdd, 0xe2, 0xa3, 0x9c,
+		0xc4, 0xfb, 0xba, 0x85, 0x38, 0x07, 0x46, 0x79,
+		0x45, 0x7a, 0x3b, 0x04, 0xb9, 0x86, 0xc7, 0xf8,
+		0xa0, 0x9f, 0xde, 0xe1, 0x5c, 0x63, 0x22, 0x1d,
+		0x92, 0xad, 0xec, 0xd3, 0x6e, 0x51, 0x10, 0x2f,
+		0x77, 0x48, 0x09, 0x36, 0x8b, 0xb4, 0xf5, 0xca,
+		0x8d, 0xb2, 0xf3, 0xcc, 0x71, 0x4e, 0x0f, 0x30,
+		0x68, 0x57, 0x16, 0x29, 0x94, 0xab, 0xea, 0xd5,
+		0x5a, 0x65, 0x24, 0x1b, 0xa6, 0x99, 0xd8, 0xe7,
+		0xbf, 0x80, 0xc1, 0xfe, 0x43, 0x7c, 0x3d, 0x02,
+		0x3e, 0x01, 0x40, 0x7f, 0xc2, 0xfd, 0xbc, 0x83,
+		0xdb, 0xe4, 0xa5, 0x9a, 0x27, 0x18, 0x59, 0x66,
+		0xe9, 0xd6, 0x97, 0xa8, 0x15, 0x2a, 0x6b, 0x54,
+		0x0c, 0x33, 0x72, 0x4d, 0xf0, 0xcf, 0x8e, 0xb1,
+	},
+	{
+		0x00, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd,
+		0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7,
+		0x74, 0x34, 0xf4, 0xb4, 0x69, 0x29, 0xe9, 0xa9,
+		0x4e, 0x0e, 0xce, 0x8e, 0x53, 0x13, 0xd3, 0x93,
+		0xe8, 0xa8, 0x68, 0x28, 0xf5, 0xb5, 0x75, 0x35,
+		0xd2, 0x92, 0x52, 0x12, 0xcf, 0x8f, 0x4f, 0x0f,
+		0x9c, 0xdc, 0x1c, 0x5c, 0x81, 0xc1, 0x01, 0x41,
+		0xa6, 0xe6, 0x26, 0x66, 0xbb, 0xfb, 0x3b, 0x7b,
+		0xcd, 0x8d, 0x4d, 0x0d, 0xd0, 0x90, 0x50, 0x10,
+		0xf7, 0xb7, 0x77, 0x37, 0xea, 0xaa, 0x6a, 0x2a,
+		0xb9, 0xf9, 0x39, 0x79, 0xa4, 0xe4, 0x24, 0x64,
+		0x83, 0xc3, 0x03, 0x43, 0x9e, 0xde, 0x1e, 0x5e,
+		0x25, 0x65, 0xa5, 0xe5, 0x38, 0x78, 0xb8, 0xf8,
+		0x1f, 0x5f, 0x9f, 0xdf, 0x02, 0x42, 0x82, 0xc2,
+		0x51, 0x11, 0xd1, 0x91, 0x4c, 0x0c, 0xcc, 0x8c,
+		0x6b, 0x2b, 0xeb, 0xab, 0x76, 0x36, 0xf6, 0xb6,
+		0x87, 0xc7, 0x07, 0x47, 0x9a, 0xda, 0x1a, 0x5a,
+		0xbd, 0xfd, 0x3d, 0x7d, 0xa0, 0xe0, 0x20, 0x60,
+		0xf3, 0xb3, 0x73, 0x33, 0xee, 0xae, 0x6e, 0x2e,
+		0xc9, 0x89, 0x49, 0x09, 0xd4, 0x94, 0x54, 0x14,
+		0x6f, 0x2f, 0xef, 0xaf, 0x72, 0x32, 0xf2, 0xb2,
+		0x55, 0x15, 0xd5, 0x95, 0x48, 0x08, 0xc8, 0x88,
+		0x1b, 0x5b, 0x9b, 0xdb, 0x06, 0x46, 0x86, 0xc6,
+		0x21, 0x61, 0xa1, 0xe1, 0x3c, 0x7c, 0xbc, 0xfc,
+		0x4a, 0x0a, 0xca, 0x8a, 0x57, 0x17, 0xd7, 0x97,
+		0x70, 0x30, 0xf0, 0xb0, 0x6d, 0x2d, 0xed, 0xad,
+		0x3e, 0x7e, 0xbe, 0xfe, 0x23, 0x63, 0xa3, 0xe3,
+		0x04, 0x44, 0x84, 0xc4, 0x19, 0x59, 0x99, 0xd9,
+		0xa2, 0xe2, 0x22, 0x62, 0xbf, 0xff, 0x3f, 0x7f,
+		0x98, 0xd8, 0x18, 0x58, 0x85, 0xc5, 0x05, 0x45,
+		0xd6, 0x96, 0x56, 0x16, 0xcb, 0x8b, 0x4b, 0x0b,
+		0xec, 0xac, 0x6c, 0x2c, 0xf1, 0xb1, 0x71, 0x31,
+	},
+	{
+		0x00, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda,
+		0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8,
+		0x64, 0x25, 0xe6, 0xa7, 0x7d, 0x3c, 0xff, 0xbe,
+		0x56, 0x17, 0xd4, 0x95, 0x4f, 0x0e, 0xcd, 0x8c,
+		0xc8, 0x89, 0x4a, 0x0b, 0xd1, 0x90, 0x53, 0x12,
+		0xfa, 0xbb, 0x78, 0x39, 0xe3, 0xa2, 0x61, 0x20,
+		0xac, 0xed, 0x2e, 0x6f, 0xb5, 0xf4, 0x37, 0x76,
+		0x9e, 0xdf, 0x1c, 0x5d, 0x87, 0xc6, 0x05, 0x44,
+		0x8d, 0xcc, 0x0f, 0x4e, 0x94, 0xd5, 0x16, 0x57,
+		0xbf, 0xfe, 0x3d, 0x7c, 0xa6, 0xe7, 0x24, 0x65,
+		0xe9, 0xa8, 0x6b, 0x2a, 0xf0, 0xb1, 0x72, 0x33,
+		0xdb, 0x9a, 0x59, 0x18, 0xc2, 0x83, 0x40, 0x01,
+		0x45, 0x04, 0xc7, 0x86, 0x5c, 0x1d, 0xde, 0x9f,
+		0x77, 0x36, 0xf5, 0xb4, 0x6e, 0x2f, 0xec, 0xad,
+		0x21, 0x60, 0xa3, 0xe2, 0x38, 0x79, 0xba, 0xfb,
+		0x13, 0x52, 0x91, 0xd0, 0x0a, 0x4b, 0x88, 0xc9,
+		0x07, 0x46, 0x85, 0xc4, 0x1e, 0x5f, 0x9c, 0xdd,
+		0x35, 0x74, 0xb7, 0xf6, 0x2c, 0x6d, 0xae, 0xef,
+		0x63, 0x22, 0xe1, 0xa0, 0x7a, 0x3b, 0xf8, 0xb9,
+		0x51, 0x10, 0xd3, 0x92, 0x48, 0x09, 0xca, 0x8b,
+		0xcf, 0x8e, 0x4d, 0x0c, 0xd6, 0x97, 0x54, 0x15,
+		0xfd, 0xbc, 0x7f, 0x3e, 0xe4, 0xa5, 0x66, 0x27,
+		0xab, 0xea, 0x29, 0x68, 0xb2, 0xf3, 0x30, 0x71,
+		0x99, 0xd8, 0x1b, 0x5a, 0x80, 0xc1, 0x02, 0x43,
+		0x8a, 0xcb, 0x08, 0x49, 0x93, 0xd2, 0x11, 0x50,
+		0xb8, 0xf9, 0x3a, 0x7b, 0xa1, 0xe0, 0x23, 0x62,
+		0xee, 0xaf, 0x6c, 0x2d, 0xf7, 0xb6, 0x75, 0x34,
+		0xdc, 0x9d, 0x5e, 0x1f, 0xc5, 0x84, 0x47, 0x06,
+		0x42, 0x03, 0xc0, 0x81, 0x5b, 0x1a, 0xd9, 0x98,
+		0x70, 0x31, 0xf2, 0xb3, 0x69, 0x28, 0xeb, 0xaa,
+		0x26, 0x67, 0xa4, 0xe5, 0x3f, 0x7e, 0xbd, 0xfc,
+		0x14, 0x55, 0x96, 0xd7, 0x0d, 0x4c, 0x8f, 0xce,
+	},
+	{
+		0x00, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3,
+		0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9,
+		0x54, 0x16, 0xd0, 0x92, 0x41, 0x03, 0xc5, 0x87,
+		0x7e, 0x3c, 0xfa, 0xb8, 0x6b, 0x29, 0xef, 0xad,
+		0xa8, 0xea, 0x2c, 0x6e, 0xbd, 0xff, 0x39, 0x7b,
+		0x82, 0xc0, 0x06, 0x44, 0x97, 0xd5, 0x13, 0x51,
+		0xfc, 0xbe, 0x78, 0x3a, 0xe9, 0xab, 0x6d, 0x2f,
+		0xd6, 0x94, 0x52, 0x10, 0xc3, 0x81, 0x47, 0x05,
+		0x4d, 0x0f, 0xc9, 0x8b, 0x58, 0x1a, 0xdc, 0x9e,
+		0x67, 0x25, 0xe3, 0xa1, 0x72, 0x30, 0xf6, 0xb4,
+		0x19, 0x5b, 0x9d, 0xdf, 0x0c, 0x4e, 0x88, 0xca,
+		0x33, 0x71, 0xb7, 0xf5, 0x26, 0x64, 0xa2, 0xe0,
+		0xe5, 0xa7, 0x61, 0x23, 0xf0, 0xb2, 0x74, 0x36,
+		0xcf, 0x8d, 0x4b, 0x09, 0xda, 0x98, 0x5e, 0x1c,
+		0xb1, 0xf3, 0x35, 0x77, 0xa4, 0xe6, 0x20, 0x62,
+		0x9b, 0xd9, 0x1f, 0x5d, 0x8e, 0xcc, 0x0a, 0x48,
+		0x9a, 0xd8, 0x1e, 0x5c, 0x8f, 0xcd, 0x0b, 0x49,
+		0xb0, 0xf2, 0x34, 0x76, 0xa5, 0xe7, 0x21, 0x63,
+		0xce, 0x8c, 0x4a, 0x08, 0xdb, 0x99, 0x5f, 0x1d,
+		0xe4, 0xa6, 0x60, 0x22, 0xf1, 0xb3, 0x75, 0x37,
+		0x32, 0x70, 0xb6, 0xf4, 0x27, 0x65, 0xa3, 0xe1,
+		0x18, 0x5a, 0x9c, 0xde, 0x0d, 0x4f, 0x89, 0xcb,
+		0x66, 0x24, 0xe2, 0xa0, 0x73, 0x31, 0xf7, 0xb5,
+		0x4c, 0x0e, 0xc8, 0x8a, 0x59, 0x1b, 0xdd, 0x9f,
+		0xd7, 0x95, 0x53, 0x11, 0xc2, 0x80, 0x46, 0x04,
+		0xfd, 0xbf, 0x79, 0x3b, 0xe8, 0xaa, 0x6c, 0x2e,
+		0x83, 0xc1, 0x07, 0x45, 0x96, 0xd4, 0x12, 0x50,
+		0xa9, 0xeb, 0x2d, 0x6f, 0xbc, 0xfe, 0x38, 0x7a,
+		0x7f, 0x3d, 0xfb, 0xb9, 0x6a, 0x28, 0xee, 0xac,
+		0x55, 0x17, 0xd1, 0x93, 0x40, 0x02, 0xc4, 0x86,
+		0x2b, 0x69, 0xaf, 0xed, 0x3e, 0x7c, 0xba, 0xf8,
+		0x01, 0x43, 0x85, 0xc7, 0x14, 0x56, 0x90, 0xd2,
+	},
+	{
+		0x00, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4,
+		0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6,
+		0x44, 0x07, 0xc2, 0x81, 0x55, 0x16, 0xd3, 0x90,
+		0x66, 0x25, 0xe0, 0xa3, 0x77, 0x34, 0xf1, 0xb2,
+		0x88, 0xcb, 0x0e, 0x4d, 0x99, 0xda, 0x1f, 0x5c,
+		0xaa, 0xe9, 0x2c, 0x6f, 0xbb, 0xf8, 0x3d, 0x7e,
+		0xcc, 0x8f, 0x4a, 0x09, 0xdd, 0x9e, 0x5b, 0x18,
+		0xee, 0xad, 0x68, 0x2b, 0xff, 0xbc, 0x79, 0x3a,
+		0x0d, 0x4e, 0x8b, 0xc8, 0x1c, 0x5f, 0x9a, 0xd9,
+		0x2f, 0x6c, 0xa9, 0xea, 0x3e, 0x7d, 0xb8, 0xfb,
+		0x49, 0x0a, 0xcf, 0x8c, 0x58, 0x1b, 0xde, 0x9d,
+		0x6b, 0x28, 0xed, 0xae, 0x7a, 0x39, 0xfc, 0xbf,
+		0x85, 0xc6, 0x03, 0x40, 0x94, 0xd7, 0x12, 0x51,
+		0xa7, 0xe4, 0x21, 0x62, 0xb6, 0xf5, 0x30, 0x73,
+		0xc1, 0x82, 0x47, 0x04, 0xd0, 0x93, 0x56, 0x15,
+		0xe3, 0xa0, 0x65, 0x26, 0xf2, 0xb1, 0x74, 0x37,
+		0x1a, 0x59, 0x9c, 0xdf, 0x0b, 0x48, 0x8d, 0xce,
+		0x38, 0x7b, 0xbe, 0xfd, 0x29, 0x6a, 0xaf, 0xec,
+		0x5e, 0x1d, 0xd8, 0x9b, 0x4f, 0x0c, 0xc9, 0x8a,
+		0x7c, 0x3f, 0xfa, 0xb9, 0x6d, 0x2e, 0xeb, 0xa8,
+		0x92, 0xd1, 0x14, 0x57, 0x83, 0xc0, 0x05, 0x46,
+		0xb0, 0xf3, 0x36, 0x75, 0xa1, 0xe2, 0x27, 0x64,
+		0xd6, 0x95, 0x50, 0x13, 0xc7, 0x84, 0x41, 0x02,
+		0xf4, 0xb7, 0x72, 0x31, 0xe5, 0xa6, 0x63, 0x20,
+		0x17, 0x54, 0x91, 0xd2, 0x06, 0x45, 0x80, 0xc3,
+		0x35, 0x76, 0xb3, 0xf0, 0x24, 0x67, 0xa2, 0xe1,
+		0x53, 0x10, 0xd5, 0x96, 0x42, 0x01, 0xc4, 0x87,
+		0x71, 0x32, 0xf7, 0xb4, 0x60, 0x23, 0xe6, 0xa5,
+		0x9f, 0xdc, 0x19, 0x5a, 0x8e, 0xcd, 0x08, 0x4b,
+		0xbd, 0xfe, 0x3b, 0x78, 0xac, 0xef, 0x2a, 0x69,
+		0xdb, 0x98, 0x5d, 0x1e, 0xca, 0x89, 0x4c, 0x0f,
+		0xf9, 0xba, 0x7f, 0x3c, 0xe8, 0xab, 0x6e, 0x2d,
+	},
+	{
+		0x00, 0x44, 0x88, 0xcc, 0x0d, 0x49, 0x85, 0xc1,
+		0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb,
+		0x34, 0x70, 0xbc, 0xf8, 0x39, 0x7d, 0xb1, 0xf5,
+		0x2e, 0x6a, 0xa6, 0xe2, 0x23, 0x67, 0xab, 0xef,
+		0x68, 0x2c, 0xe0, 0xa4, 0x65, 0x21, 0xed, 0xa9,
+		0x72, 0x36, 0xfa, 0xbe, 0x7f, 0x3b, 0xf7, 0xb3,
+		0x5c, 0x18, 0xd4, 0x90, 0x51, 0x15, 0xd9, 0x9d,
+		0x46, 0x02, 0xce, 0x8a, 0x4b, 0x0f, 0xc3, 0x87,
+		0xd0, 0x94, 0x58, 0x1c, 0xdd, 0x99, 0x55, 0x11,
+		0xca, 0x8e, 0x42, 0x06, 0xc7, 0x83, 0x4f, 0x0b,
+		0xe4, 0xa0, 0x6c, 0x28, 0xe9, 0xad, 0x61, 0x25,
+		0xfe, 0xba, 0x76, 0x32, 0xf3, 0xb7, 0x7b, 0x3f,
+		0xb8, 0xfc, 0x30, 0x74, 0xb5, 0xf1, 0x3d, 0x79,
+		0xa2, 0xe6, 0x2a, 0x6e, 0xaf, 0xeb, 0x27, 0x63,
+		0x8c, 0xc8, 0x04, 0x40, 0x81, 0xc5, 0x09, 0x4d,
+		0x96, 0xd2, 0x1e, 0x5a, 0x9b, 0xdf, 0x13, 0x57,
+		0xbd, 0xf9, 0x35, 0x71, 0xb0, 0xf4, 0x38, 0x7c,
+		0xa7, 0xe3, 0x2f, 0x6b, 0xaa, 0xee, 0x22, 0x66,
+		0x89, 0xcd, 0x01, 0x45, 0x84, 0xc0, 0x0c, 0x48,
+		0x93, 0xd7, 0x1b, 0x5f, 0x9e, 0xda, 0x16, 0x52,
+		0xd5, 0x91, 0x5d, 0x19, 0xd8, 0x9c, 0x50, 0x14,
+		0xcf, 0x8b, 0x47, 0x03, 0xc2, 0x86, 0x4a, 0x0e,
+		0xe1, 0xa5, 0x69, 0x2d, 0xec, 0xa8, 0x64, 0x20,
+		0xfb, 0xbf, 0x73, 0x37, 0xf6, 0xb2, 0x7e, 0x3a,
+		0x6d, 0x29, 0xe5, 0xa1, 0x60, 0x24, 0xe8, 0xac,
+		0x77, 0x33, 0xff, 0xbb, 0x7a, 0x3e, 0xf2, 0xb6,
+		0x59, 0x1d, 0xd1, 0x95, 0x54, 0x10, 0xdc, 0x98,
+		0x43, 0x07, 0xcb, 0x8f, 0x4e, 0x0a, 0xc6, 0x82,
+		0x05, 0x41, 0x8d, 0xc9, 0x08, 0x4c, 0x80, 0xc4,
+		0x1f, 0x5b, 0x97, 0xd3, 0x12, 0x56, 0x9a, 0xde,
+		0x31, 0x75, 0xb9, 0xfd, 0x3c, 0x78, 0xb4, 0xf0,
+		0x2b, 0x6f, 0xa3, 0xe7, 0x26, 0x62, 0xae, 0xea,
+	},
+	{
+		0x00, 0x45, 0x8a, 0xcf, 0x09, 0x4c, 0x83, 0xc6,
+		0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4,
+		0x24, 0x61, 0xae, 0xeb, 0x2d, 0x68, 0xa7, 0xe2,
+		0x36, 0x73, 0xbc, 0xf9, 0x3f, 0x7a, 0xb5, 0xf0,
+		0x48, 0x0d, 0xc2, 0x87, 0x41, 0x04, 0xcb, 0x8e,
+		0x5a, 0x1f, 0xd0, 0x95, 0x53, 0x16, 0xd9, 0x9c,
+		0x6c, 0x29, 0xe6, 0xa3, 0x65, 0x20, 0xef, 0xaa,
+		0x7e, 0x3b, 0xf4, 0xb1, 0x77, 0x32, 0xfd, 0xb8,
+		0x90, 0xd5, 0x1a, 0x5f, 0x99, 0xdc, 0x13, 0x56,
+		0x82, 0xc7, 0x08, 0x4d, 0x8b, 0xce, 0x01, 0x44,
+		0xb4, 0xf1, 0x3e, 0x7b, 0xbd, 0xf8, 0x37, 0x72,
+		0xa6, 0xe3, 0x2c, 0x69, 0xaf, 0xea, 0x25, 0x60,
+		0xd8, 0x9d, 0x52, 0x17, 0xd1, 0x94, 0x5b, 0x1e,
+		0xca, 0x8f, 0x40, 0x05, 0xc3, 0x86, 0x49, 0x0c,
+		0xfc, 0xb9, 0x76, 0x33, 0xf5, 0xb0, 0x7f, 0x3a,
+		0xee, 0xab, 0x64, 0x21, 0xe7, 0xa2, 0x6d, 0x28,
+		0x3d, 0x78, 0xb7, 0xf2, 0x34, 0x71, 0xbe, 0xfb,
+		0x2f, 0x6a, 0xa5, 0xe0, 0x26, 0x63, 0xac, 0xe9,
+		0x19, 0x5c, 0x93, 0xd6, 0x10, 0x55, 0x9a, 0xdf,
+		0x0b, 0x4e, 0x81, 0xc4, 0x02, 0x47, 0x88, 0xcd,
+		0x75, 0x30, 0xff, 0xba, 0x7c, 0x39, 0xf6, 0xb3,
+		0x67, 0x22, 0xed, 0xa8, 0x6e, 0x2b, 0xe4, 0xa1,
+		0x51, 0x14, 0xdb, 0x9e, 0x58, 0x1d, 0xd2, 0x97,
+		0x43, 0x06, 0xc9, 0x8c, 0x4a, 0x0f, 0xc0, 0x85,
+		0xad, 0xe8, 0x27, 0x62, 0xa4, 0xe1, 0x2e, 0x6b,
+		0xbf, 0xfa, 0x35, 0x70, 0xb6, 0xf3, 0x3c, 0x79,
+		0x89, 0xcc, 0x03, 0x46, 0x80, 0xc5, 0x0a, 0x4f,
+		0x9b, 0xde, 0x11, 0x54, 0x92, 0xd7, 0x18, 0x5d,
+		0xe5, 0xa0, 0x6f, 0x2a, 0xec, 0xa9, 0x66, 0x23,
+		0xf7, 0xb2, 0x7d, 0x38, 0xfe, 0xbb, 0x74, 0x31,
+		0xc1, 0x84, 0x4b, 0x0e, 0xc8, 0x8d, 0x42, 0x07,
+		0xd3, 0x96, 0x59, 0x1c, 0xda, 0x9f, 0x50, 0x15,
+	},
+	{
+		0x00, 0x46, 0x8c, 0xca, 0x05, 0x43, 0x89, 0xcf,
+		0x0a, 0x4c, 0x86, 0xc0, 0x0f, 0x49, 0x83, 0xc5,
+		0x14, 0x52, 0x98, 0xde, 0x11, 0x57, 0x9d, 0xdb,
+		0x1e, 0x58, 0x92, 0xd4, 0x1b, 0x5d, 0x97, 0xd1,
+		0x28, 0x6e, 0xa4, 0xe2, 0x2d, 0x6b, 0xa1, 0xe7,
+		0x22, 0x64, 0xae, 0xe8, 0x27, 0x61, 0xab, 0xed,
+		0x3c, 0x7a, 0xb0, 0xf6, 0x39, 0x7f, 0xb5, 0xf3,
+		0x36, 0x70, 0xba, 0xfc, 0x33, 0x75, 0xbf, 0xf9,
+		0x50, 0x16, 0xdc, 0x9a, 0x55, 0x13, 0xd9, 0x9f,
+		0x5a, 0x1c, 0xd6, 0x90, 0x5f, 0x19, 0xd3, 0x95,
+		0x44, 0x02, 0xc8, 0x8e, 0x41, 0x07, 0xcd, 0x8b,
+		0x4e, 0x08, 0xc2, 0x84, 0x4b, 0x0d, 0xc7, 0x81,
+		0x78, 0x3e, 0xf4, 0xb2, 0x7d, 0x3b, 0xf1, 0xb7,
+		0x72, 0x34, 0xfe, 0xb8, 0x77, 0x31, 0xfb, 0xbd,
+		0x6c, 0x2a, 0xe0, 0xa6, 0x69, 0x2f, 0xe5, 0xa3,
+		0x66, 0x20, 0xea, 0xac, 0x63, 0x25, 0xef, 0xa9,
+		0xa0, 0xe6, 0x2c, 0x6a, 0xa5, 0xe3, 0x29, 0x6f,
+		0xaa, 0xec, 0x26, 0x60, 0xaf, 0xe9, 0x23, 0x65,
+		0xb4, 0xf2, 0x38, 0x7e, 0xb1, 0xf7, 0x3d, 0x7b,
+		0xbe, 0xf8, 0x32, 0x74, 0xbb, 0xfd, 0x37, 0x71,
+		0x88, 0xce, 0x04, 0x42, 0x8d, 0xcb, 0x01, 0x47,
+		0x82, 0xc4, 0x0e, 0x48, 0x87, 0xc1, 0x0b, 0x4d,
+		0x9c, 0xda, 0x10, 0x56, 0x99, 0xdf, 0x15, 0x53,
+		0x96, 0xd0, 0x1a, 0x5c, 0x93, 0xd5, 0x1f, 0x59,
+		0xf0, 0xb6, 0x7c, 0x3a, 0xf5, 0xb3, 0x79, 0x3f,
+		0xfa, 0xbc, 0x76, 0x30, 0xff, 0xb9, 0x73, 0x35,
+		0xe4, 0xa2, 0x68, 0x2e, 0xe1, 0xa7, 0x6d, 0x2b,
+		0xee, 0xa8, 0x62, 0x24, 0xeb, 0xad, 0x67, 0x21,
+		0xd8, 0x9e, 0x54, 0x12, 0xdd, 0x9b, 0x51, 0x17,
+		0xd2, 0x94, 0x5e, 0x18, 0xd7, 0x91, 0x5b, 0x1d,
+		0xcc, 0x8a, 0x40, 0x06, 0xc9, 0x8f, 0x45, 0x03,
+		0xc6, 0x80, 0x4a, 0x0c, 0xc3, 0x85, 0x4f, 0x09,
+	},
+	{
+		0x00, 0x47, 0x8e, 0xc9, 0x01, 0x46, 0x8f, 0xc8,
+		0x02, 0x45, 0x8c, 0xcb, 0x03, 0x44, 0x8d, 0xca,
+		0x04, 0x43, 0x8a, 0xcd, 0x05, 0x42, 0x8b, 0xcc,
+		0x06, 0x41, 0x88, 0xcf, 0x07, 0x40, 0x89, 0xce,
+		0x08, 0x4f, 0x86, 0xc1, 0x09, 0x4e, 0x87, 0xc0,
+		0x0a, 0x4d, 0x84, 0xc3, 0x0b, 0x4c, 0x85, 0xc2,
+		0x0c, 0x4b, 0x82, 0xc5, 0x0d, 0x4a, 0x83, 0xc4,
+		0x0e, 0x49, 0x80, 0xc7, 0x0f, 0x48, 0x81, 0xc6,
+		0x10, 0x57, 0x9e, 0xd9, 0x11, 0x56, 0x9f, 0xd8,
+		0x12, 0x55, 0x9c, 0xdb, 0x13, 0x54, 0x9d, 0xda,
+		0x14, 0x53, 0x9a, 0xdd, 0x15, 0x52, 0x9b, 0xdc,
+		0x16, 0x51, 0x98, 0xdf, 0x17, 0x50, 0x99, 0xde,
+		0x18, 0x5f, 0x96, 0xd1, 0x19, 0x5e, 0x97, 0xd0,
+		0x1a, 0x5d, 0x94, 0xd3, 0x1b, 0x5c, 0x95, 0xd2,
+		0x1c, 0x5b, 0x92, 0xd5, 0x1d, 0x5a, 0x93, 0xd4,
+		0x1e, 0x59, 0x90, 0xd7, 0x1f, 0x58, 0x91, 0xd6,
+		0x20, 0x67, 0xae, 0xe9, 0x21, 0x66, 0xaf, 0xe8,
+		0x22, 0x65, 0xac, 0xeb, 0x23, 0x64, 0xad, 0xea,
+		0x24, 0x63, 0xaa, 0xed, 0x25, 0x62, 0xab, 0xec,
+		0x26, 0x61, 0xa8, 0xef, 0x27, 0x60, 0xa9, 0xee,
+		0x28, 0x6f, 0xa6, 0xe1, 0x29, 0x6e, 0xa7, 0xe0,
+		0x2a, 0x6d, 0xa4, 0xe3, 0x2b, 0x6c, 0xa5, 0xe2,
+		0x2c, 0x6b, 0xa2, 0xe5, 0x2d, 0x6a, 0xa3, 0xe4,
+		0x2e, 0x69, 0xa0, 0xe7, 0x2f, 0x68, 0xa1, 0xe6,
+		0x30, 0x77, 0xbe, 0xf9, 0x31, 0x76, 0xbf, 0xf8,
+		0x32, 0x75, 0xbc, 0xfb, 0x33, 0x74, 0xbd, 0xfa,
+		0x34, 0x73, 0xba, 0xfd, 0x35, 0x72, 0xbb, 0xfc,
+		0x36, 0x71, 0xb8, 0xff, 0x37, 0x70, 0xb9, 0xfe,
+		0x38, 0x7f, 0xb6, 0xf1, 0x39, 0x7e, 0xb7, 0xf0,
+		0x3a, 0x7d, 0xb4, 0xf3, 0x3b, 0x7c, 0xb5, 0xf2,
+		0x3c, 0x7b, 0xb2, 0xf5, 0x3d, 0x7a, 0xb3, 0xf4,
+		0x3e, 0x79, 0xb0, 0xf7, 0x3f, 0x78, 0xb1, 0xf6,
+	},
+	{
+		0x00, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5,
+		0x7a, 0x32, 0xea, 0xa2, 0x47, 0x0f, 0xd7, 0x9f,
+		0xf4, 0xbc, 0x64, 0x2c, 0xc9, 0x81, 0x59, 0x11,
+		0x8e, 0xc6, 0x1e, 0x56, 0xb3, 0xfb, 0x23, 0x6b,
+		0xf5, 0xbd, 0x65, 0x2d, 0xc8, 0x80, 0x58, 0x10,
+		0x8f, 0xc7, 0x1f, 0x57, 0xb2, 0xfa, 0x22, 0x6a,
+		0x01, 0x49, 0x91, 0xd9, 0x3c, 0x74, 0xac, 0xe4,
+		0x7b, 0x33, 0xeb, 0xa3, 0x46, 0x0e, 0xd6, 0x9e,
+		0xf7, 0xbf, 0x67, 0x2f, 0xca, 0x82, 0x5a, 0x12,
+		0x8d, 0xc5, 0x1d, 0x55, 0xb0, 0xf8, 0x20, 0x68,
+		0x03, 0x4b, 0x93, 0xdb, 0x3e, 0x76, 0xae, 0xe6,
+		0x79, 0x31, 0xe9, 0xa1, 0x44, 0x0c, 0xd4, 0x9c,
+		0x02, 0x4a, 0x92, 0xda, 0x3f, 0x77, 0xaf, 0xe7,
+		0x78, 0x30, 0xe8, 0xa0, 0x45, 0x0d, 0xd5, 0x9d,
+		0xf6, 0xbe, 0x66, 0x2e, 0xcb, 0x83, 0x5b, 0x13,
+		0x8c, 0xc4, 0x1c, 0x54, 0xb1, 0xf9, 0x21, 0x69,
+		0xf3, 0xbb, 0x63, 0x2b, 0xce, 0x86, 0x5e, 0x16,
+		0x89, 0xc1, 0x19, 0x51, 0xb4, 0xfc, 0x24, 0x6c,
+		0x07, 0x4f, 0x97, 0xdf, 0x3a, 0x72, 0xaa, 0xe2,
+		0x7d, 0x35, 0xed, 0xa5, 0x40, 0x08, 0xd0, 0x98,
+		0x06, 0x4e, 0x96, 0xde, 0x3b, 0x73, 0xab, 0xe3,
+		0x7c, 0x34, 0xec, 0xa4, 0x41, 0x09, 0xd1, 0x99,
+		0xf2, 0xba, 0x62, 0x2a, 0xcf, 0x87, 0x5f, 0x17,
+		0x88, 0xc0, 0x18, 0x50, 0xb5, 0xfd, 0x25, 0x6d,
+		0x04, 0x4c, 0x94, 0xdc, 0x39, 0x71, 0xa9, 0xe1,
+		0x7e, 0x36, 0xee, 0xa6, 0x43, 0x0b, 0xd3, 0x9b,
+		0xf0, 0xb8, 0x60, 0x28, 0xcd, 0x85, 0x5d, 0x15,
+		0x8a, 0xc2, 0x1a, 0x52, 0xb7, 0xff, 0x27, 0x6f,
+		0xf1, 0xb9, 0x61, 0x29, 0xcc, 0x84, 0x5c, 0x14,
+		0x8b, 0xc3, 0x1b, 0x53, 0xb6, 0xfe, 0x26, 0x6e,
+		0x05, 0x4d, 0x95, 0xdd, 0x38, 0x70, 0xa8, 0xe0,
+		0x7f, 0x37, 0xef, 0xa7, 0x42, 0x0a, 0xd2, 0x9a,
+	},
+	{
+		0x00, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2,
+		0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x02, 0xd9, 0x90,
+		0xe4, 0xad, 0x76, 0x3f, 0xdd, 0x94, 0x4f, 0x06,
+		0x96, 0xdf, 0x04, 0x4d, 0xaf, 0xe6, 0x3d, 0x74,
+		0xd5, 0x9c, 0x47, 0x0e, 0xec, 0xa5, 0x7e, 0x37,
+		0xa7, 0xee, 0x35, 0x7c, 0x9e, 0xd7, 0x0c, 0x45,
+		0x31, 0x78, 0xa3, 0xea, 0x08, 0x41, 0x9a, 0xd3,
+		0x43, 0x0a, 0xd1, 0x98, 0x7a, 0x33, 0xe8, 0xa1,
+		0xb7, 0xfe, 0x25, 0x6c, 0x8e, 0xc7, 0x1c, 0x55,
+		0xc5, 0x8c, 0x57, 0x1e, 0xfc, 0xb5, 0x6e, 0x27,
+		0x53, 0x1a, 0xc1, 0x88, 0x6a, 0x23, 0xf8, 0xb1,
+		0x21, 0x68, 0xb3, 0xfa, 0x18, 0x51, 0x8a, 0xc3,
+		0x62, 0x2b, 0xf0, 0xb9, 0x5b, 0x12, 0xc9, 0x80,
+		0x10, 0x59, 0x82, 0xcb, 0x29, 0x60, 0xbb, 0xf2,
+		0x86, 0xcf, 0x14, 0x5d, 0xbf, 0xf6, 0x2d, 0x64,
+		0xf4, 0xbd, 0x66, 0x2f, 0xcd, 0x84, 0x5f, 0x16,
+		0x73, 0x3a, 0xe1, 0xa8, 0x4a, 0x03, 0xd8, 0x91,
+		0x01, 0x48, 0x93, 0xda, 0x38, 0x71, 0xaa, 0xe3,
+		0x97, 0xde, 0x05, 0x4c, 0xae, 0xe7, 0x3c, 0x75,
+		0xe5, 0xac, 0x77, 0x3e, 0xdc, 0x95, 0x4e, 0x07,
+		0xa6, 0xef, 0x34, 0x7d, 0x9f, 0xd6, 0x0d, 0x44,
+		0xd4, 0x9d, 0x46, 0x0f, 0xed, 0xa4, 0x7f, 0x36,
+		0x42, 0x0b, 0xd0, 0x99, 0x7b, 0x32, 0xe9, 0xa0,
+		0x30, 0x79, 0xa2, 0xeb, 0x09, 0x40, 0x9b, 0xd2,
+		0xc4, 0x8d, 0x56, 0x1f, 0xfd, 0xb4, 0x6f, 0x26,
+		0xb6, 0xff, 0x24, 0x6d, 0x8f, 0xc6, 0x1d, 0x54,
+		0x20, 0x69, 0xb2, 0xfb, 0x19, 0x50, 0x8b, 0xc2,
+		0x52, 0x1b, 0xc0, 0x89, 0x6b, 0x22, 0xf9, 0xb0,
+		0x11, 0x58, 0x83, 0xca, 0x28, 0x61, 0xba, 0xf3,
+		0x63, 0x2a, 0xf1, 0xb8, 0x5a, 0x13, 0xc8, 0x81,
+		0xf5, 0xbc, 0x67, 0x2e, 0xcc, 0x85, 0x5e, 0x17,
+		0x87, 0xce, 0x15, 0x5c, 0xbe, 0xf7, 0x2c, 0x65,
+	},
+	{
+		0x00, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb,
+		0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81,
+		0xd4, 0x9e, 0x40, 0x0a, 0xe1, 0xab, 0x75, 0x3f,
+		0xbe, 0xf4, 0x2a, 0x60, 0x8b, 0xc1, 0x1f, 0x55,
+		0xb5, 0xff, 0x21, 0x6b, 0x80, 0xca, 0x14, 0x5e,
+		0xdf, 0x95, 0x4b, 0x01, 0xea, 0xa0, 0x7e, 0x34,
+		0x61, 0x2b, 0xf5, 0xbf, 0x54, 0x1e, 0xc0, 0x8a,
+		0x0b, 0x41, 0x9f, 0xd5, 0x3e, 0x74, 0xaa, 0xe0,
+		0x77, 0x3d, 0xe3, 0xa9, 0x42, 0x08, 0xd6, 0x9c,
+		0x1d, 0x57, 0x89, 0xc3, 0x28, 0x62, 0xbc, 0xf6,
+		0xa3, 0xe9, 0x37, 0x7d, 0x96, 0xdc, 0x02, 0x48,
+		0xc9, 0x83, 0x5d, 0x17, 0xfc, 0xb6, 0x68, 0x22,
+		0xc2, 0x88, 0x56, 0x1c, 0xf7, 0xbd, 0x63, 0x29,
+		0xa8, 0xe2, 0x3c, 0x76, 0x9d, 0xd7, 0x09, 0x43,
+		0x16, 0x5c, 0x82, 0xc8, 0x23, 0x69, 0xb7, 0xfd,
+		0x7c, 0x36, 0xe8, 0xa2, 0x49, 0x03, 0xdd, 0x97,
+		0xee, 0xa4, 0x7a, 0x30, 0xdb, 0x91, 0x4f, 0x05,
+		0x84, 0xce, 0x10, 0x5a, 0xb1, 0xfb, 0x25, 0x6f,
+		0x3a, 0x70, 0xae, 0xe4, 0x0f, 0x45, 0x9b, 0xd1,
+		0x50, 0x1a, 0xc4, 0x8e, 0x65, 0x2f, 0xf1, 0xbb,
+		0x5b, 0x11, 0xcf, 0x85, 0x6e, 0x24, 0xfa, 0xb0,
+		0x31, 0x7b, 0xa5, 0xef, 0x04, 0x4e, 0x90, 0xda,
+		0x8f, 0xc5, 0x1b, 0x51, 0xba, 0xf0, 0x2e, 0x64,
+		0xe5, 0xaf, 0x71, 0x3b, 0xd0, 0x9a, 0x44, 0x0e,
+		0x99, 0xd3, 0x0d, 0x47, 0xac, 0xe6, 0x38, 0x72,
+		0xf3, 0xb9, 0x67, 0x2d, 0xc6, 0x8c, 0x52, 0x18,
+		0x4d, 0x07, 0xd9, 0x93, 0x78, 0x32, 0xec, 0xa6,
+		0x27, 0x6d, 0xb3, 0xf9, 0x12, 0x58, 0x86, 0xcc,
+		0x2c, 0x66, 0xb8, 0xf2, 0x19, 0x53, 0x8d, 0xc7,
+		0x46, 0x0c, 0xd2, 0x98, 0x73, 0x39, 0xe7, 0xad,
+		0xf8, 0xb2, 0x6c, 0x26, 0xcd, 0x87, 0x59, 0x13,
+		0x92, 0xd8, 0x06, 0x4c, 0xa7, 0xed, 0x33, 0x79,
+	},
+	{
+		0x00, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec,
+		0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e,
+		0xc4, 0x8f, 0x52, 0x19, 0xf5, 0xbe, 0x63, 0x28,
+		0xa6, 0xed, 0x30, 0x7b, 0x97, 0xdc, 0x01, 0x4a,
+		0x95, 0xde, 0x03, 0x48, 0xa4, 0xef, 0x32, 0x79,
+		0xf7, 0xbc, 0x61, 0x2a, 0xc6, 0x8d, 0x50, 0x1b,
+		0x51, 0x1a, 0xc7, 0x8c, 0x60, 0x2b, 0xf6, 0xbd,
+		0x33, 0x78, 0xa5, 0xee, 0x02, 0x49, 0x94, 0xdf,
+		0x37, 0x7c, 0xa1, 0xea, 0x06, 0x4d, 0x90, 0xdb,
+		0x55, 0x1e, 0xc3, 0x88, 0x64, 0x2f, 0xf2, 0xb9,
+		0xf3, 0xb8, 0x65, 0x2e, 0xc2, 0x89, 0x54, 0x1f,
+		0x91, 0xda, 0x07, 0x4c, 0xa0, 0xeb, 0x36, 0x7d,
+		0xa2, 0xe9, 0x34, 0x7f, 0x93, 0xd8, 0x05, 0x4e,
+		0xc0, 0x8b, 0x56, 0x1d, 0xf1, 0xba, 0x67, 0x2c,
+		0x66, 0x2d, 0xf0, 0xbb, 0x57, 0x1c, 0xc1, 0x8a,
+		0x04, 0x4f, 0x92, 0xd9, 0x35, 0x7e, 0xa3, 0xe8,
+		0x6e, 0x25, 0xf8, 0xb3, 0x5f, 0x14, 0xc9, 0x82,
+		0x0c, 0x47, 0x9a, 0xd1, 0x3d, 0x76, 0xab, 0xe0,
+		0xaa, 0xe1, 0x3c, 0x77, 0x9b, 0xd0, 0x0d, 0x46,
+		0xc8, 0x83, 0x5e, 0x15, 0xf9, 0xb2, 0x6f, 0x24,
+		0xfb, 0xb0, 0x6d, 0x26, 0xca, 0x81, 0x5c, 0x17,
+		0x99, 0xd2, 0x0f, 0x44, 0xa8, 0xe3, 0x3e, 0x75,
+		0x3f, 0x74, 0xa9, 0xe2, 0x0e, 0x45, 0x98, 0xd3,
+		0x5d, 0x16, 0xcb, 0x80, 0x6c, 0x27, 0xfa, 0xb1,
+		0x59, 0x12, 0xcf, 0x84, 0x68, 0x23, 0xfe, 0xb5,
+		0x3b, 0x70, 0xad, 0xe6, 0x0a, 0x41, 0x9c, 0xd7,
+		0x9d, 0xd6, 0x0b, 0x40, 0xac, 0xe7, 0x3a, 0x71,
+		0xff, 0xb4, 0x69, 0x22, 0xce, 0x85, 0x58, 0x13,
+		0xcc, 0x87, 0x5a, 0x11, 0xfd, 0xb6, 0x6b, 0x20,
+		0xae, 0xe5, 0x38, 0x73, 0x9f, 0xd4, 0x09, 0x42,
+		0x08, 0x43, 0x9e, 0xd5, 0x39, 0x72, 0xaf, 0xe4,
+		0x6a, 0x21, 0xfc, 0xb7, 0x5b, 0x10, 0xcd, 0x86,
+	},
+	{
+		0x00, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9,
+		0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3,
+		0xb4, 0xf8, 0x2c, 0x60, 0x99, 0xd5, 0x01, 0x4d,
+		0xee, 0xa2, 0x76, 0x3a, 0xc3, 0x8f, 0x5b, 0x17,
+		0x75, 0x39, 0xed, 0xa1, 0x58, 0x14, 0xc0, 0x8c,
+		0x2f, 0x63, 0xb7, 0xfb, 0x02, 0x4e, 0x9a, 0xd6,
+		0xc1, 0x8d, 0x59, 0x15, 0xec, 0xa0, 0x74, 0x38,
+		0x9b, 0xd7, 0x03, 0x4f, 0xb6, 0xfa, 0x2e, 0x62,
+		0xea, 0xa6, 0x72, 0x3e, 0xc7, 0x8b, 0x5f, 0x13,
+		0xb0, 0xfc, 0x28, 0x64, 0x9d, 0xd1, 0x05, 0x49,
+		0x5e, 0x12, 0xc6, 0x8a, 0x73, 0x3f, 0xeb, 0xa7,
+		0x04, 0x48, 0x9c, 0xd0, 0x29, 0x65, 0xb1, 0xfd,
+		0x9f, 0xd3, 0x07, 0x4b, 0xb2, 0xfe, 0x2a, 0x66,
+		0xc5, 0x89, 0x5d, 0x11, 0xe8, 0xa4, 0x70, 0x3c,
+		0x2b, 0x67, 0xb3, 0xff, 0x06, 0x4a, 0x9e, 0xd2,
+		0x71, 0x3d, 0xe9, 0xa5, 0x5c, 0x10, 0xc4, 0x88,
+		0xc9, 0x85, 0x51, 0x1d, 0xe4, 0xa8, 0x7c, 0x30,
+		0x93, 0xdf, 0x0b, 0x47, 0xbe, 0xf2, 0x26, 0x6a,
+		0x7d, 0x31, 0xe5, 0xa9, 0x50, 0x1c, 0xc8, 0x84,
+		0x27, 0x6b, 0xbf, 0xf3, 0x0a, 0x46, 0x92, 0xde,
+		0xbc, 0xf0, 0x24, 0x68, 0x91, 0xdd, 0x09, 0x45,
+		0xe6, 0xaa, 0x7e, 0x32, 0xcb, 0x87, 0x53, 0x1f,
+		0x08, 0x44, 0x90, 0xdc, 0x25, 0x69, 0xbd, 0xf1,
+		0x52, 0x1e, 0xca, 0x86, 0x7f, 0x33, 0xe7, 0xab,
+		0x23, 0x6f, 0xbb, 0xf7, 0x0e, 0x42, 0x96, 0xda,
+		0x79, 0x35, 0xe1, 0xad, 0x54, 0x18, 0xcc, 0x80,
+		0x97, 0xdb, 0x0f, 0x43, 0xba, 0xf6, 0x22, 0x6e,
+		0xcd, 0x81, 0x55, 0x19, 0xe0, 0xac, 0x78, 0x34,
+		0x56, 0x1a, 0xce, 0x82, 0x7b, 0x37, 0xe3, 0xaf,
+		0x0c, 0x40, 0x94, 0xd8, 0x21, 0x6d, 0xb9, 0xf5,
+		0xe2, 0xae, 0x7a, 0x36, 0xcf, 0x83, 0x57, 0x1b,
+		0xb8, 0xf4, 0x20, 0x6c, 0x95, 0xd9, 0x0d, 0x41,
+	},
+	{
+		0x00, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe,
+		0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac,
+		0xa4, 0xe9, 0x3e, 0x73, 0x8d, 0xc0, 0x17, 0x5a,
+		0xf6, 0xbb, 0x6c, 0x21, 0xdf, 0x92, 0x45, 0x08,
+		0x55, 0x18, 0xcf, 0x82, 0x7c, 0x31, 0xe6, 0xab,
+		0x07, 0x4a, 0x9d, 0xd0, 0x2e, 0x63, 0xb4, 0xf9,
+		0xf1, 0xbc, 0x6b, 0x26, 0xd8, 0x95, 0x42, 0x0f,
+		0xa3, 0xee, 0x39, 0x74, 0x8a, 0xc7, 0x10, 0x5d,
+		0xaa, 0xe7, 0x30, 0x7d, 0x83, 0xce, 0x19, 0x54,
+		0xf8, 0xb5, 0x62, 0x2f, 0xd1, 0x9c, 0x4b, 0x06,
+		0x0e, 0x43, 0x94, 0xd9, 0x27, 0x6a, 0xbd, 0xf0,
+		0x5c, 0x11, 0xc6, 0x8b, 0x75, 0x38, 0xef, 0xa2,
+		0xff, 0xb2, 0x65, 0x28, 0xd6, 0x9b, 0x4c, 0x01,
+		0xad, 0xe0, 0x37, 0x7a, 0x84, 0xc9, 0x1e, 0x53,
+		0x5b, 0x16, 0xc1, 0x8c, 0x72, 0x3f, 0xe8, 0xa5,
+		0x09, 0x44, 0x93, 0xde, 0x20, 0x6d, 0xba, 0xf7,
+		0x49, 0x04, 0xd3, 0x9e, 0x60, 0x2d, 0xfa, 0xb7,
+		0x1b, 0x56, 0x81, 0xcc, 0x32, 0x7f, 0xa8, 0xe5,
+		0xed, 0xa0, 0x77, 0x3a, 0xc4, 0x89, 0x5e, 0x13,
+		0xbf, 0xf2, 0x25, 0x68, 0x96, 0xdb, 0x0c, 0x41,
+		0x1c, 0x51, 0x86, 0xcb, 0x35, 0x78, 0xaf, 0xe2,
+		0x4e, 0x03, 0xd4, 0x99, 0x67, 0x2a, 0xfd, 0xb0,
+		0xb8, 0xf5, 0x22, 0x6f, 0x91, 0xdc, 0x0b, 0x46,
+		0xea, 0xa7, 0x70, 0x3d, 0xc3, 0x8e, 0x59, 0x14,
+		0xe3, 0xae, 0x79, 0x34, 0xca, 0x87, 0x50, 0x1d,
+		0xb1, 0xfc, 0x2b, 0x66, 0x98, 0xd5, 0x02, 0x4f,
+		0x47, 0x0a, 0xdd, 0x90, 0x6e, 0x23, 0xf4, 0xb9,
+		0x15, 0x58, 0x8f, 0xc2, 0x3c, 0x71, 0xa6, 0xeb,
+		0xb6, 0xfb, 0x2c, 0x61, 0x9f, 0xd2, 0x05, 0x48,
+		0xe4, 0xa9, 0x7e, 0x33, 0xcd, 0x80, 0x57, 0x1a,
+		0x12, 0x5f, 0x88, 0xc5, 0x3b, 0x76, 0xa1, 0xec,
+		0x40, 0x0d, 0xda, 0x97, 0x69, 0x24, 0xf3, 0xbe,
+	},
+	{
+		0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+		0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd,
+		0x94, 0xda, 0x08, 0x46, 0xb1, 0xff, 0x2d, 0x63,
+		0xde, 0x90, 0x42, 0x0c, 0xfb, 0xb5, 0x67, 0x29,
+		0x35, 0x7b, 0xa9, 0xe7, 0x10, 0x5e, 0x8c, 0xc2,
+		0x7f, 0x31, 0xe3, 0xad, 0x5a, 0x14, 0xc6, 0x88,
+		0xa1, 0xef, 0x3d, 0x73, 0x84, 0xca, 0x18, 0x56,
+		0xeb, 0xa5, 0x77, 0x39, 0xce, 0x80, 0x52, 0x1c,
+		0x6a, 0x24, 0xf6, 0xb8, 0x4f, 0x01, 0xd3, 0x9d,
+		0x20, 0x6e, 0xbc, 0xf2, 0x05, 0x4b, 0x99, 0xd7,
+		0xfe, 0xb0, 0x62, 0x2c, 0xdb, 0x95, 0x47, 0x09,
+		0xb4, 0xfa, 0x28, 0x66, 0x91, 0xdf, 0x0d, 0x43,
+		0x5f, 0x11, 0xc3, 0x8d, 0x7a, 0x34, 0xe6, 0xa8,
+		0x15, 0x5b, 0x89, 0xc7, 0x30, 0x7e, 0xac, 0xe2,
+		0xcb, 0x85, 0x57, 0x19, 0xee, 0xa0, 0x72, 0x3c,
+		0x81, 0xcf, 0x1d, 0x53, 0xa4, 0xea, 0x38, 0x76,
+		0xd4, 0x9a, 0x48, 0x06, 0xf1, 0xbf, 0x6d, 0x23,
+		0x9e, 0xd0, 0x02, 0x4c, 0xbb, 0xf5, 0x27, 0x69,
+		0x40, 0x0e, 0xdc, 0x92, 0x65, 0x2b, 0xf9, 0xb7,
+		0x0a, 0x44, 0x96, 0xd8, 0x2f, 0x61, 0xb3, 0xfd,
+		0xe1, 0xaf, 0x7d, 0x33, 0xc4, 0x8a, 0x58, 0x16,
+		0xab, 0xe5, 0x37, 0x79, 0x8e, 0xc0, 0x12, 0x5c,
+		0x75, 0x3b, 0xe9, 0xa7, 0x50, 0x1e, 0xcc, 0x82,
+		0x3f, 0x71, 0xa3, 0xed, 0x1a, 0x54, 0x86, 0xc8,
+		0xbe, 0xf0, 0x22, 0x6c, 0x9b, 0xd5, 0x07, 0x49,
+		0xf4, 0xba, 0x68, 0x26, 0xd1, 0x9f, 0x4d, 0x03,
+		0x2a, 0x64, 0xb6, 0xf8, 0x0f, 0x41, 0x93, 0xdd,
+		0x60, 0x2e, 0xfc, 0xb2, 0x45, 0x0b, 0xd9, 0x97,
+		0x8b, 0xc5, 0x17, 0x59, 0xae, 0xe0, 0x32, 0x7c,
+		0xc1, 0x8f, 0x5d, 0x13, 0xe4, 0xaa, 0x78, 0x36,
+		0x1f, 0x51, 0x83, 0xcd, 0x3a, 0x74, 0xa6, 0xe8,
+		0x55, 0x1b, 0xc9, 0x87, 0x70, 0x3e, 0xec, 0xa2,
+	},
+	{
+		0x00, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0,
+		0x42, 0x0d, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2,
+		0x84, 0xcb, 0x1a, 0x55, 0xa5, 0xea, 0x3b, 0x74,
+		0xc6, 0x89, 0x58, 0x17, 0xe7, 0xa8, 0x79, 0x36,
+		0x15, 0x5a, 0x8b, 0xc4, 0x34, 0x7b, 0xaa, 0xe5,
+		0x57, 0x18, 0xc9, 0x86, 0x76, 0x39, 0xe8, 0xa7,
+		0x91, 0xde, 0x0f, 0x40, 0xb0, 0xff, 0x2e, 0x61,
+		0xd3, 0x9c, 0x4d, 0x02, 0xf2, 0xbd, 0x6c, 0x23,
+		0x2a, 0x65, 0xb4, 0xfb, 0x0b, 0x44, 0x95, 0xda,
+		0x68, 0x27, 0xf6, 0xb9, 0x49, 0x06, 0xd7, 0x98,
+		0xae, 0xe1, 0x30, 0x7f, 0x8f, 0xc0, 0x11, 0x5e,
+		0xec, 0xa3, 0x72, 0x3d, 0xcd, 0x82, 0x53, 0x1c,
+		0x3f, 0x70, 0xa1, 0xee, 0x1e, 0x51, 0x80, 0xcf,
+		0x7d, 0x32, 0xe3, 0xac, 0x5c, 0x13, 0xc2, 0x8d,
+		0xbb, 0xf4, 0x25, 0x6a, 0x9a, 0xd5, 0x04, 0x4b,
+		0xf9, 0xb6, 0x67, 0x28, 0xd8, 0x97, 0x46, 0x09,
+		0x54, 0x1b, 0xca, 0x85, 0x75, 0x3a, 0xeb, 0xa4,
+		0x16, 0x59, 0x88, 0xc7, 0x37, 0x78, 0xa9, 0xe6,
+		0xd0, 0x9f, 0x4e, 0x01, 0xf1, 0xbe, 0x6f, 0x20,
+		0x92, 0xdd, 0x0c, 0x43, 0xb3, 0xfc, 0x2d, 0x62,
+		0x41, 0x0e, 0xdf, 0x90, 0x60, 0x2f, 0xfe, 0xb1,
+		0x03, 0x4c, 0x9d, 0xd2, 0x22, 0x6d, 0xbc, 0xf3,
+		0xc5, 0x8a, 0x5b, 0x14, 0xe4, 0xab, 0x7a, 0x35,
+		0x87, 0xc8, 0x19, 0x56, 0xa6, 0xe9, 0x38, 0x77,
+		0x7e, 0x31, 0xe0, 0xaf, 0x5f, 0x10, 0xc1, 0x8e,
+		0x3c, 0x73, 0xa2, 0xed, 0x1d, 0x52, 0x83, 0xcc,
+		0xfa, 0xb5, 0x64, 0x2b, 0xdb, 0x94, 0x45, 0x0a,
+		0xb8, 0xf7, 0x26, 0x69, 0x99, 0xd6, 0x07, 0x48,
+		0x6b, 0x24, 0xf5, 0xba, 0x4a, 0x05, 0xd4, 0x9b,
+		0x29, 0x66, 0xb7, 0xf8, 0x08, 0x47, 0x96, 0xd9,
+		0xef, 0xa0, 0x71, 0x3e, 0xce, 0x81, 0x50, 0x1f,
+		0xad, 0xe2, 0x33, 0x7c, 0x8c, 0xc3, 0x12, 0x5d,
+	},
+	{
+		0x00, 0x50, 0xa0, 0xf0, 0x5d, 0x0d, 0xfd, 0xad,
+		0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17,
+		0x69, 0x39, 0xc9, 0x99, 0x34, 0x64, 0x94, 0xc4,
+		0xd3, 0x83, 0x73, 0x23, 0x8e, 0xde, 0x2e, 0x7e,
+		0xd2, 0x82, 0x72, 0x22, 0x8f, 0xdf, 0x2f, 0x7f,
+		0x68, 0x38, 0xc8, 0x98, 0x35, 0x65, 0x95, 0xc5,
+		0xbb, 0xeb, 0x1b, 0x4b, 0xe6, 0xb6, 0x46, 0x16,
+		0x01, 0x51, 0xa1, 0xf1, 0x5c, 0x0c, 0xfc, 0xac,
+		0xb9, 0xe9, 0x19, 0x49, 0xe4, 0xb4, 0x44, 0x14,
+		0x03, 0x53, 0xa3, 0xf3, 0x5e, 0x0e, 0xfe, 0xae,
+		0xd0, 0x80, 0x70, 0x20, 0x8d, 0xdd, 0x2d, 0x7d,
+		0x6a, 0x3a, 0xca, 0x9a, 0x37, 0x67, 0x97, 0xc7,
+		0x6b, 0x3b, 0xcb, 0x9b, 0x36, 0x66, 0x96, 0xc6,
+		0xd1, 0x81, 0x71, 0x21, 0x8c, 0xdc, 0x2c, 0x7c,
+		0x02, 0x52, 0xa2, 0xf2, 0x5f, 0x0f, 0xff, 0xaf,
+		0xb8, 0xe8, 0x18, 0x48, 0xe5, 0xb5, 0x45, 0x15,
+		0x6f, 0x3f, 0xcf, 0x9f, 0x32, 0x62, 0x92, 0xc2,
+		0xd5, 0x85, 0x75, 0x25, 0x88, 0xd8, 0x28, 0x78,
+		0x06, 0x56, 0xa6, 0xf6, 0x5b, 0x0b, 0xfb, 0xab,
+		0xbc, 0xec, 0x1c, 0x4c, 0xe1, 0xb1, 0x41, 0x11,
+		0xbd, 0xed, 0x1d, 0x4d, 0xe0, 0xb0, 0x40, 0x10,
+		0x07, 0x57, 0xa7, 0xf7, 0x5a, 0x0a, 0xfa, 0xaa,
+		0xd4, 0x84, 0x74, 0x24, 0x89, 0xd9, 0x29, 0x79,
+		0x6e, 0x3e, 0xce, 0x9e, 0x33, 0x63, 0x93, 0xc3,
+		0xd6, 0x86, 0x76, 0x26, 0x8b, 0xdb, 0x2b, 0x7b,
+		0x6c, 0x3c, 0xcc, 0x9c, 0x31, 0x61, 0x91, 0xc1,
+		0xbf, 0xef, 0x1f, 0x4f, 0xe2, 0xb2, 0x42, 0x12,
+		0x05, 0x55, 0xa5, 0xf5, 0x58, 0x08, 0xf8, 0xa8,
+		0x04, 0x54, 0xa4, 0xf4, 0x59, 0x09, 0xf9, 0xa9,
+		0xbe, 0xee, 0x1e, 0x4e, 0xe3, 0xb3, 0x43, 0x13,
+		0x6d, 0x3d, 0xcd, 0x9d, 0x30, 0x60, 0x90, 0xc0,
+		0xd7, 0x87, 0x77, 0x27, 0x8a, 0xda, 0x2a, 0x7a,
+	},
+	{
+		0x00, 0x51, 0xa2, 0xf3, 0x59, 0x08, 0xfb, 0xaa,
+		0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18,
+		0x79, 0x28, 0xdb, 0x8a, 0x20, 0x71, 0x82, 0xd3,
+		0xcb, 0x9a, 0x69, 0x38, 0x92, 0xc3, 0x30, 0x61,
+		0xf2, 0xa3, 0x50, 0x01, 0xab, 0xfa, 0x09, 0x58,
+		0x40, 0x11, 0xe2, 0xb3, 0x19, 0x48, 0xbb, 0xea,
+		0x8b, 0xda, 0x29, 0x78, 0xd2, 0x83, 0x70, 0x21,
+		0x39, 0x68, 0x9b, 0xca, 0x60, 0x31, 0xc2, 0x93,
+		0xf9, 0xa8, 0x5b, 0x0a, 0xa0, 0xf1, 0x02, 0x53,
+		0x4b, 0x1a, 0xe9, 0xb8, 0x12, 0x43, 0xb0, 0xe1,
+		0x80, 0xd1, 0x22, 0x73, 0xd9, 0x88, 0x7b, 0x2a,
+		0x32, 0x63, 0x90, 0xc1, 0x6b, 0x3a, 0xc9, 0x98,
+		0x0b, 0x5a, 0xa9, 0xf8, 0x52, 0x03, 0xf0, 0xa1,
+		0xb9, 0xe8, 0x1b, 0x4a, 0xe0, 0xb1, 0x42, 0x13,
+		0x72, 0x23, 0xd0, 0x81, 0x2b, 0x7a, 0x89, 0xd8,
+		0xc0, 0x91, 0x62, 0x33, 0x99, 0xc8, 0x3b, 0x6a,
+		0xef, 0xbe, 0x4d, 0x1c, 0xb6, 0xe7, 0x14, 0x45,
+		0x5d, 0x0c, 0xff, 0xae, 0x04, 0x55, 0xa6, 0xf7,
+		0x96, 0xc7, 0x34, 0x65, 0xcf, 0x9e, 0x6d, 0x3c,
+		0x24, 0x75, 0x86, 0xd7, 0x7d, 0x2c, 0xdf, 0x8e,
+		0x1d, 0x4c, 0xbf, 0xee, 0x44, 0x15, 0xe6, 0xb7,
+		0xaf, 0xfe, 0x0d, 0x5c, 0xf6, 0xa7, 0x54, 0x05,
+		0x64, 0x35, 0xc6, 0x97, 0x3d, 0x6c, 0x9f, 0xce,
+		0xd6, 0x87, 0x74, 0x25, 0x8f, 0xde, 0x2d, 0x7c,
+		0x16, 0x47, 0xb4, 0xe5, 0x4f, 0x1e, 0xed, 0xbc,
+		0xa4, 0xf5, 0x06, 0x57, 0xfd, 0xac, 0x5f, 0x0e,
+		0x6f, 0x3e, 0xcd, 0x9c, 0x36, 0x67, 0x94, 0xc5,
+		0xdd, 0x8c, 0x7f, 0x2e, 0x84, 0xd5, 0x26, 0x77,
+		0xe4, 0xb5, 0x46, 0x17, 0xbd, 0xec, 0x1f, 0x4e,
+		0x56, 0x07, 0xf4, 0xa5, 0x0f, 0x5e, 0xad, 0xfc,
+		0x9d, 0xcc, 0x3f, 0x6e, 0xc4, 0x95, 0x66, 0x37,
+		0x2f, 0x7e, 0x8d, 0xdc, 0x76, 0x27, 0xd4, 0x85,
+	},
+	{
+		0x00, 0x52, 0xa4, 0xf6, 0x55, 0x07, 0xf1, 0xa3,
+		0xaa, 0xf8, 0x0e, 0x5c, 0xff, 0xad, 0x5b, 0x09,
+		0x49, 0x1b, 0xed, 0xbf, 0x1c, 0x4e, 0xb8, 0xea,
+		0xe3, 0xb1, 0x47, 0x15, 0xb6, 0xe4, 0x12, 0x40,
+		0x92, 0xc0, 0x36, 0x64, 0xc7, 0x95, 0x63, 0x31,
+		0x38, 0x6a, 0x9c, 0xce, 0x6d, 0x3f, 0xc9, 0x9b,
+		0xdb, 0x89, 0x7f, 0x2d, 0x8e, 0xdc, 0x2a, 0x78,
+		0x71, 0x23, 0xd5, 0x87, 0x24, 0x76, 0x80, 0xd2,
+		0x39, 0x6b, 0x9d, 0xcf, 0x6c, 0x3e, 0xc8, 0x9a,
+		0x93, 0xc1, 0x37, 0x65, 0xc6, 0x94, 0x62, 0x30,
+		0x70, 0x22, 0xd4, 0x86, 0x25, 0x77, 0x81, 0xd3,
+		0xda, 0x88, 0x7e, 0x2c, 0x8f, 0xdd, 0x2b, 0x79,
+		0xab, 0xf9, 0x0f, 0x5d, 0xfe, 0xac, 0x5a, 0x08,
+		0x01, 0x53, 0xa5, 0xf7, 0x54, 0x06, 0xf0, 0xa2,
+		0xe2, 0xb0, 0x46, 0x14, 0xb7, 0xe5, 0x13, 0x41,
+		0x48, 0x1a, 0xec, 0xbe, 0x1d, 0x4f, 0xb9, 0xeb,
+		0x72, 0x20, 0xd6, 0x84, 0x27, 0x75, 0x83, 0xd1,
+		0xd8, 0x8a, 0x7c, 0x2e, 0x8d, 0xdf, 0x29, 0x7b,
+		0x3b, 0x69, 0x9f, 0xcd, 0x6e, 0x3c, 0xca, 0x98,
+		0x91, 0xc3, 0x35, 0x67, 0xc4, 0x96, 0x60, 0x32,
+		0xe0, 0xb2, 0x44, 0x16, 0xb5, 0xe7, 0x11, 0x43,
+		0x4a, 0x18, 0xee, 0xbc, 0x1f, 0x4d, 0xbb, 0xe9,
+		0xa9, 0xfb, 0x0d, 0x5f, 0xfc, 0xae, 0x58, 0x0a,
+		0x03, 0x51, 0xa7, 0xf5, 0x56, 0x04, 0xf2, 0xa0,
+		0x4b, 0x19, 0xef, 0xbd, 0x1e, 0x4c, 0xba, 0xe8,
+		0xe1, 0xb3, 0x45, 0x17, 0xb4, 0xe6, 0x10, 0x42,
+		0x02, 0x50, 0xa6, 0xf4, 0x57, 0x05, 0xf3, 0xa1,
+		0xa8, 0xfa, 0x0c, 0x5e, 0xfd, 0xaf, 0x59, 0x0b,
+		0xd9, 0x8b, 0x7d, 0x2f, 0x8c, 0xde, 0x28, 0x7a,
+		0x73, 0x21, 0xd7, 0x85, 0x26, 0x74, 0x82, 0xd0,
+		0x90, 0xc2, 0x34, 0x66, 0xc5, 0x97, 0x61, 0x33,
+		0x3a, 0x68, 0x9e, 0xcc, 0x6f, 0x3d, 0xcb, 0x99,
+	},
+	{
+		0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+		0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06,
+		0x59, 0x0a, 0xff, 0xac, 0x08, 0x5b, 0xae, 0xfd,
+		0xfb, 0xa8, 0x5d, 0x0e, 0xaa, 0xf9, 0x0c, 0x5f,
+		0xb2, 0xe1, 0x14, 0x47, 0xe3, 0xb0, 0x45, 0x16,
+		0x10, 0x43, 0xb6, 0xe5, 0x41, 0x12, 0xe7, 0xb4,
+		0xeb, 0xb8, 0x4d, 0x1e, 0xba, 0xe9, 0x1c, 0x4f,
+		0x49, 0x1a, 0xef, 0xbc, 0x18, 0x4b, 0xbe, 0xed,
+		0x79, 0x2a, 0xdf, 0x8c, 0x28, 0x7b, 0x8e, 0xdd,
+		0xdb, 0x88, 0x7d, 0x2e, 0x8a, 0xd9, 0x2c, 0x7f,
+		0x20, 0x73, 0x86, 0xd5, 0x71, 0x22, 0xd7, 0x84,
+		0x82, 0xd1, 0x24, 0x77, 0xd3, 0x80, 0x75, 0x26,
+		0xcb, 0x98, 0x6d, 0x3e, 0x9a, 0xc9, 0x3c, 0x6f,
+		0x69, 0x3a, 0xcf, 0x9c, 0x38, 0x6b, 0x9e, 0xcd,
+		0x92, 0xc1, 0x34, 0x67, 0xc3, 0x90, 0x65, 0x36,
+		0x30, 0x63, 0x96, 0xc5, 0x61, 0x32, 0xc7, 0x94,
+		0xf2, 0xa1, 0x54, 0x07, 0xa3, 0xf0, 0x05, 0x56,
+		0x50, 0x03, 0xf6, 0xa5, 0x01, 0x52, 0xa7, 0xf4,
+		0xab, 0xf8, 0x0d, 0x5e, 0xfa, 0xa9, 0x5c, 0x0f,
+		0x09, 0x5a, 0xaf, 0xfc, 0x58, 0x0b, 0xfe, 0xad,
+		0x40, 0x13, 0xe6, 0xb5, 0x11, 0x42, 0xb7, 0xe4,
+		0xe2, 0xb1, 0x44, 0x17, 0xb3, 0xe0, 0x15, 0x46,
+		0x19, 0x4a, 0xbf, 0xec, 0x48, 0x1b, 0xee, 0xbd,
+		0xbb, 0xe8, 0x1d, 0x4e, 0xea, 0xb9, 0x4c, 0x1f,
+		0x8b, 0xd8, 0x2d, 0x7e, 0xda, 0x89, 0x7c, 0x2f,
+		0x29, 0x7a, 0x8f, 0xdc, 0x78, 0x2b, 0xde, 0x8d,
+		0xd2, 0x81, 0x74, 0x27, 0x83, 0xd0, 0x25, 0x76,
+		0x70, 0x23, 0xd6, 0x85, 0x21, 0x72, 0x87, 0xd4,
+		0x39, 0x6a, 0x9f, 0xcc, 0x68, 0x3b, 0xce, 0x9d,
+		0x9b, 0xc8, 0x3d, 0x6e, 0xca, 0x99, 0x6c, 0x3f,
+		0x60, 0x33, 0xc6, 0x95, 0x31, 0x62, 0x97, 0xc4,
+		0xc2, 0x91, 0x64, 0x37, 0x93, 0xc0, 0x35, 0x66,
+	},
+	{
+		0x00, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1,
+		0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b,
+		0x29, 0x7d, 0x81, 0xd5, 0x64, 0x30, 0xcc, 0x98,
+		0xb3, 0xe7, 0x1b, 0x4f, 0xfe, 0xaa, 0x56, 0x02,
+		0x52, 0x06, 0xfa, 0xae, 0x1f, 0x4b, 0xb7, 0xe3,
+		0xc8, 0x9c, 0x60, 0x34, 0x85, 0xd1, 0x2d, 0x79,
+		0x7b, 0x2f, 0xd3, 0x87, 0x36, 0x62, 0x9e, 0xca,
+		0xe1, 0xb5, 0x49, 0x1d, 0xac, 0xf8, 0x04, 0x50,
+		0xa4, 0xf0, 0x0c, 0x58, 0xe9, 0xbd, 0x41, 0x15,
+		0x3e, 0x6a, 0x96, 0xc2, 0x73, 0x27, 0xdb, 0x8f,
+		0x8d, 0xd9, 0x25, 0x71, 0xc0, 0x94, 0x68, 0x3c,
+		0x17, 0x43, 0xbf, 0xeb, 0x5a, 0x0e, 0xf2, 0xa6,
+		0xf6, 0xa2, 0x5e, 0x0a, 0xbb, 0xef, 0x13, 0x47,
+		0x6c, 0x38, 0xc4, 0x90, 0x21, 0x75, 0x89, 0xdd,
+		0xdf, 0x8b, 0x77, 0x23, 0x92, 0xc6, 0x3a, 0x6e,
+		0x45, 0x11, 0xed, 0xb9, 0x08, 0x5c, 0xa0, 0xf4,
+		0x55, 0x01, 0xfd, 0xa9, 0x18, 0x4c, 0xb0, 0xe4,
+		0xcf, 0x9b, 0x67, 0x33, 0x82, 0xd6, 0x2a, 0x7e,
+		0x7c, 0x28, 0xd4, 0x80, 0x31, 0x65, 0x99, 0xcd,
+		0xe6, 0xb2, 0x4e, 0x1a, 0xab, 0xff, 0x03, 0x57,
+		0x07, 0x53, 0xaf, 0xfb, 0x4a, 0x1e, 0xe2, 0xb6,
+		0x9d, 0xc9, 0x35, 0x61, 0xd0, 0x84, 0x78, 0x2c,
+		0x2e, 0x7a, 0x86, 0xd2, 0x63, 0x37, 0xcb, 0x9f,
+		0xb4, 0xe0, 0x1c, 0x48, 0xf9, 0xad, 0x51, 0x05,
+		0xf1, 0xa5, 0x59, 0x0d, 0xbc, 0xe8, 0x14, 0x40,
+		0x6b, 0x3f, 0xc3, 0x97, 0x26, 0x72, 0x8e, 0xda,
+		0xd8, 0x8c, 0x70, 0x24, 0x95, 0xc1, 0x3d, 0x69,
+		0x42, 0x16, 0xea, 0xbe, 0x0f, 0x5b, 0xa7, 0xf3,
+		0xa3, 0xf7, 0x0b, 0x5f, 0xee, 0xba, 0x46, 0x12,
+		0x39, 0x6d, 0x91, 0xc5, 0x74, 0x20, 0xdc, 0x88,
+		0x8a, 0xde, 0x22, 0x76, 0xc7, 0x93, 0x6f, 0x3b,
+		0x10, 0x44, 0xb8, 0xec, 0x5d, 0x09, 0xf5, 0xa1,
+	},
+	{
+		0x00, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6,
+		0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24,
+		0x39, 0x6c, 0x93, 0xc6, 0x70, 0x25, 0xda, 0x8f,
+		0xab, 0xfe, 0x01, 0x54, 0xe2, 0xb7, 0x48, 0x1d,
+		0x72, 0x27, 0xd8, 0x8d, 0x3b, 0x6e, 0x91, 0xc4,
+		0xe0, 0xb5, 0x4a, 0x1f, 0xa9, 0xfc, 0x03, 0x56,
+		0x4b, 0x1e, 0xe1, 0xb4, 0x02, 0x57, 0xa8, 0xfd,
+		0xd9, 0x8c, 0x73, 0x26, 0x90, 0xc5, 0x3a, 0x6f,
+		0xe4, 0xb1, 0x4e, 0x1b, 0xad, 0xf8, 0x07, 0x52,
+		0x76, 0x23, 0xdc, 0x89, 0x3f, 0x6a, 0x95, 0xc0,
+		0xdd, 0x88, 0x77, 0x22, 0x94, 0xc1, 0x3e, 0x6b,
+		0x4f, 0x1a, 0xe5, 0xb0, 0x06, 0x53, 0xac, 0xf9,
+		0x96, 0xc3, 0x3c, 0x69, 0xdf, 0x8a, 0x75, 0x20,
+		0x04, 0x51, 0xae, 0xfb, 0x4d, 0x18, 0xe7, 0xb2,
+		0xaf, 0xfa, 0x05, 0x50, 0xe6, 0xb3, 0x4c, 0x19,
+		0x3d, 0x68, 0x97, 0xc2, 0x74, 0x21, 0xde, 0x8b,
+		0xd5, 0x80, 0x7f, 0x2a, 0x9c, 0xc9, 0x36, 0x63,
+		0x47, 0x12, 0xed, 0xb8, 0x0e, 0x5b, 0xa4, 0xf1,
+		0xec, 0xb9, 0x46, 0x13, 0xa5, 0xf0, 0x0f, 0x5a,
+		0x7e, 0x2b, 0xd4, 0x81, 0x37, 0x62, 0x9d, 0xc8,
+		0xa7, 0xf2, 0x0d, 0x58, 0xee, 0xbb, 0x44, 0x11,
+		0x35, 0x60, 0x9f, 0xca, 0x7c, 0x29, 0xd6, 0x83,
+		0x9e, 0xcb, 0x34, 0x61, 0xd7, 0x82, 0x7d, 0x28,
+		0x0c, 0x59, 0xa6, 0xf3, 0x45, 0x10, 0xef, 0xba,
+		0x31, 0x64, 0x9b, 0xce, 0x78, 0x2d, 0xd2, 0x87,
+		0xa3, 0xf6, 0x09, 0x5c, 0xea, 0xbf, 0x40, 0x15,
+		0x08, 0x5d, 0xa2, 0xf7, 0x41, 0x14, 0xeb, 0xbe,
+		0x9a, 0xcf, 0x30, 0x65, 0xd3, 0x86, 0x79, 0x2c,
+		0x43, 0x16, 0xe9, 0xbc, 0x0a, 0x5f, 0xa0, 0xf5,
+		0xd1, 0x84, 0x7b, 0x2e, 0x98, 0xcd, 0x32, 0x67,
+		0x7a, 0x2f, 0xd0, 0x85, 0x33, 0x66, 0x99, 0xcc,
+		0xe8, 0xbd, 0x42, 0x17, 0xa1, 0xf4, 0x0b, 0x5e,
+	},
+	{
+		0x00, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf,
+		0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35,
+		0x09, 0x5f, 0xa5, 0xf3, 0x4c, 0x1a, 0xe0, 0xb6,
+		0x83, 0xd5, 0x2f, 0x79, 0xc6, 0x90, 0x6a, 0x3c,
+		0x12, 0x44, 0xbe, 0xe8, 0x57, 0x01, 0xfb, 0xad,
+		0x98, 0xce, 0x34, 0x62, 0xdd, 0x8b, 0x71, 0x27,
+		0x1b, 0x4d, 0xb7, 0xe1, 0x5e, 0x08, 0xf2, 0xa4,
+		0x91, 0xc7, 0x3d, 0x6b, 0xd4, 0x82, 0x78, 0x2e,
+		0x24, 0x72, 0x88, 0xde, 0x61, 0x37, 0xcd, 0x9b,
+		0xae, 0xf8, 0x02, 0x54, 0xeb, 0xbd, 0x47, 0x11,
+		0x2d, 0x7b, 0x81, 0xd7, 0x68, 0x3e, 0xc4, 0x92,
+		0xa7, 0xf1, 0x0b, 0x5d, 0xe2, 0xb4, 0x4e, 0x18,
+		0x36, 0x60, 0x9a, 0xcc, 0x73, 0x25, 0xdf, 0x89,
+		0xbc, 0xea, 0x10, 0x46, 0xf9, 0xaf, 0x55, 0x03,
+		0x3f, 0x69, 0x93, 0xc5, 0x7a, 0x2c, 0xd6, 0x80,
+		0xb5, 0xe3, 0x19, 0x4f, 0xf0, 0xa6, 0x5c, 0x0a,
+		0x48, 0x1e, 0xe4, 0xb2, 0x0d, 0x5b, 0xa1, 0xf7,
+		0xc2, 0x94, 0x6e, 0x38, 0x87, 0xd1, 0x2b, 0x7d,
+		0x41, 0x17, 0xed, 0xbb, 0x04, 0x52, 0xa8, 0xfe,
+		0xcb, 0x9d, 0x67, 0x31, 0x8e, 0xd8, 0x22, 0x74,
+		0x5a, 0x0c, 0xf6, 0xa0, 0x1f, 0x49, 0xb3, 0xe5,
+		0xd0, 0x86, 0x7c, 0x2a, 0x95, 0xc3, 0x39, 0x6f,
+		0x53, 0x05, 0xff, 0xa9, 0x16, 0x40, 0xba, 0xec,
+		0xd9, 0x8f, 0x75, 0x23, 0x9c, 0xca, 0x30, 0x66,
+		0x6c, 0x3a, 0xc0, 0x96, 0x29, 0x7f, 0x85, 0xd3,
+		0xe6, 0xb0, 0x4a, 0x1c, 0xa3, 0xf5, 0x0f, 0x59,
+		0x65, 0x33, 0xc9, 0x9f, 0x20, 0x76, 0x8c, 0xda,
+		0xef, 0xb9, 0x43, 0x15, 0xaa, 0xfc, 0x06, 0x50,
+		0x7e, 0x28, 0xd2, 0x84, 0x3b, 0x6d, 0x97, 0xc1,
+		0xf4, 0xa2, 0x58, 0x0e, 0xb1, 0xe7, 0x1d, 0x4b,
+		0x77, 0x21, 0xdb, 0x8d, 0x32, 0x64, 0x9e, 0xc8,
+		0xfd, 0xab, 0x51, 0x07, 0xb8, 0xee, 0x14, 0x42,
+	},
+	{
+		0x00, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8,
+		0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a,
+		0x19, 0x4e, 0xb7, 0xe0, 0x58, 0x0f, 0xf6, 0xa1,
+		0x9b, 0xcc, 0x35, 0x62, 0xda, 0x8d, 0x74, 0x23,
+		0x32, 0x65, 0x9c, 0xcb, 0x73, 0x24, 0xdd, 0x8a,
+		0xb0, 0xe7, 0x1e, 0x49, 0xf1, 0xa6, 0x5f, 0x08,
+		0x2b, 0x7c, 0x85, 0xd2, 0x6a, 0x3d, 0xc4, 0x93,
+		0xa9, 0xfe, 0x07, 0x50, 0xe8, 0xbf, 0x46, 0x11,
+		0x64, 0x33, 0xca, 0x9d, 0x25, 0x72, 0x8b, 0xdc,
+		0xe6, 0xb1, 0x48, 0x1f, 0xa7, 0xf0, 0x09, 0x5e,
+		0x7d, 0x2a, 0xd3, 0x84, 0x3c, 0x6b, 0x92, 0xc5,
+		0xff, 0xa8, 0x51, 0x06, 0xbe, 0xe9, 0x10, 0x47,
+		0x56, 0x01, 0xf8, 0xaf, 0x17, 0x40, 0xb9, 0xee,
+		0xd4, 0x83, 0x7a, 0x2d, 0x95, 0xc2, 0x3b, 0x6c,
+		0x4f, 0x18, 0xe1, 0xb6, 0x0e, 0x59, 0xa0, 0xf7,
+		0xcd, 0x9a, 0x63, 0x34, 0x8c, 0xdb, 0x22, 0x75,
+		0xc8, 0x9f, 0x66, 0x31, 0x89, 0xde, 0x27, 0x70,
+		0x4a, 0x1d, 0xe4, 0xb3, 0x0b, 0x5c, 0xa5, 0xf2,
+		0xd1, 0x86, 0x7f, 0x28, 0x90, 0xc7, 0x3e, 0x69,
+		0x53, 0x04, 0xfd, 0xaa, 0x12, 0x45, 0xbc, 0xeb,
+		0xfa, 0xad, 0x54, 0x03, 0xbb, 0xec, 0x15, 0x42,
+		0x78, 0x2f, 0xd6, 0x81, 0x39, 0x6e, 0x97, 0xc0,
+		0xe3, 0xb4, 0x4d, 0x1a, 0xa2, 0xf5, 0x0c, 0x5b,
+		0x61, 0x36, 0xcf, 0x98, 0x20, 0x77, 0x8e, 0xd9,
+		0xac, 0xfb, 0x02, 0x55, 0xed, 0xba, 0x43, 0x14,
+		0x2e, 0x79, 0x80, 0xd7, 0x6f, 0x38, 0xc1, 0x96,
+		0xb5, 0xe2, 0x1b, 0x4c, 0xf4, 0xa3, 0x5a, 0x0d,
+		0x37, 0x60, 0x99, 0xce, 0x76, 0x21, 0xd8, 0x8f,
+		0x9e, 0xc9, 0x30, 0x67, 0xdf, 0x88, 0x71, 0x26,
+		0x1c, 0x4b, 0xb2, 0xe5, 0x5d, 0x0a, 0xf3, 0xa4,
+		0x87, 0xd0, 0x29, 0x7e, 0xc6, 0x91, 0x68, 0x3f,
+		0x05, 0x52, 0xab, 0xfc, 0x44, 0x13, 0xea, 0xbd,
+	},
+	{
+		0x00, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95,
+		0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f,
+		0xe9, 0xb1, 0x59, 0x01, 0x94, 0xcc, 0x24, 0x7c,
+		0x13, 0x4b, 0xa3, 0xfb, 0x6e, 0x36, 0xde, 0x86,
+		0xcf, 0x97, 0x7f, 0x27, 0xb2, 0xea, 0x02, 0x5a,
+		0x35, 0x6d, 0x85, 0xdd, 0x48, 0x10, 0xf8, 0xa0,
+		0x26, 0x7e, 0x96, 0xce, 0x5b, 0x03, 0xeb, 0xb3,
+		0xdc, 0x84, 0x6c, 0x34, 0xa1, 0xf9, 0x11, 0x49,
+		0x83, 0xdb, 0x33, 0x6b, 0xfe, 0xa6, 0x4e, 0x16,
+		0x79, 0x21, 0xc9, 0x91, 0x04, 0x5c, 0xb4, 0xec,
+		0x6a, 0x32, 0xda, 0x82, 0x17, 0x4f, 0xa7, 0xff,
+		0x90, 0xc8, 0x20, 0x78, 0xed, 0xb5, 0x5d, 0x05,
+		0x4c, 0x14, 0xfc, 0xa4, 0x31, 0x69, 0x81, 0xd9,
+		0xb6, 0xee, 0x06, 0x5e, 0xcb, 0x93, 0x7b, 0x23,
+		0xa5, 0xfd, 0x15, 0x4d, 0xd8, 0x80, 0x68, 0x30,
+		0x5f, 0x07, 0xef, 0xb7, 0x22, 0x7a, 0x92, 0xca,
+		0x1b, 0x43, 0xab, 0xf3, 0x66, 0x3e, 0xd6, 0x8e,
+		0xe1, 0xb9, 0x51, 0x09, 0x9c, 0xc4, 0x2c, 0x74,
+		0xf2, 0xaa, 0x42, 0x1a, 0x8f, 0xd7, 0x3f, 0x67,
+		0x08, 0x50, 0xb8, 0xe0, 0x75, 0x2d, 0xc5, 0x9d,
+		0xd4, 0x8c, 0x64, 0x3c, 0xa9, 0xf1, 0x19, 0x41,
+		0x2e, 0x76, 0x9e, 0xc6, 0x53, 0x0b, 0xe3, 0xbb,
+		0x3d, 0x65, 0x8d, 0xd5, 0x40, 0x18, 0xf0, 0xa8,
+		0xc7, 0x9f, 0x77, 0x2f, 0xba, 0xe2, 0x0a, 0x52,
+		0x98, 0xc0, 0x28, 0x70, 0xe5, 0xbd, 0x55, 0x0d,
+		0x62, 0x3a, 0xd2, 0x8a, 0x1f, 0x47, 0xaf, 0xf7,
+		0x71, 0x29, 0xc1, 0x99, 0x0c, 0x54, 0xbc, 0xe4,
+		0x8b, 0xd3, 0x3b, 0x63, 0xf6, 0xae, 0x46, 0x1e,
+		0x57, 0x0f, 0xe7, 0xbf, 0x2a, 0x72, 0x9a, 0xc2,
+		0xad, 0xf5, 0x1d, 0x45, 0xd0, 0x88, 0x60, 0x38,
+		0xbe, 0xe6, 0x0e, 0x56, 0xc3, 0x9b, 0x73, 0x2b,
+		0x44, 0x1c, 0xf4, 0xac, 0x39, 0x61, 0x89, 0xd1,
+	},
+	{
+		0x00, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92,
+		0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60,
+		0xf9, 0xa0, 0x4b, 0x12, 0x80, 0xd9, 0x32, 0x6b,
+		0x0b, 0x52, 0xb9, 0xe0, 0x72, 0x2b, 0xc0, 0x99,
+		0xef, 0xb6, 0x5d, 0x04, 0x96, 0xcf, 0x24, 0x7d,
+		0x1d, 0x44, 0xaf, 0xf6, 0x64, 0x3d, 0xd6, 0x8f,
+		0x16, 0x4f, 0xa4, 0xfd, 0x6f, 0x36, 0xdd, 0x84,
+		0xe4, 0xbd, 0x56, 0x0f, 0x9d, 0xc4, 0x2f, 0x76,
+		0xc3, 0x9a, 0x71, 0x28, 0xba, 0xe3, 0x08, 0x51,
+		0x31, 0x68, 0x83, 0xda, 0x48, 0x11, 0xfa, 0xa3,
+		0x3a, 0x63, 0x88, 0xd1, 0x43, 0x1a, 0xf1, 0xa8,
+		0xc8, 0x91, 0x7a, 0x23, 0xb1, 0xe8, 0x03, 0x5a,
+		0x2c, 0x75, 0x9e, 0xc7, 0x55, 0x0c, 0xe7, 0xbe,
+		0xde, 0x87, 0x6c, 0x35, 0xa7, 0xfe, 0x15, 0x4c,
+		0xd5, 0x8c, 0x67, 0x3e, 0xac, 0xf5, 0x1e, 0x47,
+		0x27, 0x7e, 0x95, 0xcc, 0x5e, 0x07, 0xec, 0xb5,
+		0x9b, 0xc2, 0x29, 0x70, 0xe2, 0xbb, 0x50, 0x09,
+		0x69, 0x30, 0xdb, 0x82, 0x10, 0x49, 0xa2, 0xfb,
+		0x62, 0x3b, 0xd0, 0x89, 0x1b, 0x42, 0xa9, 0xf0,
+		0x90, 0xc9, 0x22, 0x7b, 0xe9, 0xb0, 0x5b, 0x02,
+		0x74, 0x2d, 0xc6, 0x9f, 0x0d, 0x54, 0xbf, 0xe6,
+		0x86, 0xdf, 0x34, 0x6d, 0xff, 0xa6, 0x4d, 0x14,
+		0x8d, 0xd4, 0x3f, 0x66, 0xf4, 0xad, 0x46, 0x1f,
+		0x7f, 0x26, 0xcd, 0x94, 0x06, 0x5f, 0xb4, 0xed,
+		0x58, 0x01, 0xea, 0xb3, 0x21, 0x78, 0x93, 0xca,
+		0xaa, 0xf3, 0x18, 0x41, 0xd3, 0x8a, 0x61, 0x38,
+		0xa1, 0xf8, 0x13, 0x4a, 0xd8, 0x81, 0x6a, 0x33,
+		0x53, 0x0a, 0xe1, 0xb8, 0x2a, 0x73, 0x98, 0xc1,
+		0xb7, 0xee, 0x05, 0x5c, 0xce, 0x97, 0x7c, 0x25,
+		0x45, 0x1c, 0xf7, 0xae, 0x3c, 0x65, 0x8e, 0xd7,
+		0x4e, 0x17, 0xfc, 0xa5, 0x37, 0x6e, 0x85, 0xdc,
+		0xbc, 0xe5, 0x0e, 0x57, 0xc5, 0x9c, 0x77, 0x2e,
+	},
+	{
+		0x00, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b,
+		0xea, 0xb0, 0x5e, 0x04, 0x9f, 0xc5, 0x2b, 0x71,
+		0xc9, 0x93, 0x7d, 0x27, 0xbc, 0xe6, 0x08, 0x52,
+		0x23, 0x79, 0x97, 0xcd, 0x56, 0x0c, 0xe2, 0xb8,
+		0x8f, 0xd5, 0x3b, 0x61, 0xfa, 0xa0, 0x4e, 0x14,
+		0x65, 0x3f, 0xd1, 0x8b, 0x10, 0x4a, 0xa4, 0xfe,
+		0x46, 0x1c, 0xf2, 0xa8, 0x33, 0x69, 0x87, 0xdd,
+		0xac, 0xf6, 0x18, 0x42, 0xd9, 0x83, 0x6d, 0x37,
+		0x03, 0x59, 0xb7, 0xed, 0x76, 0x2c, 0xc2, 0x98,
+		0xe9, 0xb3, 0x5d, 0x07, 0x9c, 0xc6, 0x28, 0x72,
+		0xca, 0x90, 0x7e, 0x24, 0xbf, 0xe5, 0x0b, 0x51,
+		0x20, 0x7a, 0x94, 0xce, 0x55, 0x0f, 0xe1, 0xbb,
+		0x8c, 0xd6, 0x38, 0x62, 0xf9, 0xa3, 0x4d, 0x17,
+		0x66, 0x3c, 0xd2, 0x88, 0x13, 0x49, 0xa7, 0xfd,
+		0x45, 0x1f, 0xf1, 0xab, 0x30, 0x6a, 0x84, 0xde,
+		0xaf, 0xf5, 0x1b, 0x41, 0xda, 0x80, 0x6e, 0x34,
+		0x06, 0x5c, 0xb2, 0xe8, 0x73, 0x29, 0xc7, 0x9d,
+		0xec, 0xb6, 0x58, 0x02, 0x99, 0xc3, 0x2d, 0x77,
+		0xcf, 0x95, 0x7b, 0x21, 0xba, 0xe0, 0x0e, 0x54,
+		0x25, 0x7f, 0x91, 0xcb, 0x50, 0x0a, 0xe4, 0xbe,
+		0x89, 0xd3, 0x3d, 0x67, 0xfc, 0xa6, 0x48, 0x12,
+		0x63, 0x39, 0xd7, 0x8d, 0x16, 0x4c, 0xa2, 0xf8,
+		0x40, 0x1a, 0xf4, 0xae, 0x35, 0x6f, 0x81, 0xdb,
+		0xaa, 0xf0, 0x1e, 0x44, 0xdf, 0x85, 0x6b, 0x31,
+		0x05, 0x5f, 0xb1, 0xeb, 0x70, 0x2a, 0xc4, 0x9e,
+		0xef, 0xb5, 0x5b, 0x01, 0x9a, 0xc0, 0x2e, 0x74,
+		0xcc, 0x96, 0x78, 0x22, 0xb9, 0xe3, 0x0d, 0x57,
+		0x26, 0x7c, 0x92, 0xc8, 0x53, 0x09, 0xe7, 0xbd,
+		0x8a, 0xd0, 0x3e, 0x64, 0xff, 0xa5, 0x4b, 0x11,
+		0x60, 0x3a, 0xd4, 0x8e, 0x15, 0x4f, 0xa1, 0xfb,
+		0x43, 0x19, 0xf7, 0xad, 0x36, 0x6c, 0x82, 0xd8,
+		0xa9, 0xf3, 0x1d, 0x47, 0xdc, 0x86, 0x68, 0x32,
+	},
+	{
+		0x00, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c,
+		0xe2, 0xb9, 0x54, 0x0f, 0x93, 0xc8, 0x25, 0x7e,
+		0xd9, 0x82, 0x6f, 0x34, 0xa8, 0xf3, 0x1e, 0x45,
+		0x3b, 0x60, 0x8d, 0xd6, 0x4a, 0x11, 0xfc, 0xa7,
+		0xaf, 0xf4, 0x19, 0x42, 0xde, 0x85, 0x68, 0x33,
+		0x4d, 0x16, 0xfb, 0xa0, 0x3c, 0x67, 0x8a, 0xd1,
+		0x76, 0x2d, 0xc0, 0x9b, 0x07, 0x5c, 0xb1, 0xea,
+		0x94, 0xcf, 0x22, 0x79, 0xe5, 0xbe, 0x53, 0x08,
+		0x43, 0x18, 0xf5, 0xae, 0x32, 0x69, 0x84, 0xdf,
+		0xa1, 0xfa, 0x17, 0x4c, 0xd0, 0x8b, 0x66, 0x3d,
+		0x9a, 0xc1, 0x2c, 0x77, 0xeb, 0xb0, 0x5d, 0x06,
+		0x78, 0x23, 0xce, 0x95, 0x09, 0x52, 0xbf, 0xe4,
+		0xec, 0xb7, 0x5a, 0x01, 0x9d, 0xc6, 0x2b, 0x70,
+		0x0e, 0x55, 0xb8, 0xe3, 0x7f, 0x24, 0xc9, 0x92,
+		0x35, 0x6e, 0x83, 0xd8, 0x44, 0x1f, 0xf2, 0xa9,
+		0xd7, 0x8c, 0x61, 0x3a, 0xa6, 0xfd, 0x10, 0x4b,
+		0x86, 0xdd, 0x30, 0x6b, 0xf7, 0xac, 0x41, 0x1a,
+		0x64, 0x3f, 0xd2, 0x89, 0x15, 0x4e, 0xa3, 0xf8,
+		0x5f, 0x04, 0xe9, 0xb2, 0x2e, 0x75, 0x98, 0xc3,
+		0xbd, 0xe6, 0x0b, 0x50, 0xcc, 0x97, 0x7a, 0x21,
+		0x29, 0x72, 0x9f, 0xc4, 0x58, 0x03, 0xee, 0xb5,
+		0xcb, 0x90, 0x7d, 0x26, 0xba, 0xe1, 0x0c, 0x57,
+		0xf0, 0xab, 0x46, 0x1d, 0x81, 0xda, 0x37, 0x6c,
+		0x12, 0x49, 0xa4, 0xff, 0x63, 0x38, 0xd5, 0x8e,
+		0xc5, 0x9e, 0x73, 0x28, 0xb4, 0xef, 0x02, 0x59,
+		0x27, 0x7c, 0x91, 0xca, 0x56, 0x0d, 0xe0, 0xbb,
+		0x1c, 0x47, 0xaa, 0xf1, 0x6d, 0x36, 0xdb, 0x80,
+		0xfe, 0xa5, 0x48, 0x13, 0x8f, 0xd4, 0x39, 0x62,
+		0x6a, 0x31, 0xdc, 0x87, 0x1b, 0x40, 0xad, 0xf6,
+		0x88, 0xd3, 0x3e, 0x65, 0xf9, 0xa2, 0x4f, 0x14,
+		0xb3, 0xe8, 0x05, 0x5e, 0xc2, 0x99, 0x74, 0x2f,
+		0x51, 0x0a, 0xe7, 0xbc, 0x20, 0x7b, 0x96, 0xcd,
+	},
+	{
+		0x00, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89,
+		0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0x0f, 0x53,
+		0xa9, 0xf5, 0x11, 0x4d, 0xc4, 0x98, 0x7c, 0x20,
+		0x73, 0x2f, 0xcb, 0x97, 0x1e, 0x42, 0xa6, 0xfa,
+		0x4f, 0x13, 0xf7, 0xab, 0x22, 0x7e, 0x9a, 0xc6,
+		0x95, 0xc9, 0x2d, 0x71, 0xf8, 0xa4, 0x40, 0x1c,
+		0xe6, 0xba, 0x5e, 0x02, 0x8b, 0xd7, 0x33, 0x6f,
+		0x3c, 0x60, 0x84, 0xd8, 0x51, 0x0d, 0xe9, 0xb5,
+		0x9e, 0xc2, 0x26, 0x7a, 0xf3, 0xaf, 0x4b, 0x17,
+		0x44, 0x18, 0xfc, 0xa0, 0x29, 0x75, 0x91, 0xcd,
+		0x37, 0x6b, 0x8f, 0xd3, 0x5a, 0x06, 0xe2, 0xbe,
+		0xed, 0xb1, 0x55, 0x09, 0x80, 0xdc, 0x38, 0x64,
+		0xd1, 0x8d, 0x69, 0x35, 0xbc, 0xe0, 0x04, 0x58,
+		0x0b, 0x57, 0xb3, 0xef, 0x66, 0x3a, 0xde, 0x82,
+		0x78, 0x24, 0xc0, 0x9c, 0x15, 0x49, 0xad, 0xf1,
+		0xa2, 0xfe, 0x1a, 0x46, 0xcf, 0x93, 0x77, 0x2b,
+		0x21, 0x7d, 0x99, 0xc5, 0x4c, 0x10, 0xf4, 0xa8,
+		0xfb, 0xa7, 0x43, 0x1f, 0x96, 0xca, 0x2e, 0x72,
+		0x88, 0xd4, 0x30, 0x6c, 0xe5, 0xb9, 0x5d, 0x01,
+		0x52, 0x0e, 0xea, 0xb6, 0x3f, 0x63, 0x87, 0xdb,
+		0x6e, 0x32, 0xd6, 0x8a, 0x03, 0x5f, 0xbb, 0xe7,
+		0xb4, 0xe8, 0x0c, 0x50, 0xd9, 0x85, 0x61, 0x3d,
+		0xc7, 0x9b, 0x7f, 0x23, 0xaa, 0xf6, 0x12, 0x4e,
+		0x1d, 0x41, 0xa5, 0xf9, 0x70, 0x2c, 0xc8, 0x94,
+		0xbf, 0xe3, 0x07, 0x5b, 0xd2, 0x8e, 0x6a, 0x36,
+		0x65, 0x39, 0xdd, 0x81, 0x08, 0x54, 0xb0, 0xec,
+		0x16, 0x4a, 0xae, 0xf2, 0x7b, 0x27, 0xc3, 0x9f,
+		0xcc, 0x90, 0x74, 0x28, 0xa1, 0xfd, 0x19, 0x45,
+		0xf0, 0xac, 0x48, 0x14, 0x9d, 0xc1, 0x25, 0x79,
+		0x2a, 0x76, 0x92, 0xce, 0x47, 0x1b, 0xff, 0xa3,
+		0x59, 0x05, 0xe1, 0xbd, 0x34, 0x68, 0x8c, 0xd0,
+		0x83, 0xdf, 0x3b, 0x67, 0xee, 0xb2, 0x56, 0x0a,
+	},
+	{
+		0x00, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e,
+		0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x01, 0x5c,
+		0xb9, 0xe4, 0x03, 0x5e, 0xd0, 0x8d, 0x6a, 0x37,
+		0x6b, 0x36, 0xd1, 0x8c, 0x02, 0x5f, 0xb8, 0xe5,
+		0x6f, 0x32, 0xd5, 0x88, 0x06, 0x5b, 0xbc, 0xe1,
+		0xbd, 0xe0, 0x07, 0x5a, 0xd4, 0x89, 0x6e, 0x33,
+		0xd6, 0x8b, 0x6c, 0x31, 0xbf, 0xe2, 0x05, 0x58,
+		0x04, 0x59, 0xbe, 0xe3, 0x6d, 0x30, 0xd7, 0x8a,
+		0xde, 0x83, 0x64, 0x39, 0xb7, 0xea, 0x0d, 0x50,
+		0x0c, 0x51, 0xb6, 0xeb, 0x65, 0x38, 0xdf, 0x82,
+		0x67, 0x3a, 0xdd, 0x80, 0x0e, 0x53, 0xb4, 0xe9,
+		0xb5, 0xe8, 0x0f, 0x52, 0xdc, 0x81, 0x66, 0x3b,
+		0xb1, 0xec, 0x0b, 0x56, 0xd8, 0x85, 0x62, 0x3f,
+		0x63, 0x3e, 0xd9, 0x84, 0x0a, 0x57, 0xb0, 0xed,
+		0x08, 0x55, 0xb2, 0xef, 0x61, 0x3c, 0xdb, 0x86,
+		0xda, 0x87, 0x60, 0x3d, 0xb3, 0xee, 0x09, 0x54,
+		0xa1, 0xfc, 0x1b, 0x46, 0xc8, 0x95, 0x72, 0x2f,
+		0x73, 0x2e, 0xc9, 0x94, 0x1a, 0x47, 0xa0, 0xfd,
+		0x18, 0x45, 0xa2, 0xff, 0x71, 0x2c, 0xcb, 0x96,
+		0xca, 0x97, 0x70, 0x2d, 0xa3, 0xfe, 0x19, 0x44,
+		0xce, 0x93, 0x74, 0x29, 0xa7, 0xfa, 0x1d, 0x40,
+		0x1c, 0x41, 0xa6, 0xfb, 0x75, 0x28, 0xcf, 0x92,
+		0x77, 0x2a, 0xcd, 0x90, 0x1e, 0x43, 0xa4, 0xf9,
+		0xa5, 0xf8, 0x1f, 0x42, 0xcc, 0x91, 0x76, 0x2b,
+		0x7f, 0x22, 0xc5, 0x98, 0x16, 0x4b, 0xac, 0xf1,
+		0xad, 0xf0, 0x17, 0x4a, 0xc4, 0x99, 0x7e, 0x23,
+		0xc6, 0x9b, 0x7c, 0x21, 0xaf, 0xf2, 0x15, 0x48,
+		0x14, 0x49, 0xae, 0xf3, 0x7d, 0x20, 0xc7, 0x9a,
+		0x10, 0x4d, 0xaa, 0xf7, 0x79, 0x24, 0xc3, 0x9e,
+		0xc2, 0x9f, 0x78, 0x25, 0xab, 0xf6, 0x11, 0x4c,
+		0xa9, 0xf4, 0x13, 0x4e, 0xc0, 0x9d, 0x7a, 0x27,
+		0x7b, 0x26, 0xc1, 0x9c, 0x12, 0x4f, 0xa8, 0xf5,
+	},
+	{
+		0x00, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87,
+		0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d,
+		0x89, 0xd7, 0x35, 0x6b, 0xec, 0xb2, 0x50, 0x0e,
+		0x43, 0x1d, 0xff, 0xa1, 0x26, 0x78, 0x9a, 0xc4,
+		0x0f, 0x51, 0xb3, 0xed, 0x6a, 0x34, 0xd6, 0x88,
+		0xc5, 0x9b, 0x79, 0x27, 0xa0, 0xfe, 0x1c, 0x42,
+		0x86, 0xd8, 0x3a, 0x64, 0xe3, 0xbd, 0x5f, 0x01,
+		0x4c, 0x12, 0xf0, 0xae, 0x29, 0x77, 0x95, 0xcb,
+		0x1e, 0x40, 0xa2, 0xfc, 0x7b, 0x25, 0xc7, 0x99,
+		0xd4, 0x8a, 0x68, 0x36, 0xb1, 0xef, 0x0d, 0x53,
+		0x97, 0xc9, 0x2b, 0x75, 0xf2, 0xac, 0x4e, 0x10,
+		0x5d, 0x03, 0xe1, 0xbf, 0x38, 0x66, 0x84, 0xda,
+		0x11, 0x4f, 0xad, 0xf3, 0x74, 0x2a, 0xc8, 0x96,
+		0xdb, 0x85, 0x67, 0x39, 0xbe, 0xe0, 0x02, 0x5c,
+		0x98, 0xc6, 0x24, 0x7a, 0xfd, 0xa3, 0x41, 0x1f,
+		0x52, 0x0c, 0xee, 0xb0, 0x37, 0x69, 0x8b, 0xd5,
+		0x3c, 0x62, 0x80, 0xde, 0x59, 0x07, 0xe5, 0xbb,
+		0xf6, 0xa8, 0x4a, 0x14, 0x93, 0xcd, 0x2f, 0x71,
+		0xb5, 0xeb, 0x09, 0x57, 0xd0, 0x8e, 0x6c, 0x32,
+		0x7f, 0x21, 0xc3, 0x9d, 0x1a, 0x44, 0xa6, 0xf8,
+		0x33, 0x6d, 0x8f, 0xd1, 0x56, 0x08, 0xea, 0xb4,
+		0xf9, 0xa7, 0x45, 0x1b, 0x9c, 0xc2, 0x20, 0x7e,
+		0xba, 0xe4, 0x06, 0x58, 0xdf, 0x81, 0x63, 0x3d,
+		0x70, 0x2e, 0xcc, 0x92, 0x15, 0x4b, 0xa9, 0xf7,
+		0x22, 0x7c, 0x9e, 0xc0, 0x47, 0x19, 0xfb, 0xa5,
+		0xe8, 0xb6, 0x54, 0x0a, 0x8d, 0xd3, 0x31, 0x6f,
+		0xab, 0xf5, 0x17, 0x49, 0xce, 0x90, 0x72, 0x2c,
+		0x61, 0x3f, 0xdd, 0x83, 0x04, 0x5a, 0xb8, 0xe6,
+		0x2d, 0x73, 0x91, 0xcf, 0x48, 0x16, 0xf4, 0xaa,
+		0xe7, 0xb9, 0x5b, 0x05, 0x82, 0xdc, 0x3e, 0x60,
+		0xa4, 0xfa, 0x18, 0x46, 0xc1, 0x9f, 0x7d, 0x23,
+		0x6e, 0x30, 0xd2, 0x8c, 0x0b, 0x55, 0xb7, 0xe9,
+	},
+	{
+		0x00, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80,
+		0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42,
+		0x99, 0xc6, 0x27, 0x78, 0xf8, 0xa7, 0x46, 0x19,
+		0x5b, 0x04, 0xe5, 0xba, 0x3a, 0x65, 0x84, 0xdb,
+		0x2f, 0x70, 0x91, 0xce, 0x4e, 0x11, 0xf0, 0xaf,
+		0xed, 0xb2, 0x53, 0x0c, 0x8c, 0xd3, 0x32, 0x6d,
+		0xb6, 0xe9, 0x08, 0x57, 0xd7, 0x88, 0x69, 0x36,
+		0x74, 0x2b, 0xca, 0x95, 0x15, 0x4a, 0xab, 0xf4,
+		0x5e, 0x01, 0xe0, 0xbf, 0x3f, 0x60, 0x81, 0xde,
+		0x9c, 0xc3, 0x22, 0x7d, 0xfd, 0xa2, 0x43, 0x1c,
+		0xc7, 0x98, 0x79, 0x26, 0xa6, 0xf9, 0x18, 0x47,
+		0x05, 0x5a, 0xbb, 0xe4, 0x64, 0x3b, 0xda, 0x85,
+		0x71, 0x2e, 0xcf, 0x90, 0x10, 0x4f, 0xae, 0xf1,
+		0xb3, 0xec, 0x0d, 0x52, 0xd2, 0x8d, 0x6c, 0x33,
+		0xe8, 0xb7, 0x56, 0x09, 0x89, 0xd6, 0x37, 0x68,
+		0x2a, 0x75, 0x94, 0xcb, 0x4b, 0x14, 0xf5, 0xaa,
+		0xbc, 0xe3, 0x02, 0x5d, 0xdd, 0x82, 0x63, 0x3c,
+		0x7e, 0x21, 0xc0, 0x9f, 0x1f, 0x40, 0xa1, 0xfe,
+		0x25, 0x7a, 0x9b, 0xc4, 0x44, 0x1b, 0xfa, 0xa5,
+		0xe7, 0xb8, 0x59, 0x06, 0x86, 0xd9, 0x38, 0x67,
+		0x93, 0xcc, 0x2d, 0x72, 0xf2, 0xad, 0x4c, 0x13,
+		0x51, 0x0e, 0xef, 0xb0, 0x30, 0x6f, 0x8e, 0xd1,
+		0x0a, 0x55, 0xb4, 0xeb, 0x6b, 0x34, 0xd5, 0x8a,
+		0xc8, 0x97, 0x76, 0x29, 0xa9, 0xf6, 0x17, 0x48,
+		0xe2, 0xbd, 0x5c, 0x03, 0x83, 0xdc, 0x3d, 0x62,
+		0x20, 0x7f, 0x9e, 0xc1, 0x41, 0x1e, 0xff, 0xa0,
+		0x7b, 0x24, 0xc5, 0x9a, 0x1a, 0x45, 0xa4, 0xfb,
+		0xb9, 0xe6, 0x07, 0x58, 0xd8, 0x87, 0x66, 0x39,
+		0xcd, 0x92, 0x73, 0x2c, 0xac, 0xf3, 0x12, 0x4d,
+		0x0f, 0x50, 0xb1, 0xee, 0x6e, 0x31, 0xd0, 0x8f,
+		0x54, 0x0b, 0xea, 0xb5, 0x35, 0x6a, 0x8b, 0xd4,
+		0x96, 0xc9, 0x28, 0x77, 0xf7, 0xa8, 0x49, 0x16,
+	},
+	{
+		0x00, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d,
+		0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a,
+		0x4e, 0x2e, 0x8e, 0xee, 0xd3, 0xb3, 0x13, 0x73,
+		0x69, 0x09, 0xa9, 0xc9, 0xf4, 0x94, 0x34, 0x54,
+		0x9c, 0xfc, 0x5c, 0x3c, 0x01, 0x61, 0xc1, 0xa1,
+		0xbb, 0xdb, 0x7b, 0x1b, 0x26, 0x46, 0xe6, 0x86,
+		0xd2, 0xb2, 0x12, 0x72, 0x4f, 0x2f, 0x8f, 0xef,
+		0xf5, 0x95, 0x35, 0x55, 0x68, 0x08, 0xa8, 0xc8,
+		0x25, 0x45, 0xe5, 0x85, 0xb8, 0xd8, 0x78, 0x18,
+		0x02, 0x62, 0xc2, 0xa2, 0x9f, 0xff, 0x5f, 0x3f,
+		0x6b, 0x0b, 0xab, 0xcb, 0xf6, 0x96, 0x36, 0x56,
+		0x4c, 0x2c, 0x8c, 0xec, 0xd1, 0xb1, 0x11, 0x71,
+		0xb9, 0xd9, 0x79, 0x19, 0x24, 0x44, 0xe4, 0x84,
+		0x9e, 0xfe, 0x5e, 0x3e, 0x03, 0x63, 0xc3, 0xa3,
+		0xf7, 0x97, 0x37, 0x57, 0x6a, 0x0a, 0xaa, 0xca,
+		0xd0, 0xb0, 0x10, 0x70, 0x4d, 0x2d, 0x8d, 0xed,
+		0x4a, 0x2a, 0x8a, 0xea, 0xd7, 0xb7, 0x17, 0x77,
+		0x6d, 0x0d, 0xad, 0xcd, 0xf0, 0x90, 0x30, 0x50,
+		0x04, 0x64, 0xc4, 0xa4, 0x99, 0xf9, 0x59, 0x39,
+		0x23, 0x43, 0xe3, 0x83, 0xbe, 0xde, 0x7e, 0x1e,
+		0xd6, 0xb6, 0x16, 0x76, 0x4b, 0x2b, 0x8b, 0xeb,
+		0xf1, 0x91, 0x31, 0x51, 0x6c, 0x0c, 0xac, 0xcc,
+		0x98, 0xf8, 0x58, 0x38, 0x05, 0x65, 0xc5, 0xa5,
+		0xbf, 0xdf, 0x7f, 0x1f, 0x22, 0x42, 0xe2, 0x82,
+		0x6f, 0x0f, 0xaf, 0xcf, 0xf2, 0x92, 0x32, 0x52,
+		0x48, 0x28, 0x88, 0xe8, 0xd5, 0xb5, 0x15, 0x75,
+		0x21, 0x41, 0xe1, 0x81, 0xbc, 0xdc, 0x7c, 0x1c,
+		0x06, 0x66, 0xc6, 0xa6, 0x9b, 0xfb, 0x5b, 0x3b,
+		0xf3, 0x93, 0x33, 0x53, 0x6e, 0x0e, 0xae, 0xce,
+		0xd4, 0xb4, 0x14, 0x74, 0x49, 0x29, 0x89, 0xe9,
+		0xbd, 0xdd, 0x7d, 0x1d, 0x20, 0x40, 0xe0, 0x80,
+		0x9a, 0xfa, 0x5a, 0x3a, 0x07, 0x67, 0xc7, 0xa7,
+	},
+	{
+		0x00, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a,
+		0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15,
+		0x5e, 0x3f, 0x9c, 0xfd, 0xc7, 0xa6, 0x05, 0x64,
+		0x71, 0x10, 0xb3, 0xd2, 0xe8, 0x89, 0x2a, 0x4b,
+		0xbc, 0xdd, 0x7e, 0x1f, 0x25, 0x44, 0xe7, 0x86,
+		0x93, 0xf2, 0x51, 0x30, 0x0a, 0x6b, 0xc8, 0xa9,
+		0xe2, 0x83, 0x20, 0x41, 0x7b, 0x1a, 0xb9, 0xd8,
+		0xcd, 0xac, 0x0f, 0x6e, 0x54, 0x35, 0x96, 0xf7,
+		0x65, 0x04, 0xa7, 0xc6, 0xfc, 0x9d, 0x3e, 0x5f,
+		0x4a, 0x2b, 0x88, 0xe9, 0xd3, 0xb2, 0x11, 0x70,
+		0x3b, 0x5a, 0xf9, 0x98, 0xa2, 0xc3, 0x60, 0x01,
+		0x14, 0x75, 0xd6, 0xb7, 0x8d, 0xec, 0x4f, 0x2e,
+		0xd9, 0xb8, 0x1b, 0x7a, 0x40, 0x21, 0x82, 0xe3,
+		0xf6, 0x97, 0x34, 0x55, 0x6f, 0x0e, 0xad, 0xcc,
+		0x87, 0xe6, 0x45, 0x24, 0x1e, 0x7f, 0xdc, 0xbd,
+		0xa8, 0xc9, 0x6a, 0x0b, 0x31, 0x50, 0xf3, 0x92,
+		0xca, 0xab, 0x08, 0x69, 0x53, 0x32, 0x91, 0xf0,
+		0xe5, 0x84, 0x27, 0x46, 0x7c, 0x1d, 0xbe, 0xdf,
+		0x94, 0xf5, 0x56, 0x37, 0x0d, 0x6c, 0xcf, 0xae,
+		0xbb, 0xda, 0x79, 0x18, 0x22, 0x43, 0xe0, 0x81,
+		0x76, 0x17, 0xb4, 0xd5, 0xef, 0x8e, 0x2d, 0x4c,
+		0x59, 0x38, 0x9b, 0xfa, 0xc0, 0xa1, 0x02, 0x63,
+		0x28, 0x49, 0xea, 0x8b, 0xb1, 0xd0, 0x73, 0x12,
+		0x07, 0x66, 0xc5, 0xa4, 0x9e, 0xff, 0x5c, 0x3d,
+		0xaf, 0xce, 0x6d, 0x0c, 0x36, 0x57, 0xf4, 0x95,
+		0x80, 0xe1, 0x42, 0x23, 0x19, 0x78, 0xdb, 0xba,
+		0xf1, 0x90, 0x33, 0x52, 0x68, 0x09, 0xaa, 0xcb,
+		0xde, 0xbf, 0x1c, 0x7d, 0x47, 0x26, 0x85, 0xe4,
+		0x13, 0x72, 0xd1, 0xb0, 0x8a, 0xeb, 0x48, 0x29,
+		0x3c, 0x5d, 0xfe, 0x9f, 0xa5, 0xc4, 0x67, 0x06,
+		0x4d, 0x2c, 0x8f, 0xee, 0xd4, 0xb5, 0x16, 0x77,
+		0x62, 0x03, 0xa0, 0xc1, 0xfb, 0x9a, 0x39, 0x58,
+	},
+	{
+		0x00, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33,
+		0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x04,
+		0x6e, 0x0c, 0xaa, 0xc8, 0xfb, 0x99, 0x3f, 0x5d,
+		0x59, 0x3b, 0x9d, 0xff, 0xcc, 0xae, 0x08, 0x6a,
+		0xdc, 0xbe, 0x18, 0x7a, 0x49, 0x2b, 0x8d, 0xef,
+		0xeb, 0x89, 0x2f, 0x4d, 0x7e, 0x1c, 0xba, 0xd8,
+		0xb2, 0xd0, 0x76, 0x14, 0x27, 0x45, 0xe3, 0x81,
+		0x85, 0xe7, 0x41, 0x23, 0x10, 0x72, 0xd4, 0xb6,
+		0xa5, 0xc7, 0x61, 0x03, 0x30, 0x52, 0xf4, 0x96,
+		0x92, 0xf0, 0x56, 0x34, 0x07, 0x65, 0xc3, 0xa1,
+		0xcb, 0xa9, 0x0f, 0x6d, 0x5e, 0x3c, 0x9a, 0xf8,
+		0xfc, 0x9e, 0x38, 0x5a, 0x69, 0x0b, 0xad, 0xcf,
+		0x79, 0x1b, 0xbd, 0xdf, 0xec, 0x8e, 0x28, 0x4a,
+		0x4e, 0x2c, 0x8a, 0xe8, 0xdb, 0xb9, 0x1f, 0x7d,
+		0x17, 0x75, 0xd3, 0xb1, 0x82, 0xe0, 0x46, 0x24,
+		0x20, 0x42, 0xe4, 0x86, 0xb5, 0xd7, 0x71, 0x13,
+		0x57, 0x35, 0x93, 0xf1, 0xc2, 0xa0, 0x06, 0x64,
+		0x60, 0x02, 0xa4, 0xc6, 0xf5, 0x97, 0x31, 0x53,
+		0x39, 0x5b, 0xfd, 0x9f, 0xac, 0xce, 0x68, 0x0a,
+		0x0e, 0x6c, 0xca, 0xa8, 0x9b, 0xf9, 0x5f, 0x3d,
+		0x8b, 0xe9, 0x4f, 0x2d, 0x1e, 0x7c, 0xda, 0xb8,
+		0xbc, 0xde, 0x78, 0x1a, 0x29, 0x4b, 0xed, 0x8f,
+		0xe5, 0x87, 0x21, 0x43, 0x70, 0x12, 0xb4, 0xd6,
+		0xd2, 0xb0, 0x16, 0x74, 0x47, 0x25, 0x83, 0xe1,
+		0xf2, 0x90, 0x36, 0x54, 0x67, 0x05, 0xa3, 0xc1,
+		0xc5, 0xa7, 0x01, 0x63, 0x50, 0x32, 0x94, 0xf6,
+		0x9c, 0xfe, 0x58, 0x3a, 0x09, 0x6b, 0xcd, 0xaf,
+		0xab, 0xc9, 0x6f, 0x0d, 0x3e, 0x5c, 0xfa, 0x98,
+		0x2e, 0x4c, 0xea, 0x88, 0xbb, 0xd9, 0x7f, 0x1d,
+		0x19, 0x7b, 0xdd, 0xbf, 0x8c, 0xee, 0x48, 0x2a,
+		0x40, 0x22, 0x84, 0xe6, 0xd5, 0xb7, 0x11, 0x73,
+		0x77, 0x15, 0xb3, 0xd1, 0xe2, 0x80, 0x26, 0x44,
+	},
+	{
+		0x00, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34,
+		0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0x0b,
+		0x7e, 0x1d, 0xb8, 0xdb, 0xef, 0x8c, 0x29, 0x4a,
+		0x41, 0x22, 0x87, 0xe4, 0xd0, 0xb3, 0x16, 0x75,
+		0xfc, 0x9f, 0x3a, 0x59, 0x6d, 0x0e, 0xab, 0xc8,
+		0xc3, 0xa0, 0x05, 0x66, 0x52, 0x31, 0x94, 0xf7,
+		0x82, 0xe1, 0x44, 0x27, 0x13, 0x70, 0xd5, 0xb6,
+		0xbd, 0xde, 0x7b, 0x18, 0x2c, 0x4f, 0xea, 0x89,
+		0xe5, 0x86, 0x23, 0x40, 0x74, 0x17, 0xb2, 0xd1,
+		0xda, 0xb9, 0x1c, 0x7f, 0x4b, 0x28, 0x8d, 0xee,
+		0x9b, 0xf8, 0x5d, 0x3e, 0x0a, 0x69, 0xcc, 0xaf,
+		0xa4, 0xc7, 0x62, 0x01, 0x35, 0x56, 0xf3, 0x90,
+		0x19, 0x7a, 0xdf, 0xbc, 0x88, 0xeb, 0x4e, 0x2d,
+		0x26, 0x45, 0xe0, 0x83, 0xb7, 0xd4, 0x71, 0x12,
+		0x67, 0x04, 0xa1, 0xc2, 0xf6, 0x95, 0x30, 0x53,
+		0x58, 0x3b, 0x9e, 0xfd, 0xc9, 0xaa, 0x0f, 0x6c,
+		0xd7, 0xb4, 0x11, 0x72, 0x46, 0x25, 0x80, 0xe3,
+		0xe8, 0x8b, 0x2e, 0x4d, 0x79, 0x1a, 0xbf, 0xdc,
+		0xa9, 0xca, 0x6f, 0x0c, 0x38, 0x5b, 0xfe, 0x9d,
+		0x96, 0xf5, 0x50, 0x33, 0x07, 0x64, 0xc1, 0xa2,
+		0x2b, 0x48, 0xed, 0x8e, 0xba, 0xd9, 0x7c, 0x1f,
+		0x14, 0x77, 0xd2, 0xb1, 0x85, 0xe6, 0x43, 0x20,
+		0x55, 0x36, 0x93, 0xf0, 0xc4, 0xa7, 0x02, 0x61,
+		0x6a, 0x09, 0xac, 0xcf, 0xfb, 0x98, 0x3d, 0x5e,
+		0x32, 0x51, 0xf4, 0x97, 0xa3, 0xc0, 0x65, 0x06,
+		0x0d, 0x6e, 0xcb, 0xa8, 0x9c, 0xff, 0x5a, 0x39,
+		0x4c, 0x2f, 0x8a, 0xe9, 0xdd, 0xbe, 0x1b, 0x78,
+		0x73, 0x10, 0xb5, 0xd6, 0xe2, 0x81, 0x24, 0x47,
+		0xce, 0xad, 0x08, 0x6b, 0x5f, 0x3c, 0x99, 0xfa,
+		0xf1, 0x92, 0x37, 0x54, 0x60, 0x03, 0xa6, 0xc5,
+		0xb0, 0xd3, 0x76, 0x15, 0x21, 0x42, 0xe7, 0x84,
+		0x8f, 0xec, 0x49, 0x2a, 0x1e, 0x7d, 0xd8, 0xbb,
+	},
+	{
+		0x00, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21,
+		0x07, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26,
+		0x0e, 0x6a, 0xc6, 0xa2, 0x83, 0xe7, 0x4b, 0x2f,
+		0x09, 0x6d, 0xc1, 0xa5, 0x84, 0xe0, 0x4c, 0x28,
+		0x1c, 0x78, 0xd4, 0xb0, 0x91, 0xf5, 0x59, 0x3d,
+		0x1b, 0x7f, 0xd3, 0xb7, 0x96, 0xf2, 0x5e, 0x3a,
+		0x12, 0x76, 0xda, 0xbe, 0x9f, 0xfb, 0x57, 0x33,
+		0x15, 0x71, 0xdd, 0xb9, 0x98, 0xfc, 0x50, 0x34,
+		0x38, 0x5c, 0xf0, 0x94, 0xb5, 0xd1, 0x7d, 0x19,
+		0x3f, 0x5b, 0xf7, 0x93, 0xb2, 0xd6, 0x7a, 0x1e,
+		0x36, 0x52, 0xfe, 0x9a, 0xbb, 0xdf, 0x73, 0x17,
+		0x31, 0x55, 0xf9, 0x9d, 0xbc, 0xd8, 0x74, 0x10,
+		0x24, 0x40, 0xec, 0x88, 0xa9, 0xcd, 0x61, 0x05,
+		0x23, 0x47, 0xeb, 0x8f, 0xae, 0xca, 0x66, 0x02,
+		0x2a, 0x4e, 0xe2, 0x86, 0xa7, 0xc3, 0x6f, 0x0b,
+		0x2d, 0x49, 0xe5, 0x81, 0xa0, 0xc4, 0x68, 0x0c,
+		0x70, 0x14, 0xb8, 0xdc, 0xfd, 0x99, 0x35, 0x51,
+		0x77, 0x13, 0xbf, 0xdb, 0xfa, 0x9e, 0x32, 0x56,
+		0x7e, 0x1a, 0xb6, 0xd2, 0xf3, 0x97, 0x3b, 0x5f,
+		0x79, 0x1d, 0xb1, 0xd5, 0xf4, 0x90, 0x3c, 0x58,
+		0x6c, 0x08, 0xa4, 0xc0, 0xe1, 0x85, 0x29, 0x4d,
+		0x6b, 0x0f, 0xa3, 0xc7, 0xe6, 0x82, 0x2e, 0x4a,
+		0x62, 0x06, 0xaa, 0xce, 0xef, 0x8b, 0x27, 0x43,
+		0x65, 0x01, 0xad, 0xc9, 0xe8, 0x8c, 0x20, 0x44,
+		0x48, 0x2c, 0x80, 0xe4, 0xc5, 0xa1, 0x0d, 0x69,
+		0x4f, 0x2b, 0x87, 0xe3, 0xc2, 0xa6, 0x0a, 0x6e,
+		0x46, 0x22, 0x8e, 0xea, 0xcb, 0xaf, 0x03, 0x67,
+		0x41, 0x25, 0x89, 0xed, 0xcc, 0xa8, 0x04, 0x60,
+		0x54, 0x30, 0x9c, 0xf8, 0xd9, 0xbd, 0x11, 0x75,
+		0x53, 0x37, 0x9b, 0xff, 0xde, 0xba, 0x16, 0x72,
+		0x5a, 0x3e, 0x92, 0xf6, 0xd7, 0xb3, 0x1f, 0x7b,
+		0x5d, 0x39, 0x95, 0xf1, 0xd0, 0xb4, 0x18, 0x7c,
+	},
+	{
+		0x00, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26,
+		0x0f, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29,
+		0x1e, 0x7b, 0xd4, 0xb1, 0x97, 0xf2, 0x5d, 0x38,
+		0x11, 0x74, 0xdb, 0xbe, 0x98, 0xfd, 0x52, 0x37,
+		0x3c, 0x59, 0xf6, 0x93, 0xb5, 0xd0, 0x7f, 0x1a,
+		0x33, 0x56, 0xf9, 0x9c, 0xba, 0xdf, 0x70, 0x15,
+		0x22, 0x47, 0xe8, 0x8d, 0xab, 0xce, 0x61, 0x04,
+		0x2d, 0x48, 0xe7, 0x82, 0xa4, 0xc1, 0x6e, 0x0b,
+		0x78, 0x1d, 0xb2, 0xd7, 0xf1, 0x94, 0x3b, 0x5e,
+		0x77, 0x12, 0xbd, 0xd8, 0xfe, 0x9b, 0x34, 0x51,
+		0x66, 0x03, 0xac, 0xc9, 0xef, 0x8a, 0x25, 0x40,
+		0x69, 0x0c, 0xa3, 0xc6, 0xe0, 0x85, 0x2a, 0x4f,
+		0x44, 0x21, 0x8e, 0xeb, 0xcd, 0xa8, 0x07, 0x62,
+		0x4b, 0x2e, 0x81, 0xe4, 0xc2, 0xa7, 0x08, 0x6d,
+		0x5a, 0x3f, 0x90, 0xf5, 0xd3, 0xb6, 0x19, 0x7c,
+		0x55, 0x30, 0x9f, 0xfa, 0xdc, 0xb9, 0x16, 0x73,
+		0xf0, 0x95, 0x3a, 0x5f, 0x79, 0x1c, 0xb3, 0xd6,
+		0xff, 0x9a, 0x35, 0x50, 0x76, 0x13, 0xbc, 0xd9,
+		0xee, 0x8b, 0x24, 0x41, 0x67, 0x02, 0xad, 0xc8,
+		0xe1, 0x84, 0x2b, 0x4e, 0x68, 0x0d, 0xa2, 0xc7,
+		0xcc, 0xa9, 0x06, 0x63, 0x45, 0x20, 0x8f, 0xea,
+		0xc3, 0xa6, 0x09, 0x6c, 0x4a, 0x2f, 0x80, 0xe5,
+		0xd2, 0xb7, 0x18, 0x7d, 0x5b, 0x3e, 0x91, 0xf4,
+		0xdd, 0xb8, 0x17, 0x72, 0x54, 0x31, 0x9e, 0xfb,
+		0x88, 0xed, 0x42, 0x27, 0x01, 0x64, 0xcb, 0xae,
+		0x87, 0xe2, 0x4d, 0x28, 0x0e, 0x6b, 0xc4, 0xa1,
+		0x96, 0xf3, 0x5c, 0x39, 0x1f, 0x7a, 0xd5, 0xb0,
+		0x99, 0xfc, 0x53, 0x36, 0x10, 0x75, 0xda, 0xbf,
+		0xb4, 0xd1, 0x7e, 0x1b, 0x3d, 0x58, 0xf7, 0x92,
+		0xbb, 0xde, 0x71, 0x14, 0x32, 0x57, 0xf8, 0x9d,
+		0xaa, 0xcf, 0x60, 0x05, 0x23, 0x46, 0xe9, 0x8c,
+		0xa5, 0xc0, 0x6f, 0x0a, 0x2c, 0x49, 0xe6, 0x83,
+	},
+	{
+		0x00, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f,
+		0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38,
+		0x2e, 0x48, 0xe2, 0x84, 0xab, 0xcd, 0x67, 0x01,
+		0x39, 0x5f, 0xf5, 0x93, 0xbc, 0xda, 0x70, 0x16,
+		0x5c, 0x3a, 0x90, 0xf6, 0xd9, 0xbf, 0x15, 0x73,
+		0x4b, 0x2d, 0x87, 0xe1, 0xce, 0xa8, 0x02, 0x64,
+		0x72, 0x14, 0xbe, 0xd8, 0xf7, 0x91, 0x3b, 0x5d,
+		0x65, 0x03, 0xa9, 0xcf, 0xe0, 0x86, 0x2c, 0x4a,
+		0xb8, 0xde, 0x74, 0x12, 0x3d, 0x5b, 0xf1, 0x97,
+		0xaf, 0xc9, 0x63, 0x05, 0x2a, 0x4c, 0xe6, 0x80,
+		0x96, 0xf0, 0x5a, 0x3c, 0x13, 0x75, 0xdf, 0xb9,
+		0x81, 0xe7, 0x4d, 0x2b, 0x04, 0x62, 0xc8, 0xae,
+		0xe4, 0x82, 0x28, 0x4e, 0x61, 0x07, 0xad, 0xcb,
+		0xf3, 0x95, 0x3f, 0x59, 0x76, 0x10, 0xba, 0xdc,
+		0xca, 0xac, 0x06, 0x60, 0x4f, 0x29, 0x83, 0xe5,
+		0xdd, 0xbb, 0x11, 0x77, 0x58, 0x3e, 0x94, 0xf2,
+		0x6d, 0x0b, 0xa1, 0xc7, 0xe8, 0x8e, 0x24, 0x42,
+		0x7a, 0x1c, 0xb6, 0xd0, 0xff, 0x99, 0x33, 0x55,
+		0x43, 0x25, 0x8f, 0xe9, 0xc6, 0xa0, 0x0a, 0x6c,
+		0x54, 0x32, 0x98, 0xfe, 0xd1, 0xb7, 0x1d, 0x7b,
+		0x31, 0x57, 0xfd, 0x9b, 0xb4, 0xd2, 0x78, 0x1e,
+		0x26, 0x40, 0xea, 0x8c, 0xa3, 0xc5, 0x6f, 0x09,
+		0x1f, 0x79, 0xd3, 0xb5, 0x9a, 0xfc, 0x56, 0x30,
+		0x08, 0x6e, 0xc4, 0xa2, 0x8d, 0xeb, 0x41, 0x27,
+		0xd5, 0xb3, 0x19, 0x7f, 0x50, 0x36, 0x9c, 0xfa,
+		0xc2, 0xa4, 0x0e, 0x68, 0x47, 0x21, 0x8b, 0xed,
+		0xfb, 0x9d, 0x37, 0x51, 0x7e, 0x18, 0xb2, 0xd4,
+		0xec, 0x8a, 0x20, 0x46, 0x69, 0x0f, 0xa5, 0xc3,
+		0x89, 0xef, 0x45, 0x23, 0x0c, 0x6a, 0xc0, 0xa6,
+		0x9e, 0xf8, 0x52, 0x34, 0x1b, 0x7d, 0xd7, 0xb1,
+		0xa7, 0xc1, 0x6b, 0x0d, 0x22, 0x44, 0xee, 0x88,
+		0xb0, 0xd6, 0x7c, 0x1a, 0x35, 0x53, 0xf9, 0x9f,
+	},
+	{
+		0x00, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28,
+		0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37,
+		0x3e, 0x59, 0xf0, 0x97, 0xbf, 0xd8, 0x71, 0x16,
+		0x21, 0x46, 0xef, 0x88, 0xa0, 0xc7, 0x6e, 0x09,
+		0x7c, 0x1b, 0xb2, 0xd5, 0xfd, 0x9a, 0x33, 0x54,
+		0x63, 0x04, 0xad, 0xca, 0xe2, 0x85, 0x2c, 0x4b,
+		0x42, 0x25, 0x8c, 0xeb, 0xc3, 0xa4, 0x0d, 0x6a,
+		0x5d, 0x3a, 0x93, 0xf4, 0xdc, 0xbb, 0x12, 0x75,
+		0xf8, 0x9f, 0x36, 0x51, 0x79, 0x1e, 0xb7, 0xd0,
+		0xe7, 0x80, 0x29, 0x4e, 0x66, 0x01, 0xa8, 0xcf,
+		0xc6, 0xa1, 0x08, 0x6f, 0x47, 0x20, 0x89, 0xee,
+		0xd9, 0xbe, 0x17, 0x70, 0x58, 0x3f, 0x96, 0xf1,
+		0x84, 0xe3, 0x4a, 0x2d, 0x05, 0x62, 0xcb, 0xac,
+		0x9b, 0xfc, 0x55, 0x32, 0x1a, 0x7d, 0xd4, 0xb3,
+		0xba, 0xdd, 0x74, 0x13, 0x3b, 0x5c, 0xf5, 0x92,
+		0xa5, 0xc2, 0x6b, 0x0c, 0x24, 0x43, 0xea, 0x8d,
+		0xed, 0x8a, 0x23, 0x44, 0x6c, 0x0b, 0xa2, 0xc5,
+		0xf2, 0x95, 0x3c, 0x5b, 0x73, 0x14, 0xbd, 0xda,
+		0xd3, 0xb4, 0x1d, 0x7a, 0x52, 0x35, 0x9c, 0xfb,
+		0xcc, 0xab, 0x02, 0x65, 0x4d, 0x2a, 0x83, 0xe4,
+		0x91, 0xf6, 0x5f, 0x38, 0x10, 0x77, 0xde, 0xb9,
+		0x8e, 0xe9, 0x40, 0x27, 0x0f, 0x68, 0xc1, 0xa6,
+		0xaf, 0xc8, 0x61, 0x06, 0x2e, 0x49, 0xe0, 0x87,
+		0xb0, 0xd7, 0x7e, 0x19, 0x31, 0x56, 0xff, 0x98,
+		0x15, 0x72, 0xdb, 0xbc, 0x94, 0xf3, 0x5a, 0x3d,
+		0x0a, 0x6d, 0xc4, 0xa3, 0x8b, 0xec, 0x45, 0x22,
+		0x2b, 0x4c, 0xe5, 0x82, 0xaa, 0xcd, 0x64, 0x03,
+		0x34, 0x53, 0xfa, 0x9d, 0xb5, 0xd2, 0x7b, 0x1c,
+		0x69, 0x0e, 0xa7, 0xc0, 0xe8, 0x8f, 0x26, 0x41,
+		0x76, 0x11, 0xb8, 0xdf, 0xf7, 0x90, 0x39, 0x5e,
+		0x57, 0x30, 0x99, 0xfe, 0xd6, 0xb1, 0x18, 0x7f,
+		0x48, 0x2f, 0x86, 0xe1, 0xc9, 0xae, 0x07, 0x60,
+	},
+	{
+		0x00, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x05,
+		0x67, 0x0f, 0xb7, 0xdf, 0xda, 0xb2, 0x0a, 0x62,
+		0xce, 0xa6, 0x1e, 0x76, 0x73, 0x1b, 0xa3, 0xcb,
+		0xa9, 0xc1, 0x79, 0x11, 0x14, 0x7c, 0xc4, 0xac,
+		0x81, 0xe9, 0x51, 0x39, 0x3c, 0x54, 0xec, 0x84,
+		0xe6, 0x8e, 0x36, 0x5e, 0x5b, 0x33, 0x8b, 0xe3,
+		0x4f, 0x27, 0x9f, 0xf7, 0xf2, 0x9a, 0x22, 0x4a,
+		0x28, 0x40, 0xf8, 0x90, 0x95, 0xfd, 0x45, 0x2d,
+		0x1f, 0x77, 0xcf, 0xa7, 0xa2, 0xca, 0x72, 0x1a,
+		0x78, 0x10, 0xa8, 0xc0, 0xc5, 0xad, 0x15, 0x7d,
+		0xd1, 0xb9, 0x01, 0x69, 0x6c, 0x04, 0xbc, 0xd4,
+		0xb6, 0xde, 0x66, 0x0e, 0x0b, 0x63, 0xdb, 0xb3,
+		0x9e, 0xf6, 0x4e, 0x26, 0x23, 0x4b, 0xf3, 0x9b,
+		0xf9, 0x91, 0x29, 0x41, 0x44, 0x2c, 0x94, 0xfc,
+		0x50, 0x38, 0x80, 0xe8, 0xed, 0x85, 0x3d, 0x55,
+		0x37, 0x5f, 0xe7, 0x8f, 0x8a, 0xe2, 0x5a, 0x32,
+		0x3e, 0x56, 0xee, 0x86, 0x83, 0xeb, 0x53, 0x3b,
+		0x59, 0x31, 0x89, 0xe1, 0xe4, 0x8c, 0x34, 0x5c,
+		0xf0, 0x98, 0x20, 0x48, 0x4d, 0x25, 0x9d, 0xf5,
+		0x97, 0xff, 0x47, 0x2f, 0x2a, 0x42, 0xfa, 0x92,
+		0xbf, 0xd7, 0x6f, 0x07, 0x02, 0x6a, 0xd2, 0xba,
+		0xd8, 0xb0, 0x08, 0x60, 0x65, 0x0d, 0xb5, 0xdd,
+		0x71, 0x19, 0xa1, 0xc9, 0xcc, 0xa4, 0x1c, 0x74,
+		0x16, 0x7e, 0xc6, 0xae, 0xab, 0xc3, 0x7b, 0x13,
+		0x21, 0x49, 0xf1, 0x99, 0x9c, 0xf4, 0x4c, 0x24,
+		0x46, 0x2e, 0x96, 0xfe, 0xfb, 0x93, 0x2b, 0x43,
+		0xef, 0x87, 0x3f, 0x57, 0x52, 0x3a, 0x82, 0xea,
+		0x88, 0xe0, 0x58, 0x30, 0x35, 0x5d, 0xe5, 0x8d,
+		0xa0, 0xc8, 0x70, 0x18, 0x1d, 0x75, 0xcd, 0xa5,
+		0xc7, 0xaf, 0x17, 0x7f, 0x7a, 0x12, 0xaa, 0xc2,
+		0x6e, 0x06, 0xbe, 0xd6, 0xd3, 0xbb, 0x03, 0x6b,
+		0x09, 0x61, 0xd9, 0xb1, 0xb4, 0xdc, 0x64, 0x0c,
+	},
+	{
+		0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+		0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d,
+		0xde, 0xb7, 0x0c, 0x65, 0x67, 0x0e, 0xb5, 0xdc,
+		0xb1, 0xd8, 0x63, 0x0a, 0x08, 0x61, 0xda, 0xb3,
+		0xa1, 0xc8, 0x73, 0x1a, 0x18, 0x71, 0xca, 0xa3,
+		0xce, 0xa7, 0x1c, 0x75, 0x77, 0x1e, 0xa5, 0xcc,
+		0x7f, 0x16, 0xad, 0xc4, 0xc6, 0xaf, 0x14, 0x7d,
+		0x10, 0x79, 0xc2, 0xab, 0xa9, 0xc0, 0x7b, 0x12,
+		0x5f, 0x36, 0x8d, 0xe4, 0xe6, 0x8f, 0x34, 0x5d,
+		0x30, 0x59, 0xe2, 0x8b, 0x89, 0xe0, 0x5b, 0x32,
+		0x81, 0xe8, 0x53, 0x3a, 0x38, 0x51, 0xea, 0x83,
+		0xee, 0x87, 0x3c, 0x55, 0x57, 0x3e, 0x85, 0xec,
+		0xfe, 0x97, 0x2c, 0x45, 0x47, 0x2e, 0x95, 0xfc,
+		0x91, 0xf8, 0x43, 0x2a, 0x28, 0x41, 0xfa, 0x93,
+		0x20, 0x49, 0xf2, 0x9b, 0x99, 0xf0, 0x4b, 0x22,
+		0x4f, 0x26, 0x9d, 0xf4, 0xf6, 0x9f, 0x24, 0x4d,
+		0xbe, 0xd7, 0x6c, 0x05, 0x07, 0x6e, 0xd5, 0xbc,
+		0xd1, 0xb8, 0x03, 0x6a, 0x68, 0x01, 0xba, 0xd3,
+		0x60, 0x09, 0xb2, 0xdb, 0xd9, 0xb0, 0x0b, 0x62,
+		0x0f, 0x66, 0xdd, 0xb4, 0xb6, 0xdf, 0x64, 0x0d,
+		0x1f, 0x76, 0xcd, 0xa4, 0xa6, 0xcf, 0x74, 0x1d,
+		0x70, 0x19, 0xa2, 0xcb, 0xc9, 0xa0, 0x1b, 0x72,
+		0xc1, 0xa8, 0x13, 0x7a, 0x78, 0x11, 0xaa, 0xc3,
+		0xae, 0xc7, 0x7c, 0x15, 0x17, 0x7e, 0xc5, 0xac,
+		0xe1, 0x88, 0x33, 0x5a, 0x58, 0x31, 0x8a, 0xe3,
+		0x8e, 0xe7, 0x5c, 0x35, 0x37, 0x5e, 0xe5, 0x8c,
+		0x3f, 0x56, 0xed, 0x84, 0x86, 0xef, 0x54, 0x3d,
+		0x50, 0x39, 0x82, 0xeb, 0xe9, 0x80, 0x3b, 0x52,
+		0x40, 0x29, 0x92, 0xfb, 0xf9, 0x90, 0x2b, 0x42,
+		0x2f, 0x46, 0xfd, 0x94, 0x96, 0xff, 0x44, 0x2d,
+		0x9e, 0xf7, 0x4c, 0x25, 0x27, 0x4e, 0xf5, 0x9c,
+		0xf1, 0x98, 0x23, 0x4a, 0x48, 0x21, 0x9a, 0xf3,
+	},
+	{
+		0x00, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0x0b,
+		0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c,
+		0xee, 0x84, 0x3a, 0x50, 0x5b, 0x31, 0x8f, 0xe5,
+		0x99, 0xf3, 0x4d, 0x27, 0x2c, 0x46, 0xf8, 0x92,
+		0xc1, 0xab, 0x15, 0x7f, 0x74, 0x1e, 0xa0, 0xca,
+		0xb6, 0xdc, 0x62, 0x08, 0x03, 0x69, 0xd7, 0xbd,
+		0x2f, 0x45, 0xfb, 0x91, 0x9a, 0xf0, 0x4e, 0x24,
+		0x58, 0x32, 0x8c, 0xe6, 0xed, 0x87, 0x39, 0x53,
+		0x9f, 0xf5, 0x4b, 0x21, 0x2a, 0x40, 0xfe, 0x94,
+		0xe8, 0x82, 0x3c, 0x56, 0x5d, 0x37, 0x89, 0xe3,
+		0x71, 0x1b, 0xa5, 0xcf, 0xc4, 0xae, 0x10, 0x7a,
+		0x06, 0x6c, 0xd2, 0xb8, 0xb3, 0xd9, 0x67, 0x0d,
+		0x5e, 0x34, 0x8a, 0xe0, 0xeb, 0x81, 0x3f, 0x55,
+		0x29, 0x43, 0xfd, 0x97, 0x9c, 0xf6, 0x48, 0x22,
+		0xb0, 0xda, 0x64, 0x0e, 0x05, 0x6f, 0xd1, 0xbb,
+		0xc7, 0xad, 0x13, 0x79, 0x72, 0x18, 0xa6, 0xcc,
+		0x23, 0x49, 0xf7, 0x9d, 0x96, 0xfc, 0x42, 0x28,
+		0x54, 0x3e, 0x80, 0xea, 0xe1, 0x8b, 0x35, 0x5f,
+		0xcd, 0xa7, 0x19, 0x73, 0x78, 0x12, 0xac, 0xc6,
+		0xba, 0xd0, 0x6e, 0x04, 0x0f, 0x65, 0xdb, 0xb1,
+		0xe2, 0x88, 0x36, 0x5c, 0x57, 0x3d, 0x83, 0xe9,
+		0x95, 0xff, 0x41, 0x2b, 0x20, 0x4a, 0xf4, 0x9e,
+		0x0c, 0x66, 0xd8, 0xb2, 0xb9, 0xd3, 0x6d, 0x07,
+		0x7b, 0x11, 0xaf, 0xc5, 0xce, 0xa4, 0x1a, 0x70,
+		0xbc, 0xd6, 0x68, 0x02, 0x09, 0x63, 0xdd, 0xb7,
+		0xcb, 0xa1, 0x1f, 0x75, 0x7e, 0x14, 0xaa, 0xc0,
+		0x52, 0x38, 0x86, 0xec, 0xe7, 0x8d, 0x33, 0x59,
+		0x25, 0x4f, 0xf1, 0x9b, 0x90, 0xfa, 0x44, 0x2e,
+		0x7d, 0x17, 0xa9, 0xc3, 0xc8, 0xa2, 0x1c, 0x76,
+		0x0a, 0x60, 0xde, 0xb4, 0xbf, 0xd5, 0x6b, 0x01,
+		0x93, 0xf9, 0x47, 0x2d, 0x26, 0x4c, 0xf2, 0x98,
+		0xe4, 0x8e, 0x30, 0x5a, 0x51, 0x3b, 0x85, 0xef,
+	},
+	{
+		0x00, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0x0c,
+		0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73,
+		0xfe, 0x95, 0x28, 0x43, 0x4f, 0x24, 0x99, 0xf2,
+		0x81, 0xea, 0x57, 0x3c, 0x30, 0x5b, 0xe6, 0x8d,
+		0xe1, 0x8a, 0x37, 0x5c, 0x50, 0x3b, 0x86, 0xed,
+		0x9e, 0xf5, 0x48, 0x23, 0x2f, 0x44, 0xf9, 0x92,
+		0x1f, 0x74, 0xc9, 0xa2, 0xae, 0xc5, 0x78, 0x13,
+		0x60, 0x0b, 0xb6, 0xdd, 0xd1, 0xba, 0x07, 0x6c,
+		0xdf, 0xb4, 0x09, 0x62, 0x6e, 0x05, 0xb8, 0xd3,
+		0xa0, 0xcb, 0x76, 0x1d, 0x11, 0x7a, 0xc7, 0xac,
+		0x21, 0x4a, 0xf7, 0x9c, 0x90, 0xfb, 0x46, 0x2d,
+		0x5e, 0x35, 0x88, 0xe3, 0xef, 0x84, 0x39, 0x52,
+		0x3e, 0x55, 0xe8, 0x83, 0x8f, 0xe4, 0x59, 0x32,
+		0x41, 0x2a, 0x97, 0xfc, 0xf0, 0x9b, 0x26, 0x4d,
+		0xc0, 0xab, 0x16, 0x7d, 0x71, 0x1a, 0xa7, 0xcc,
+		0xbf, 0xd4, 0x69, 0x02, 0x0e, 0x65, 0xd8, 0xb3,
+		0xa3, 0xc8, 0x75, 0x1e, 0x12, 0x79, 0xc4, 0xaf,
+		0xdc, 0xb7, 0x0a, 0x61, 0x6d, 0x06, 0xbb, 0xd0,
+		0x5d, 0x36, 0x8b, 0xe0, 0xec, 0x87, 0x3a, 0x51,
+		0x22, 0x49, 0xf4, 0x9f, 0x93, 0xf8, 0x45, 0x2e,
+		0x42, 0x29, 0x94, 0xff, 0xf3, 0x98, 0x25, 0x4e,
+		0x3d, 0x56, 0xeb, 0x80, 0x8c, 0xe7, 0x5a, 0x31,
+		0xbc, 0xd7, 0x6a, 0x01, 0x0d, 0x66, 0xdb, 0xb0,
+		0xc3, 0xa8, 0x15, 0x7e, 0x72, 0x19, 0xa4, 0xcf,
+		0x7c, 0x17, 0xaa, 0xc1, 0xcd, 0xa6, 0x1b, 0x70,
+		0x03, 0x68, 0xd5, 0xbe, 0xb2, 0xd9, 0x64, 0x0f,
+		0x82, 0xe9, 0x54, 0x3f, 0x33, 0x58, 0xe5, 0x8e,
+		0xfd, 0x96, 0x2b, 0x40, 0x4c, 0x27, 0x9a, 0xf1,
+		0x9d, 0xf6, 0x4b, 0x20, 0x2c, 0x47, 0xfa, 0x91,
+		0xe2, 0x89, 0x34, 0x5f, 0x53, 0x38, 0x85, 0xee,
+		0x63, 0x08, 0xb5, 0xde, 0xd2, 0xb9, 0x04, 0x6f,
+		0x1c, 0x77, 0xca, 0xa1, 0xad, 0xc6, 0x7b, 0x10,
+	},
+	{
+		0x00, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19,
+		0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e,
+		0x8e, 0xe2, 0x56, 0x3a, 0x23, 0x4f, 0xfb, 0x97,
+		0xc9, 0xa5, 0x11, 0x7d, 0x64, 0x08, 0xbc, 0xd0,
+		0x01, 0x6d, 0xd9, 0xb5, 0xac, 0xc0, 0x74, 0x18,
+		0x46, 0x2a, 0x9e, 0xf2, 0xeb, 0x87, 0x33, 0x5f,
+		0x8f, 0xe3, 0x57, 0x3b, 0x22, 0x4e, 0xfa, 0x96,
+		0xc8, 0xa4, 0x10, 0x7c, 0x65, 0x09, 0xbd, 0xd1,
+		0x02, 0x6e, 0xda, 0xb6, 0xaf, 0xc3, 0x77, 0x1b,
+		0x45, 0x29, 0x9d, 0xf1, 0xe8, 0x84, 0x30, 0x5c,
+		0x8c, 0xe0, 0x54, 0x38, 0x21, 0x4d, 0xf9, 0x95,
+		0xcb, 0xa7, 0x13, 0x7f, 0x66, 0x0a, 0xbe, 0xd2,
+		0x03, 0x6f, 0xdb, 0xb7, 0xae, 0xc2, 0x76, 0x1a,
+		0x44, 0x28, 0x9c, 0xf0, 0xe9, 0x85, 0x31, 0x5d,
+		0x8d, 0xe1, 0x55, 0x39, 0x20, 0x4c, 0xf8, 0x94,
+		0xca, 0xa6, 0x12, 0x7e, 0x67, 0x0b, 0xbf, 0xd3,
+		0x04, 0x68, 0xdc, 0xb0, 0xa9, 0xc5, 0x71, 0x1d,
+		0x43, 0x2f, 0x9b, 0xf7, 0xee, 0x82, 0x36, 0x5a,
+		0x8a, 0xe6, 0x52, 0x3e, 0x27, 0x4b, 0xff, 0x93,
+		0xcd, 0xa1, 0x15, 0x79, 0x60, 0x0c, 0xb8, 0xd4,
+		0x05, 0x69, 0xdd, 0xb1, 0xa8, 0xc4, 0x70, 0x1c,
+		0x42, 0x2e, 0x9a, 0xf6, 0xef, 0x83, 0x37, 0x5b,
+		0x8b, 0xe7, 0x53, 0x3f, 0x26, 0x4a, 0xfe, 0x92,
+		0xcc, 0xa0, 0x14, 0x78, 0x61, 0x0d, 0xb9, 0xd5,
+		0x06, 0x6a, 0xde, 0xb2, 0xab, 0xc7, 0x73, 0x1f,
+		0x41, 0x2d, 0x99, 0xf5, 0xec, 0x80, 0x34, 0x58,
+		0x88, 0xe4, 0x50, 0x3c, 0x25, 0x49, 0xfd, 0x91,
+		0xcf, 0xa3, 0x17, 0x7b, 0x62, 0x0e, 0xba, 0xd6,
+		0x07, 0x6b, 0xdf, 0xb3, 0xaa, 0xc6, 0x72, 0x1e,
+		0x40, 0x2c, 0x98, 0xf4, 0xed, 0x81, 0x35, 0x59,
+		0x89, 0xe5, 0x51, 0x3d, 0x24, 0x48, 0xfc, 0x90,
+		0xce, 0xa2, 0x16, 0x7a, 0x63, 0x0f, 0xbb, 0xd7,
+	},
+	{
+		0x00, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e,
+		0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51,
+		0x9e, 0xf3, 0x44, 0x29, 0x37, 0x5a, 0xed, 0x80,
+		0xd1, 0xbc, 0x0b, 0x66, 0x78, 0x15, 0xa2, 0xcf,
+		0x21, 0x4c, 0xfb, 0x96, 0x88, 0xe5, 0x52, 0x3f,
+		0x6e, 0x03, 0xb4, 0xd9, 0xc7, 0xaa, 0x1d, 0x70,
+		0xbf, 0xd2, 0x65, 0x08, 0x16, 0x7b, 0xcc, 0xa1,
+		0xf0, 0x9d, 0x2a, 0x47, 0x59, 0x34, 0x83, 0xee,
+		0x42, 0x2f, 0x98, 0xf5, 0xeb, 0x86, 0x31, 0x5c,
+		0x0d, 0x60, 0xd7, 0xba, 0xa4, 0xc9, 0x7e, 0x13,
+		0xdc, 0xb1, 0x06, 0x6b, 0x75, 0x18, 0xaf, 0xc2,
+		0x93, 0xfe, 0x49, 0x24, 0x3a, 0x57, 0xe0, 0x8d,
+		0x63, 0x0e, 0xb9, 0xd4, 0xca, 0xa7, 0x10, 0x7d,
+		0x2c, 0x41, 0xf6, 0x9b, 0x85, 0xe8, 0x5f, 0x32,
+		0xfd, 0x90, 0x27, 0x4a, 0x54, 0x39, 0x8e, 0xe3,
+		0xb2, 0xdf, 0x68, 0x05, 0x1b, 0x76, 0xc1, 0xac,
+		0x84, 0xe9, 0x5e, 0x33, 0x2d, 0x40, 0xf7, 0x9a,
+		0xcb, 0xa6, 0x11, 0x7c, 0x62, 0x0f, 0xb8, 0xd5,
+		0x1a, 0x77, 0xc0, 0xad, 0xb3, 0xde, 0x69, 0x04,
+		0x55, 0x38, 0x8f, 0xe2, 0xfc, 0x91, 0x26, 0x4b,
+		0xa5, 0xc8, 0x7f, 0x12, 0x0c, 0x61, 0xd6, 0xbb,
+		0xea, 0x87, 0x30, 0x5d, 0x43, 0x2e, 0x99, 0xf4,
+		0x3b, 0x56, 0xe1, 0x8c, 0x92, 0xff, 0x48, 0x25,
+		0x74, 0x19, 0xae, 0xc3, 0xdd, 0xb0, 0x07, 0x6a,
+		0xc6, 0xab, 0x1c, 0x71, 0x6f, 0x02, 0xb5, 0xd8,
+		0x89, 0xe4, 0x53, 0x3e, 0x20, 0x4d, 0xfa, 0x97,
+		0x58, 0x35, 0x82, 0xef, 0xf1, 0x9c, 0x2b, 0x46,
+		0x17, 0x7a, 0xcd, 0xa0, 0xbe, 0xd3, 0x64, 0x09,
+		0xe7, 0x8a, 0x3d, 0x50, 0x4e, 0x23, 0x94, 0xf9,
+		0xa8, 0xc5, 0x72, 0x1f, 0x01, 0x6c, 0xdb, 0xb6,
+		0x79, 0x14, 0xa3, 0xce, 0xd0, 0xbd, 0x0a, 0x67,
+		0x36, 0x5b, 0xec, 0x81, 0x9f, 0xf2, 0x45, 0x28,
+	},
+	{
+		0x00, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17,
+		0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40,
+		0xae, 0xc0, 0x72, 0x1c, 0x0b, 0x65, 0xd7, 0xb9,
+		0xf9, 0x97, 0x25, 0x4b, 0x5c, 0x32, 0x80, 0xee,
+		0x41, 0x2f, 0x9d, 0xf3, 0xe4, 0x8a, 0x38, 0x56,
+		0x16, 0x78, 0xca, 0xa4, 0xb3, 0xdd, 0x6f, 0x01,
+		0xef, 0x81, 0x33, 0x5d, 0x4a, 0x24, 0x96, 0xf8,
+		0xb8, 0xd6, 0x64, 0x0a, 0x1d, 0x73, 0xc1, 0xaf,
+		0x82, 0xec, 0x5e, 0x30, 0x27, 0x49, 0xfb, 0x95,
+		0xd5, 0xbb, 0x09, 0x67, 0x70, 0x1e, 0xac, 0xc2,
+		0x2c, 0x42, 0xf0, 0x9e, 0x89, 0xe7, 0x55, 0x3b,
+		0x7b, 0x15, 0xa7, 0xc9, 0xde, 0xb0, 0x02, 0x6c,
+		0xc3, 0xad, 0x1f, 0x71, 0x66, 0x08, 0xba, 0xd4,
+		0x94, 0xfa, 0x48, 0x26, 0x31, 0x5f, 0xed, 0x83,
+		0x6d, 0x03, 0xb1, 0xdf, 0xc8, 0xa6, 0x14, 0x7a,
+		0x3a, 0x54, 0xe6, 0x88, 0x9f, 0xf1, 0x43, 0x2d,
+		0x19, 0x77, 0xc5, 0xab, 0xbc, 0xd2, 0x60, 0x0e,
+		0x4e, 0x20, 0x92, 0xfc, 0xeb, 0x85, 0x37, 0x59,
+		0xb7, 0xd9, 0x6b, 0x05, 0x12, 0x7c, 0xce, 0xa0,
+		0xe0, 0x8e, 0x3c, 0x52, 0x45, 0x2b, 0x99, 0xf7,
+		0x58, 0x36, 0x84, 0xea, 0xfd, 0x93, 0x21, 0x4f,
+		0x0f, 0x61, 0xd3, 0xbd, 0xaa, 0xc4, 0x76, 0x18,
+		0xf6, 0x98, 0x2a, 0x44, 0x53, 0x3d, 0x8f, 0xe1,
+		0xa1, 0xcf, 0x7d, 0x13, 0x04, 0x6a, 0xd8, 0xb6,
+		0x9b, 0xf5, 0x47, 0x29, 0x3e, 0x50, 0xe2, 0x8c,
+		0xcc, 0xa2, 0x10, 0x7e, 0x69, 0x07, 0xb5, 0xdb,
+		0x35, 0x5b, 0xe9, 0x87, 0x90, 0xfe, 0x4c, 0x22,
+		0x62, 0x0c, 0xbe, 0xd0, 0xc7, 0xa9, 0x1b, 0x75,
+		0xda, 0xb4, 0x06, 0x68, 0x7f, 0x11, 0xa3, 0xcd,
+		0x8d, 0xe3, 0x51, 0x3f, 0x28, 0x46, 0xf4, 0x9a,
+		0x74, 0x1a, 0xa8, 0xc6, 0xd1, 0xbf, 0x0d, 0x63,
+		0x23, 0x4d, 0xff, 0x91, 0x86, 0xe8, 0x5a, 0x34,
+	},
+	{
+		0x00, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10,
+		0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f,
+		0xbe, 0xd1, 0x60, 0x0f, 0x1f, 0x70, 0xc1, 0xae,
+		0xe1, 0x8e, 0x3f, 0x50, 0x40, 0x2f, 0x9e, 0xf1,
+		0x61, 0x0e, 0xbf, 0xd0, 0xc0, 0xaf, 0x1e, 0x71,
+		0x3e, 0x51, 0xe0, 0x8f, 0x9f, 0xf0, 0x41, 0x2e,
+		0xdf, 0xb0, 0x01, 0x6e, 0x7e, 0x11, 0xa0, 0xcf,
+		0x80, 0xef, 0x5e, 0x31, 0x21, 0x4e, 0xff, 0x90,
+		0xc2, 0xad, 0x1c, 0x73, 0x63, 0x0c, 0xbd, 0xd2,
+		0x9d, 0xf2, 0x43, 0x2c, 0x3c, 0x53, 0xe2, 0x8d,
+		0x7c, 0x13, 0xa2, 0xcd, 0xdd, 0xb2, 0x03, 0x6c,
+		0x23, 0x4c, 0xfd, 0x92, 0x82, 0xed, 0x5c, 0x33,
+		0xa3, 0xcc, 0x7d, 0x12, 0x02, 0x6d, 0xdc, 0xb3,
+		0xfc, 0x93, 0x22, 0x4d, 0x5d, 0x32, 0x83, 0xec,
+		0x1d, 0x72, 0xc3, 0xac, 0xbc, 0xd3, 0x62, 0x0d,
+		0x42, 0x2d, 0x9c, 0xf3, 0xe3, 0x8c, 0x3d, 0x52,
+		0x99, 0xf6, 0x47, 0x28, 0x38, 0x57, 0xe6, 0x89,
+		0xc6, 0xa9, 0x18, 0x77, 0x67, 0x08, 0xb9, 0xd6,
+		0x27, 0x48, 0xf9, 0x96, 0x86, 0xe9, 0x58, 0x37,
+		0x78, 0x17, 0xa6, 0xc9, 0xd9, 0xb6, 0x07, 0x68,
+		0xf8, 0x97, 0x26, 0x49, 0x59, 0x36, 0x87, 0xe8,
+		0xa7, 0xc8, 0x79, 0x16, 0x06, 0x69, 0xd8, 0xb7,
+		0x46, 0x29, 0x98, 0xf7, 0xe7, 0x88, 0x39, 0x56,
+		0x19, 0x76, 0xc7, 0xa8, 0xb8, 0xd7, 0x66, 0x09,
+		0x5b, 0x34, 0x85, 0xea, 0xfa, 0x95, 0x24, 0x4b,
+		0x04, 0x6b, 0xda, 0xb5, 0xa5, 0xca, 0x7b, 0x14,
+		0xe5, 0x8a, 0x3b, 0x54, 0x44, 0x2b, 0x9a, 0xf5,
+		0xba, 0xd5, 0x64, 0x0b, 0x1b, 0x74, 0xc5, 0xaa,
+		0x3a, 0x55, 0xe4, 0x8b, 0x9b, 0xf4, 0x45, 0x2a,
+		0x65, 0x0a, 0xbb, 0xd4, 0xc4, 0xab, 0x1a, 0x75,
+		0x84, 0xeb, 0x5a, 0x35, 0x25, 0x4a, 0xfb, 0x94,
+		0xdb, 0xb4, 0x05, 0x6a, 0x7a, 0x15, 0xa4, 0xcb,
+	},
+	{
+		0x00, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d,
+		0xa7, 0xd7, 0x47, 0x37, 0x7a, 0x0a, 0x9a, 0xea,
+		0x53, 0x23, 0xb3, 0xc3, 0x8e, 0xfe, 0x6e, 0x1e,
+		0xf4, 0x84, 0x14, 0x64, 0x29, 0x59, 0xc9, 0xb9,
+		0xa6, 0xd6, 0x46, 0x36, 0x7b, 0x0b, 0x9b, 0xeb,
+		0x01, 0x71, 0xe1, 0x91, 0xdc, 0xac, 0x3c, 0x4c,
+		0xf5, 0x85, 0x15, 0x65, 0x28, 0x58, 0xc8, 0xb8,
+		0x52, 0x22, 0xb2, 0xc2, 0x8f, 0xff, 0x6f, 0x1f,
+		0x51, 0x21, 0xb1, 0xc1, 0x8c, 0xfc, 0x6c, 0x1c,
+		0xf6, 0x86, 0x16, 0x66, 0x2b, 0x5b, 0xcb, 0xbb,
+		0x02, 0x72, 0xe2, 0x92, 0xdf, 0xaf, 0x3f, 0x4f,
+		0xa5, 0xd5, 0x45, 0x35, 0x78, 0x08, 0x98, 0xe8,
+		0xf7, 0x87, 0x17, 0x67, 0x2a, 0x5a, 0xca, 0xba,
+		0x50, 0x20, 0xb0, 0xc0, 0x8d, 0xfd, 0x6d, 0x1d,
+		0xa4, 0xd4, 0x44, 0x34, 0x79, 0x09, 0x99, 0xe9,
+		0x03, 0x73, 0xe3, 0x93, 0xde, 0xae, 0x3e, 0x4e,
+		0xa2, 0xd2, 0x42, 0x32, 0x7f, 0x0f, 0x9f, 0xef,
+		0x05, 0x75, 0xe5, 0x95, 0xd8, 0xa8, 0x38, 0x48,
+		0xf1, 0x81, 0x11, 0x61, 0x2c, 0x5c, 0xcc, 0xbc,
+		0x56, 0x26, 0xb6, 0xc6, 0x8b, 0xfb, 0x6b, 0x1b,
+		0x04, 0x74, 0xe4, 0x94, 0xd9, 0xa9, 0x39, 0x49,
+		0xa3, 0xd3, 0x43, 0x33, 0x7e, 0x0e, 0x9e, 0xee,
+		0x57, 0x27, 0xb7, 0xc7, 0x8a, 0xfa, 0x6a, 0x1a,
+		0xf0, 0x80, 0x10, 0x60, 0x2d, 0x5d, 0xcd, 0xbd,
+		0xf3, 0x83, 0x13, 0x63, 0x2e, 0x5e, 0xce, 0xbe,
+		0x54, 0x24, 0xb4, 0xc4, 0x89, 0xf9, 0x69, 0x19,
+		0xa0, 0xd0, 0x40, 0x30, 0x7d, 0x0d, 0x9d, 0xed,
+		0x07, 0x77, 0xe7, 0x97, 0xda, 0xaa, 0x3a, 0x4a,
+		0x55, 0x25, 0xb5, 0xc5, 0x88, 0xf8, 0x68, 0x18,
+		0xf2, 0x82, 0x12, 0x62, 0x2f, 0x5f, 0xcf, 0xbf,
+		0x06, 0x76, 0xe6, 0x96, 0xdb, 0xab, 0x3b, 0x4b,
+		0xa1, 0xd1, 0x41, 0x31, 0x7c, 0x0c, 0x9c, 0xec,
+	},
+	{
+		0x00, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a,
+		0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x07, 0x94, 0xe5,
+		0x43, 0x32, 0xa1, 0xd0, 0x9a, 0xeb, 0x78, 0x09,
+		0xec, 0x9d, 0x0e, 0x7f, 0x35, 0x44, 0xd7, 0xa6,
+		0x86, 0xf7, 0x64, 0x15, 0x5f, 0x2e, 0xbd, 0xcc,
+		0x29, 0x58, 0xcb, 0xba, 0xf0, 0x81, 0x12, 0x63,
+		0xc5, 0xb4, 0x27, 0x56, 0x1c, 0x6d, 0xfe, 0x8f,
+		0x6a, 0x1b, 0x88, 0xf9, 0xb3, 0xc2, 0x51, 0x20,
+		0x11, 0x60, 0xf3, 0x82, 0xc8, 0xb9, 0x2a, 0x5b,
+		0xbe, 0xcf, 0x5c, 0x2d, 0x67, 0x16, 0x85, 0xf4,
+		0x52, 0x23, 0xb0, 0xc1, 0x8b, 0xfa, 0x69, 0x18,
+		0xfd, 0x8c, 0x1f, 0x6e, 0x24, 0x55, 0xc6, 0xb7,
+		0x97, 0xe6, 0x75, 0x04, 0x4e, 0x3f, 0xac, 0xdd,
+		0x38, 0x49, 0xda, 0xab, 0xe1, 0x90, 0x03, 0x72,
+		0xd4, 0xa5, 0x36, 0x47, 0x0d, 0x7c, 0xef, 0x9e,
+		0x7b, 0x0a, 0x99, 0xe8, 0xa2, 0xd3, 0x40, 0x31,
+		0x22, 0x53, 0xc0, 0xb1, 0xfb, 0x8a, 0x19, 0x68,
+		0x8d, 0xfc, 0x6f, 0x1e, 0x54, 0x25, 0xb6, 0xc7,
+		0x61, 0x10, 0x83, 0xf2, 0xb8, 0xc9, 0x5a, 0x2b,
+		0xce, 0xbf, 0x2c, 0x5d, 0x17, 0x66, 0xf5, 0x84,
+		0xa4, 0xd5, 0x46, 0x37, 0x7d, 0x0c, 0x9f, 0xee,
+		0x0b, 0x7a, 0xe9, 0x98, 0xd2, 0xa3, 0x30, 0x41,
+		0xe7, 0x96, 0x05, 0x74, 0x3e, 0x4f, 0xdc, 0xad,
+		0x48, 0x39, 0xaa, 0xdb, 0x91, 0xe0, 0x73, 0x02,
+		0x33, 0x42, 0xd1, 0xa0, 0xea, 0x9b, 0x08, 0x79,
+		0x9c, 0xed, 0x7e, 0x0f, 0x45, 0x34, 0xa7, 0xd6,
+		0x70, 0x01, 0x92, 0xe3, 0xa9, 0xd8, 0x4b, 0x3a,
+		0xdf, 0xae, 0x3d, 0x4c, 0x06, 0x77, 0xe4, 0x95,
+		0xb5, 0xc4, 0x57, 0x26, 0x6c, 0x1d, 0x8e, 0xff,
+		0x1a, 0x6b, 0xf8, 0x89, 0xc3, 0xb2, 0x21, 0x50,
+		0xf6, 0x87, 0x14, 0x65, 0x2f, 0x5e, 0xcd, 0xbc,
+		0x59, 0x28, 0xbb, 0xca, 0x80, 0xf1, 0x62, 0x13,
+	},
+	{
+		0x00, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43,
+		0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4,
+		0x73, 0x01, 0x97, 0xe5, 0xa6, 0xd4, 0x42, 0x30,
+		0xc4, 0xb6, 0x20, 0x52, 0x11, 0x63, 0xf5, 0x87,
+		0xe6, 0x94, 0x02, 0x70, 0x33, 0x41, 0xd7, 0xa5,
+		0x51, 0x23, 0xb5, 0xc7, 0x84, 0xf6, 0x60, 0x12,
+		0x95, 0xe7, 0x71, 0x03, 0x40, 0x32, 0xa4, 0xd6,
+		0x22, 0x50, 0xc6, 0xb4, 0xf7, 0x85, 0x13, 0x61,
+		0xd1, 0xa3, 0x35, 0x47, 0x04, 0x76, 0xe0, 0x92,
+		0x66, 0x14, 0x82, 0xf0, 0xb3, 0xc1, 0x57, 0x25,
+		0xa2, 0xd0, 0x46, 0x34, 0x77, 0x05, 0x93, 0xe1,
+		0x15, 0x67, 0xf1, 0x83, 0xc0, 0xb2, 0x24, 0x56,
+		0x37, 0x45, 0xd3, 0xa1, 0xe2, 0x90, 0x06, 0x74,
+		0x80, 0xf2, 0x64, 0x16, 0x55, 0x27, 0xb1, 0xc3,
+		0x44, 0x36, 0xa0, 0xd2, 0x91, 0xe3, 0x75, 0x07,
+		0xf3, 0x81, 0x17, 0x65, 0x26, 0x54, 0xc2, 0xb0,
+		0xbf, 0xcd, 0x5b, 0x29, 0x6a, 0x18, 0x8e, 0xfc,
+		0x08, 0x7a, 0xec, 0x9e, 0xdd, 0xaf, 0x39, 0x4b,
+		0xcc, 0xbe, 0x28, 0x5a, 0x19, 0x6b, 0xfd, 0x8f,
+		0x7b, 0x09, 0x9f, 0xed, 0xae, 0xdc, 0x4a, 0x38,
+		0x59, 0x2b, 0xbd, 0xcf, 0x8c, 0xfe, 0x68, 0x1a,
+		0xee, 0x9c, 0x0a, 0x78, 0x3b, 0x49, 0xdf, 0xad,
+		0x2a, 0x58, 0xce, 0xbc, 0xff, 0x8d, 0x1b, 0x69,
+		0x9d, 0xef, 0x79, 0x0b, 0x48, 0x3a, 0xac, 0xde,
+		0x6e, 0x1c, 0x8a, 0xf8, 0xbb, 0xc9, 0x5f, 0x2d,
+		0xd9, 0xab, 0x3d, 0x4f, 0x0c, 0x7e, 0xe8, 0x9a,
+		0x1d, 0x6f, 0xf9, 0x8b, 0xc8, 0xba, 0x2c, 0x5e,
+		0xaa, 0xd8, 0x4e, 0x3c, 0x7f, 0x0d, 0x9b, 0xe9,
+		0x88, 0xfa, 0x6c, 0x1e, 0x5d, 0x2f, 0xb9, 0xcb,
+		0x3f, 0x4d, 0xdb, 0xa9, 0xea, 0x98, 0x0e, 0x7c,
+		0xfb, 0x89, 0x1f, 0x6d, 0x2e, 0x5c, 0xca, 0xb8,
+		0x4c, 0x3e, 0xa8, 0xda, 0x99, 0xeb, 0x7d, 0x0f,
+	},
+	{
+		0x00, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44,
+		0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb,
+		0x63, 0x10, 0x85, 0xf6, 0xb2, 0xc1, 0x54, 0x27,
+		0xdc, 0xaf, 0x3a, 0x49, 0x0d, 0x7e, 0xeb, 0x98,
+		0xc6, 0xb5, 0x20, 0x53, 0x17, 0x64, 0xf1, 0x82,
+		0x79, 0x0a, 0x9f, 0xec, 0xa8, 0xdb, 0x4e, 0x3d,
+		0xa5, 0xd6, 0x43, 0x30, 0x74, 0x07, 0x92, 0xe1,
+		0x1a, 0x69, 0xfc, 0x8f, 0xcb, 0xb8, 0x2d, 0x5e,
+		0x91, 0xe2, 0x77, 0x04, 0x40, 0x33, 0xa6, 0xd5,
+		0x2e, 0x5d, 0xc8, 0xbb, 0xff, 0x8c, 0x19, 0x6a,
+		0xf2, 0x81, 0x14, 0x67, 0x23, 0x50, 0xc5, 0xb6,
+		0x4d, 0x3e, 0xab, 0xd8, 0x9c, 0xef, 0x7a, 0x09,
+		0x57, 0x24, 0xb1, 0xc2, 0x86, 0xf5, 0x60, 0x13,
+		0xe8, 0x9b, 0x0e, 0x7d, 0x39, 0x4a, 0xdf, 0xac,
+		0x34, 0x47, 0xd2, 0xa1, 0xe5, 0x96, 0x03, 0x70,
+		0x8b, 0xf8, 0x6d, 0x1e, 0x5a, 0x29, 0xbc, 0xcf,
+		0x3f, 0x4c, 0xd9, 0xaa, 0xee, 0x9d, 0x08, 0x7b,
+		0x80, 0xf3, 0x66, 0x15, 0x51, 0x22, 0xb7, 0xc4,
+		0x5c, 0x2f, 0xba, 0xc9, 0x8d, 0xfe, 0x6b, 0x18,
+		0xe3, 0x90, 0x05, 0x76, 0x32, 0x41, 0xd4, 0xa7,
+		0xf9, 0x8a, 0x1f, 0x6c, 0x28, 0x5b, 0xce, 0xbd,
+		0x46, 0x35, 0xa0, 0xd3, 0x97, 0xe4, 0x71, 0x02,
+		0x9a, 0xe9, 0x7c, 0x0f, 0x4b, 0x38, 0xad, 0xde,
+		0x25, 0x56, 0xc3, 0xb0, 0xf4, 0x87, 0x12, 0x61,
+		0xae, 0xdd, 0x48, 0x3b, 0x7f, 0x0c, 0x99, 0xea,
+		0x11, 0x62, 0xf7, 0x84, 0xc0, 0xb3, 0x26, 0x55,
+		0xcd, 0xbe, 0x2b, 0x58, 0x1c, 0x6f, 0xfa, 0x89,
+		0x72, 0x01, 0x94, 0xe7, 0xa3, 0xd0, 0x45, 0x36,
+		0x68, 0x1b, 0x8e, 0xfd, 0xb9, 0xca, 0x5f, 0x2c,
+		0xd7, 0xa4, 0x31, 0x42, 0x06, 0x75, 0xe0, 0x93,
+		0x0b, 0x78, 0xed, 0x9e, 0xda, 0xa9, 0x3c, 0x4f,
+		0xb4, 0xc7, 0x52, 0x21, 0x65, 0x16, 0x83, 0xf0,
+	},
+	{
+		0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+		0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6,
+		0x13, 0x67, 0xfb, 0x8f, 0xde, 0xaa, 0x36, 0x42,
+		0x94, 0xe0, 0x7c, 0x08, 0x59, 0x2d, 0xb1, 0xc5,
+		0x26, 0x52, 0xce, 0xba, 0xeb, 0x9f, 0x03, 0x77,
+		0xa1, 0xd5, 0x49, 0x3d, 0x6c, 0x18, 0x84, 0xf0,
+		0x35, 0x41, 0xdd, 0xa9, 0xf8, 0x8c, 0x10, 0x64,
+		0xb2, 0xc6, 0x5a, 0x2e, 0x7f, 0x0b, 0x97, 0xe3,
+		0x4c, 0x38, 0xa4, 0xd0, 0x81, 0xf5, 0x69, 0x1d,
+		0xcb, 0xbf, 0x23, 0x57, 0x06, 0x72, 0xee, 0x9a,
+		0x5f, 0x2b, 0xb7, 0xc3, 0x92, 0xe6, 0x7a, 0x0e,
+		0xd8, 0xac, 0x30, 0x44, 0x15, 0x61, 0xfd, 0x89,
+		0x6a, 0x1e, 0x82, 0xf6, 0xa7, 0xd3, 0x4f, 0x3b,
+		0xed, 0x99, 0x05, 0x71, 0x20, 0x54, 0xc8, 0xbc,
+		0x79, 0x0d, 0x91, 0xe5, 0xb4, 0xc0, 0x5c, 0x28,
+		0xfe, 0x8a, 0x16, 0x62, 0x33, 0x47, 0xdb, 0xaf,
+		0x98, 0xec, 0x70, 0x04, 0x55, 0x21, 0xbd, 0xc9,
+		0x1f, 0x6b, 0xf7, 0x83, 0xd2, 0xa6, 0x3a, 0x4e,
+		0x8b, 0xff, 0x63, 0x17, 0x46, 0x32, 0xae, 0xda,
+		0x0c, 0x78, 0xe4, 0x90, 0xc1, 0xb5, 0x29, 0x5d,
+		0xbe, 0xca, 0x56, 0x22, 0x73, 0x07, 0x9b, 0xef,
+		0x39, 0x4d, 0xd1, 0xa5, 0xf4, 0x80, 0x1c, 0x68,
+		0xad, 0xd9, 0x45, 0x31, 0x60, 0x14, 0x88, 0xfc,
+		0x2a, 0x5e, 0xc2, 0xb6, 0xe7, 0x93, 0x0f, 0x7b,
+		0xd4, 0xa0, 0x3c, 0x48, 0x19, 0x6d, 0xf1, 0x85,
+		0x53, 0x27, 0xbb, 0xcf, 0x9e, 0xea, 0x76, 0x02,
+		0xc7, 0xb3, 0x2f, 0x5b, 0x0a, 0x7e, 0xe2, 0x96,
+		0x40, 0x34, 0xa8, 0xdc, 0x8d, 0xf9, 0x65, 0x11,
+		0xf2, 0x86, 0x1a, 0x6e, 0x3f, 0x4b, 0xd7, 0xa3,
+		0x75, 0x01, 0x9d, 0xe9, 0xb8, 0xcc, 0x50, 0x24,
+		0xe1, 0x95, 0x09, 0x7d, 0x2c, 0x58, 0xc4, 0xb0,
+		0x66, 0x12, 0x8e, 0xfa, 0xab, 0xdf, 0x43, 0x37,
+	},
+	{
+		0x00, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56,
+		0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9,
+		0x03, 0x76, 0xe9, 0x9c, 0xca, 0xbf, 0x20, 0x55,
+		0x8c, 0xf9, 0x66, 0x13, 0x45, 0x30, 0xaf, 0xda,
+		0x06, 0x73, 0xec, 0x99, 0xcf, 0xba, 0x25, 0x50,
+		0x89, 0xfc, 0x63, 0x16, 0x40, 0x35, 0xaa, 0xdf,
+		0x05, 0x70, 0xef, 0x9a, 0xcc, 0xb9, 0x26, 0x53,
+		0x8a, 0xff, 0x60, 0x15, 0x43, 0x36, 0xa9, 0xdc,
+		0x0c, 0x79, 0xe6, 0x93, 0xc5, 0xb0, 0x2f, 0x5a,
+		0x83, 0xf6, 0x69, 0x1c, 0x4a, 0x3f, 0xa0, 0xd5,
+		0x0f, 0x7a, 0xe5, 0x90, 0xc6, 0xb3, 0x2c, 0x59,
+		0x80, 0xf5, 0x6a, 0x1f, 0x49, 0x3c, 0xa3, 0xd6,
+		0x0a, 0x7f, 0xe0, 0x95, 0xc3, 0xb6, 0x29, 0x5c,
+		0x85, 0xf0, 0x6f, 0x1a, 0x4c, 0x39, 0xa6, 0xd3,
+		0x09, 0x7c, 0xe3, 0x96, 0xc0, 0xb5, 0x2a, 0x5f,
+		0x86, 0xf3, 0x6c, 0x19, 0x4f, 0x3a, 0xa5, 0xd0,
+		0x18, 0x6d, 0xf2, 0x87, 0xd1, 0xa4, 0x3b, 0x4e,
+		0x97, 0xe2, 0x7d, 0x08, 0x5e, 0x2b, 0xb4, 0xc1,
+		0x1b, 0x6e, 0xf1, 0x84, 0xd2, 0xa7, 0x38, 0x4d,
+		0x94, 0xe1, 0x7e, 0x0b, 0x5d, 0x28, 0xb7, 0xc2,
+		0x1e, 0x6b, 0xf4, 0x81, 0xd7, 0xa2, 0x3d, 0x48,
+		0x91, 0xe4, 0x7b, 0x0e, 0x58, 0x2d, 0xb2, 0xc7,
+		0x1d, 0x68, 0xf7, 0x82, 0xd4, 0xa1, 0x3e, 0x4b,
+		0x92, 0xe7, 0x78, 0x0d, 0x5b, 0x2e, 0xb1, 0xc4,
+		0x14, 0x61, 0xfe, 0x8b, 0xdd, 0xa8, 0x37, 0x42,
+		0x9b, 0xee, 0x71, 0x04, 0x52, 0x27, 0xb8, 0xcd,
+		0x17, 0x62, 0xfd, 0x88, 0xde, 0xab, 0x34, 0x41,
+		0x98, 0xed, 0x72, 0x07, 0x51, 0x24, 0xbb, 0xce,
+		0x12, 0x67, 0xf8, 0x8d, 0xdb, 0xae, 0x31, 0x44,
+		0x9d, 0xe8, 0x77, 0x02, 0x54, 0x21, 0xbe, 0xcb,
+		0x11, 0x64, 0xfb, 0x8e, 0xd8, 0xad, 0x32, 0x47,
+		0x9e, 0xeb, 0x74, 0x01, 0x57, 0x22, 0xbd, 0xc8,
+	},
+	{
+		0x00, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f,
+		0x97, 0xe1, 0x7b, 0x0d, 0x52, 0x24, 0xbe, 0xc8,
+		0x33, 0x45, 0xdf, 0xa9, 0xf6, 0x80, 0x1a, 0x6c,
+		0xa4, 0xd2, 0x48, 0x3e, 0x61, 0x17, 0x8d, 0xfb,
+		0x66, 0x10, 0x8a, 0xfc, 0xa3, 0xd5, 0x4f, 0x39,
+		0xf1, 0x87, 0x1d, 0x6b, 0x34, 0x42, 0xd8, 0xae,
+		0x55, 0x23, 0xb9, 0xcf, 0x90, 0xe6, 0x7c, 0x0a,
+		0xc2, 0xb4, 0x2e, 0x58, 0x07, 0x71, 0xeb, 0x9d,
+		0xcc, 0xba, 0x20, 0x56, 0x09, 0x7f, 0xe5, 0x93,
+		0x5b, 0x2d, 0xb7, 0xc1, 0x9e, 0xe8, 0x72, 0x04,
+		0xff, 0x89, 0x13, 0x65, 0x3a, 0x4c, 0xd6, 0xa0,
+		0x68, 0x1e, 0x84, 0xf2, 0xad, 0xdb, 0x41, 0x37,
+		0xaa, 0xdc, 0x46, 0x30, 0x6f, 0x19, 0x83, 0xf5,
+		0x3d, 0x4b, 0xd1, 0xa7, 0xf8, 0x8e, 0x14, 0x62,
+		0x99, 0xef, 0x75, 0x03, 0x5c, 0x2a, 0xb0, 0xc6,
+		0x0e, 0x78, 0xe2, 0x94, 0xcb, 0xbd, 0x27, 0x51,
+		0x85, 0xf3, 0x69, 0x1f, 0x40, 0x36, 0xac, 0xda,
+		0x12, 0x64, 0xfe, 0x88, 0xd7, 0xa1, 0x3b, 0x4d,
+		0xb6, 0xc0, 0x5a, 0x2c, 0x73, 0x05, 0x9f, 0xe9,
+		0x21, 0x57, 0xcd, 0xbb, 0xe4, 0x92, 0x08, 0x7e,
+		0xe3, 0x95, 0x0f, 0x79, 0x26, 0x50, 0xca, 0xbc,
+		0x74, 0x02, 0x98, 0xee, 0xb1, 0xc7, 0x5d, 0x2b,
+		0xd0, 0xa6, 0x3c, 0x4a, 0x15, 0x63, 0xf9, 0x8f,
+		0x47, 0x31, 0xab, 0xdd, 0x82, 0xf4, 0x6e, 0x18,
+		0x49, 0x3f, 0xa5, 0xd3, 0x8c, 0xfa, 0x60, 0x16,
+		0xde, 0xa8, 0x32, 0x44, 0x1b, 0x6d, 0xf7, 0x81,
+		0x7a, 0x0c, 0x96, 0xe0, 0xbf, 0xc9, 0x53, 0x25,
+		0xed, 0x9b, 0x01, 0x77, 0x28, 0x5e, 0xc4, 0xb2,
+		0x2f, 0x59, 0xc3, 0xb5, 0xea, 0x9c, 0x06, 0x70,
+		0xb8, 0xce, 0x54, 0x22, 0x7d, 0x0b, 0x91, 0xe7,
+		0x1c, 0x6a, 0xf0, 0x86, 0xd9, 0xaf, 0x35, 0x43,
+		0x8b, 0xfd, 0x67, 0x11, 0x4e, 0x38, 0xa2, 0xd4,
+	},
+	{
+		0x00, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58,
+		0x9f, 0xe8, 0x71, 0x06, 0x5e, 0x29, 0xb0, 0xc7,
+		0x23, 0x54, 0xcd, 0xba, 0xe2, 0x95, 0x0c, 0x7b,
+		0xbc, 0xcb, 0x52, 0x25, 0x7d, 0x0a, 0x93, 0xe4,
+		0x46, 0x31, 0xa8, 0xdf, 0x87, 0xf0, 0x69, 0x1e,
+		0xd9, 0xae, 0x37, 0x40, 0x18, 0x6f, 0xf6, 0x81,
+		0x65, 0x12, 0x8b, 0xfc, 0xa4, 0xd3, 0x4a, 0x3d,
+		0xfa, 0x8d, 0x14, 0x63, 0x3b, 0x4c, 0xd5, 0xa2,
+		0x8c, 0xfb, 0x62, 0x15, 0x4d, 0x3a, 0xa3, 0xd4,
+		0x13, 0x64, 0xfd, 0x8a, 0xd2, 0xa5, 0x3c, 0x4b,
+		0xaf, 0xd8, 0x41, 0x36, 0x6e, 0x19, 0x80, 0xf7,
+		0x30, 0x47, 0xde, 0xa9, 0xf1, 0x86, 0x1f, 0x68,
+		0xca, 0xbd, 0x24, 0x53, 0x0b, 0x7c, 0xe5, 0x92,
+		0x55, 0x22, 0xbb, 0xcc, 0x94, 0xe3, 0x7a, 0x0d,
+		0xe9, 0x9e, 0x07, 0x70, 0x28, 0x5f, 0xc6, 0xb1,
+		0x76, 0x01, 0x98, 0xef, 0xb7, 0xc0, 0x59, 0x2e,
+		0x05, 0x72, 0xeb, 0x9c, 0xc4, 0xb3, 0x2a, 0x5d,
+		0x9a, 0xed, 0x74, 0x03, 0x5b, 0x2c, 0xb5, 0xc2,
+		0x26, 0x51, 0xc8, 0xbf, 0xe7, 0x90, 0x09, 0x7e,
+		0xb9, 0xce, 0x57, 0x20, 0x78, 0x0f, 0x96, 0xe1,
+		0x43, 0x34, 0xad, 0xda, 0x82, 0xf5, 0x6c, 0x1b,
+		0xdc, 0xab, 0x32, 0x45, 0x1d, 0x6a, 0xf3, 0x84,
+		0x60, 0x17, 0x8e, 0xf9, 0xa1, 0xd6, 0x4f, 0x38,
+		0xff, 0x88, 0x11, 0x66, 0x3e, 0x49, 0xd0, 0xa7,
+		0x89, 0xfe, 0x67, 0x10, 0x48, 0x3f, 0xa6, 0xd1,
+		0x16, 0x61, 0xf8, 0x8f, 0xd7, 0xa0, 0x39, 0x4e,
+		0xaa, 0xdd, 0x44, 0x33, 0x6b, 0x1c, 0x85, 0xf2,
+		0x35, 0x42, 0xdb, 0xac, 0xf4, 0x83, 0x1a, 0x6d,
+		0xcf, 0xb8, 0x21, 0x56, 0x0e, 0x79, 0xe0, 0x97,
+		0x50, 0x27, 0xbe, 0xc9, 0x91, 0xe6, 0x7f, 0x08,
+		0xec, 0x9b, 0x02, 0x75, 0x2d, 0x5a, 0xc3, 0xb4,
+		0x73, 0x04, 0x9d, 0xea, 0xb2, 0xc5, 0x5c, 0x2b,
+	},
+	{
+		0x00, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0x0d, 0x75,
+		0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92,
+		0xd3, 0xab, 0x23, 0x5b, 0x2e, 0x56, 0xde, 0xa6,
+		0x34, 0x4c, 0xc4, 0xbc, 0xc9, 0xb1, 0x39, 0x41,
+		0xbb, 0xc3, 0x4b, 0x33, 0x46, 0x3e, 0xb6, 0xce,
+		0x5c, 0x24, 0xac, 0xd4, 0xa1, 0xd9, 0x51, 0x29,
+		0x68, 0x10, 0x98, 0xe0, 0x95, 0xed, 0x65, 0x1d,
+		0x8f, 0xf7, 0x7f, 0x07, 0x72, 0x0a, 0x82, 0xfa,
+		0x6b, 0x13, 0x9b, 0xe3, 0x96, 0xee, 0x66, 0x1e,
+		0x8c, 0xf4, 0x7c, 0x04, 0x71, 0x09, 0x81, 0xf9,
+		0xb8, 0xc0, 0x48, 0x30, 0x45, 0x3d, 0xb5, 0xcd,
+		0x5f, 0x27, 0xaf, 0xd7, 0xa2, 0xda, 0x52, 0x2a,
+		0xd0, 0xa8, 0x20, 0x58, 0x2d, 0x55, 0xdd, 0xa5,
+		0x37, 0x4f, 0xc7, 0xbf, 0xca, 0xb2, 0x3a, 0x42,
+		0x03, 0x7b, 0xf3, 0x8b, 0xfe, 0x86, 0x0e, 0x76,
+		0xe4, 0x9c, 0x14, 0x6c, 0x19, 0x61, 0xe9, 0x91,
+		0xd6, 0xae, 0x26, 0x5e, 0x2b, 0x53, 0xdb, 0xa3,
+		0x31, 0x49, 0xc1, 0xb9, 0xcc, 0xb4, 0x3c, 0x44,
+		0x05, 0x7d, 0xf5, 0x8d, 0xf8, 0x80, 0x08, 0x70,
+		0xe2, 0x9a, 0x12, 0x6a, 0x1f, 0x67, 0xef, 0x97,
+		0x6d, 0x15, 0x9d, 0xe5, 0x90, 0xe8, 0x60, 0x18,
+		0x8a, 0xf2, 0x7a, 0x02, 0x77, 0x0f, 0x87, 0xff,
+		0xbe, 0xc6, 0x4e, 0x36, 0x43, 0x3b, 0xb3, 0xcb,
+		0x59, 0x21, 0xa9, 0xd1, 0xa4, 0xdc, 0x54, 0x2c,
+		0xbd, 0xc5, 0x4d, 0x35, 0x40, 0x38, 0xb0, 0xc8,
+		0x5a, 0x22, 0xaa, 0xd2, 0xa7, 0xdf, 0x57, 0x2f,
+		0x6e, 0x16, 0x9e, 0xe6, 0x93, 0xeb, 0x63, 0x1b,
+		0x89, 0xf1, 0x79, 0x01, 0x74, 0x0c, 0x84, 0xfc,
+		0x06, 0x7e, 0xf6, 0x8e, 0xfb, 0x83, 0x0b, 0x73,
+		0xe1, 0x99, 0x11, 0x69, 0x1c, 0x64, 0xec, 0x94,
+		0xd5, 0xad, 0x25, 0x5d, 0x28, 0x50, 0xd8, 0xa0,
+		0x32, 0x4a, 0xc2, 0xba, 0xcf, 0xb7, 0x3f, 0x47,
+	},
+	{
+		0x00, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0x0b, 0x72,
+		0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d,
+		0xc3, 0xba, 0x31, 0x48, 0x3a, 0x43, 0xc8, 0xb1,
+		0x2c, 0x55, 0xde, 0xa7, 0xd5, 0xac, 0x27, 0x5e,
+		0x9b, 0xe2, 0x69, 0x10, 0x62, 0x1b, 0x90, 0xe9,
+		0x74, 0x0d, 0x86, 0xff, 0x8d, 0xf4, 0x7f, 0x06,
+		0x58, 0x21, 0xaa, 0xd3, 0xa1, 0xd8, 0x53, 0x2a,
+		0xb7, 0xce, 0x45, 0x3c, 0x4e, 0x37, 0xbc, 0xc5,
+		0x2b, 0x52, 0xd9, 0xa0, 0xd2, 0xab, 0x20, 0x59,
+		0xc4, 0xbd, 0x36, 0x4f, 0x3d, 0x44, 0xcf, 0xb6,
+		0xe8, 0x91, 0x1a, 0x63, 0x11, 0x68, 0xe3, 0x9a,
+		0x07, 0x7e, 0xf5, 0x8c, 0xfe, 0x87, 0x0c, 0x75,
+		0xb0, 0xc9, 0x42, 0x3b, 0x49, 0x30, 0xbb, 0xc2,
+		0x5f, 0x26, 0xad, 0xd4, 0xa6, 0xdf, 0x54, 0x2d,
+		0x73, 0x0a, 0x81, 0xf8, 0x8a, 0xf3, 0x78, 0x01,
+		0x9c, 0xe5, 0x6e, 0x17, 0x65, 0x1c, 0x97, 0xee,
+		0x56, 0x2f, 0xa4, 0xdd, 0xaf, 0xd6, 0x5d, 0x24,
+		0xb9, 0xc0, 0x4b, 0x32, 0x40, 0x39, 0xb2, 0xcb,
+		0x95, 0xec, 0x67, 0x1e, 0x6c, 0x15, 0x9e, 0xe7,
+		0x7a, 0x03, 0x88, 0xf1, 0x83, 0xfa, 0x71, 0x08,
+		0xcd, 0xb4, 0x3f, 0x46, 0x34, 0x4d, 0xc6, 0xbf,
+		0x22, 0x5b, 0xd0, 0xa9, 0xdb, 0xa2, 0x29, 0x50,
+		0x0e, 0x77, 0xfc, 0x85, 0xf7, 0x8e, 0x05, 0x7c,
+		0xe1, 0x98, 0x13, 0x6a, 0x18, 0x61, 0xea, 0x93,
+		0x7d, 0x04, 0x8f, 0xf6, 0x84, 0xfd, 0x76, 0x0f,
+		0x92, 0xeb, 0x60, 0x19, 0x6b, 0x12, 0x99, 0xe0,
+		0xbe, 0xc7, 0x4c, 0x35, 0x47, 0x3e, 0xb5, 0xcc,
+		0x51, 0x28, 0xa3, 0xda, 0xa8, 0xd1, 0x5a, 0x23,
+		0xe6, 0x9f, 0x14, 0x6d, 0x1f, 0x66, 0xed, 0x94,
+		0x09, 0x70, 0xfb, 0x82, 0xf0, 0x89, 0x02, 0x7b,
+		0x25, 0x5c, 0xd7, 0xae, 0xdc, 0xa5, 0x2e, 0x57,
+		0xca, 0xb3, 0x38, 0x41, 0x33, 0x4a, 0xc1, 0xb8,
+	},
+	{
+		0x00, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x01, 0x7b,
+		0xf7, 0x8d, 0x03, 0x79, 0x02, 0x78, 0xf6, 0x8c,
+		0xf3, 0x89, 0x07, 0x7d, 0x06, 0x7c, 0xf2, 0x88,
+		0x04, 0x7e, 0xf0, 0x8a, 0xf1, 0x8b, 0x05, 0x7f,
+		0xfb, 0x81, 0x0f, 0x75, 0x0e, 0x74, 0xfa, 0x80,
+		0x0c, 0x76, 0xf8, 0x82, 0xf9, 0x83, 0x0d, 0x77,
+		0x08, 0x72, 0xfc, 0x86, 0xfd, 0x87, 0x09, 0x73,
+		0xff, 0x85, 0x0b, 0x71, 0x0a, 0x70, 0xfe, 0x84,
+		0xeb, 0x91, 0x1f, 0x65, 0x1e, 0x64, 0xea, 0x90,
+		0x1c, 0x66, 0xe8, 0x92, 0xe9, 0x93, 0x1d, 0x67,
+		0x18, 0x62, 0xec, 0x96, 0xed, 0x97, 0x19, 0x63,
+		0xef, 0x95, 0x1b, 0x61, 0x1a, 0x60, 0xee, 0x94,
+		0x10, 0x6a, 0xe4, 0x9e, 0xe5, 0x9f, 0x11, 0x6b,
+		0xe7, 0x9d, 0x13, 0x69, 0x12, 0x68, 0xe6, 0x9c,
+		0xe3, 0x99, 0x17, 0x6d, 0x16, 0x6c, 0xe2, 0x98,
+		0x14, 0x6e, 0xe0, 0x9a, 0xe1, 0x9b, 0x15, 0x6f,
+		0xcb, 0xb1, 0x3f, 0x45, 0x3e, 0x44, 0xca, 0xb0,
+		0x3c, 0x46, 0xc8, 0xb2, 0xc9, 0xb3, 0x3d, 0x47,
+		0x38, 0x42, 0xcc, 0xb6, 0xcd, 0xb7, 0x39, 0x43,
+		0xcf, 0xb5, 0x3b, 0x41, 0x3a, 0x40, 0xce, 0xb4,
+		0x30, 0x4a, 0xc4, 0xbe, 0xc5, 0xbf, 0x31, 0x4b,
+		0xc7, 0xbd, 0x33, 0x49, 0x32, 0x48, 0xc6, 0xbc,
+		0xc3, 0xb9, 0x37, 0x4d, 0x36, 0x4c, 0xc2, 0xb8,
+		0x34, 0x4e, 0xc0, 0xba, 0xc1, 0xbb, 0x35, 0x4f,
+		0x20, 0x5a, 0xd4, 0xae, 0xd5, 0xaf, 0x21, 0x5b,
+		0xd7, 0xad, 0x23, 0x59, 0x22, 0x58, 0xd6, 0xac,
+		0xd3, 0xa9, 0x27, 0x5d, 0x26, 0x5c, 0xd2, 0xa8,
+		0x24, 0x5e, 0xd0, 0xaa, 0xd1, 0xab, 0x25, 0x5f,
+		0xdb, 0xa1, 0x2f, 0x55, 0x2e, 0x54, 0xda, 0xa0,
+		0x2c, 0x56, 0xd8, 0xa2, 0xd9, 0xa3, 0x2d, 0x57,
+		0x28, 0x52, 0xdc, 0xa6, 0xdd, 0xa7, 0x29, 0x53,
+		0xdf, 0xa5, 0x2b, 0x51, 0x2a, 0x50, 0xde, 0xa4,
+	},
+	{
+		0x00, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x07, 0x7c,
+		0xff, 0x84, 0x09, 0x72, 0x0e, 0x75, 0xf8, 0x83,
+		0xe3, 0x98, 0x15, 0x6e, 0x12, 0x69, 0xe4, 0x9f,
+		0x1c, 0x67, 0xea, 0x91, 0xed, 0x96, 0x1b, 0x60,
+		0xdb, 0xa0, 0x2d, 0x56, 0x2a, 0x51, 0xdc, 0xa7,
+		0x24, 0x5f, 0xd2, 0xa9, 0xd5, 0xae, 0x23, 0x58,
+		0x38, 0x43, 0xce, 0xb5, 0xc9, 0xb2, 0x3f, 0x44,
+		0xc7, 0xbc, 0x31, 0x4a, 0x36, 0x4d, 0xc0, 0xbb,
+		0xab, 0xd0, 0x5d, 0x26, 0x5a, 0x21, 0xac, 0xd7,
+		0x54, 0x2f, 0xa2, 0xd9, 0xa5, 0xde, 0x53, 0x28,
+		0x48, 0x33, 0xbe, 0xc5, 0xb9, 0xc2, 0x4f, 0x34,
+		0xb7, 0xcc, 0x41, 0x3a, 0x46, 0x3d, 0xb0, 0xcb,
+		0x70, 0x0b, 0x86, 0xfd, 0x81, 0xfa, 0x77, 0x0c,
+		0x8f, 0xf4, 0x79, 0x02, 0x7e, 0x05, 0x88, 0xf3,
+		0x93, 0xe8, 0x65, 0x1e, 0x62, 0x19, 0x94, 0xef,
+		0x6c, 0x17, 0x9a, 0xe1, 0x9d, 0xe6, 0x6b, 0x10,
+		0x4b, 0x30, 0xbd, 0xc6, 0xba, 0xc1, 0x4c, 0x37,
+		0xb4, 0xcf, 0x42, 0x39, 0x45, 0x3e, 0xb3, 0xc8,
+		0xa8, 0xd3, 0x5e, 0x25, 0x59, 0x22, 0xaf, 0xd4,
+		0x57, 0x2c, 0xa1, 0xda, 0xa6, 0xdd, 0x50, 0x2b,
+		0x90, 0xeb, 0x66, 0x1d, 0x61, 0x1a, 0x97, 0xec,
+		0x6f, 0x14, 0x99, 0xe2, 0x9e, 0xe5, 0x68, 0x13,
+		0x73, 0x08, 0x85, 0xfe, 0x82, 0xf9, 0x74, 0x0f,
+		0x8c, 0xf7, 0x7a, 0x01, 0x7d, 0x06, 0x8b, 0xf0,
+		0xe0, 0x9b, 0x16, 0x6d, 0x11, 0x6a, 0xe7, 0x9c,
+		0x1f, 0x64, 0xe9, 0x92, 0xee, 0x95, 0x18, 0x63,
+		0x03, 0x78, 0xf5, 0x8e, 0xf2, 0x89, 0x04, 0x7f,
+		0xfc, 0x87, 0x0a, 0x71, 0x0d, 0x76, 0xfb, 0x80,
+		0x3b, 0x40, 0xcd, 0xb6, 0xca, 0xb1, 0x3c, 0x47,
+		0xc4, 0xbf, 0x32, 0x49, 0x35, 0x4e, 0xc3, 0xb8,
+		0xd8, 0xa3, 0x2e, 0x55, 0x29, 0x52, 0xdf, 0xa4,
+		0x27, 0x5c, 0xd1, 0xaa, 0xd6, 0xad, 0x20, 0x5b,
+	},
+	{
+		0x00, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69,
+		0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae,
+		0x93, 0xef, 0x6b, 0x17, 0x7e, 0x02, 0x86, 0xfa,
+		0x54, 0x28, 0xac, 0xd0, 0xb9, 0xc5, 0x41, 0x3d,
+		0x3b, 0x47, 0xc3, 0xbf, 0xd6, 0xaa, 0x2e, 0x52,
+		0xfc, 0x80, 0x04, 0x78, 0x11, 0x6d, 0xe9, 0x95,
+		0xa8, 0xd4, 0x50, 0x2c, 0x45, 0x39, 0xbd, 0xc1,
+		0x6f, 0x13, 0x97, 0xeb, 0x82, 0xfe, 0x7a, 0x06,
+		0x76, 0x0a, 0x8e, 0xf2, 0x9b, 0xe7, 0x63, 0x1f,
+		0xb1, 0xcd, 0x49, 0x35, 0x5c, 0x20, 0xa4, 0xd8,
+		0xe5, 0x99, 0x1d, 0x61, 0x08, 0x74, 0xf0, 0x8c,
+		0x22, 0x5e, 0xda, 0xa6, 0xcf, 0xb3, 0x37, 0x4b,
+		0x4d, 0x31, 0xb5, 0xc9, 0xa0, 0xdc, 0x58, 0x24,
+		0x8a, 0xf6, 0x72, 0x0e, 0x67, 0x1b, 0x9f, 0xe3,
+		0xde, 0xa2, 0x26, 0x5a, 0x33, 0x4f, 0xcb, 0xb7,
+		0x19, 0x65, 0xe1, 0x9d, 0xf4, 0x88, 0x0c, 0x70,
+		0xec, 0x90, 0x14, 0x68, 0x01, 0x7d, 0xf9, 0x85,
+		0x2b, 0x57, 0xd3, 0xaf, 0xc6, 0xba, 0x3e, 0x42,
+		0x7f, 0x03, 0x87, 0xfb, 0x92, 0xee, 0x6a, 0x16,
+		0xb8, 0xc4, 0x40, 0x3c, 0x55, 0x29, 0xad, 0xd1,
+		0xd7, 0xab, 0x2f, 0x53, 0x3a, 0x46, 0xc2, 0xbe,
+		0x10, 0x6c, 0xe8, 0x94, 0xfd, 0x81, 0x05, 0x79,
+		0x44, 0x38, 0xbc, 0xc0, 0xa9, 0xd5, 0x51, 0x2d,
+		0x83, 0xff, 0x7b, 0x07, 0x6e, 0x12, 0x96, 0xea,
+		0x9a, 0xe6, 0x62, 0x1e, 0x77, 0x0b, 0x8f, 0xf3,
+		0x5d, 0x21, 0xa5, 0xd9, 0xb0, 0xcc, 0x48, 0x34,
+		0x09, 0x75, 0xf1, 0x8d, 0xe4, 0x98, 0x1c, 0x60,
+		0xce, 0xb2, 0x36, 0x4a, 0x23, 0x5f, 0xdb, 0xa7,
+		0xa1, 0xdd, 0x59, 0x25, 0x4c, 0x30, 0xb4, 0xc8,
+		0x66, 0x1a, 0x9e, 0xe2, 0x8b, 0xf7, 0x73, 0x0f,
+		0x32, 0x4e, 0xca, 0xb6, 0xdf, 0xa3, 0x27, 0x5b,
+		0xf5, 0x89, 0x0d, 0x71, 0x18, 0x64, 0xe0, 0x9c,
+	},
+	{
+		0x00, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e,
+		0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1,
+		0x83, 0xfe, 0x79, 0x04, 0x6a, 0x17, 0x90, 0xed,
+		0x4c, 0x31, 0xb6, 0xcb, 0xa5, 0xd8, 0x5f, 0x22,
+		0x1b, 0x66, 0xe1, 0x9c, 0xf2, 0x8f, 0x08, 0x75,
+		0xd4, 0xa9, 0x2e, 0x53, 0x3d, 0x40, 0xc7, 0xba,
+		0x98, 0xe5, 0x62, 0x1f, 0x71, 0x0c, 0x8b, 0xf6,
+		0x57, 0x2a, 0xad, 0xd0, 0xbe, 0xc3, 0x44, 0x39,
+		0x36, 0x4b, 0xcc, 0xb1, 0xdf, 0xa2, 0x25, 0x58,
+		0xf9, 0x84, 0x03, 0x7e, 0x10, 0x6d, 0xea, 0x97,
+		0xb5, 0xc8, 0x4f, 0x32, 0x5c, 0x21, 0xa6, 0xdb,
+		0x7a, 0x07, 0x80, 0xfd, 0x93, 0xee, 0x69, 0x14,
+		0x2d, 0x50, 0xd7, 0xaa, 0xc4, 0xb9, 0x3e, 0x43,
+		0xe2, 0x9f, 0x18, 0x65, 0x0b, 0x76, 0xf1, 0x8c,
+		0xae, 0xd3, 0x54, 0x29, 0x47, 0x3a, 0xbd, 0xc0,
+		0x61, 0x1c, 0x9b, 0xe6, 0x88, 0xf5, 0x72, 0x0f,
+		0x6c, 0x11, 0x96, 0xeb, 0x85, 0xf8, 0x7f, 0x02,
+		0xa3, 0xde, 0x59, 0x24, 0x4a, 0x37, 0xb0, 0xcd,
+		0xef, 0x92, 0x15, 0x68, 0x06, 0x7b, 0xfc, 0x81,
+		0x20, 0x5d, 0xda, 0xa7, 0xc9, 0xb4, 0x33, 0x4e,
+		0x77, 0x0a, 0x8d, 0xf0, 0x9e, 0xe3, 0x64, 0x19,
+		0xb8, 0xc5, 0x42, 0x3f, 0x51, 0x2c, 0xab, 0xd6,
+		0xf4, 0x89, 0x0e, 0x73, 0x1d, 0x60, 0xe7, 0x9a,
+		0x3b, 0x46, 0xc1, 0xbc, 0xd2, 0xaf, 0x28, 0x55,
+		0x5a, 0x27, 0xa0, 0xdd, 0xb3, 0xce, 0x49, 0x34,
+		0x95, 0xe8, 0x6f, 0x12, 0x7c, 0x01, 0x86, 0xfb,
+		0xd9, 0xa4, 0x23, 0x5e, 0x30, 0x4d, 0xca, 0xb7,
+		0x16, 0x6b, 0xec, 0x91, 0xff, 0x82, 0x05, 0x78,
+		0x41, 0x3c, 0xbb, 0xc6, 0xa8, 0xd5, 0x52, 0x2f,
+		0x8e, 0xf3, 0x74, 0x09, 0x67, 0x1a, 0x9d, 0xe0,
+		0xc2, 0xbf, 0x38, 0x45, 0x2b, 0x56, 0xd1, 0xac,
+		0x0d, 0x70, 0xf7, 0x8a, 0xe4, 0x99, 0x1e, 0x63,
+	},
+	{
+		0x00, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67,
+		0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0,
+		0xb3, 0xcd, 0x4f, 0x31, 0x56, 0x28, 0xaa, 0xd4,
+		0x64, 0x1a, 0x98, 0xe6, 0x81, 0xff, 0x7d, 0x03,
+		0x7b, 0x05, 0x87, 0xf9, 0x9e, 0xe0, 0x62, 0x1c,
+		0xac, 0xd2, 0x50, 0x2e, 0x49, 0x37, 0xb5, 0xcb,
+		0xc8, 0xb6, 0x34, 0x4a, 0x2d, 0x53, 0xd1, 0xaf,
+		0x1f, 0x61, 0xe3, 0x9d, 0xfa, 0x84, 0x06, 0x78,
+		0xf6, 0x88, 0x0a, 0x74, 0x13, 0x6d, 0xef, 0x91,
+		0x21, 0x5f, 0xdd, 0xa3, 0xc4, 0xba, 0x38, 0x46,
+		0x45, 0x3b, 0xb9, 0xc7, 0xa0, 0xde, 0x5c, 0x22,
+		0x92, 0xec, 0x6e, 0x10, 0x77, 0x09, 0x8b, 0xf5,
+		0x8d, 0xf3, 0x71, 0x0f, 0x68, 0x16, 0x94, 0xea,
+		0x5a, 0x24, 0xa6, 0xd8, 0xbf, 0xc1, 0x43, 0x3d,
+		0x3e, 0x40, 0xc2, 0xbc, 0xdb, 0xa5, 0x27, 0x59,
+		0xe9, 0x97, 0x15, 0x6b, 0x0c, 0x72, 0xf0, 0x8e,
+		0xf1, 0x8f, 0x0d, 0x73, 0x14, 0x6a, 0xe8, 0x96,
+		0x26, 0x58, 0xda, 0xa4, 0xc3, 0xbd, 0x3f, 0x41,
+		0x42, 0x3c, 0xbe, 0xc0, 0xa7, 0xd9, 0x5b, 0x25,
+		0x95, 0xeb, 0x69, 0x17, 0x70, 0x0e, 0x8c, 0xf2,
+		0x8a, 0xf4, 0x76, 0x08, 0x6f, 0x11, 0x93, 0xed,
+		0x5d, 0x23, 0xa1, 0xdf, 0xb8, 0xc6, 0x44, 0x3a,
+		0x39, 0x47, 0xc5, 0xbb, 0xdc, 0xa2, 0x20, 0x5e,
+		0xee, 0x90, 0x12, 0x6c, 0x0b, 0x75, 0xf7, 0x89,
+		0x07, 0x79, 0xfb, 0x85, 0xe2, 0x9c, 0x1e, 0x60,
+		0xd0, 0xae, 0x2c, 0x52, 0x35, 0x4b, 0xc9, 0xb7,
+		0xb4, 0xca, 0x48, 0x36, 0x51, 0x2f, 0xad, 0xd3,
+		0x63, 0x1d, 0x9f, 0xe1, 0x86, 0xf8, 0x7a, 0x04,
+		0x7c, 0x02, 0x80, 0xfe, 0x99, 0xe7, 0x65, 0x1b,
+		0xab, 0xd5, 0x57, 0x29, 0x4e, 0x30, 0xb2, 0xcc,
+		0xcf, 0xb1, 0x33, 0x4d, 0x2a, 0x54, 0xd6, 0xa8,
+		0x18, 0x66, 0xe4, 0x9a, 0xfd, 0x83, 0x01, 0x7f,
+	},
+	{
+		0x00, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60,
+		0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf,
+		0xa3, 0xdc, 0x5d, 0x22, 0x42, 0x3d, 0xbc, 0xc3,
+		0x7c, 0x03, 0x82, 0xfd, 0x9d, 0xe2, 0x63, 0x1c,
+		0x5b, 0x24, 0xa5, 0xda, 0xba, 0xc5, 0x44, 0x3b,
+		0x84, 0xfb, 0x7a, 0x05, 0x65, 0x1a, 0x9b, 0xe4,
+		0xf8, 0x87, 0x06, 0x79, 0x19, 0x66, 0xe7, 0x98,
+		0x27, 0x58, 0xd9, 0xa6, 0xc6, 0xb9, 0x38, 0x47,
+		0xb6, 0xc9, 0x48, 0x37, 0x57, 0x28, 0xa9, 0xd6,
+		0x69, 0x16, 0x97, 0xe8, 0x88, 0xf7, 0x76, 0x09,
+		0x15, 0x6a, 0xeb, 0x94, 0xf4, 0x8b, 0x0a, 0x75,
+		0xca, 0xb5, 0x34, 0x4b, 0x2b, 0x54, 0xd5, 0xaa,
+		0xed, 0x92, 0x13, 0x6c, 0x0c, 0x73, 0xf2, 0x8d,
+		0x32, 0x4d, 0xcc, 0xb3, 0xd3, 0xac, 0x2d, 0x52,
+		0x4e, 0x31, 0xb0, 0xcf, 0xaf, 0xd0, 0x51, 0x2e,
+		0x91, 0xee, 0x6f, 0x10, 0x70, 0x0f, 0x8e, 0xf1,
+		0x71, 0x0e, 0x8f, 0xf0, 0x90, 0xef, 0x6e, 0x11,
+		0xae, 0xd1, 0x50, 0x2f, 0x4f, 0x30, 0xb1, 0xce,
+		0xd2, 0xad, 0x2c, 0x53, 0x33, 0x4c, 0xcd, 0xb2,
+		0x0d, 0x72, 0xf3, 0x8c, 0xec, 0x93, 0x12, 0x6d,
+		0x2a, 0x55, 0xd4, 0xab, 0xcb, 0xb4, 0x35, 0x4a,
+		0xf5, 0x8a, 0x0b, 0x74, 0x14, 0x6b, 0xea, 0x95,
+		0x89, 0xf6, 0x77, 0x08, 0x68, 0x17, 0x96, 0xe9,
+		0x56, 0x29, 0xa8, 0xd7, 0xb7, 0xc8, 0x49, 0x36,
+		0xc7, 0xb8, 0x39, 0x46, 0x26, 0x59, 0xd8, 0xa7,
+		0x18, 0x67, 0xe6, 0x99, 0xf9, 0x86, 0x07, 0x78,
+		0x64, 0x1b, 0x9a, 0xe5, 0x85, 0xfa, 0x7b, 0x04,
+		0xbb, 0xc4, 0x45, 0x3a, 0x5a, 0x25, 0xa4, 0xdb,
+		0x9c, 0xe3, 0x62, 0x1d, 0x7d, 0x02, 0x83, 0xfc,
+		0x43, 0x3c, 0xbd, 0xc2, 0xa2, 0xdd, 0x5c, 0x23,
+		0x3f, 0x40, 0xc1, 0xbe, 0xde, 0xa1, 0x20, 0x5f,
+		0xe0, 0x9f, 0x1e, 0x61, 0x01, 0x7e, 0xff, 0x80,
+	},
+	{
+		0x00, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7,
+		0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3,
+		0xe8, 0x68, 0xf5, 0x75, 0xd2, 0x52, 0xcf, 0x4f,
+		0x9c, 0x1c, 0x81, 0x01, 0xa6, 0x26, 0xbb, 0x3b,
+		0xcd, 0x4d, 0xd0, 0x50, 0xf7, 0x77, 0xea, 0x6a,
+		0xb9, 0x39, 0xa4, 0x24, 0x83, 0x03, 0x9e, 0x1e,
+		0x25, 0xa5, 0x38, 0xb8, 0x1f, 0x9f, 0x02, 0x82,
+		0x51, 0xd1, 0x4c, 0xcc, 0x6b, 0xeb, 0x76, 0xf6,
+		0x87, 0x07, 0x9a, 0x1a, 0xbd, 0x3d, 0xa0, 0x20,
+		0xf3, 0x73, 0xee, 0x6e, 0xc9, 0x49, 0xd4, 0x54,
+		0x6f, 0xef, 0x72, 0xf2, 0x55, 0xd5, 0x48, 0xc8,
+		0x1b, 0x9b, 0x06, 0x86, 0x21, 0xa1, 0x3c, 0xbc,
+		0x4a, 0xca, 0x57, 0xd7, 0x70, 0xf0, 0x6d, 0xed,
+		0x3e, 0xbe, 0x23, 0xa3, 0x04, 0x84, 0x19, 0x99,
+		0xa2, 0x22, 0xbf, 0x3f, 0x98, 0x18, 0x85, 0x05,
+		0xd6, 0x56, 0xcb, 0x4b, 0xec, 0x6c, 0xf1, 0x71,
+		0x13, 0x93, 0x0e, 0x8e, 0x29, 0xa9, 0x34, 0xb4,
+		0x67, 0xe7, 0x7a, 0xfa, 0x5d, 0xdd, 0x40, 0xc0,
+		0xfb, 0x7b, 0xe6, 0x66, 0xc1, 0x41, 0xdc, 0x5c,
+		0x8f, 0x0f, 0x92, 0x12, 0xb5, 0x35, 0xa8, 0x28,
+		0xde, 0x5e, 0xc3, 0x43, 0xe4, 0x64, 0xf9, 0x79,
+		0xaa, 0x2a, 0xb7, 0x37, 0x90, 0x10, 0x8d, 0x0d,
+		0x36, 0xb6, 0x2b, 0xab, 0x0c, 0x8c, 0x11, 0x91,
+		0x42, 0xc2, 0x5f, 0xdf, 0x78, 0xf8, 0x65, 0xe5,
+		0x94, 0x14, 0x89, 0x09, 0xae, 0x2e, 0xb3, 0x33,
+		0xe0, 0x60, 0xfd, 0x7d, 0xda, 0x5a, 0xc7, 0x47,
+		0x7c, 0xfc, 0x61, 0xe1, 0x46, 0xc6, 0x5b, 0xdb,
+		0x08, 0x88, 0x15, 0x95, 0x32, 0xb2, 0x2f, 0xaf,
+		0x59, 0xd9, 0x44, 0xc4, 0x63, 0xe3, 0x7e, 0xfe,
+		0x2d, 0xad, 0x30, 0xb0, 0x17, 0x97, 0x0a, 0x8a,
+		0xb1, 0x31, 0xac, 0x2c, 0x8b, 0x0b, 0x96, 0x16,
+		0xc5, 0x45, 0xd8, 0x58, 0xff, 0x7f, 0xe2, 0x62,
+	},
+	{
+		0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+		0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc,
+		0xf8, 0x79, 0xe7, 0x66, 0xc6, 0x47, 0xd9, 0x58,
+		0x84, 0x05, 0x9b, 0x1a, 0xba, 0x3b, 0xa5, 0x24,
+		0xed, 0x6c, 0xf2, 0x73, 0xd3, 0x52, 0xcc, 0x4d,
+		0x91, 0x10, 0x8e, 0x0f, 0xaf, 0x2e, 0xb0, 0x31,
+		0x15, 0x94, 0x0a, 0x8b, 0x2b, 0xaa, 0x34, 0xb5,
+		0x69, 0xe8, 0x76, 0xf7, 0x57, 0xd6, 0x48, 0xc9,
+		0xc7, 0x46, 0xd8, 0x59, 0xf9, 0x78, 0xe6, 0x67,
+		0xbb, 0x3a, 0xa4, 0x25, 0x85, 0x04, 0x9a, 0x1b,
+		0x3f, 0xbe, 0x20, 0xa1, 0x01, 0x80, 0x1e, 0x9f,
+		0x43, 0xc2, 0x5c, 0xdd, 0x7d, 0xfc, 0x62, 0xe3,
+		0x2a, 0xab, 0x35, 0xb4, 0x14, 0x95, 0x0b, 0x8a,
+		0x56, 0xd7, 0x49, 0xc8, 0x68, 0xe9, 0x77, 0xf6,
+		0xd2, 0x53, 0xcd, 0x4c, 0xec, 0x6d, 0xf3, 0x72,
+		0xae, 0x2f, 0xb1, 0x30, 0x90, 0x11, 0x8f, 0x0e,
+		0x93, 0x12, 0x8c, 0x0d, 0xad, 0x2c, 0xb2, 0x33,
+		0xef, 0x6e, 0xf0, 0x71, 0xd1, 0x50, 0xce, 0x4f,
+		0x6b, 0xea, 0x74, 0xf5, 0x55, 0xd4, 0x4a, 0xcb,
+		0x17, 0x96, 0x08, 0x89, 0x29, 0xa8, 0x36, 0xb7,
+		0x7e, 0xff, 0x61, 0xe0, 0x40, 0xc1, 0x5f, 0xde,
+		0x02, 0x83, 0x1d, 0x9c, 0x3c, 0xbd, 0x23, 0xa2,
+		0x86, 0x07, 0x99, 0x18, 0xb8, 0x39, 0xa7, 0x26,
+		0xfa, 0x7b, 0xe5, 0x64, 0xc4, 0x45, 0xdb, 0x5a,
+		0x54, 0xd5, 0x4b, 0xca, 0x6a, 0xeb, 0x75, 0xf4,
+		0x28, 0xa9, 0x37, 0xb6, 0x16, 0x97, 0x09, 0x88,
+		0xac, 0x2d, 0xb3, 0x32, 0x92, 0x13, 0x8d, 0x0c,
+		0xd0, 0x51, 0xcf, 0x4e, 0xee, 0x6f, 0xf1, 0x70,
+		0xb9, 0x38, 0xa6, 0x27, 0x87, 0x06, 0x98, 0x19,
+		0xc5, 0x44, 0xda, 0x5b, 0xfb, 0x7a, 0xe4, 0x65,
+		0x41, 0xc0, 0x5e, 0xdf, 0x7f, 0xfe, 0x60, 0xe1,
+		0x3d, 0xbc, 0x22, 0xa3, 0x03, 0x82, 0x1c, 0x9d,
+	},
+	{
+		0x00, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9,
+		0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd,
+		0xc8, 0x4a, 0xd1, 0x53, 0xfa, 0x78, 0xe3, 0x61,
+		0xac, 0x2e, 0xb5, 0x37, 0x9e, 0x1c, 0x87, 0x05,
+		0x8d, 0x0f, 0x94, 0x16, 0xbf, 0x3d, 0xa6, 0x24,
+		0xe9, 0x6b, 0xf0, 0x72, 0xdb, 0x59, 0xc2, 0x40,
+		0x45, 0xc7, 0x5c, 0xde, 0x77, 0xf5, 0x6e, 0xec,
+		0x21, 0xa3, 0x38, 0xba, 0x13, 0x91, 0x0a, 0x88,
+		0x07, 0x85, 0x1e, 0x9c, 0x35, 0xb7, 0x2c, 0xae,
+		0x63, 0xe1, 0x7a, 0xf8, 0x51, 0xd3, 0x48, 0xca,
+		0xcf, 0x4d, 0xd6, 0x54, 0xfd, 0x7f, 0xe4, 0x66,
+		0xab, 0x29, 0xb2, 0x30, 0x99, 0x1b, 0x80, 0x02,
+		0x8a, 0x08, 0x93, 0x11, 0xb8, 0x3a, 0xa1, 0x23,
+		0xee, 0x6c, 0xf7, 0x75, 0xdc, 0x5e, 0xc5, 0x47,
+		0x42, 0xc0, 0x5b, 0xd9, 0x70, 0xf2, 0x69, 0xeb,
+		0x26, 0xa4, 0x3f, 0xbd, 0x14, 0x96, 0x0d, 0x8f,
+		0x0e, 0x8c, 0x17, 0x95, 0x3c, 0xbe, 0x25, 0xa7,
+		0x6a, 0xe8, 0x73, 0xf1, 0x58, 0xda, 0x41, 0xc3,
+		0xc6, 0x44, 0xdf, 0x5d, 0xf4, 0x76, 0xed, 0x6f,
+		0xa2, 0x20, 0xbb, 0x39, 0x90, 0x12, 0x89, 0x0b,
+		0x83, 0x01, 0x9a, 0x18, 0xb1, 0x33, 0xa8, 0x2a,
+		0xe7, 0x65, 0xfe, 0x7c, 0xd5, 0x57, 0xcc, 0x4e,
+		0x4b, 0xc9, 0x52, 0xd0, 0x79, 0xfb, 0x60, 0xe2,
+		0x2f, 0xad, 0x36, 0xb4, 0x1d, 0x9f, 0x04, 0x86,
+		0x09, 0x8b, 0x10, 0x92, 0x3b, 0xb9, 0x22, 0xa0,
+		0x6d, 0xef, 0x74, 0xf6, 0x5f, 0xdd, 0x46, 0xc4,
+		0xc1, 0x43, 0xd8, 0x5a, 0xf3, 0x71, 0xea, 0x68,
+		0xa5, 0x27, 0xbc, 0x3e, 0x97, 0x15, 0x8e, 0x0c,
+		0x84, 0x06, 0x9d, 0x1f, 0xb6, 0x34, 0xaf, 0x2d,
+		0xe0, 0x62, 0xf9, 0x7b, 0xd2, 0x50, 0xcb, 0x49,
+		0x4c, 0xce, 0x55, 0xd7, 0x7e, 0xfc, 0x67, 0xe5,
+		0x28, 0xaa, 0x31, 0xb3, 0x1a, 0x98, 0x03, 0x81,
+	},
+	{
+		0x00, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae,
+		0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2,
+		0xd8, 0x5b, 0xc3, 0x40, 0xee, 0x6d, 0xf5, 0x76,
+		0xb4, 0x37, 0xaf, 0x2c, 0x82, 0x01, 0x99, 0x1a,
+		0xad, 0x2e, 0xb6, 0x35, 0x9b, 0x18, 0x80, 0x03,
+		0xc1, 0x42, 0xda, 0x59, 0xf7, 0x74, 0xec, 0x6f,
+		0x75, 0xf6, 0x6e, 0xed, 0x43, 0xc0, 0x58, 0xdb,
+		0x19, 0x9a, 0x02, 0x81, 0x2f, 0xac, 0x34, 0xb7,
+		0x47, 0xc4, 0x5c, 0xdf, 0x71, 0xf2, 0x6a, 0xe9,
+		0x2b, 0xa8, 0x30, 0xb3, 0x1d, 0x9e, 0x06, 0x85,
+		0x9f, 0x1c, 0x84, 0x07, 0xa9, 0x2a, 0xb2, 0x31,
+		0xf3, 0x70, 0xe8, 0x6b, 0xc5, 0x46, 0xde, 0x5d,
+		0xea, 0x69, 0xf1, 0x72, 0xdc, 0x5f, 0xc7, 0x44,
+		0x86, 0x05, 0x9d, 0x1e, 0xb0, 0x33, 0xab, 0x28,
+		0x32, 0xb1, 0x29, 0xaa, 0x04, 0x87, 0x1f, 0x9c,
+		0x5e, 0xdd, 0x45, 0xc6, 0x68, 0xeb, 0x73, 0xf0,
+		0x8e, 0x0d, 0x95, 0x16, 0xb8, 0x3b, 0xa3, 0x20,
+		0xe2, 0x61, 0xf9, 0x7a, 0xd4, 0x57, 0xcf, 0x4c,
+		0x56, 0xd5, 0x4d, 0xce, 0x60, 0xe3, 0x7b, 0xf8,
+		0x3a, 0xb9, 0x21, 0xa2, 0x0c, 0x8f, 0x17, 0x94,
+		0x23, 0xa0, 0x38, 0xbb, 0x15, 0x96, 0x0e, 0x8d,
+		0x4f, 0xcc, 0x54, 0xd7, 0x79, 0xfa, 0x62, 0xe1,
+		0xfb, 0x78, 0xe0, 0x63, 0xcd, 0x4e, 0xd6, 0x55,
+		0x97, 0x14, 0x8c, 0x0f, 0xa1, 0x22, 0xba, 0x39,
+		0xc9, 0x4a, 0xd2, 0x51, 0xff, 0x7c, 0xe4, 0x67,
+		0xa5, 0x26, 0xbe, 0x3d, 0x93, 0x10, 0x88, 0x0b,
+		0x11, 0x92, 0x0a, 0x89, 0x27, 0xa4, 0x3c, 0xbf,
+		0x7d, 0xfe, 0x66, 0xe5, 0x4b, 0xc8, 0x50, 0xd3,
+		0x64, 0xe7, 0x7f, 0xfc, 0x52, 0xd1, 0x49, 0xca,
+		0x08, 0x8b, 0x13, 0x90, 0x3e, 0xbd, 0x25, 0xa6,
+		0xbc, 0x3f, 0xa7, 0x24, 0x8a, 0x09, 0x91, 0x12,
+		0xd0, 0x53, 0xcb, 0x48, 0xe6, 0x65, 0xfd, 0x7e,
+	},
+	{
+		0x00, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb,
+		0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef,
+		0xa8, 0x2c, 0xbd, 0x39, 0x82, 0x06, 0x97, 0x13,
+		0xfc, 0x78, 0xe9, 0x6d, 0xd6, 0x52, 0xc3, 0x47,
+		0x4d, 0xc9, 0x58, 0xdc, 0x67, 0xe3, 0x72, 0xf6,
+		0x19, 0x9d, 0x0c, 0x88, 0x33, 0xb7, 0x26, 0xa2,
+		0xe5, 0x61, 0xf0, 0x74, 0xcf, 0x4b, 0xda, 0x5e,
+		0xb1, 0x35, 0xa4, 0x20, 0x9b, 0x1f, 0x8e, 0x0a,
+		0x9a, 0x1e, 0x8f, 0x0b, 0xb0, 0x34, 0xa5, 0x21,
+		0xce, 0x4a, 0xdb, 0x5f, 0xe4, 0x60, 0xf1, 0x75,
+		0x32, 0xb6, 0x27, 0xa3, 0x18, 0x9c, 0x0d, 0x89,
+		0x66, 0xe2, 0x73, 0xf7, 0x4c, 0xc8, 0x59, 0xdd,
+		0xd7, 0x53, 0xc2, 0x46, 0xfd, 0x79, 0xe8, 0x6c,
+		0x83, 0x07, 0x96, 0x12, 0xa9, 0x2d, 0xbc, 0x38,
+		0x7f, 0xfb, 0x6a, 0xee, 0x55, 0xd1, 0x40, 0xc4,
+		0x2b, 0xaf, 0x3e, 0xba, 0x01, 0x85, 0x14, 0x90,
+		0x29, 0xad, 0x3c, 0xb8, 0x03, 0x87, 0x16, 0x92,
+		0x7d, 0xf9, 0x68, 0xec, 0x57, 0xd3, 0x42, 0xc6,
+		0x81, 0x05, 0x94, 0x10, 0xab, 0x2f, 0xbe, 0x3a,
+		0xd5, 0x51, 0xc0, 0x44, 0xff, 0x7b, 0xea, 0x6e,
+		0x64, 0xe0, 0x71, 0xf5, 0x4e, 0xca, 0x5b, 0xdf,
+		0x30, 0xb4, 0x25, 0xa1, 0x1a, 0x9e, 0x0f, 0x8b,
+		0xcc, 0x48, 0xd9, 0x5d, 0xe6, 0x62, 0xf3, 0x77,
+		0x98, 0x1c, 0x8d, 0x09, 0xb2, 0x36, 0xa7, 0x23,
+		0xb3, 0x37, 0xa6, 0x22, 0x99, 0x1d, 0x8c, 0x08,
+		0xe7, 0x63, 0xf2, 0x76, 0xcd, 0x49, 0xd8, 0x5c,
+		0x1b, 0x9f, 0x0e, 0x8a, 0x31, 0xb5, 0x24, 0xa0,
+		0x4f, 0xcb, 0x5a, 0xde, 0x65, 0xe1, 0x70, 0xf4,
+		0xfe, 0x7a, 0xeb, 0x6f, 0xd4, 0x50, 0xc1, 0x45,
+		0xaa, 0x2e, 0xbf, 0x3b, 0x80, 0x04, 0x95, 0x11,
+		0x56, 0xd2, 0x43, 0xc7, 0x7c, 0xf8, 0x69, 0xed,
+		0x02, 0x86, 0x17, 0x93, 0x28, 0xac, 0x3d, 0xb9,
+	},
+	{
+		0x00, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc,
+		0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0,
+		0xb8, 0x3d, 0xaf, 0x2a, 0x96, 0x13, 0x81, 0x04,
+		0xe4, 0x61, 0xf3, 0x76, 0xca, 0x4f, 0xdd, 0x58,
+		0x6d, 0xe8, 0x7a, 0xff, 0x43, 0xc6, 0x54, 0xd1,
+		0x31, 0xb4, 0x26, 0xa3, 0x1f, 0x9a, 0x08, 0x8d,
+		0xd5, 0x50, 0xc2, 0x47, 0xfb, 0x7e, 0xec, 0x69,
+		0x89, 0x0c, 0x9e, 0x1b, 0xa7, 0x22, 0xb0, 0x35,
+		0xda, 0x5f, 0xcd, 0x48, 0xf4, 0x71, 0xe3, 0x66,
+		0x86, 0x03, 0x91, 0x14, 0xa8, 0x2d, 0xbf, 0x3a,
+		0x62, 0xe7, 0x75, 0xf0, 0x4c, 0xc9, 0x5b, 0xde,
+		0x3e, 0xbb, 0x29, 0xac, 0x10, 0x95, 0x07, 0x82,
+		0xb7, 0x32, 0xa0, 0x25, 0x99, 0x1c, 0x8e, 0x0b,
+		0xeb, 0x6e, 0xfc, 0x79, 0xc5, 0x40, 0xd2, 0x57,
+		0x0f, 0x8a, 0x18, 0x9d, 0x21, 0xa4, 0x36, 0xb3,
+		0x53, 0xd6, 0x44, 0xc1, 0x7d, 0xf8, 0x6a, 0xef,
+		0xa9, 0x2c, 0xbe, 0x3b, 0x87, 0x02, 0x90, 0x15,
+		0xf5, 0x70, 0xe2, 0x67, 0xdb, 0x5e, 0xcc, 0x49,
+		0x11, 0x94, 0x06, 0x83, 0x3f, 0xba, 0x28, 0xad,
+		0x4d, 0xc8, 0x5a, 0xdf, 0x63, 0xe6, 0x74, 0xf1,
+		0xc4, 0x41, 0xd3, 0x56, 0xea, 0x6f, 0xfd, 0x78,
+		0x98, 0x1d, 0x8f, 0x0a, 0xb6, 0x33, 0xa1, 0x24,
+		0x7c, 0xf9, 0x6b, 0xee, 0x52, 0xd7, 0x45, 0xc0,
+		0x20, 0xa5, 0x37, 0xb2, 0x0e, 0x8b, 0x19, 0x9c,
+		0x73, 0xf6, 0x64, 0xe1, 0x5d, 0xd8, 0x4a, 0xcf,
+		0x2f, 0xaa, 0x38, 0xbd, 0x01, 0x84, 0x16, 0x93,
+		0xcb, 0x4e, 0xdc, 0x59, 0xe5, 0x60, 0xf2, 0x77,
+		0x97, 0x12, 0x80, 0x05, 0xb9, 0x3c, 0xae, 0x2b,
+		0x1e, 0x9b, 0x09, 0x8c, 0x30, 0xb5, 0x27, 0xa2,
+		0x42, 0xc7, 0x55, 0xd0, 0x6c, 0xe9, 0x7b, 0xfe,
+		0xa6, 0x23, 0xb1, 0x34, 0x88, 0x0d, 0x9f, 0x1a,
+		0xfa, 0x7f, 0xed, 0x68, 0xd4, 0x51, 0xc3, 0x46,
+	},
+	{
+		0x00, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5,
+		0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1,
+		0x88, 0x0e, 0x99, 0x1f, 0xaa, 0x2c, 0xbb, 0x3d,
+		0xcc, 0x4a, 0xdd, 0x5b, 0xee, 0x68, 0xff, 0x79,
+		0x0d, 0x8b, 0x1c, 0x9a, 0x2f, 0xa9, 0x3e, 0xb8,
+		0x49, 0xcf, 0x58, 0xde, 0x6b, 0xed, 0x7a, 0xfc,
+		0x85, 0x03, 0x94, 0x12, 0xa7, 0x21, 0xb6, 0x30,
+		0xc1, 0x47, 0xd0, 0x56, 0xe3, 0x65, 0xf2, 0x74,
+		0x1a, 0x9c, 0x0b, 0x8d, 0x38, 0xbe, 0x29, 0xaf,
+		0x5e, 0xd8, 0x4f, 0xc9, 0x7c, 0xfa, 0x6d, 0xeb,
+		0x92, 0x14, 0x83, 0x05, 0xb0, 0x36, 0xa1, 0x27,
+		0xd6, 0x50, 0xc7, 0x41, 0xf4, 0x72, 0xe5, 0x63,
+		0x17, 0x91, 0x06, 0x80, 0x35, 0xb3, 0x24, 0xa2,
+		0x53, 0xd5, 0x42, 0xc4, 0x71, 0xf7, 0x60, 0xe6,
+		0x9f, 0x19, 0x8e, 0x08, 0xbd, 0x3b, 0xac, 0x2a,
+		0xdb, 0x5d, 0xca, 0x4c, 0xf9, 0x7f, 0xe8, 0x6e,
+		0x34, 0xb2, 0x25, 0xa3, 0x16, 0x90, 0x07, 0x81,
+		0x70, 0xf6, 0x61, 0xe7, 0x52, 0xd4, 0x43, 0xc5,
+		0xbc, 0x3a, 0xad, 0x2b, 0x9e, 0x18, 0x8f, 0x09,
+		0xf8, 0x7e, 0xe9, 0x6f, 0xda, 0x5c, 0xcb, 0x4d,
+		0x39, 0xbf, 0x28, 0xae, 0x1b, 0x9d, 0x0a, 0x8c,
+		0x7d, 0xfb, 0x6c, 0xea, 0x5f, 0xd9, 0x4e, 0xc8,
+		0xb1, 0x37, 0xa0, 0x26, 0x93, 0x15, 0x82, 0x04,
+		0xf5, 0x73, 0xe4, 0x62, 0xd7, 0x51, 0xc6, 0x40,
+		0x2e, 0xa8, 0x3f, 0xb9, 0x0c, 0x8a, 0x1d, 0x9b,
+		0x6a, 0xec, 0x7b, 0xfd, 0x48, 0xce, 0x59, 0xdf,
+		0xa6, 0x20, 0xb7, 0x31, 0x84, 0x02, 0x95, 0x13,
+		0xe2, 0x64, 0xf3, 0x75, 0xc0, 0x46, 0xd1, 0x57,
+		0x23, 0xa5, 0x32, 0xb4, 0x01, 0x87, 0x10, 0x96,
+		0x67, 0xe1, 0x76, 0xf0, 0x45, 0xc3, 0x54, 0xd2,
+		0xab, 0x2d, 0xba, 0x3c, 0x89, 0x0f, 0x98, 0x1e,
+		0xef, 0x69, 0xfe, 0x78, 0xcd, 0x4b, 0xdc, 0x5a,
+	},
+	{
+		0x00, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2,
+		0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe,
+		0x98, 0x1f, 0x8b, 0x0c, 0xbe, 0x39, 0xad, 0x2a,
+		0xd4, 0x53, 0xc7, 0x40, 0xf2, 0x75, 0xe1, 0x66,
+		0x2d, 0xaa, 0x3e, 0xb9, 0x0b, 0x8c, 0x18, 0x9f,
+		0x61, 0xe6, 0x72, 0xf5, 0x47, 0xc0, 0x54, 0xd3,
+		0xb5, 0x32, 0xa6, 0x21, 0x93, 0x14, 0x80, 0x07,
+		0xf9, 0x7e, 0xea, 0x6d, 0xdf, 0x58, 0xcc, 0x4b,
+		0x5a, 0xdd, 0x49, 0xce, 0x7c, 0xfb, 0x6f, 0xe8,
+		0x16, 0x91, 0x05, 0x82, 0x30, 0xb7, 0x23, 0xa4,
+		0xc2, 0x45, 0xd1, 0x56, 0xe4, 0x63, 0xf7, 0x70,
+		0x8e, 0x09, 0x9d, 0x1a, 0xa8, 0x2f, 0xbb, 0x3c,
+		0x77, 0xf0, 0x64, 0xe3, 0x51, 0xd6, 0x42, 0xc5,
+		0x3b, 0xbc, 0x28, 0xaf, 0x1d, 0x9a, 0x0e, 0x89,
+		0xef, 0x68, 0xfc, 0x7b, 0xc9, 0x4e, 0xda, 0x5d,
+		0xa3, 0x24, 0xb0, 0x37, 0x85, 0x02, 0x96, 0x11,
+		0xb4, 0x33, 0xa7, 0x20, 0x92, 0x15, 0x81, 0x06,
+		0xf8, 0x7f, 0xeb, 0x6c, 0xde, 0x59, 0xcd, 0x4a,
+		0x2c, 0xab, 0x3f, 0xb8, 0x0a, 0x8d, 0x19, 0x9e,
+		0x60, 0xe7, 0x73, 0xf4, 0x46, 0xc1, 0x55, 0xd2,
+		0x99, 0x1e, 0x8a, 0x0d, 0xbf, 0x38, 0xac, 0x2b,
+		0xd5, 0x52, 0xc6, 0x41, 0xf3, 0x74, 0xe0, 0x67,
+		0x01, 0x86, 0x12, 0x95, 0x27, 0xa0, 0x34, 0xb3,
+		0x4d, 0xca, 0x5e, 0xd9, 0x6b, 0xec, 0x78, 0xff,
+		0xee, 0x69, 0xfd, 0x7a, 0xc8, 0x4f, 0xdb, 0x5c,
+		0xa2, 0x25, 0xb1, 0x36, 0x84, 0x03, 0x97, 0x10,
+		0x76, 0xf1, 0x65, 0xe2, 0x50, 0xd7, 0x43, 0xc4,
+		0x3a, 0xbd, 0x29, 0xae, 0x1c, 0x9b, 0x0f, 0x88,
+		0xc3, 0x44, 0xd0, 0x57, 0xe5, 0x62, 0xf6, 0x71,
+		0x8f, 0x08, 0x9c, 0x1b, 0xa9, 0x2e, 0xba, 0x3d,
+		0x5b, 0xdc, 0x48, 0xcf, 0x7d, 0xfa, 0x6e, 0xe9,
+		0x17, 0x90, 0x04, 0x83, 0x31, 0xb6, 0x22, 0xa5,
+	},
+	{
+		0x00, 0x88, 0x0d, 0x85, 0x1a, 0x92, 0x17, 0x9f,
+		0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab,
+		0x68, 0xe0, 0x65, 0xed, 0x72, 0xfa, 0x7f, 0xf7,
+		0x5c, 0xd4, 0x51, 0xd9, 0x46, 0xce, 0x4b, 0xc3,
+		0xd0, 0x58, 0xdd, 0x55, 0xca, 0x42, 0xc7, 0x4f,
+		0xe4, 0x6c, 0xe9, 0x61, 0xfe, 0x76, 0xf3, 0x7b,
+		0xb8, 0x30, 0xb5, 0x3d, 0xa2, 0x2a, 0xaf, 0x27,
+		0x8c, 0x04, 0x81, 0x09, 0x96, 0x1e, 0x9b, 0x13,
+		0xbd, 0x35, 0xb0, 0x38, 0xa7, 0x2f, 0xaa, 0x22,
+		0x89, 0x01, 0x84, 0x0c, 0x93, 0x1b, 0x9e, 0x16,
+		0xd5, 0x5d, 0xd8, 0x50, 0xcf, 0x47, 0xc2, 0x4a,
+		0xe1, 0x69, 0xec, 0x64, 0xfb, 0x73, 0xf6, 0x7e,
+		0x6d, 0xe5, 0x60, 0xe8, 0x77, 0xff, 0x7a, 0xf2,
+		0x59, 0xd1, 0x54, 0xdc, 0x43, 0xcb, 0x4e, 0xc6,
+		0x05, 0x8d, 0x08, 0x80, 0x1f, 0x97, 0x12, 0x9a,
+		0x31, 0xb9, 0x3c, 0xb4, 0x2b, 0xa3, 0x26, 0xae,
+		0x67, 0xef, 0x6a, 0xe2, 0x7d, 0xf5, 0x70, 0xf8,
+		0x53, 0xdb, 0x5e, 0xd6, 0x49, 0xc1, 0x44, 0xcc,
+		0x0f, 0x87, 0x02, 0x8a, 0x15, 0x9d, 0x18, 0x90,
+		0x3b, 0xb3, 0x36, 0xbe, 0x21, 0xa9, 0x2c, 0xa4,
+		0xb7, 0x3f, 0xba, 0x32, 0xad, 0x25, 0xa0, 0x28,
+		0x83, 0x0b, 0x8e, 0x06, 0x99, 0x11, 0x94, 0x1c,
+		0xdf, 0x57, 0xd2, 0x5a, 0xc5, 0x4d, 0xc8, 0x40,
+		0xeb, 0x63, 0xe6, 0x6e, 0xf1, 0x79, 0xfc, 0x74,
+		0xda, 0x52, 0xd7, 0x5f, 0xc0, 0x48, 0xcd, 0x45,
+		0xee, 0x66, 0xe3, 0x6b, 0xf4, 0x7c, 0xf9, 0x71,
+		0xb2, 0x3a, 0xbf, 0x37, 0xa8, 0x20, 0xa5, 0x2d,
+		0x86, 0x0e, 0x8b, 0x03, 0x9c, 0x14, 0x91, 0x19,
+		0x0a, 0x82, 0x07, 0x8f, 0x10, 0x98, 0x1d, 0x95,
+		0x3e, 0xb6, 0x33, 0xbb, 0x24, 0xac, 0x29, 0xa1,
+		0x62, 0xea, 0x6f, 0xe7, 0x78, 0xf0, 0x75, 0xfd,
+		0x56, 0xde, 0x5b, 0xd3, 0x4c, 0xc4, 0x41, 0xc9,
+	},
+	{
+		0x00, 0x89, 0x0f, 0x86, 0x1e, 0x97, 0x11, 0x98,
+		0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4,
+		0x78, 0xf1, 0x77, 0xfe, 0x66, 0xef, 0x69, 0xe0,
+		0x44, 0xcd, 0x4b, 0xc2, 0x5a, 0xd3, 0x55, 0xdc,
+		0xf0, 0x79, 0xff, 0x76, 0xee, 0x67, 0xe1, 0x68,
+		0xcc, 0x45, 0xc3, 0x4a, 0xd2, 0x5b, 0xdd, 0x54,
+		0x88, 0x01, 0x87, 0x0e, 0x96, 0x1f, 0x99, 0x10,
+		0xb4, 0x3d, 0xbb, 0x32, 0xaa, 0x23, 0xa5, 0x2c,
+		0xfd, 0x74, 0xf2, 0x7b, 0xe3, 0x6a, 0xec, 0x65,
+		0xc1, 0x48, 0xce, 0x47, 0xdf, 0x56, 0xd0, 0x59,
+		0x85, 0x0c, 0x8a, 0x03, 0x9b, 0x12, 0x94, 0x1d,
+		0xb9, 0x30, 0xb6, 0x3f, 0xa7, 0x2e, 0xa8, 0x21,
+		0x0d, 0x84, 0x02, 0x8b, 0x13, 0x9a, 0x1c, 0x95,
+		0x31, 0xb8, 0x3e, 0xb7, 0x2f, 0xa6, 0x20, 0xa9,
+		0x75, 0xfc, 0x7a, 0xf3, 0x6b, 0xe2, 0x64, 0xed,
+		0x49, 0xc0, 0x46, 0xcf, 0x57, 0xde, 0x58, 0xd1,
+		0xe7, 0x6e, 0xe8, 0x61, 0xf9, 0x70, 0xf6, 0x7f,
+		0xdb, 0x52, 0xd4, 0x5d, 0xc5, 0x4c, 0xca, 0x43,
+		0x9f, 0x16, 0x90, 0x19, 0x81, 0x08, 0x8e, 0x07,
+		0xa3, 0x2a, 0xac, 0x25, 0xbd, 0x34, 0xb2, 0x3b,
+		0x17, 0x9e, 0x18, 0x91, 0x09, 0x80, 0x06, 0x8f,
+		0x2b, 0xa2, 0x24, 0xad, 0x35, 0xbc, 0x3a, 0xb3,
+		0x6f, 0xe6, 0x60, 0xe9, 0x71, 0xf8, 0x7e, 0xf7,
+		0x53, 0xda, 0x5c, 0xd5, 0x4d, 0xc4, 0x42, 0xcb,
+		0x1a, 0x93, 0x15, 0x9c, 0x04, 0x8d, 0x0b, 0x82,
+		0x26, 0xaf, 0x29, 0xa0, 0x38, 0xb1, 0x37, 0xbe,
+		0x62, 0xeb, 0x6d, 0xe4, 0x7c, 0xf5, 0x73, 0xfa,
+		0x5e, 0xd7, 0x51, 0xd8, 0x40, 0xc9, 0x4f, 0xc6,
+		0xea, 0x63, 0xe5, 0x6c, 0xf4, 0x7d, 0xfb, 0x72,
+		0xd6, 0x5f, 0xd9, 0x50, 0xc8, 0x41, 0xc7, 0x4e,
+		0x92, 0x1b, 0x9d, 0x14, 0x8c, 0x05, 0x83, 0x0a,
+		0xae, 0x27, 0xa1, 0x28, 0xb0, 0x39, 0xbf, 0x36,
+	},
+	{
+		0x00, 0x8a, 0x09, 0x83, 0x12, 0x98, 0x1b, 0x91,
+		0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5,
+		0x48, 0xc2, 0x41, 0xcb, 0x5a, 0xd0, 0x53, 0xd9,
+		0x6c, 0xe6, 0x65, 0xef, 0x7e, 0xf4, 0x77, 0xfd,
+		0x90, 0x1a, 0x99, 0x13, 0x82, 0x08, 0x8b, 0x01,
+		0xb4, 0x3e, 0xbd, 0x37, 0xa6, 0x2c, 0xaf, 0x25,
+		0xd8, 0x52, 0xd1, 0x5b, 0xca, 0x40, 0xc3, 0x49,
+		0xfc, 0x76, 0xf5, 0x7f, 0xee, 0x64, 0xe7, 0x6d,
+		0x3d, 0xb7, 0x34, 0xbe, 0x2f, 0xa5, 0x26, 0xac,
+		0x19, 0x93, 0x10, 0x9a, 0x0b, 0x81, 0x02, 0x88,
+		0x75, 0xff, 0x7c, 0xf6, 0x67, 0xed, 0x6e, 0xe4,
+		0x51, 0xdb, 0x58, 0xd2, 0x43, 0xc9, 0x4a, 0xc0,
+		0xad, 0x27, 0xa4, 0x2e, 0xbf, 0x35, 0xb6, 0x3c,
+		0x89, 0x03, 0x80, 0x0a, 0x9b, 0x11, 0x92, 0x18,
+		0xe5, 0x6f, 0xec, 0x66, 0xf7, 0x7d, 0xfe, 0x74,
+		0xc1, 0x4b, 0xc8, 0x42, 0xd3, 0x59, 0xda, 0x50,
+		0x7a, 0xf0, 0x73, 0xf9, 0x68, 0xe2, 0x61, 0xeb,
+		0x5e, 0xd4, 0x57, 0xdd, 0x4c, 0xc6, 0x45, 0xcf,
+		0x32, 0xb8, 0x3b, 0xb1, 0x20, 0xaa, 0x29, 0xa3,
+		0x16, 0x9c, 0x1f, 0x95, 0x04, 0x8e, 0x0d, 0x87,
+		0xea, 0x60, 0xe3, 0x69, 0xf8, 0x72, 0xf1, 0x7b,
+		0xce, 0x44, 0xc7, 0x4d, 0xdc, 0x56, 0xd5, 0x5f,
+		0xa2, 0x28, 0xab, 0x21, 0xb0, 0x3a, 0xb9, 0x33,
+		0x86, 0x0c, 0x8f, 0x05, 0x94, 0x1e, 0x9d, 0x17,
+		0x47, 0xcd, 0x4e, 0xc4, 0x55, 0xdf, 0x5c, 0xd6,
+		0x63, 0xe9, 0x6a, 0xe0, 0x71, 0xfb, 0x78, 0xf2,
+		0x0f, 0x85, 0x06, 0x8c, 0x1d, 0x97, 0x14, 0x9e,
+		0x2b, 0xa1, 0x22, 0xa8, 0x39, 0xb3, 0x30, 0xba,
+		0xd7, 0x5d, 0xde, 0x54, 0xc5, 0x4f, 0xcc, 0x46,
+		0xf3, 0x79, 0xfa, 0x70, 0xe1, 0x6b, 0xe8, 0x62,
+		0x9f, 0x15, 0x96, 0x1c, 0x8d, 0x07, 0x84, 0x0e,
+		0xbb, 0x31, 0xb2, 0x38, 0xa9, 0x23, 0xa0, 0x2a,
+	},
+	{
+		0x00, 0x8b, 0x0b, 0x80, 0x16, 0x9d, 0x1d, 0x96,
+		0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba,
+		0x58, 0xd3, 0x53, 0xd8, 0x4e, 0xc5, 0x45, 0xce,
+		0x74, 0xff, 0x7f, 0xf4, 0x62, 0xe9, 0x69, 0xe2,
+		0xb0, 0x3b, 0xbb, 0x30, 0xa6, 0x2d, 0xad, 0x26,
+		0x9c, 0x17, 0x97, 0x1c, 0x8a, 0x01, 0x81, 0x0a,
+		0xe8, 0x63, 0xe3, 0x68, 0xfe, 0x75, 0xf5, 0x7e,
+		0xc4, 0x4f, 0xcf, 0x44, 0xd2, 0x59, 0xd9, 0x52,
+		0x7d, 0xf6, 0x76, 0xfd, 0x6b, 0xe0, 0x60, 0xeb,
+		0x51, 0xda, 0x5a, 0xd1, 0x47, 0xcc, 0x4c, 0xc7,
+		0x25, 0xae, 0x2e, 0xa5, 0x33, 0xb8, 0x38, 0xb3,
+		0x09, 0x82, 0x02, 0x89, 0x1f, 0x94, 0x14, 0x9f,
+		0xcd, 0x46, 0xc6, 0x4d, 0xdb, 0x50, 0xd0, 0x5b,
+		0xe1, 0x6a, 0xea, 0x61, 0xf7, 0x7c, 0xfc, 0x77,
+		0x95, 0x1e, 0x9e, 0x15, 0x83, 0x08, 0x88, 0x03,
+		0xb9, 0x32, 0xb2, 0x39, 0xaf, 0x24, 0xa4, 0x2f,
+		0xfa, 0x71, 0xf1, 0x7a, 0xec, 0x67, 0xe7, 0x6c,
+		0xd6, 0x5d, 0xdd, 0x56, 0xc0, 0x4b, 0xcb, 0x40,
+		0xa2, 0x29, 0xa9, 0x22, 0xb4, 0x3f, 0xbf, 0x34,
+		0x8e, 0x05, 0x85, 0x0e, 0x98, 0x13, 0x93, 0x18,
+		0x4a, 0xc1, 0x41, 0xca, 0x5c, 0xd7, 0x57, 0xdc,
+		0x66, 0xed, 0x6d, 0xe6, 0x70, 0xfb, 0x7b, 0xf0,
+		0x12, 0x99, 0x19, 0x92, 0x04, 0x8f, 0x0f, 0x84,
+		0x3e, 0xb5, 0x35, 0xbe, 0x28, 0xa3, 0x23, 0xa8,
+		0x87, 0x0c, 0x8c, 0x07, 0x91, 0x1a, 0x9a, 0x11,
+		0xab, 0x20, 0xa0, 0x2b, 0xbd, 0x36, 0xb6, 0x3d,
+		0xdf, 0x54, 0xd4, 0x5f, 0xc9, 0x42, 0xc2, 0x49,
+		0xf3, 0x78, 0xf8, 0x73, 0xe5, 0x6e, 0xee, 0x65,
+		0x37, 0xbc, 0x3c, 0xb7, 0x21, 0xaa, 0x2a, 0xa1,
+		0x1b, 0x90, 0x10, 0x9b, 0x0d, 0x86, 0x06, 0x8d,
+		0x6f, 0xe4, 0x64, 0xef, 0x79, 0xf2, 0x72, 0xf9,
+		0x43, 0xc8, 0x48, 0xc3, 0x55, 0xde, 0x5e, 0xd5,
+	},
+	{
+		0x00, 0x8c, 0x05, 0x89, 0x0a, 0x86, 0x0f, 0x83,
+		0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97,
+		0x28, 0xa4, 0x2d, 0xa1, 0x22, 0xae, 0x27, 0xab,
+		0x3c, 0xb0, 0x39, 0xb5, 0x36, 0xba, 0x33, 0xbf,
+		0x50, 0xdc, 0x55, 0xd9, 0x5a, 0xd6, 0x5f, 0xd3,
+		0x44, 0xc8, 0x41, 0xcd, 0x4e, 0xc2, 0x4b, 0xc7,
+		0x78, 0xf4, 0x7d, 0xf1, 0x72, 0xfe, 0x77, 0xfb,
+		0x6c, 0xe0, 0x69, 0xe5, 0x66, 0xea, 0x63, 0xef,
+		0xa0, 0x2c, 0xa5, 0x29, 0xaa, 0x26, 0xaf, 0x23,
+		0xb4, 0x38, 0xb1, 0x3d, 0xbe, 0x32, 0xbb, 0x37,
+		0x88, 0x04, 0x8d, 0x01, 0x82, 0x0e, 0x87, 0x0b,
+		0x9c, 0x10, 0x99, 0x15, 0x96, 0x1a, 0x93, 0x1f,
+		0xf0, 0x7c, 0xf5, 0x79, 0xfa, 0x76, 0xff, 0x73,
+		0xe4, 0x68, 0xe1, 0x6d, 0xee, 0x62, 0xeb, 0x67,
+		0xd8, 0x54, 0xdd, 0x51, 0xd2, 0x5e, 0xd7, 0x5b,
+		0xcc, 0x40, 0xc9, 0x45, 0xc6, 0x4a, 0xc3, 0x4f,
+		0x5d, 0xd1, 0x58, 0xd4, 0x57, 0xdb, 0x52, 0xde,
+		0x49, 0xc5, 0x4c, 0xc0, 0x43, 0xcf, 0x46, 0xca,
+		0x75, 0xf9, 0x70, 0xfc, 0x7f, 0xf3, 0x7a, 0xf6,
+		0x61, 0xed, 0x64, 0xe8, 0x6b, 0xe7, 0x6e, 0xe2,
+		0x0d, 0x81, 0x08, 0x84, 0x07, 0x8b, 0x02, 0x8e,
+		0x19, 0x95, 0x1c, 0x90, 0x13, 0x9f, 0x16, 0x9a,
+		0x25, 0xa9, 0x20, 0xac, 0x2f, 0xa3, 0x2a, 0xa6,
+		0x31, 0xbd, 0x34, 0xb8, 0x3b, 0xb7, 0x3e, 0xb2,
+		0xfd, 0x71, 0xf8, 0x74, 0xf7, 0x7b, 0xf2, 0x7e,
+		0xe9, 0x65, 0xec, 0x60, 0xe3, 0x6f, 0xe6, 0x6a,
+		0xd5, 0x59, 0xd0, 0x5c, 0xdf, 0x53, 0xda, 0x56,
+		0xc1, 0x4d, 0xc4, 0x48, 0xcb, 0x47, 0xce, 0x42,
+		0xad, 0x21, 0xa8, 0x24, 0xa7, 0x2b, 0xa2, 0x2e,
+		0xb9, 0x35, 0xbc, 0x30, 0xb3, 0x3f, 0xb6, 0x3a,
+		0x85, 0x09, 0x80, 0x0c, 0x8f, 0x03, 0x8a, 0x06,
+		0x91, 0x1d, 0x94, 0x18, 0x9b, 0x17, 0x9e, 0x12,
+	},
+	{
+		0x00, 0x8d, 0x07, 0x8a, 0x0e, 0x83, 0x09, 0x84,
+		0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98,
+		0x38, 0xb5, 0x3f, 0xb2, 0x36, 0xbb, 0x31, 0xbc,
+		0x24, 0xa9, 0x23, 0xae, 0x2a, 0xa7, 0x2d, 0xa0,
+		0x70, 0xfd, 0x77, 0xfa, 0x7e, 0xf3, 0x79, 0xf4,
+		0x6c, 0xe1, 0x6b, 0xe6, 0x62, 0xef, 0x65, 0xe8,
+		0x48, 0xc5, 0x4f, 0xc2, 0x46, 0xcb, 0x41, 0xcc,
+		0x54, 0xd9, 0x53, 0xde, 0x5a, 0xd7, 0x5d, 0xd0,
+		0xe0, 0x6d, 0xe7, 0x6a, 0xee, 0x63, 0xe9, 0x64,
+		0xfc, 0x71, 0xfb, 0x76, 0xf2, 0x7f, 0xf5, 0x78,
+		0xd8, 0x55, 0xdf, 0x52, 0xd6, 0x5b, 0xd1, 0x5c,
+		0xc4, 0x49, 0xc3, 0x4e, 0xca, 0x47, 0xcd, 0x40,
+		0x90, 0x1d, 0x97, 0x1a, 0x9e, 0x13, 0x99, 0x14,
+		0x8c, 0x01, 0x8b, 0x06, 0x82, 0x0f, 0x85, 0x08,
+		0xa8, 0x25, 0xaf, 0x22, 0xa6, 0x2b, 0xa1, 0x2c,
+		0xb4, 0x39, 0xb3, 0x3e, 0xba, 0x37, 0xbd, 0x30,
+		0xdd, 0x50, 0xda, 0x57, 0xd3, 0x5e, 0xd4, 0x59,
+		0xc1, 0x4c, 0xc6, 0x4b, 0xcf, 0x42, 0xc8, 0x45,
+		0xe5, 0x68, 0xe2, 0x6f, 0xeb, 0x66, 0xec, 0x61,
+		0xf9, 0x74, 0xfe, 0x73, 0xf7, 0x7a, 0xf0, 0x7d,
+		0xad, 0x20, 0xaa, 0x27, 0xa3, 0x2e, 0xa4, 0x29,
+		0xb1, 0x3c, 0xb6, 0x3b, 0xbf, 0x32, 0xb8, 0x35,
+		0x95, 0x18, 0x92, 0x1f, 0x9b, 0x16, 0x9c, 0x11,
+		0x89, 0x04, 0x8e, 0x03, 0x87, 0x0a, 0x80, 0x0d,
+		0x3d, 0xb0, 0x3a, 0xb7, 0x33, 0xbe, 0x34, 0xb9,
+		0x21, 0xac, 0x26, 0xab, 0x2f, 0xa2, 0x28, 0xa5,
+		0x05, 0x88, 0x02, 0x8f, 0x0b, 0x86, 0x0c, 0x81,
+		0x19, 0x94, 0x1e, 0x93, 0x17, 0x9a, 0x10, 0x9d,
+		0x4d, 0xc0, 0x4a, 0xc7, 0x43, 0xce, 0x44, 0xc9,
+		0x51, 0xdc, 0x56, 0xdb, 0x5f, 0xd2, 0x58, 0xd5,
+		0x75, 0xf8, 0x72, 0xff, 0x7b, 0xf6, 0x7c, 0xf1,
+		0x69, 0xe4, 0x6e, 0xe3, 0x67, 0xea, 0x60, 0xed,
+	},
+	{
+		0x00, 0x8e, 0x01, 0x8f, 0x02, 0x8c, 0x03, 0x8d,
+		0x04, 0x8a, 0x05, 0x8b, 0x06, 0x88, 0x07, 0x89,
+		0x08, 0x86, 0x09, 0x87, 0x0a, 0x84, 0x0b, 0x85,
+		0x0c, 0x82, 0x0d, 0x83, 0x0e, 0x80, 0x0f, 0x81,
+		0x10, 0x9e, 0x11, 0x9f, 0x12, 0x9c, 0x13, 0x9d,
+		0x14, 0x9a, 0x15, 0x9b, 0x16, 0x98, 0x17, 0x99,
+		0x18, 0x96, 0x19, 0x97, 0x1a, 0x94, 0x1b, 0x95,
+		0x1c, 0x92, 0x1d, 0x93, 0x1e, 0x90, 0x1f, 0x91,
+		0x20, 0xae, 0x21, 0xaf, 0x22, 0xac, 0x23, 0xad,
+		0x24, 0xaa, 0x25, 0xab, 0x26, 0xa8, 0x27, 0xa9,
+		0x28, 0xa6, 0x29, 0xa7, 0x2a, 0xa4, 0x2b, 0xa5,
+		0x2c, 0xa2, 0x2d, 0xa3, 0x2e, 0xa0, 0x2f, 0xa1,
+		0x30, 0xbe, 0x31, 0xbf, 0x32, 0xbc, 0x33, 0xbd,
+		0x34, 0xba, 0x35, 0xbb, 0x36, 0xb8, 0x37, 0xb9,
+		0x38, 0xb6, 0x39, 0xb7, 0x3a, 0xb4, 0x3b, 0xb5,
+		0x3c, 0xb2, 0x3d, 0xb3, 0x3e, 0xb0, 0x3f, 0xb1,
+		0x40, 0xce, 0x41, 0xcf, 0x42, 0xcc, 0x43, 0xcd,
+		0x44, 0xca, 0x45, 0xcb, 0x46, 0xc8, 0x47, 0xc9,
+		0x48, 0xc6, 0x49, 0xc7, 0x4a, 0xc4, 0x4b, 0xc5,
+		0x4c, 0xc2, 0x4d, 0xc3, 0x4e, 0xc0, 0x4f, 0xc1,
+		0x50, 0xde, 0x51, 0xdf, 0x52, 0xdc, 0x53, 0xdd,
+		0x54, 0xda, 0x55, 0xdb, 0x56, 0xd8, 0x57, 0xd9,
+		0x58, 0xd6, 0x59, 0xd7, 0x5a, 0xd4, 0x5b, 0xd5,
+		0x5c, 0xd2, 0x5d, 0xd3, 0x5e, 0xd0, 0x5f, 0xd1,
+		0x60, 0xee, 0x61, 0xef, 0x62, 0xec, 0x63, 0xed,
+		0x64, 0xea, 0x65, 0xeb, 0x66, 0xe8, 0x67, 0xe9,
+		0x68, 0xe6, 0x69, 0xe7, 0x6a, 0xe4, 0x6b, 0xe5,
+		0x6c, 0xe2, 0x6d, 0xe3, 0x6e, 0xe0, 0x6f, 0xe1,
+		0x70, 0xfe, 0x71, 0xff, 0x72, 0xfc, 0x73, 0xfd,
+		0x74, 0xfa, 0x75, 0xfb, 0x76, 0xf8, 0x77, 0xf9,
+		0x78, 0xf6, 0x79, 0xf7, 0x7a, 0xf4, 0x7b, 0xf5,
+		0x7c, 0xf2, 0x7d, 0xf3, 0x7e, 0xf0, 0x7f, 0xf1,
+	},
+	{
+		0x00, 0x8f, 0x03, 0x8c, 0x06, 0x89, 0x05, 0x8a,
+		0x0c, 0x83, 0x0f, 0x80, 0x0a, 0x85, 0x09, 0x86,
+		0x18, 0x97, 0x1b, 0x94, 0x1e, 0x91, 0x1d, 0x92,
+		0x14, 0x9b, 0x17, 0x98, 0x12, 0x9d, 0x11, 0x9e,
+		0x30, 0xbf, 0x33, 0xbc, 0x36, 0xb9, 0x35, 0xba,
+		0x3c, 0xb3, 0x3f, 0xb0, 0x3a, 0xb5, 0x39, 0xb6,
+		0x28, 0xa7, 0x2b, 0xa4, 0x2e, 0xa1, 0x2d, 0xa2,
+		0x24, 0xab, 0x27, 0xa8, 0x22, 0xad, 0x21, 0xae,
+		0x60, 0xef, 0x63, 0xec, 0x66, 0xe9, 0x65, 0xea,
+		0x6c, 0xe3, 0x6f, 0xe0, 0x6a, 0xe5, 0x69, 0xe6,
+		0x78, 0xf7, 0x7b, 0xf4, 0x7e, 0xf1, 0x7d, 0xf2,
+		0x74, 0xfb, 0x77, 0xf8, 0x72, 0xfd, 0x71, 0xfe,
+		0x50, 0xdf, 0x53, 0xdc, 0x56, 0xd9, 0x55, 0xda,
+		0x5c, 0xd3, 0x5f, 0xd0, 0x5a, 0xd5, 0x59, 0xd6,
+		0x48, 0xc7, 0x4b, 0xc4, 0x4e, 0xc1, 0x4d, 0xc2,
+		0x44, 0xcb, 0x47, 0xc8, 0x42, 0xcd, 0x41, 0xce,
+		0xc0, 0x4f, 0xc3, 0x4c, 0xc6, 0x49, 0xc5, 0x4a,
+		0xcc, 0x43, 0xcf, 0x40, 0xca, 0x45, 0xc9, 0x46,
+		0xd8, 0x57, 0xdb, 0x54, 0xde, 0x51, 0xdd, 0x52,
+		0xd4, 0x5b, 0xd7, 0x58, 0xd2, 0x5d, 0xd1, 0x5e,
+		0xf0, 0x7f, 0xf3, 0x7c, 0xf6, 0x79, 0xf5, 0x7a,
+		0xfc, 0x73, 0xff, 0x70, 0xfa, 0x75, 0xf9, 0x76,
+		0xe8, 0x67, 0xeb, 0x64, 0xee, 0x61, 0xed, 0x62,
+		0xe4, 0x6b, 0xe7, 0x68, 0xe2, 0x6d, 0xe1, 0x6e,
+		0xa0, 0x2f, 0xa3, 0x2c, 0xa6, 0x29, 0xa5, 0x2a,
+		0xac, 0x23, 0xaf, 0x20, 0xaa, 0x25, 0xa9, 0x26,
+		0xb8, 0x37, 0xbb, 0x34, 0xbe, 0x31, 0xbd, 0x32,
+		0xb4, 0x3b, 0xb7, 0x38, 0xb2, 0x3d, 0xb1, 0x3e,
+		0x90, 0x1f, 0x93, 0x1c, 0x96, 0x19, 0x95, 0x1a,
+		0x9c, 0x13, 0x9f, 0x10, 0x9a, 0x15, 0x99, 0x16,
+		0x88, 0x07, 0x8b, 0x04, 0x8e, 0x01, 0x8d, 0x02,
+		0x84, 0x0b, 0x87, 0x08, 0x82, 0x0d, 0x81, 0x0e,
+	},
+	{
+		0x00, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7,
+		0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23,
+		0xf5, 0x65, 0xc8, 0x58, 0x8f, 0x1f, 0xb2, 0x22,
+		0x01, 0x91, 0x3c, 0xac, 0x7b, 0xeb, 0x46, 0xd6,
+		0xf7, 0x67, 0xca, 0x5a, 0x8d, 0x1d, 0xb0, 0x20,
+		0x03, 0x93, 0x3e, 0xae, 0x79, 0xe9, 0x44, 0xd4,
+		0x02, 0x92, 0x3f, 0xaf, 0x78, 0xe8, 0x45, 0xd5,
+		0xf6, 0x66, 0xcb, 0x5b, 0x8c, 0x1c, 0xb1, 0x21,
+		0xf3, 0x63, 0xce, 0x5e, 0x89, 0x19, 0xb4, 0x24,
+		0x07, 0x97, 0x3a, 0xaa, 0x7d, 0xed, 0x40, 0xd0,
+		0x06, 0x96, 0x3b, 0xab, 0x7c, 0xec, 0x41, 0xd1,
+		0xf2, 0x62, 0xcf, 0x5f, 0x88, 0x18, 0xb5, 0x25,
+		0x04, 0x94, 0x39, 0xa9, 0x7e, 0xee, 0x43, 0xd3,
+		0xf0, 0x60, 0xcd, 0x5d, 0x8a, 0x1a, 0xb7, 0x27,
+		0xf1, 0x61, 0xcc, 0x5c, 0x8b, 0x1b, 0xb6, 0x26,
+		0x05, 0x95, 0x38, 0xa8, 0x7f, 0xef, 0x42, 0xd2,
+		0xfb, 0x6b, 0xc6, 0x56, 0x81, 0x11, 0xbc, 0x2c,
+		0x0f, 0x9f, 0x32, 0xa2, 0x75, 0xe5, 0x48, 0xd8,
+		0x0e, 0x9e, 0x33, 0xa3, 0x74, 0xe4, 0x49, 0xd9,
+		0xfa, 0x6a, 0xc7, 0x57, 0x80, 0x10, 0xbd, 0x2d,
+		0x0c, 0x9c, 0x31, 0xa1, 0x76, 0xe6, 0x4b, 0xdb,
+		0xf8, 0x68, 0xc5, 0x55, 0x82, 0x12, 0xbf, 0x2f,
+		0xf9, 0x69, 0xc4, 0x54, 0x83, 0x13, 0xbe, 0x2e,
+		0x0d, 0x9d, 0x30, 0xa0, 0x77, 0xe7, 0x4a, 0xda,
+		0x08, 0x98, 0x35, 0xa5, 0x72, 0xe2, 0x4f, 0xdf,
+		0xfc, 0x6c, 0xc1, 0x51, 0x86, 0x16, 0xbb, 0x2b,
+		0xfd, 0x6d, 0xc0, 0x50, 0x87, 0x17, 0xba, 0x2a,
+		0x09, 0x99, 0x34, 0xa4, 0x73, 0xe3, 0x4e, 0xde,
+		0xff, 0x6f, 0xc2, 0x52, 0x85, 0x15, 0xb8, 0x28,
+		0x0b, 0x9b, 0x36, 0xa6, 0x71, 0xe1, 0x4c, 0xdc,
+		0x0a, 0x9a, 0x37, 0xa7, 0x70, 0xe0, 0x4d, 0xdd,
+		0xfe, 0x6e, 0xc3, 0x53, 0x84, 0x14, 0xb9, 0x29,
+	},
+	{
+		0x00, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0,
+		0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c,
+		0xe5, 0x74, 0xda, 0x4b, 0x9b, 0x0a, 0xa4, 0x35,
+		0x19, 0x88, 0x26, 0xb7, 0x67, 0xf6, 0x58, 0xc9,
+		0xd7, 0x46, 0xe8, 0x79, 0xa9, 0x38, 0x96, 0x07,
+		0x2b, 0xba, 0x14, 0x85, 0x55, 0xc4, 0x6a, 0xfb,
+		0x32, 0xa3, 0x0d, 0x9c, 0x4c, 0xdd, 0x73, 0xe2,
+		0xce, 0x5f, 0xf1, 0x60, 0xb0, 0x21, 0x8f, 0x1e,
+		0xb3, 0x22, 0x8c, 0x1d, 0xcd, 0x5c, 0xf2, 0x63,
+		0x4f, 0xde, 0x70, 0xe1, 0x31, 0xa0, 0x0e, 0x9f,
+		0x56, 0xc7, 0x69, 0xf8, 0x28, 0xb9, 0x17, 0x86,
+		0xaa, 0x3b, 0x95, 0x04, 0xd4, 0x45, 0xeb, 0x7a,
+		0x64, 0xf5, 0x5b, 0xca, 0x1a, 0x8b, 0x25, 0xb4,
+		0x98, 0x09, 0xa7, 0x36, 0xe6, 0x77, 0xd9, 0x48,
+		0x81, 0x10, 0xbe, 0x2f, 0xff, 0x6e, 0xc0, 0x51,
+		0x7d, 0xec, 0x42, 0xd3, 0x03, 0x92, 0x3c, 0xad,
+		0x7b, 0xea, 0x44, 0xd5, 0x05, 0x94, 0x3a, 0xab,
+		0x87, 0x16, 0xb8, 0x29, 0xf9, 0x68, 0xc6, 0x57,
+		0x9e, 0x0f, 0xa1, 0x30, 0xe0, 0x71, 0xdf, 0x4e,
+		0x62, 0xf3, 0x5d, 0xcc, 0x1c, 0x8d, 0x23, 0xb2,
+		0xac, 0x3d, 0x93, 0x02, 0xd2, 0x43, 0xed, 0x7c,
+		0x50, 0xc1, 0x6f, 0xfe, 0x2e, 0xbf, 0x11, 0x80,
+		0x49, 0xd8, 0x76, 0xe7, 0x37, 0xa6, 0x08, 0x99,
+		0xb5, 0x24, 0x8a, 0x1b, 0xcb, 0x5a, 0xf4, 0x65,
+		0xc8, 0x59, 0xf7, 0x66, 0xb6, 0x27, 0x89, 0x18,
+		0x34, 0xa5, 0x0b, 0x9a, 0x4a, 0xdb, 0x75, 0xe4,
+		0x2d, 0xbc, 0x12, 0x83, 0x53, 0xc2, 0x6c, 0xfd,
+		0xd1, 0x40, 0xee, 0x7f, 0xaf, 0x3e, 0x90, 0x01,
+		0x1f, 0x8e, 0x20, 0xb1, 0x61, 0xf0, 0x5e, 0xcf,
+		0xe3, 0x72, 0xdc, 0x4d, 0x9d, 0x0c, 0xa2, 0x33,
+		0xfa, 0x6b, 0xc5, 0x54, 0x84, 0x15, 0xbb, 0x2a,
+		0x06, 0x97, 0x39, 0xa8, 0x78, 0xe9, 0x47, 0xd6,
+	},
+	{
+		0x00, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9,
+		0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x04, 0xaf, 0x3d,
+		0xd5, 0x47, 0xec, 0x7e, 0xa7, 0x35, 0x9e, 0x0c,
+		0x31, 0xa3, 0x08, 0x9a, 0x43, 0xd1, 0x7a, 0xe8,
+		0xb7, 0x25, 0x8e, 0x1c, 0xc5, 0x57, 0xfc, 0x6e,
+		0x53, 0xc1, 0x6a, 0xf8, 0x21, 0xb3, 0x18, 0x8a,
+		0x62, 0xf0, 0x5b, 0xc9, 0x10, 0x82, 0x29, 0xbb,
+		0x86, 0x14, 0xbf, 0x2d, 0xf4, 0x66, 0xcd, 0x5f,
+		0x73, 0xe1, 0x4a, 0xd8, 0x01, 0x93, 0x38, 0xaa,
+		0x97, 0x05, 0xae, 0x3c, 0xe5, 0x77, 0xdc, 0x4e,
+		0xa6, 0x34, 0x9f, 0x0d, 0xd4, 0x46, 0xed, 0x7f,
+		0x42, 0xd0, 0x7b, 0xe9, 0x30, 0xa2, 0x09, 0x9b,
+		0xc4, 0x56, 0xfd, 0x6f, 0xb6, 0x24, 0x8f, 0x1d,
+		0x20, 0xb2, 0x19, 0x8b, 0x52, 0xc0, 0x6b, 0xf9,
+		0x11, 0x83, 0x28, 0xba, 0x63, 0xf1, 0x5a, 0xc8,
+		0xf5, 0x67, 0xcc, 0x5e, 0x87, 0x15, 0xbe, 0x2c,
+		0xe6, 0x74, 0xdf, 0x4d, 0x94, 0x06, 0xad, 0x3f,
+		0x02, 0x90, 0x3b, 0xa9, 0x70, 0xe2, 0x49, 0xdb,
+		0x33, 0xa1, 0x0a, 0x98, 0x41, 0xd3, 0x78, 0xea,
+		0xd7, 0x45, 0xee, 0x7c, 0xa5, 0x37, 0x9c, 0x0e,
+		0x51, 0xc3, 0x68, 0xfa, 0x23, 0xb1, 0x1a, 0x88,
+		0xb5, 0x27, 0x8c, 0x1e, 0xc7, 0x55, 0xfe, 0x6c,
+		0x84, 0x16, 0xbd, 0x2f, 0xf6, 0x64, 0xcf, 0x5d,
+		0x60, 0xf2, 0x59, 0xcb, 0x12, 0x80, 0x2b, 0xb9,
+		0x95, 0x07, 0xac, 0x3e, 0xe7, 0x75, 0xde, 0x4c,
+		0x71, 0xe3, 0x48, 0xda, 0x03, 0x91, 0x3a, 0xa8,
+		0x40, 0xd2, 0x79, 0xeb, 0x32, 0xa0, 0x0b, 0x99,
+		0xa4, 0x36, 0x9d, 0x0f, 0xd6, 0x44, 0xef, 0x7d,
+		0x22, 0xb0, 0x1b, 0x89, 0x50, 0xc2, 0x69, 0xfb,
+		0xc6, 0x54, 0xff, 0x6d, 0xb4, 0x26, 0x8d, 0x1f,
+		0xf7, 0x65, 0xce, 0x5c, 0x85, 0x17, 0xbc, 0x2e,
+		0x13, 0x81, 0x2a, 0xb8, 0x61, 0xf3, 0x58, 0xca,
+	},
+	{
+		0x00, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde,
+		0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x09, 0xa1, 0x32,
+		0xc5, 0x56, 0xfe, 0x6d, 0xb3, 0x20, 0x88, 0x1b,
+		0x29, 0xba, 0x12, 0x81, 0x5f, 0xcc, 0x64, 0xf7,
+		0x97, 0x04, 0xac, 0x3f, 0xe1, 0x72, 0xda, 0x49,
+		0x7b, 0xe8, 0x40, 0xd3, 0x0d, 0x9e, 0x36, 0xa5,
+		0x52, 0xc1, 0x69, 0xfa, 0x24, 0xb7, 0x1f, 0x8c,
+		0xbe, 0x2d, 0x85, 0x16, 0xc8, 0x5b, 0xf3, 0x60,
+		0x33, 0xa0, 0x08, 0x9b, 0x45, 0xd6, 0x7e, 0xed,
+		0xdf, 0x4c, 0xe4, 0x77, 0xa9, 0x3a, 0x92, 0x01,
+		0xf6, 0x65, 0xcd, 0x5e, 0x80, 0x13, 0xbb, 0x28,
+		0x1a, 0x89, 0x21, 0xb2, 0x6c, 0xff, 0x57, 0xc4,
+		0xa4, 0x37, 0x9f, 0x0c, 0xd2, 0x41, 0xe9, 0x7a,
+		0x48, 0xdb, 0x73, 0xe0, 0x3e, 0xad, 0x05, 0x96,
+		0x61, 0xf2, 0x5a, 0xc9, 0x17, 0x84, 0x2c, 0xbf,
+		0x8d, 0x1e, 0xb6, 0x25, 0xfb, 0x68, 0xc0, 0x53,
+		0x66, 0xf5, 0x5d, 0xce, 0x10, 0x83, 0x2b, 0xb8,
+		0x8a, 0x19, 0xb1, 0x22, 0xfc, 0x6f, 0xc7, 0x54,
+		0xa3, 0x30, 0x98, 0x0b, 0xd5, 0x46, 0xee, 0x7d,
+		0x4f, 0xdc, 0x74, 0xe7, 0x39, 0xaa, 0x02, 0x91,
+		0xf1, 0x62, 0xca, 0x59, 0x87, 0x14, 0xbc, 0x2f,
+		0x1d, 0x8e, 0x26, 0xb5, 0x6b, 0xf8, 0x50, 0xc3,
+		0x34, 0xa7, 0x0f, 0x9c, 0x42, 0xd1, 0x79, 0xea,
+		0xd8, 0x4b, 0xe3, 0x70, 0xae, 0x3d, 0x95, 0x06,
+		0x55, 0xc6, 0x6e, 0xfd, 0x23, 0xb0, 0x18, 0x8b,
+		0xb9, 0x2a, 0x82, 0x11, 0xcf, 0x5c, 0xf4, 0x67,
+		0x90, 0x03, 0xab, 0x38, 0xe6, 0x75, 0xdd, 0x4e,
+		0x7c, 0xef, 0x47, 0xd4, 0x0a, 0x99, 0x31, 0xa2,
+		0xc2, 0x51, 0xf9, 0x6a, 0xb4, 0x27, 0x8f, 0x1c,
+		0x2e, 0xbd, 0x15, 0x86, 0x58, 0xcb, 0x63, 0xf0,
+		0x07, 0x94, 0x3c, 0xaf, 0x71, 0xe2, 0x4a, 0xd9,
+		0xeb, 0x78, 0xd0, 0x43, 0x9d, 0x0e, 0xa6, 0x35,
+	},
+	{
+		0x00, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb,
+		0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f,
+		0xb5, 0x21, 0x80, 0x14, 0xdf, 0x4b, 0xea, 0x7e,
+		0x61, 0xf5, 0x54, 0xc0, 0x0b, 0x9f, 0x3e, 0xaa,
+		0x77, 0xe3, 0x42, 0xd6, 0x1d, 0x89, 0x28, 0xbc,
+		0xa3, 0x37, 0x96, 0x02, 0xc9, 0x5d, 0xfc, 0x68,
+		0xc2, 0x56, 0xf7, 0x63, 0xa8, 0x3c, 0x9d, 0x09,
+		0x16, 0x82, 0x23, 0xb7, 0x7c, 0xe8, 0x49, 0xdd,
+		0xee, 0x7a, 0xdb, 0x4f, 0x84, 0x10, 0xb1, 0x25,
+		0x3a, 0xae, 0x0f, 0x9b, 0x50, 0xc4, 0x65, 0xf1,
+		0x5b, 0xcf, 0x6e, 0xfa, 0x31, 0xa5, 0x04, 0x90,
+		0x8f, 0x1b, 0xba, 0x2e, 0xe5, 0x71, 0xd0, 0x44,
+		0x99, 0x0d, 0xac, 0x38, 0xf3, 0x67, 0xc6, 0x52,
+		0x4d, 0xd9, 0x78, 0xec, 0x27, 0xb3, 0x12, 0x86,
+		0x2c, 0xb8, 0x19, 0x8d, 0x46, 0xd2, 0x73, 0xe7,
+		0xf8, 0x6c, 0xcd, 0x59, 0x92, 0x06, 0xa7, 0x33,
+		0xc1, 0x55, 0xf4, 0x60, 0xab, 0x3f, 0x9e, 0x0a,
+		0x15, 0x81, 0x20, 0xb4, 0x7f, 0xeb, 0x4a, 0xde,
+		0x74, 0xe0, 0x41, 0xd5, 0x1e, 0x8a, 0x2b, 0xbf,
+		0xa0, 0x34, 0x95, 0x01, 0xca, 0x5e, 0xff, 0x6b,
+		0xb6, 0x22, 0x83, 0x17, 0xdc, 0x48, 0xe9, 0x7d,
+		0x62, 0xf6, 0x57, 0xc3, 0x08, 0x9c, 0x3d, 0xa9,
+		0x03, 0x97, 0x36, 0xa2, 0x69, 0xfd, 0x5c, 0xc8,
+		0xd7, 0x43, 0xe2, 0x76, 0xbd, 0x29, 0x88, 0x1c,
+		0x2f, 0xbb, 0x1a, 0x8e, 0x45, 0xd1, 0x70, 0xe4,
+		0xfb, 0x6f, 0xce, 0x5a, 0x91, 0x05, 0xa4, 0x30,
+		0x9a, 0x0e, 0xaf, 0x3b, 0xf0, 0x64, 0xc5, 0x51,
+		0x4e, 0xda, 0x7b, 0xef, 0x24, 0xb0, 0x11, 0x85,
+		0x58, 0xcc, 0x6d, 0xf9, 0x32, 0xa6, 0x07, 0x93,
+		0x8c, 0x18, 0xb9, 0x2d, 0xe6, 0x72, 0xd3, 0x47,
+		0xed, 0x79, 0xd8, 0x4c, 0x87, 0x13, 0xb2, 0x26,
+		0x39, 0xad, 0x0c, 0x98, 0x53, 0xc7, 0x66, 0xf2,
+	},
+	{
+		0x00, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc,
+		0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10,
+		0xa5, 0x30, 0x92, 0x07, 0xcb, 0x5e, 0xfc, 0x69,
+		0x79, 0xec, 0x4e, 0xdb, 0x17, 0x82, 0x20, 0xb5,
+		0x57, 0xc2, 0x60, 0xf5, 0x39, 0xac, 0x0e, 0x9b,
+		0x8b, 0x1e, 0xbc, 0x29, 0xe5, 0x70, 0xd2, 0x47,
+		0xf2, 0x67, 0xc5, 0x50, 0x9c, 0x09, 0xab, 0x3e,
+		0x2e, 0xbb, 0x19, 0x8c, 0x40, 0xd5, 0x77, 0xe2,
+		0xae, 0x3b, 0x99, 0x0c, 0xc0, 0x55, 0xf7, 0x62,
+		0x72, 0xe7, 0x45, 0xd0, 0x1c, 0x89, 0x2b, 0xbe,
+		0x0b, 0x9e, 0x3c, 0xa9, 0x65, 0xf0, 0x52, 0xc7,
+		0xd7, 0x42, 0xe0, 0x75, 0xb9, 0x2c, 0x8e, 0x1b,
+		0xf9, 0x6c, 0xce, 0x5b, 0x97, 0x02, 0xa0, 0x35,
+		0x25, 0xb0, 0x12, 0x87, 0x4b, 0xde, 0x7c, 0xe9,
+		0x5c, 0xc9, 0x6b, 0xfe, 0x32, 0xa7, 0x05, 0x90,
+		0x80, 0x15, 0xb7, 0x22, 0xee, 0x7b, 0xd9, 0x4c,
+		0x41, 0xd4, 0x76, 0xe3, 0x2f, 0xba, 0x18, 0x8d,
+		0x9d, 0x08, 0xaa, 0x3f, 0xf3, 0x66, 0xc4, 0x51,
+		0xe4, 0x71, 0xd3, 0x46, 0x8a, 0x1f, 0xbd, 0x28,
+		0x38, 0xad, 0x0f, 0x9a, 0x56, 0xc3, 0x61, 0xf4,
+		0x16, 0x83, 0x21, 0xb4, 0x78, 0xed, 0x4f, 0xda,
+		0xca, 0x5f, 0xfd, 0x68, 0xa4, 0x31, 0x93, 0x06,
+		0xb3, 0x26, 0x84, 0x11, 0xdd, 0x48, 0xea, 0x7f,
+		0x6f, 0xfa, 0x58, 0xcd, 0x01, 0x94, 0x36, 0xa3,
+		0xef, 0x7a, 0xd8, 0x4d, 0x81, 0x14, 0xb6, 0x23,
+		0x33, 0xa6, 0x04, 0x91, 0x5d, 0xc8, 0x6a, 0xff,
+		0x4a, 0xdf, 0x7d, 0xe8, 0x24, 0xb1, 0x13, 0x86,
+		0x96, 0x03, 0xa1, 0x34, 0xf8, 0x6d, 0xcf, 0x5a,
+		0xb8, 0x2d, 0x8f, 0x1a, 0xd6, 0x43, 0xe1, 0x74,
+		0x64, 0xf1, 0x53, 0xc6, 0x0a, 0x9f, 0x3d, 0xa8,
+		0x1d, 0x88, 0x2a, 0xbf, 0x73, 0xe6, 0x44, 0xd1,
+		0xc1, 0x54, 0xf6, 0x63, 0xaf, 0x3a, 0x98, 0x0d,
+	},
+	{
+		0x00, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5,
+		0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x01,
+		0x95, 0x03, 0xa4, 0x32, 0xf7, 0x61, 0xc6, 0x50,
+		0x51, 0xc7, 0x60, 0xf6, 0x33, 0xa5, 0x02, 0x94,
+		0x37, 0xa1, 0x06, 0x90, 0x55, 0xc3, 0x64, 0xf2,
+		0xf3, 0x65, 0xc2, 0x54, 0x91, 0x07, 0xa0, 0x36,
+		0xa2, 0x34, 0x93, 0x05, 0xc0, 0x56, 0xf1, 0x67,
+		0x66, 0xf0, 0x57, 0xc1, 0x04, 0x92, 0x35, 0xa3,
+		0x6e, 0xf8, 0x5f, 0xc9, 0x0c, 0x9a, 0x3d, 0xab,
+		0xaa, 0x3c, 0x9b, 0x0d, 0xc8, 0x5e, 0xf9, 0x6f,
+		0xfb, 0x6d, 0xca, 0x5c, 0x99, 0x0f, 0xa8, 0x3e,
+		0x3f, 0xa9, 0x0e, 0x98, 0x5d, 0xcb, 0x6c, 0xfa,
+		0x59, 0xcf, 0x68, 0xfe, 0x3b, 0xad, 0x0a, 0x9c,
+		0x9d, 0x0b, 0xac, 0x3a, 0xff, 0x69, 0xce, 0x58,
+		0xcc, 0x5a, 0xfd, 0x6b, 0xae, 0x38, 0x9f, 0x09,
+		0x08, 0x9e, 0x39, 0xaf, 0x6a, 0xfc, 0x5b, 0xcd,
+		0xdc, 0x4a, 0xed, 0x7b, 0xbe, 0x28, 0x8f, 0x19,
+		0x18, 0x8e, 0x29, 0xbf, 0x7a, 0xec, 0x4b, 0xdd,
+		0x49, 0xdf, 0x78, 0xee, 0x2b, 0xbd, 0x1a, 0x8c,
+		0x8d, 0x1b, 0xbc, 0x2a, 0xef, 0x79, 0xde, 0x48,
+		0xeb, 0x7d, 0xda, 0x4c, 0x89, 0x1f, 0xb8, 0x2e,
+		0x2f, 0xb9, 0x1e, 0x88, 0x4d, 0xdb, 0x7c, 0xea,
+		0x7e, 0xe8, 0x4f, 0xd9, 0x1c, 0x8a, 0x2d, 0xbb,
+		0xba, 0x2c, 0x8b, 0x1d, 0xd8, 0x4e, 0xe9, 0x7f,
+		0xb2, 0x24, 0x83, 0x15, 0xd0, 0x46, 0xe1, 0x77,
+		0x76, 0xe0, 0x47, 0xd1, 0x14, 0x82, 0x25, 0xb3,
+		0x27, 0xb1, 0x16, 0x80, 0x45, 0xd3, 0x74, 0xe2,
+		0xe3, 0x75, 0xd2, 0x44, 0x81, 0x17, 0xb0, 0x26,
+		0x85, 0x13, 0xb4, 0x22, 0xe7, 0x71, 0xd6, 0x40,
+		0x41, 0xd7, 0x70, 0xe6, 0x23, 0xb5, 0x12, 0x84,
+		0x10, 0x86, 0x21, 0xb7, 0x72, 0xe4, 0x43, 0xd5,
+		0xd4, 0x42, 0xe5, 0x73, 0xb6, 0x20, 0x87, 0x11,
+	},
+	{
+		0x00, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2,
+		0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0x0e,
+		0x85, 0x12, 0xb6, 0x21, 0xe3, 0x74, 0xd0, 0x47,
+		0x49, 0xde, 0x7a, 0xed, 0x2f, 0xb8, 0x1c, 0x8b,
+		0x17, 0x80, 0x24, 0xb3, 0x71, 0xe6, 0x42, 0xd5,
+		0xdb, 0x4c, 0xe8, 0x7f, 0xbd, 0x2a, 0x8e, 0x19,
+		0x92, 0x05, 0xa1, 0x36, 0xf4, 0x63, 0xc7, 0x50,
+		0x5e, 0xc9, 0x6d, 0xfa, 0x38, 0xaf, 0x0b, 0x9c,
+		0x2e, 0xb9, 0x1d, 0x8a, 0x48, 0xdf, 0x7b, 0xec,
+		0xe2, 0x75, 0xd1, 0x46, 0x84, 0x13, 0xb7, 0x20,
+		0xab, 0x3c, 0x98, 0x0f, 0xcd, 0x5a, 0xfe, 0x69,
+		0x67, 0xf0, 0x54, 0xc3, 0x01, 0x96, 0x32, 0xa5,
+		0x39, 0xae, 0x0a, 0x9d, 0x5f, 0xc8, 0x6c, 0xfb,
+		0xf5, 0x62, 0xc6, 0x51, 0x93, 0x04, 0xa0, 0x37,
+		0xbc, 0x2b, 0x8f, 0x18, 0xda, 0x4d, 0xe9, 0x7e,
+		0x70, 0xe7, 0x43, 0xd4, 0x16, 0x81, 0x25, 0xb2,
+		0x5c, 0xcb, 0x6f, 0xf8, 0x3a, 0xad, 0x09, 0x9e,
+		0x90, 0x07, 0xa3, 0x34, 0xf6, 0x61, 0xc5, 0x52,
+		0xd9, 0x4e, 0xea, 0x7d, 0xbf, 0x28, 0x8c, 0x1b,
+		0x15, 0x82, 0x26, 0xb1, 0x73, 0xe4, 0x40, 0xd7,
+		0x4b, 0xdc, 0x78, 0xef, 0x2d, 0xba, 0x1e, 0x89,
+		0x87, 0x10, 0xb4, 0x23, 0xe1, 0x76, 0xd2, 0x45,
+		0xce, 0x59, 0xfd, 0x6a, 0xa8, 0x3f, 0x9b, 0x0c,
+		0x02, 0x95, 0x31, 0xa6, 0x64, 0xf3, 0x57, 0xc0,
+		0x72, 0xe5, 0x41, 0xd6, 0x14, 0x83, 0x27, 0xb0,
+		0xbe, 0x29, 0x8d, 0x1a, 0xd8, 0x4f, 0xeb, 0x7c,
+		0xf7, 0x60, 0xc4, 0x53, 0x91, 0x06, 0xa2, 0x35,
+		0x3b, 0xac, 0x08, 0x9f, 0x5d, 0xca, 0x6e, 0xf9,
+		0x65, 0xf2, 0x56, 0xc1, 0x03, 0x94, 0x30, 0xa7,
+		0xa9, 0x3e, 0x9a, 0x0d, 0xcf, 0x58, 0xfc, 0x6b,
+		0xe0, 0x77, 0xd3, 0x44, 0x86, 0x11, 0xb5, 0x22,
+		0x2c, 0xbb, 0x1f, 0x88, 0x4a, 0xdd, 0x79, 0xee,
+	},
+	{
+		0x00, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef,
+		0xb4, 0x2c, 0x99, 0x01, 0xee, 0x76, 0xc3, 0x5b,
+		0x75, 0xed, 0x58, 0xc0, 0x2f, 0xb7, 0x02, 0x9a,
+		0xc1, 0x59, 0xec, 0x74, 0x9b, 0x03, 0xb6, 0x2e,
+		0xea, 0x72, 0xc7, 0x5f, 0xb0, 0x28, 0x9d, 0x05,
+		0x5e, 0xc6, 0x73, 0xeb, 0x04, 0x9c, 0x29, 0xb1,
+		0x9f, 0x07, 0xb2, 0x2a, 0xc5, 0x5d, 0xe8, 0x70,
+		0x2b, 0xb3, 0x06, 0x9e, 0x71, 0xe9, 0x5c, 0xc4,
+		0xc9, 0x51, 0xe4, 0x7c, 0x93, 0x0b, 0xbe, 0x26,
+		0x7d, 0xe5, 0x50, 0xc8, 0x27, 0xbf, 0x0a, 0x92,
+		0xbc, 0x24, 0x91, 0x09, 0xe6, 0x7e, 0xcb, 0x53,
+		0x08, 0x90, 0x25, 0xbd, 0x52, 0xca, 0x7f, 0xe7,
+		0x23, 0xbb, 0x0e, 0x96, 0x79, 0xe1, 0x54, 0xcc,
+		0x97, 0x0f, 0xba, 0x22, 0xcd, 0x55, 0xe0, 0x78,
+		0x56, 0xce, 0x7b, 0xe3, 0x0c, 0x94, 0x21, 0xb9,
+		0xe2, 0x7a, 0xcf, 0x57, 0xb8, 0x20, 0x95, 0x0d,
+		0x8f, 0x17, 0xa2, 0x3a, 0xd5, 0x4d, 0xf8, 0x60,
+		0x3b, 0xa3, 0x16, 0x8e, 0x61, 0xf9, 0x4c, 0xd4,
+		0xfa, 0x62, 0xd7, 0x4f, 0xa0, 0x38, 0x8d, 0x15,
+		0x4e, 0xd6, 0x63, 0xfb, 0x14, 0x8c, 0x39, 0xa1,
+		0x65, 0xfd, 0x48, 0xd0, 0x3f, 0xa7, 0x12, 0x8a,
+		0xd1, 0x49, 0xfc, 0x64, 0x8b, 0x13, 0xa6, 0x3e,
+		0x10, 0x88, 0x3d, 0xa5, 0x4a, 0xd2, 0x67, 0xff,
+		0xa4, 0x3c, 0x89, 0x11, 0xfe, 0x66, 0xd3, 0x4b,
+		0x46, 0xde, 0x6b, 0xf3, 0x1c, 0x84, 0x31, 0xa9,
+		0xf2, 0x6a, 0xdf, 0x47, 0xa8, 0x30, 0x85, 0x1d,
+		0x33, 0xab, 0x1e, 0x86, 0x69, 0xf1, 0x44, 0xdc,
+		0x87, 0x1f, 0xaa, 0x32, 0xdd, 0x45, 0xf0, 0x68,
+		0xac, 0x34, 0x81, 0x19, 0xf6, 0x6e, 0xdb, 0x43,
+		0x18, 0x80, 0x35, 0xad, 0x42, 0xda, 0x6f, 0xf7,
+		0xd9, 0x41, 0xf4, 0x6c, 0x83, 0x1b, 0xae, 0x36,
+		0x6d, 0xf5, 0x40, 0xd8, 0x37, 0xaf, 0x1a, 0x82,
+	},
+	{
+		0x00, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8,
+		0xbc, 0x25, 0x93, 0x0a, 0xe2, 0x7b, 0xcd, 0x54,
+		0x65, 0xfc, 0x4a, 0xd3, 0x3b, 0xa2, 0x14, 0x8d,
+		0xd9, 0x40, 0xf6, 0x6f, 0x87, 0x1e, 0xa8, 0x31,
+		0xca, 0x53, 0xe5, 0x7c, 0x94, 0x0d, 0xbb, 0x22,
+		0x76, 0xef, 0x59, 0xc0, 0x28, 0xb1, 0x07, 0x9e,
+		0xaf, 0x36, 0x80, 0x19, 0xf1, 0x68, 0xde, 0x47,
+		0x13, 0x8a, 0x3c, 0xa5, 0x4d, 0xd4, 0x62, 0xfb,
+		0x89, 0x10, 0xa6, 0x3f, 0xd7, 0x4e, 0xf8, 0x61,
+		0x35, 0xac, 0x1a, 0x83, 0x6b, 0xf2, 0x44, 0xdd,
+		0xec, 0x75, 0xc3, 0x5a, 0xb2, 0x2b, 0x9d, 0x04,
+		0x50, 0xc9, 0x7f, 0xe6, 0x0e, 0x97, 0x21, 0xb8,
+		0x43, 0xda, 0x6c, 0xf5, 0x1d, 0x84, 0x32, 0xab,
+		0xff, 0x66, 0xd0, 0x49, 0xa1, 0x38, 0x8e, 0x17,
+		0x26, 0xbf, 0x09, 0x90, 0x78, 0xe1, 0x57, 0xce,
+		0x9a, 0x03, 0xb5, 0x2c, 0xc4, 0x5d, 0xeb, 0x72,
+		0x0f, 0x96, 0x20, 0xb9, 0x51, 0xc8, 0x7e, 0xe7,
+		0xb3, 0x2a, 0x9c, 0x05, 0xed, 0x74, 0xc2, 0x5b,
+		0x6a, 0xf3, 0x45, 0xdc, 0x34, 0xad, 0x1b, 0x82,
+		0xd6, 0x4f, 0xf9, 0x60, 0x88, 0x11, 0xa7, 0x3e,
+		0xc5, 0x5c, 0xea, 0x73, 0x9b, 0x02, 0xb4, 0x2d,
+		0x79, 0xe0, 0x56, 0xcf, 0x27, 0xbe, 0x08, 0x91,
+		0xa0, 0x39, 0x8f, 0x16, 0xfe, 0x67, 0xd1, 0x48,
+		0x1c, 0x85, 0x33, 0xaa, 0x42, 0xdb, 0x6d, 0xf4,
+		0x86, 0x1f, 0xa9, 0x30, 0xd8, 0x41, 0xf7, 0x6e,
+		0x3a, 0xa3, 0x15, 0x8c, 0x64, 0xfd, 0x4b, 0xd2,
+		0xe3, 0x7a, 0xcc, 0x55, 0xbd, 0x24, 0x92, 0x0b,
+		0x5f, 0xc6, 0x70, 0xe9, 0x01, 0x98, 0x2e, 0xb7,
+		0x4c, 0xd5, 0x63, 0xfa, 0x12, 0x8b, 0x3d, 0xa4,
+		0xf0, 0x69, 0xdf, 0x46, 0xae, 0x37, 0x81, 0x18,
+		0x29, 0xb0, 0x06, 0x9f, 0x77, 0xee, 0x58, 0xc1,
+		0x95, 0x0c, 0xba, 0x23, 0xcb, 0x52, 0xe4, 0x7d,
+	},
+	{
+		0x00, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1,
+		0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45,
+		0x55, 0xcf, 0x7c, 0xe6, 0x07, 0x9d, 0x2e, 0xb4,
+		0xf1, 0x6b, 0xd8, 0x42, 0xa3, 0x39, 0x8a, 0x10,
+		0xaa, 0x30, 0x83, 0x19, 0xf8, 0x62, 0xd1, 0x4b,
+		0x0e, 0x94, 0x27, 0xbd, 0x5c, 0xc6, 0x75, 0xef,
+		0xff, 0x65, 0xd6, 0x4c, 0xad, 0x37, 0x84, 0x1e,
+		0x5b, 0xc1, 0x72, 0xe8, 0x09, 0x93, 0x20, 0xba,
+		0x49, 0xd3, 0x60, 0xfa, 0x1b, 0x81, 0x32, 0xa8,
+		0xed, 0x77, 0xc4, 0x5e, 0xbf, 0x25, 0x96, 0x0c,
+		0x1c, 0x86, 0x35, 0xaf, 0x4e, 0xd4, 0x67, 0xfd,
+		0xb8, 0x22, 0x91, 0x0b, 0xea, 0x70, 0xc3, 0x59,
+		0xe3, 0x79, 0xca, 0x50, 0xb1, 0x2b, 0x98, 0x02,
+		0x47, 0xdd, 0x6e, 0xf4, 0x15, 0x8f, 0x3c, 0xa6,
+		0xb6, 0x2c, 0x9f, 0x05, 0xe4, 0x7e, 0xcd, 0x57,
+		0x12, 0x88, 0x3b, 0xa1, 0x40, 0xda, 0x69, 0xf3,
+		0x92, 0x08, 0xbb, 0x21, 0xc0, 0x5a, 0xe9, 0x73,
+		0x36, 0xac, 0x1f, 0x85, 0x64, 0xfe, 0x4d, 0xd7,
+		0xc7, 0x5d, 0xee, 0x74, 0x95, 0x0f, 0xbc, 0x26,
+		0x63, 0xf9, 0x4a, 0xd0, 0x31, 0xab, 0x18, 0x82,
+		0x38, 0xa2, 0x11, 0x8b, 0x6a, 0xf0, 0x43, 0xd9,
+		0x9c, 0x06, 0xb5, 0x2f, 0xce, 0x54, 0xe7, 0x7d,
+		0x6d, 0xf7, 0x44, 0xde, 0x3f, 0xa5, 0x16, 0x8c,
+		0xc9, 0x53, 0xe0, 0x7a, 0x9b, 0x01, 0xb2, 0x28,
+		0xdb, 0x41, 0xf2, 0x68, 0x89, 0x13, 0xa0, 0x3a,
+		0x7f, 0xe5, 0x56, 0xcc, 0x2d, 0xb7, 0x04, 0x9e,
+		0x8e, 0x14, 0xa7, 0x3d, 0xdc, 0x46, 0xf5, 0x6f,
+		0x2a, 0xb0, 0x03, 0x99, 0x78, 0xe2, 0x51, 0xcb,
+		0x71, 0xeb, 0x58, 0xc2, 0x23, 0xb9, 0x0a, 0x90,
+		0xd5, 0x4f, 0xfc, 0x66, 0x87, 0x1d, 0xae, 0x34,
+		0x24, 0xbe, 0x0d, 0x97, 0x76, 0xec, 0x5f, 0xc5,
+		0x80, 0x1a, 0xa9, 0x33, 0xd2, 0x48, 0xfb, 0x61,
+	},
+	{
+		0x00, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6,
+		0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a,
+		0x45, 0xde, 0x6e, 0xf5, 0x13, 0x88, 0x38, 0xa3,
+		0xe9, 0x72, 0xc2, 0x59, 0xbf, 0x24, 0x94, 0x0f,
+		0x8a, 0x11, 0xa1, 0x3a, 0xdc, 0x47, 0xf7, 0x6c,
+		0x26, 0xbd, 0x0d, 0x96, 0x70, 0xeb, 0x5b, 0xc0,
+		0xcf, 0x54, 0xe4, 0x7f, 0x99, 0x02, 0xb2, 0x29,
+		0x63, 0xf8, 0x48, 0xd3, 0x35, 0xae, 0x1e, 0x85,
+		0x09, 0x92, 0x22, 0xb9, 0x5f, 0xc4, 0x74, 0xef,
+		0xa5, 0x3e, 0x8e, 0x15, 0xf3, 0x68, 0xd8, 0x43,
+		0x4c, 0xd7, 0x67, 0xfc, 0x1a, 0x81, 0x31, 0xaa,
+		0xe0, 0x7b, 0xcb, 0x50, 0xb6, 0x2d, 0x9d, 0x06,
+		0x83, 0x18, 0xa8, 0x33, 0xd5, 0x4e, 0xfe, 0x65,
+		0x2f, 0xb4, 0x04, 0x9f, 0x79, 0xe2, 0x52, 0xc9,
+		0xc6, 0x5d, 0xed, 0x76, 0x90, 0x0b, 0xbb, 0x20,
+		0x6a, 0xf1, 0x41, 0xda, 0x3c, 0xa7, 0x17, 0x8c,
+		0x12, 0x89, 0x39, 0xa2, 0x44, 0xdf, 0x6f, 0xf4,
+		0xbe, 0x25, 0x95, 0x0e, 0xe8, 0x73, 0xc3, 0x58,
+		0x57, 0xcc, 0x7c, 0xe7, 0x01, 0x9a, 0x2a, 0xb1,
+		0xfb, 0x60, 0xd0, 0x4b, 0xad, 0x36, 0x86, 0x1d,
+		0x98, 0x03, 0xb3, 0x28, 0xce, 0x55, 0xe5, 0x7e,
+		0x34, 0xaf, 0x1f, 0x84, 0x62, 0xf9, 0x49, 0xd2,
+		0xdd, 0x46, 0xf6, 0x6d, 0x8b, 0x10, 0xa0, 0x3b,
+		0x71, 0xea, 0x5a, 0xc1, 0x27, 0xbc, 0x0c, 0x97,
+		0x1b, 0x80, 0x30, 0xab, 0x4d, 0xd6, 0x66, 0xfd,
+		0xb7, 0x2c, 0x9c, 0x07, 0xe1, 0x7a, 0xca, 0x51,
+		0x5e, 0xc5, 0x75, 0xee, 0x08, 0x93, 0x23, 0xb8,
+		0xf2, 0x69, 0xd9, 0x42, 0xa4, 0x3f, 0x8f, 0x14,
+		0x91, 0x0a, 0xba, 0x21, 0xc7, 0x5c, 0xec, 0x77,
+		0x3d, 0xa6, 0x16, 0x8d, 0x6b, 0xf0, 0x40, 0xdb,
+		0xd4, 0x4f, 0xff, 0x64, 0x82, 0x19, 0xa9, 0x32,
+		0x78, 0xe3, 0x53, 0xc8, 0x2e, 0xb5, 0x05, 0x9e,
+	},
+	{
+		0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+		0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67,
+		0x35, 0xa9, 0x10, 0x8c, 0x7f, 0xe3, 0x5a, 0xc6,
+		0xa1, 0x3d, 0x84, 0x18, 0xeb, 0x77, 0xce, 0x52,
+		0x6a, 0xf6, 0x4f, 0xd3, 0x20, 0xbc, 0x05, 0x99,
+		0xfe, 0x62, 0xdb, 0x47, 0xb4, 0x28, 0x91, 0x0d,
+		0x5f, 0xc3, 0x7a, 0xe6, 0x15, 0x89, 0x30, 0xac,
+		0xcb, 0x57, 0xee, 0x72, 0x81, 0x1d, 0xa4, 0x38,
+		0xd4, 0x48, 0xf1, 0x6d, 0x9e, 0x02, 0xbb, 0x27,
+		0x40, 0xdc, 0x65, 0xf9, 0x0a, 0x96, 0x2f, 0xb3,
+		0xe1, 0x7d, 0xc4, 0x58, 0xab, 0x37, 0x8e, 0x12,
+		0x75, 0xe9, 0x50, 0xcc, 0x3f, 0xa3, 0x1a, 0x86,
+		0xbe, 0x22, 0x9b, 0x07, 0xf4, 0x68, 0xd1, 0x4d,
+		0x2a, 0xb6, 0x0f, 0x93, 0x60, 0xfc, 0x45, 0xd9,
+		0x8b, 0x17, 0xae, 0x32, 0xc1, 0x5d, 0xe4, 0x78,
+		0x1f, 0x83, 0x3a, 0xa6, 0x55, 0xc9, 0x70, 0xec,
+		0xb5, 0x29, 0x90, 0x0c, 0xff, 0x63, 0xda, 0x46,
+		0x21, 0xbd, 0x04, 0x98, 0x6b, 0xf7, 0x4e, 0xd2,
+		0x80, 0x1c, 0xa5, 0x39, 0xca, 0x56, 0xef, 0x73,
+		0x14, 0x88, 0x31, 0xad, 0x5e, 0xc2, 0x7b, 0xe7,
+		0xdf, 0x43, 0xfa, 0x66, 0x95, 0x09, 0xb0, 0x2c,
+		0x4b, 0xd7, 0x6e, 0xf2, 0x01, 0x9d, 0x24, 0xb8,
+		0xea, 0x76, 0xcf, 0x53, 0xa0, 0x3c, 0x85, 0x19,
+		0x7e, 0xe2, 0x5b, 0xc7, 0x34, 0xa8, 0x11, 0x8d,
+		0x61, 0xfd, 0x44, 0xd8, 0x2b, 0xb7, 0x0e, 0x92,
+		0xf5, 0x69, 0xd0, 0x4c, 0xbf, 0x23, 0x9a, 0x06,
+		0x54, 0xc8, 0x71, 0xed, 0x1e, 0x82, 0x3b, 0xa7,
+		0xc0, 0x5c, 0xe5, 0x79, 0x8a, 0x16, 0xaf, 0x33,
+		0x0b, 0x97, 0x2e, 0xb2, 0x41, 0xdd, 0x64, 0xf8,
+		0x9f, 0x03, 0xba, 0x26, 0xd5, 0x49, 0xf0, 0x6c,
+		0x3e, 0xa2, 0x1b, 0x87, 0x74, 0xe8, 0x51, 0xcd,
+		0xaa, 0x36, 0x8f, 0x13, 0xe0, 0x7c, 0xc5, 0x59,
+	},
+	{
+		0x00, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4,
+		0x9c, 0x01, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68,
+		0x25, 0xb8, 0x02, 0x9f, 0x6b, 0xf6, 0x4c, 0xd1,
+		0xb9, 0x24, 0x9e, 0x03, 0xf7, 0x6a, 0xd0, 0x4d,
+		0x4a, 0xd7, 0x6d, 0xf0, 0x04, 0x99, 0x23, 0xbe,
+		0xd6, 0x4b, 0xf1, 0x6c, 0x98, 0x05, 0xbf, 0x22,
+		0x6f, 0xf2, 0x48, 0xd5, 0x21, 0xbc, 0x06, 0x9b,
+		0xf3, 0x6e, 0xd4, 0x49, 0xbd, 0x20, 0x9a, 0x07,
+		0x94, 0x09, 0xb3, 0x2e, 0xda, 0x47, 0xfd, 0x60,
+		0x08, 0x95, 0x2f, 0xb2, 0x46, 0xdb, 0x61, 0xfc,
+		0xb1, 0x2c, 0x96, 0x0b, 0xff, 0x62, 0xd8, 0x45,
+		0x2d, 0xb0, 0x0a, 0x97, 0x63, 0xfe, 0x44, 0xd9,
+		0xde, 0x43, 0xf9, 0x64, 0x90, 0x0d, 0xb7, 0x2a,
+		0x42, 0xdf, 0x65, 0xf8, 0x0c, 0x91, 0x2b, 0xb6,
+		0xfb, 0x66, 0xdc, 0x41, 0xb5, 0x28, 0x92, 0x0f,
+		0x67, 0xfa, 0x40, 0xdd, 0x29, 0xb4, 0x0e, 0x93,
+		0x35, 0xa8, 0x12, 0x8f, 0x7b, 0xe6, 0x5c, 0xc1,
+		0xa9, 0x34, 0x8e, 0x13, 0xe7, 0x7a, 0xc0, 0x5d,
+		0x10, 0x8d, 0x37, 0xaa, 0x5e, 0xc3, 0x79, 0xe4,
+		0x8c, 0x11, 0xab, 0x36, 0xc2, 0x5f, 0xe5, 0x78,
+		0x7f, 0xe2, 0x58, 0xc5, 0x31, 0xac, 0x16, 0x8b,
+		0xe3, 0x7e, 0xc4, 0x59, 0xad, 0x30, 0x8a, 0x17,
+		0x5a, 0xc7, 0x7d, 0xe0, 0x14, 0x89, 0x33, 0xae,
+		0xc6, 0x5b, 0xe1, 0x7c, 0x88, 0x15, 0xaf, 0x32,
+		0xa1, 0x3c, 0x86, 0x1b, 0xef, 0x72, 0xc8, 0x55,
+		0x3d, 0xa0, 0x1a, 0x87, 0x73, 0xee, 0x54, 0xc9,
+		0x84, 0x19, 0xa3, 0x3e, 0xca, 0x57, 0xed, 0x70,
+		0x18, 0x85, 0x3f, 0xa2, 0x56, 0xcb, 0x71, 0xec,
+		0xeb, 0x76, 0xcc, 0x51, 0xa5, 0x38, 0x82, 0x1f,
+		0x77, 0xea, 0x50, 0xcd, 0x39, 0xa4, 0x1e, 0x83,
+		0xce, 0x53, 0xe9, 0x74, 0x80, 0x1d, 0xa7, 0x3a,
+		0x52, 0xcf, 0x75, 0xe8, 0x1c, 0x81, 0x3b, 0xa6,
+	},
+	{
+		0x00, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd,
+		0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79,
+		0x15, 0x8b, 0x34, 0xaa, 0x57, 0xc9, 0x76, 0xe8,
+		0x91, 0x0f, 0xb0, 0x2e, 0xd3, 0x4d, 0xf2, 0x6c,
+		0x2a, 0xb4, 0x0b, 0x95, 0x68, 0xf6, 0x49, 0xd7,
+		0xae, 0x30, 0x8f, 0x11, 0xec, 0x72, 0xcd, 0x53,
+		0x3f, 0xa1, 0x1e, 0x80, 0x7d, 0xe3, 0x5c, 0xc2,
+		0xbb, 0x25, 0x9a, 0x04, 0xf9, 0x67, 0xd8, 0x46,
+		0x54, 0xca, 0x75, 0xeb, 0x16, 0x88, 0x37, 0xa9,
+		0xd0, 0x4e, 0xf1, 0x6f, 0x92, 0x0c, 0xb3, 0x2d,
+		0x41, 0xdf, 0x60, 0xfe, 0x03, 0x9d, 0x22, 0xbc,
+		0xc5, 0x5b, 0xe4, 0x7a, 0x87, 0x19, 0xa6, 0x38,
+		0x7e, 0xe0, 0x5f, 0xc1, 0x3c, 0xa2, 0x1d, 0x83,
+		0xfa, 0x64, 0xdb, 0x45, 0xb8, 0x26, 0x99, 0x07,
+		0x6b, 0xf5, 0x4a, 0xd4, 0x29, 0xb7, 0x08, 0x96,
+		0xef, 0x71, 0xce, 0x50, 0xad, 0x33, 0x8c, 0x12,
+		0xa8, 0x36, 0x89, 0x17, 0xea, 0x74, 0xcb, 0x55,
+		0x2c, 0xb2, 0x0d, 0x93, 0x6e, 0xf0, 0x4f, 0xd1,
+		0xbd, 0x23, 0x9c, 0x02, 0xff, 0x61, 0xde, 0x40,
+		0x39, 0xa7, 0x18, 0x86, 0x7b, 0xe5, 0x5a, 0xc4,
+		0x82, 0x1c, 0xa3, 0x3d, 0xc0, 0x5e, 0xe1, 0x7f,
+		0x06, 0x98, 0x27, 0xb9, 0x44, 0xda, 0x65, 0xfb,
+		0x97, 0x09, 0xb6, 0x28, 0xd5, 0x4b, 0xf4, 0x6a,
+		0x13, 0x8d, 0x32, 0xac, 0x51, 0xcf, 0x70, 0xee,
+		0xfc, 0x62, 0xdd, 0x43, 0xbe, 0x20, 0x9f, 0x01,
+		0x78, 0xe6, 0x59, 0xc7, 0x3a, 0xa4, 0x1b, 0x85,
+		0xe9, 0x77, 0xc8, 0x56, 0xab, 0x35, 0x8a, 0x14,
+		0x6d, 0xf3, 0x4c, 0xd2, 0x2f, 0xb1, 0x0e, 0x90,
+		0xd6, 0x48, 0xf7, 0x69, 0x94, 0x0a, 0xb5, 0x2b,
+		0x52, 0xcc, 0x73, 0xed, 0x10, 0x8e, 0x31, 0xaf,
+		0xc3, 0x5d, 0xe2, 0x7c, 0x81, 0x1f, 0xa0, 0x3e,
+		0x47, 0xd9, 0x66, 0xf8, 0x05, 0x9b, 0x24, 0xba,
+	},
+	{
+		0x00, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa,
+		0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76,
+		0x05, 0x9a, 0x26, 0xb9, 0x43, 0xdc, 0x60, 0xff,
+		0x89, 0x16, 0xaa, 0x35, 0xcf, 0x50, 0xec, 0x73,
+		0x0a, 0x95, 0x29, 0xb6, 0x4c, 0xd3, 0x6f, 0xf0,
+		0x86, 0x19, 0xa5, 0x3a, 0xc0, 0x5f, 0xe3, 0x7c,
+		0x0f, 0x90, 0x2c, 0xb3, 0x49, 0xd6, 0x6a, 0xf5,
+		0x83, 0x1c, 0xa0, 0x3f, 0xc5, 0x5a, 0xe6, 0x79,
+		0x14, 0x8b, 0x37, 0xa8, 0x52, 0xcd, 0x71, 0xee,
+		0x98, 0x07, 0xbb, 0x24, 0xde, 0x41, 0xfd, 0x62,
+		0x11, 0x8e, 0x32, 0xad, 0x57, 0xc8, 0x74, 0xeb,
+		0x9d, 0x02, 0xbe, 0x21, 0xdb, 0x44, 0xf8, 0x67,
+		0x1e, 0x81, 0x3d, 0xa2, 0x58, 0xc7, 0x7b, 0xe4,
+		0x92, 0x0d, 0xb1, 0x2e, 0xd4, 0x4b, 0xf7, 0x68,
+		0x1b, 0x84, 0x38, 0xa7, 0x5d, 0xc2, 0x7e, 0xe1,
+		0x97, 0x08, 0xb4, 0x2b, 0xd1, 0x4e, 0xf2, 0x6d,
+		0x28, 0xb7, 0x0b, 0x94, 0x6e, 0xf1, 0x4d, 0xd2,
+		0xa4, 0x3b, 0x87, 0x18, 0xe2, 0x7d, 0xc1, 0x5e,
+		0x2d, 0xb2, 0x0e, 0x91, 0x6b, 0xf4, 0x48, 0xd7,
+		0xa1, 0x3e, 0x82, 0x1d, 0xe7, 0x78, 0xc4, 0x5b,
+		0x22, 0xbd, 0x01, 0x9e, 0x64, 0xfb, 0x47, 0xd8,
+		0xae, 0x31, 0x8d, 0x12, 0xe8, 0x77, 0xcb, 0x54,
+		0x27, 0xb8, 0x04, 0x9b, 0x61, 0xfe, 0x42, 0xdd,
+		0xab, 0x34, 0x88, 0x17, 0xed, 0x72, 0xce, 0x51,
+		0x3c, 0xa3, 0x1f, 0x80, 0x7a, 0xe5, 0x59, 0xc6,
+		0xb0, 0x2f, 0x93, 0x0c, 0xf6, 0x69, 0xd5, 0x4a,
+		0x39, 0xa6, 0x1a, 0x85, 0x7f, 0xe0, 0x5c, 0xc3,
+		0xb5, 0x2a, 0x96, 0x09, 0xf3, 0x6c, 0xd0, 0x4f,
+		0x36, 0xa9, 0x15, 0x8a, 0x70, 0xef, 0x53, 0xcc,
+		0xba, 0x25, 0x99, 0x06, 0xfc, 0x63, 0xdf, 0x40,
+		0x33, 0xac, 0x10, 0x8f, 0x75, 0xea, 0x56, 0xc9,
+		0xbf, 0x20, 0x9c, 0x03, 0xf9, 0x66, 0xda, 0x45,
+	},
+	{
+		0x00, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47,
+		0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e,
+		0xd2, 0x72, 0x8f, 0x2f, 0x68, 0xc8, 0x35, 0x95,
+		0xbb, 0x1b, 0xe6, 0x46, 0x01, 0xa1, 0x5c, 0xfc,
+		0xb9, 0x19, 0xe4, 0x44, 0x03, 0xa3, 0x5e, 0xfe,
+		0xd0, 0x70, 0x8d, 0x2d, 0x6a, 0xca, 0x37, 0x97,
+		0x6b, 0xcb, 0x36, 0x96, 0xd1, 0x71, 0x8c, 0x2c,
+		0x02, 0xa2, 0x5f, 0xff, 0xb8, 0x18, 0xe5, 0x45,
+		0x6f, 0xcf, 0x32, 0x92, 0xd5, 0x75, 0x88, 0x28,
+		0x06, 0xa6, 0x5b, 0xfb, 0xbc, 0x1c, 0xe1, 0x41,
+		0xbd, 0x1d, 0xe0, 0x40, 0x07, 0xa7, 0x5a, 0xfa,
+		0xd4, 0x74, 0x89, 0x29, 0x6e, 0xce, 0x33, 0x93,
+		0xd6, 0x76, 0x8b, 0x2b, 0x6c, 0xcc, 0x31, 0x91,
+		0xbf, 0x1f, 0xe2, 0x42, 0x05, 0xa5, 0x58, 0xf8,
+		0x04, 0xa4, 0x59, 0xf9, 0xbe, 0x1e, 0xe3, 0x43,
+		0x6d, 0xcd, 0x30, 0x90, 0xd7, 0x77, 0x8a, 0x2a,
+		0xde, 0x7e, 0x83, 0x23, 0x64, 0xc4, 0x39, 0x99,
+		0xb7, 0x17, 0xea, 0x4a, 0x0d, 0xad, 0x50, 0xf0,
+		0x0c, 0xac, 0x51, 0xf1, 0xb6, 0x16, 0xeb, 0x4b,
+		0x65, 0xc5, 0x38, 0x98, 0xdf, 0x7f, 0x82, 0x22,
+		0x67, 0xc7, 0x3a, 0x9a, 0xdd, 0x7d, 0x80, 0x20,
+		0x0e, 0xae, 0x53, 0xf3, 0xb4, 0x14, 0xe9, 0x49,
+		0xb5, 0x15, 0xe8, 0x48, 0x0f, 0xaf, 0x52, 0xf2,
+		0xdc, 0x7c, 0x81, 0x21, 0x66, 0xc6, 0x3b, 0x9b,
+		0xb1, 0x11, 0xec, 0x4c, 0x0b, 0xab, 0x56, 0xf6,
+		0xd8, 0x78, 0x85, 0x25, 0x62, 0xc2, 0x3f, 0x9f,
+		0x63, 0xc3, 0x3e, 0x9e, 0xd9, 0x79, 0x84, 0x24,
+		0x0a, 0xaa, 0x57, 0xf7, 0xb0, 0x10, 0xed, 0x4d,
+		0x08, 0xa8, 0x55, 0xf5, 0xb2, 0x12, 0xef, 0x4f,
+		0x61, 0xc1, 0x3c, 0x9c, 0xdb, 0x7b, 0x86, 0x26,
+		0xda, 0x7a, 0x87, 0x27, 0x60, 0xc0, 0x3d, 0x9d,
+		0xb3, 0x13, 0xee, 0x4e, 0x09, 0xa9, 0x54, 0xf4,
+	},
+	{
+		0x00, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40,
+		0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21,
+		0xc2, 0x63, 0x9d, 0x3c, 0x7c, 0xdd, 0x23, 0x82,
+		0xa3, 0x02, 0xfc, 0x5d, 0x1d, 0xbc, 0x42, 0xe3,
+		0x99, 0x38, 0xc6, 0x67, 0x27, 0x86, 0x78, 0xd9,
+		0xf8, 0x59, 0xa7, 0x06, 0x46, 0xe7, 0x19, 0xb8,
+		0x5b, 0xfa, 0x04, 0xa5, 0xe5, 0x44, 0xba, 0x1b,
+		0x3a, 0x9b, 0x65, 0xc4, 0x84, 0x25, 0xdb, 0x7a,
+		0x2f, 0x8e, 0x70, 0xd1, 0x91, 0x30, 0xce, 0x6f,
+		0x4e, 0xef, 0x11, 0xb0, 0xf0, 0x51, 0xaf, 0x0e,
+		0xed, 0x4c, 0xb2, 0x13, 0x53, 0xf2, 0x0c, 0xad,
+		0x8c, 0x2d, 0xd3, 0x72, 0x32, 0x93, 0x6d, 0xcc,
+		0xb6, 0x17, 0xe9, 0x48, 0x08, 0xa9, 0x57, 0xf6,
+		0xd7, 0x76, 0x88, 0x29, 0x69, 0xc8, 0x36, 0x97,
+		0x74, 0xd5, 0x2b, 0x8a, 0xca, 0x6b, 0x95, 0x34,
+		0x15, 0xb4, 0x4a, 0xeb, 0xab, 0x0a, 0xf4, 0x55,
+		0x5e, 0xff, 0x01, 0xa0, 0xe0, 0x41, 0xbf, 0x1e,
+		0x3f, 0x9e, 0x60, 0xc1, 0x81, 0x20, 0xde, 0x7f,
+		0x9c, 0x3d, 0xc3, 0x62, 0x22, 0x83, 0x7d, 0xdc,
+		0xfd, 0x5c, 0xa2, 0x03, 0x43, 0xe2, 0x1c, 0xbd,
+		0xc7, 0x66, 0x98, 0x39, 0x79, 0xd8, 0x26, 0x87,
+		0xa6, 0x07, 0xf9, 0x58, 0x18, 0xb9, 0x47, 0xe6,
+		0x05, 0xa4, 0x5a, 0xfb, 0xbb, 0x1a, 0xe4, 0x45,
+		0x64, 0xc5, 0x3b, 0x9a, 0xda, 0x7b, 0x85, 0x24,
+		0x71, 0xd0, 0x2e, 0x8f, 0xcf, 0x6e, 0x90, 0x31,
+		0x10, 0xb1, 0x4f, 0xee, 0xae, 0x0f, 0xf1, 0x50,
+		0xb3, 0x12, 0xec, 0x4d, 0x0d, 0xac, 0x52, 0xf3,
+		0xd2, 0x73, 0x8d, 0x2c, 0x6c, 0xcd, 0x33, 0x92,
+		0xe8, 0x49, 0xb7, 0x16, 0x56, 0xf7, 0x09, 0xa8,
+		0x89, 0x28, 0xd6, 0x77, 0x37, 0x96, 0x68, 0xc9,
+		0x2a, 0x8b, 0x75, 0xd4, 0x94, 0x35, 0xcb, 0x6a,
+		0x4b, 0xea, 0x14, 0xb5, 0xf5, 0x54, 0xaa, 0x0b,
+	},
+	{
+		0x00, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49,
+		0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30,
+		0xf2, 0x50, 0xab, 0x09, 0x40, 0xe2, 0x19, 0xbb,
+		0x8b, 0x29, 0xd2, 0x70, 0x39, 0x9b, 0x60, 0xc2,
+		0xf9, 0x5b, 0xa0, 0x02, 0x4b, 0xe9, 0x12, 0xb0,
+		0x80, 0x22, 0xd9, 0x7b, 0x32, 0x90, 0x6b, 0xc9,
+		0x0b, 0xa9, 0x52, 0xf0, 0xb9, 0x1b, 0xe0, 0x42,
+		0x72, 0xd0, 0x2b, 0x89, 0xc0, 0x62, 0x99, 0x3b,
+		0xef, 0x4d, 0xb6, 0x14, 0x5d, 0xff, 0x04, 0xa6,
+		0x96, 0x34, 0xcf, 0x6d, 0x24, 0x86, 0x7d, 0xdf,
+		0x1d, 0xbf, 0x44, 0xe6, 0xaf, 0x0d, 0xf6, 0x54,
+		0x64, 0xc6, 0x3d, 0x9f, 0xd6, 0x74, 0x8f, 0x2d,
+		0x16, 0xb4, 0x4f, 0xed, 0xa4, 0x06, 0xfd, 0x5f,
+		0x6f, 0xcd, 0x36, 0x94, 0xdd, 0x7f, 0x84, 0x26,
+		0xe4, 0x46, 0xbd, 0x1f, 0x56, 0xf4, 0x0f, 0xad,
+		0x9d, 0x3f, 0xc4, 0x66, 0x2f, 0x8d, 0x76, 0xd4,
+		0xc3, 0x61, 0x9a, 0x38, 0x71, 0xd3, 0x28, 0x8a,
+		0xba, 0x18, 0xe3, 0x41, 0x08, 0xaa, 0x51, 0xf3,
+		0x31, 0x93, 0x68, 0xca, 0x83, 0x21, 0xda, 0x78,
+		0x48, 0xea, 0x11, 0xb3, 0xfa, 0x58, 0xa3, 0x01,
+		0x3a, 0x98, 0x63, 0xc1, 0x88, 0x2a, 0xd1, 0x73,
+		0x43, 0xe1, 0x1a, 0xb8, 0xf1, 0x53, 0xa8, 0x0a,
+		0xc8, 0x6a, 0x91, 0x33, 0x7a, 0xd8, 0x23, 0x81,
+		0xb1, 0x13, 0xe8, 0x4a, 0x03, 0xa1, 0x5a, 0xf8,
+		0x2c, 0x8e, 0x75, 0xd7, 0x9e, 0x3c, 0xc7, 0x65,
+		0x55, 0xf7, 0x0c, 0xae, 0xe7, 0x45, 0xbe, 0x1c,
+		0xde, 0x7c, 0x87, 0x25, 0x6c, 0xce, 0x35, 0x97,
+		0xa7, 0x05, 0xfe, 0x5c, 0x15, 0xb7, 0x4c, 0xee,
+		0xd5, 0x77, 0x8c, 0x2e, 0x67, 0xc5, 0x3e, 0x9c,
+		0xac, 0x0e, 0xf5, 0x57, 0x1e, 0xbc, 0x47, 0xe5,
+		0x27, 0x85, 0x7e, 0xdc, 0x95, 0x37, 0xcc, 0x6e,
+		0x5e, 0xfc, 0x07, 0xa5, 0xec, 0x4e, 0xb5, 0x17,
+	},
+	{
+		0x00, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e,
+		0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f,
+		0xe2, 0x41, 0xb9, 0x1a, 0x54, 0xf7, 0x0f, 0xac,
+		0x93, 0x30, 0xc8, 0x6b, 0x25, 0x86, 0x7e, 0xdd,
+		0xd9, 0x7a, 0x82, 0x21, 0x6f, 0xcc, 0x34, 0x97,
+		0xa8, 0x0b, 0xf3, 0x50, 0x1e, 0xbd, 0x45, 0xe6,
+		0x3b, 0x98, 0x60, 0xc3, 0x8d, 0x2e, 0xd6, 0x75,
+		0x4a, 0xe9, 0x11, 0xb2, 0xfc, 0x5f, 0xa7, 0x04,
+		0xaf, 0x0c, 0xf4, 0x57, 0x19, 0xba, 0x42, 0xe1,
+		0xde, 0x7d, 0x85, 0x26, 0x68, 0xcb, 0x33, 0x90,
+		0x4d, 0xee, 0x16, 0xb5, 0xfb, 0x58, 0xa0, 0x03,
+		0x3c, 0x9f, 0x67, 0xc4, 0x8a, 0x29, 0xd1, 0x72,
+		0x76, 0xd5, 0x2d, 0x8e, 0xc0, 0x63, 0x9b, 0x38,
+		0x07, 0xa4, 0x5c, 0xff, 0xb1, 0x12, 0xea, 0x49,
+		0x94, 0x37, 0xcf, 0x6c, 0x22, 0x81, 0x79, 0xda,
+		0xe5, 0x46, 0xbe, 0x1d, 0x53, 0xf0, 0x08, 0xab,
+		0x43, 0xe0, 0x18, 0xbb, 0xf5, 0x56, 0xae, 0x0d,
+		0x32, 0x91, 0x69, 0xca, 0x84, 0x27, 0xdf, 0x7c,
+		0xa1, 0x02, 0xfa, 0x59, 0x17, 0xb4, 0x4c, 0xef,
+		0xd0, 0x73, 0x8b, 0x28, 0x66, 0xc5, 0x3d, 0x9e,
+		0x9a, 0x39, 0xc1, 0x62, 0x2c, 0x8f, 0x77, 0xd4,
+		0xeb, 0x48, 0xb0, 0x13, 0x5d, 0xfe, 0x06, 0xa5,
+		0x78, 0xdb, 0x23, 0x80, 0xce, 0x6d, 0x95, 0x36,
+		0x09, 0xaa, 0x52, 0xf1, 0xbf, 0x1c, 0xe4, 0x47,
+		0xec, 0x4f, 0xb7, 0x14, 0x5a, 0xf9, 0x01, 0xa2,
+		0x9d, 0x3e, 0xc6, 0x65, 0x2b, 0x88, 0x70, 0xd3,
+		0x0e, 0xad, 0x55, 0xf6, 0xb8, 0x1b, 0xe3, 0x40,
+		0x7f, 0xdc, 0x24, 0x87, 0xc9, 0x6a, 0x92, 0x31,
+		0x35, 0x96, 0x6e, 0xcd, 0x83, 0x20, 0xd8, 0x7b,
+		0x44, 0xe7, 0x1f, 0xbc, 0xf2, 0x51, 0xa9, 0x0a,
+		0xd7, 0x74, 0x8c, 0x2f, 0x61, 0xc2, 0x3a, 0x99,
+		0xa6, 0x05, 0xfd, 0x5e, 0x10, 0xb3, 0x4b, 0xe8,
+	},
+	{
+		0x00, 0xa4, 0x55, 0xf1, 0xaa, 0x0e, 0xff, 0x5b,
+		0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12,
+		0x92, 0x36, 0xc7, 0x63, 0x38, 0x9c, 0x6d, 0xc9,
+		0xdb, 0x7f, 0x8e, 0x2a, 0x71, 0xd5, 0x24, 0x80,
+		0x39, 0x9d, 0x6c, 0xc8, 0x93, 0x37, 0xc6, 0x62,
+		0x70, 0xd4, 0x25, 0x81, 0xda, 0x7e, 0x8f, 0x2b,
+		0xab, 0x0f, 0xfe, 0x5a, 0x01, 0xa5, 0x54, 0xf0,
+		0xe2, 0x46, 0xb7, 0x13, 0x48, 0xec, 0x1d, 0xb9,
+		0x72, 0xd6, 0x27, 0x83, 0xd8, 0x7c, 0x8d, 0x29,
+		0x3b, 0x9f, 0x6e, 0xca, 0x91, 0x35, 0xc4, 0x60,
+		0xe0, 0x44, 0xb5, 0x11, 0x4a, 0xee, 0x1f, 0xbb,
+		0xa9, 0x0d, 0xfc, 0x58, 0x03, 0xa7, 0x56, 0xf2,
+		0x4b, 0xef, 0x1e, 0xba, 0xe1, 0x45, 0xb4, 0x10,
+		0x02, 0xa6, 0x57, 0xf3, 0xa8, 0x0c, 0xfd, 0x59,
+		0xd9, 0x7d, 0x8c, 0x28, 0x73, 0xd7, 0x26, 0x82,
+		0x90, 0x34, 0xc5, 0x61, 0x3a, 0x9e, 0x6f, 0xcb,
+		0xe4, 0x40, 0xb1, 0x15, 0x4e, 0xea, 0x1b, 0xbf,
+		0xad, 0x09, 0xf8, 0x5c, 0x07, 0xa3, 0x52, 0xf6,
+		0x76, 0xd2, 0x23, 0x87, 0xdc, 0x78, 0x89, 0x2d,
+		0x3f, 0x9b, 0x6a, 0xce, 0x95, 0x31, 0xc0, 0x64,
+		0xdd, 0x79, 0x88, 0x2c, 0x77, 0xd3, 0x22, 0x86,
+		0x94, 0x30, 0xc1, 0x65, 0x3e, 0x9a, 0x6b, 0xcf,
+		0x4f, 0xeb, 0x1a, 0xbe, 0xe5, 0x41, 0xb0, 0x14,
+		0x06, 0xa2, 0x53, 0xf7, 0xac, 0x08, 0xf9, 0x5d,
+		0x96, 0x32, 0xc3, 0x67, 0x3c, 0x98, 0x69, 0xcd,
+		0xdf, 0x7b, 0x8a, 0x2e, 0x75, 0xd1, 0x20, 0x84,
+		0x04, 0xa0, 0x51, 0xf5, 0xae, 0x0a, 0xfb, 0x5f,
+		0x4d, 0xe9, 0x18, 0xbc, 0xe7, 0x43, 0xb2, 0x16,
+		0xaf, 0x0b, 0xfa, 0x5e, 0x05, 0xa1, 0x50, 0xf4,
+		0xe6, 0x42, 0xb3, 0x17, 0x4c, 0xe8, 0x19, 0xbd,
+		0x3d, 0x99, 0x68, 0xcc, 0x97, 0x33, 0xc2, 0x66,
+		0x74, 0xd0, 0x21, 0x85, 0xde, 0x7a, 0x8b, 0x2f,
+	},
+	{
+		0x00, 0xa5, 0x57, 0xf2, 0xae, 0x0b, 0xf9, 0x5c,
+		0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d,
+		0x82, 0x27, 0xd5, 0x70, 0x2c, 0x89, 0x7b, 0xde,
+		0xc3, 0x66, 0x94, 0x31, 0x6d, 0xc8, 0x3a, 0x9f,
+		0x19, 0xbc, 0x4e, 0xeb, 0xb7, 0x12, 0xe0, 0x45,
+		0x58, 0xfd, 0x0f, 0xaa, 0xf6, 0x53, 0xa1, 0x04,
+		0x9b, 0x3e, 0xcc, 0x69, 0x35, 0x90, 0x62, 0xc7,
+		0xda, 0x7f, 0x8d, 0x28, 0x74, 0xd1, 0x23, 0x86,
+		0x32, 0x97, 0x65, 0xc0, 0x9c, 0x39, 0xcb, 0x6e,
+		0x73, 0xd6, 0x24, 0x81, 0xdd, 0x78, 0x8a, 0x2f,
+		0xb0, 0x15, 0xe7, 0x42, 0x1e, 0xbb, 0x49, 0xec,
+		0xf1, 0x54, 0xa6, 0x03, 0x5f, 0xfa, 0x08, 0xad,
+		0x2b, 0x8e, 0x7c, 0xd9, 0x85, 0x20, 0xd2, 0x77,
+		0x6a, 0xcf, 0x3d, 0x98, 0xc4, 0x61, 0x93, 0x36,
+		0xa9, 0x0c, 0xfe, 0x5b, 0x07, 0xa2, 0x50, 0xf5,
+		0xe8, 0x4d, 0xbf, 0x1a, 0x46, 0xe3, 0x11, 0xb4,
+		0x64, 0xc1, 0x33, 0x96, 0xca, 0x6f, 0x9d, 0x38,
+		0x25, 0x80, 0x72, 0xd7, 0x8b, 0x2e, 0xdc, 0x79,
+		0xe6, 0x43, 0xb1, 0x14, 0x48, 0xed, 0x1f, 0xba,
+		0xa7, 0x02, 0xf0, 0x55, 0x09, 0xac, 0x5e, 0xfb,
+		0x7d, 0xd8, 0x2a, 0x8f, 0xd3, 0x76, 0x84, 0x21,
+		0x3c, 0x99, 0x6b, 0xce, 0x92, 0x37, 0xc5, 0x60,
+		0xff, 0x5a, 0xa8, 0x0d, 0x51, 0xf4, 0x06, 0xa3,
+		0xbe, 0x1b, 0xe9, 0x4c, 0x10, 0xb5, 0x47, 0xe2,
+		0x56, 0xf3, 0x01, 0xa4, 0xf8, 0x5d, 0xaf, 0x0a,
+		0x17, 0xb2, 0x40, 0xe5, 0xb9, 0x1c, 0xee, 0x4b,
+		0xd4, 0x71, 0x83, 0x26, 0x7a, 0xdf, 0x2d, 0x88,
+		0x95, 0x30, 0xc2, 0x67, 0x3b, 0x9e, 0x6c, 0xc9,
+		0x4f, 0xea, 0x18, 0xbd, 0xe1, 0x44, 0xb6, 0x13,
+		0x0e, 0xab, 0x59, 0xfc, 0xa0, 0x05, 0xf7, 0x52,
+		0xcd, 0x68, 0x9a, 0x3f, 0x63, 0xc6, 0x34, 0x91,
+		0x8c, 0x29, 0xdb, 0x7e, 0x22, 0x87, 0x75, 0xd0,
+	},
+	{
+		0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+		0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c,
+		0xb2, 0x14, 0xe3, 0x45, 0x10, 0xb6, 0x41, 0xe7,
+		0xeb, 0x4d, 0xba, 0x1c, 0x49, 0xef, 0x18, 0xbe,
+		0x79, 0xdf, 0x28, 0x8e, 0xdb, 0x7d, 0x8a, 0x2c,
+		0x20, 0x86, 0x71, 0xd7, 0x82, 0x24, 0xd3, 0x75,
+		0xcb, 0x6d, 0x9a, 0x3c, 0x69, 0xcf, 0x38, 0x9e,
+		0x92, 0x34, 0xc3, 0x65, 0x30, 0x96, 0x61, 0xc7,
+		0xf2, 0x54, 0xa3, 0x05, 0x50, 0xf6, 0x01, 0xa7,
+		0xab, 0x0d, 0xfa, 0x5c, 0x09, 0xaf, 0x58, 0xfe,
+		0x40, 0xe6, 0x11, 0xb7, 0xe2, 0x44, 0xb3, 0x15,
+		0x19, 0xbf, 0x48, 0xee, 0xbb, 0x1d, 0xea, 0x4c,
+		0x8b, 0x2d, 0xda, 0x7c, 0x29, 0x8f, 0x78, 0xde,
+		0xd2, 0x74, 0x83, 0x25, 0x70, 0xd6, 0x21, 0x87,
+		0x39, 0x9f, 0x68, 0xce, 0x9b, 0x3d, 0xca, 0x6c,
+		0x60, 0xc6, 0x31, 0x97, 0xc2, 0x64, 0x93, 0x35,
+		0xf9, 0x5f, 0xa8, 0x0e, 0x5b, 0xfd, 0x0a, 0xac,
+		0xa0, 0x06, 0xf1, 0x57, 0x02, 0xa4, 0x53, 0xf5,
+		0x4b, 0xed, 0x1a, 0xbc, 0xe9, 0x4f, 0xb8, 0x1e,
+		0x12, 0xb4, 0x43, 0xe5, 0xb0, 0x16, 0xe1, 0x47,
+		0x80, 0x26, 0xd1, 0x77, 0x22, 0x84, 0x73, 0xd5,
+		0xd9, 0x7f, 0x88, 0x2e, 0x7b, 0xdd, 0x2a, 0x8c,
+		0x32, 0x94, 0x63, 0xc5, 0x90, 0x36, 0xc1, 0x67,
+		0x6b, 0xcd, 0x3a, 0x9c, 0xc9, 0x6f, 0x98, 0x3e,
+		0x0b, 0xad, 0x5a, 0xfc, 0xa9, 0x0f, 0xf8, 0x5e,
+		0x52, 0xf4, 0x03, 0xa5, 0xf0, 0x56, 0xa1, 0x07,
+		0xb9, 0x1f, 0xe8, 0x4e, 0x1b, 0xbd, 0x4a, 0xec,
+		0xe0, 0x46, 0xb1, 0x17, 0x42, 0xe4, 0x13, 0xb5,
+		0x72, 0xd4, 0x23, 0x85, 0xd0, 0x76, 0x81, 0x27,
+		0x2b, 0x8d, 0x7a, 0xdc, 0x89, 0x2f, 0xd8, 0x7e,
+		0xc0, 0x66, 0x91, 0x37, 0x62, 0xc4, 0x33, 0x95,
+		0x99, 0x3f, 0xc8, 0x6e, 0x3b, 0x9d, 0x6a, 0xcc,
+	},
+	{
+		0x00, 0xa7, 0x53, 0xf4, 0xa6, 0x01, 0xf5, 0x52,
+		0x51, 0xf6, 0x02, 0xa5, 0xf7, 0x50, 0xa4, 0x03,
+		0xa2, 0x05, 0xf1, 0x56, 0x04, 0xa3, 0x57, 0xf0,
+		0xf3, 0x54, 0xa0, 0x07, 0x55, 0xf2, 0x06, 0xa1,
+		0x59, 0xfe, 0x0a, 0xad, 0xff, 0x58, 0xac, 0x0b,
+		0x08, 0xaf, 0x5b, 0xfc, 0xae, 0x09, 0xfd, 0x5a,
+		0xfb, 0x5c, 0xa8, 0x0f, 0x5d, 0xfa, 0x0e, 0xa9,
+		0xaa, 0x0d, 0xf9, 0x5e, 0x0c, 0xab, 0x5f, 0xf8,
+		0xb2, 0x15, 0xe1, 0x46, 0x14, 0xb3, 0x47, 0xe0,
+		0xe3, 0x44, 0xb0, 0x17, 0x45, 0xe2, 0x16, 0xb1,
+		0x10, 0xb7, 0x43, 0xe4, 0xb6, 0x11, 0xe5, 0x42,
+		0x41, 0xe6, 0x12, 0xb5, 0xe7, 0x40, 0xb4, 0x13,
+		0xeb, 0x4c, 0xb8, 0x1f, 0x4d, 0xea, 0x1e, 0xb9,
+		0xba, 0x1d, 0xe9, 0x4e, 0x1c, 0xbb, 0x4f, 0xe8,
+		0x49, 0xee, 0x1a, 0xbd, 0xef, 0x48, 0xbc, 0x1b,
+		0x18, 0xbf, 0x4b, 0xec, 0xbe, 0x19, 0xed, 0x4a,
+		0x79, 0xde, 0x2a, 0x8d, 0xdf, 0x78, 0x8c, 0x2b,
+		0x28, 0x8f, 0x7b, 0xdc, 0x8e, 0x29, 0xdd, 0x7a,
+		0xdb, 0x7c, 0x88, 0x2f, 0x7d, 0xda, 0x2e, 0x89,
+		0x8a, 0x2d, 0xd9, 0x7e, 0x2c, 0x8b, 0x7f, 0xd8,
+		0x20, 0x87, 0x73, 0xd4, 0x86, 0x21, 0xd5, 0x72,
+		0x71, 0xd6, 0x22, 0x85, 0xd7, 0x70, 0x84, 0x23,
+		0x82, 0x25, 0xd1, 0x76, 0x24, 0x83, 0x77, 0xd0,
+		0xd3, 0x74, 0x80, 0x27, 0x75, 0xd2, 0x26, 0x81,
+		0xcb, 0x6c, 0x98, 0x3f, 0x6d, 0xca, 0x3e, 0x99,
+		0x9a, 0x3d, 0xc9, 0x6e, 0x3c, 0x9b, 0x6f, 0xc8,
+		0x69, 0xce, 0x3a, 0x9d, 0xcf, 0x68, 0x9c, 0x3b,
+		0x38, 0x9f, 0x6b, 0xcc, 0x9e, 0x39, 0xcd, 0x6a,
+		0x92, 0x35, 0xc1, 0x66, 0x34, 0x93, 0x67, 0xc0,
+		0xc3, 0x64, 0x90, 0x37, 0x65, 0xc2, 0x36, 0x91,
+		0x30, 0x97, 0x63, 0xc4, 0x96, 0x31, 0xc5, 0x62,
+		0x61, 0xc6, 0x32, 0x95, 0xc7, 0x60, 0x94, 0x33,
+	},
+	{
+		0x00, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f,
+		0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56,
+		0x52, 0xfa, 0x1f, 0xb7, 0xc8, 0x60, 0x85, 0x2d,
+		0x7b, 0xd3, 0x36, 0x9e, 0xe1, 0x49, 0xac, 0x04,
+		0xa4, 0x0c, 0xe9, 0x41, 0x3e, 0x96, 0x73, 0xdb,
+		0x8d, 0x25, 0xc0, 0x68, 0x17, 0xbf, 0x5a, 0xf2,
+		0xf6, 0x5e, 0xbb, 0x13, 0x6c, 0xc4, 0x21, 0x89,
+		0xdf, 0x77, 0x92, 0x3a, 0x45, 0xed, 0x08, 0xa0,
+		0x55, 0xfd, 0x18, 0xb0, 0xcf, 0x67, 0x82, 0x2a,
+		0x7c, 0xd4, 0x31, 0x99, 0xe6, 0x4e, 0xab, 0x03,
+		0x07, 0xaf, 0x4a, 0xe2, 0x9d, 0x35, 0xd0, 0x78,
+		0x2e, 0x86, 0x63, 0xcb, 0xb4, 0x1c, 0xf9, 0x51,
+		0xf1, 0x59, 0xbc, 0x14, 0x6b, 0xc3, 0x26, 0x8e,
+		0xd8, 0x70, 0x95, 0x3d, 0x42, 0xea, 0x0f, 0xa7,
+		0xa3, 0x0b, 0xee, 0x46, 0x39, 0x91, 0x74, 0xdc,
+		0x8a, 0x22, 0xc7, 0x6f, 0x10, 0xb8, 0x5d, 0xf5,
+		0xaa, 0x02, 0xe7, 0x4f, 0x30, 0x98, 0x7d, 0xd5,
+		0x83, 0x2b, 0xce, 0x66, 0x19, 0xb1, 0x54, 0xfc,
+		0xf8, 0x50, 0xb5, 0x1d, 0x62, 0xca, 0x2f, 0x87,
+		0xd1, 0x79, 0x9c, 0x34, 0x4b, 0xe3, 0x06, 0xae,
+		0x0e, 0xa6, 0x43, 0xeb, 0x94, 0x3c, 0xd9, 0x71,
+		0x27, 0x8f, 0x6a, 0xc2, 0xbd, 0x15, 0xf0, 0x58,
+		0x5c, 0xf4, 0x11, 0xb9, 0xc6, 0x6e, 0x8b, 0x23,
+		0x75, 0xdd, 0x38, 0x90, 0xef, 0x47, 0xa2, 0x0a,
+		0xff, 0x57, 0xb2, 0x1a, 0x65, 0xcd, 0x28, 0x80,
+		0xd6, 0x7e, 0x9b, 0x33, 0x4c, 0xe4, 0x01, 0xa9,
+		0xad, 0x05, 0xe0, 0x48, 0x37, 0x9f, 0x7a, 0xd2,
+		0x84, 0x2c, 0xc9, 0x61, 0x1e, 0xb6, 0x53, 0xfb,
+		0x5b, 0xf3, 0x16, 0xbe, 0xc1, 0x69, 0x8c, 0x24,
+		0x72, 0xda, 0x3f, 0x97, 0xe8, 0x40, 0xa5, 0x0d,
+		0x09, 0xa1, 0x44, 0xec, 0x93, 0x3b, 0xde, 0x76,
+		0x20, 0x88, 0x6d, 0xc5, 0xba, 0x12, 0xf7, 0x5f,
+	},
+	{
+		0x00, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78,
+		0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59,
+		0x42, 0xeb, 0x0d, 0xa4, 0xdc, 0x75, 0x93, 0x3a,
+		0x63, 0xca, 0x2c, 0x85, 0xfd, 0x54, 0xb2, 0x1b,
+		0x84, 0x2d, 0xcb, 0x62, 0x1a, 0xb3, 0x55, 0xfc,
+		0xa5, 0x0c, 0xea, 0x43, 0x3b, 0x92, 0x74, 0xdd,
+		0xc6, 0x6f, 0x89, 0x20, 0x58, 0xf1, 0x17, 0xbe,
+		0xe7, 0x4e, 0xa8, 0x01, 0x79, 0xd0, 0x36, 0x9f,
+		0x15, 0xbc, 0x5a, 0xf3, 0x8b, 0x22, 0xc4, 0x6d,
+		0x34, 0x9d, 0x7b, 0xd2, 0xaa, 0x03, 0xe5, 0x4c,
+		0x57, 0xfe, 0x18, 0xb1, 0xc9, 0x60, 0x86, 0x2f,
+		0x76, 0xdf, 0x39, 0x90, 0xe8, 0x41, 0xa7, 0x0e,
+		0x91, 0x38, 0xde, 0x77, 0x0f, 0xa6, 0x40, 0xe9,
+		0xb0, 0x19, 0xff, 0x56, 0x2e, 0x87, 0x61, 0xc8,
+		0xd3, 0x7a, 0x9c, 0x35, 0x4d, 0xe4, 0x02, 0xab,
+		0xf2, 0x5b, 0xbd, 0x14, 0x6c, 0xc5, 0x23, 0x8a,
+		0x2a, 0x83, 0x65, 0xcc, 0xb4, 0x1d, 0xfb, 0x52,
+		0x0b, 0xa2, 0x44, 0xed, 0x95, 0x3c, 0xda, 0x73,
+		0x68, 0xc1, 0x27, 0x8e, 0xf6, 0x5f, 0xb9, 0x10,
+		0x49, 0xe0, 0x06, 0xaf, 0xd7, 0x7e, 0x98, 0x31,
+		0xae, 0x07, 0xe1, 0x48, 0x30, 0x99, 0x7f, 0xd6,
+		0x8f, 0x26, 0xc0, 0x69, 0x11, 0xb8, 0x5e, 0xf7,
+		0xec, 0x45, 0xa3, 0x0a, 0x72, 0xdb, 0x3d, 0x94,
+		0xcd, 0x64, 0x82, 0x2b, 0x53, 0xfa, 0x1c, 0xb5,
+		0x3f, 0x96, 0x70, 0xd9, 0xa1, 0x08, 0xee, 0x47,
+		0x1e, 0xb7, 0x51, 0xf8, 0x80, 0x29, 0xcf, 0x66,
+		0x7d, 0xd4, 0x32, 0x9b, 0xe3, 0x4a, 0xac, 0x05,
+		0x5c, 0xf5, 0x13, 0xba, 0xc2, 0x6b, 0x8d, 0x24,
+		0xbb, 0x12, 0xf4, 0x5d, 0x25, 0x8c, 0x6a, 0xc3,
+		0x9a, 0x33, 0xd5, 0x7c, 0x04, 0xad, 0x4b, 0xe2,
+		0xf9, 0x50, 0xb6, 0x1f, 0x67, 0xce, 0x28, 0x81,
+		0xd8, 0x71, 0x97, 0x3e, 0x46, 0xef, 0x09, 0xa0,
+	},
+	{
+		0x00, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71,
+		0x39, 0x93, 0x70, 0xda, 0xab, 0x01, 0xe2, 0x48,
+		0x72, 0xd8, 0x3b, 0x91, 0xe0, 0x4a, 0xa9, 0x03,
+		0x4b, 0xe1, 0x02, 0xa8, 0xd9, 0x73, 0x90, 0x3a,
+		0xe4, 0x4e, 0xad, 0x07, 0x76, 0xdc, 0x3f, 0x95,
+		0xdd, 0x77, 0x94, 0x3e, 0x4f, 0xe5, 0x06, 0xac,
+		0x96, 0x3c, 0xdf, 0x75, 0x04, 0xae, 0x4d, 0xe7,
+		0xaf, 0x05, 0xe6, 0x4c, 0x3d, 0x97, 0x74, 0xde,
+		0xd5, 0x7f, 0x9c, 0x36, 0x47, 0xed, 0x0e, 0xa4,
+		0xec, 0x46, 0xa5, 0x0f, 0x7e, 0xd4, 0x37, 0x9d,
+		0xa7, 0x0d, 0xee, 0x44, 0x35, 0x9f, 0x7c, 0xd6,
+		0x9e, 0x34, 0xd7, 0x7d, 0x0c, 0xa6, 0x45, 0xef,
+		0x31, 0x9b, 0x78, 0xd2, 0xa3, 0x09, 0xea, 0x40,
+		0x08, 0xa2, 0x41, 0xeb, 0x9a, 0x30, 0xd3, 0x79,
+		0x43, 0xe9, 0x0a, 0xa0, 0xd1, 0x7b, 0x98, 0x32,
+		0x7a, 0xd0, 0x33, 0x99, 0xe8, 0x42, 0xa1, 0x0b,
+		0xb7, 0x1d, 0xfe, 0x54, 0x25, 0x8f, 0x6c, 0xc6,
+		0x8e, 0x24, 0xc7, 0x6d, 0x1c, 0xb6, 0x55, 0xff,
+		0xc5, 0x6f, 0x8c, 0x26, 0x57, 0xfd, 0x1e, 0xb4,
+		0xfc, 0x56, 0xb5, 0x1f, 0x6e, 0xc4, 0x27, 0x8d,
+		0x53, 0xf9, 0x1a, 0xb0, 0xc1, 0x6b, 0x88, 0x22,
+		0x6a, 0xc0, 0x23, 0x89, 0xf8, 0x52, 0xb1, 0x1b,
+		0x21, 0x8b, 0x68, 0xc2, 0xb3, 0x19, 0xfa, 0x50,
+		0x18, 0xb2, 0x51, 0xfb, 0x8a, 0x20, 0xc3, 0x69,
+		0x62, 0xc8, 0x2b, 0x81, 0xf0, 0x5a, 0xb9, 0x13,
+		0x5b, 0xf1, 0x12, 0xb8, 0xc9, 0x63, 0x80, 0x2a,
+		0x10, 0xba, 0x59, 0xf3, 0x82, 0x28, 0xcb, 0x61,
+		0x29, 0x83, 0x60, 0xca, 0xbb, 0x11, 0xf2, 0x58,
+		0x86, 0x2c, 0xcf, 0x65, 0x14, 0xbe, 0x5d, 0xf7,
+		0xbf, 0x15, 0xf6, 0x5c, 0x2d, 0x87, 0x64, 0xce,
+		0xf4, 0x5e, 0xbd, 0x17, 0x66, 0xcc, 0x2f, 0x85,
+		0xcd, 0x67, 0x84, 0x2e, 0x5f, 0xf5, 0x16, 0xbc,
+	},
+	{
+		0x00, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76,
+		0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0x0c, 0xec, 0x47,
+		0x62, 0xc9, 0x29, 0x82, 0xf4, 0x5f, 0xbf, 0x14,
+		0x53, 0xf8, 0x18, 0xb3, 0xc5, 0x6e, 0x8e, 0x25,
+		0xc4, 0x6f, 0x8f, 0x24, 0x52, 0xf9, 0x19, 0xb2,
+		0xf5, 0x5e, 0xbe, 0x15, 0x63, 0xc8, 0x28, 0x83,
+		0xa6, 0x0d, 0xed, 0x46, 0x30, 0x9b, 0x7b, 0xd0,
+		0x97, 0x3c, 0xdc, 0x77, 0x01, 0xaa, 0x4a, 0xe1,
+		0x95, 0x3e, 0xde, 0x75, 0x03, 0xa8, 0x48, 0xe3,
+		0xa4, 0x0f, 0xef, 0x44, 0x32, 0x99, 0x79, 0xd2,
+		0xf7, 0x5c, 0xbc, 0x17, 0x61, 0xca, 0x2a, 0x81,
+		0xc6, 0x6d, 0x8d, 0x26, 0x50, 0xfb, 0x1b, 0xb0,
+		0x51, 0xfa, 0x1a, 0xb1, 0xc7, 0x6c, 0x8c, 0x27,
+		0x60, 0xcb, 0x2b, 0x80, 0xf6, 0x5d, 0xbd, 0x16,
+		0x33, 0x98, 0x78, 0xd3, 0xa5, 0x0e, 0xee, 0x45,
+		0x02, 0xa9, 0x49, 0xe2, 0x94, 0x3f, 0xdf, 0x74,
+		0x37, 0x9c, 0x7c, 0xd7, 0xa1, 0x0a, 0xea, 0x41,
+		0x06, 0xad, 0x4d, 0xe6, 0x90, 0x3b, 0xdb, 0x70,
+		0x55, 0xfe, 0x1e, 0xb5, 0xc3, 0x68, 0x88, 0x23,
+		0x64, 0xcf, 0x2f, 0x84, 0xf2, 0x59, 0xb9, 0x12,
+		0xf3, 0x58, 0xb8, 0x13, 0x65, 0xce, 0x2e, 0x85,
+		0xc2, 0x69, 0x89, 0x22, 0x54, 0xff, 0x1f, 0xb4,
+		0x91, 0x3a, 0xda, 0x71, 0x07, 0xac, 0x4c, 0xe7,
+		0xa0, 0x0b, 0xeb, 0x40, 0x36, 0x9d, 0x7d, 0xd6,
+		0xa2, 0x09, 0xe9, 0x42, 0x34, 0x9f, 0x7f, 0xd4,
+		0x93, 0x38, 0xd8, 0x73, 0x05, 0xae, 0x4e, 0xe5,
+		0xc0, 0x6b, 0x8b, 0x20, 0x56, 0xfd, 0x1d, 0xb6,
+		0xf1, 0x5a, 0xba, 0x11, 0x67, 0xcc, 0x2c, 0x87,
+		0x66, 0xcd, 0x2d, 0x86, 0xf0, 0x5b, 0xbb, 0x10,
+		0x57, 0xfc, 0x1c, 0xb7, 0xc1, 0x6a, 0x8a, 0x21,
+		0x04, 0xaf, 0x4f, 0xe4, 0x92, 0x39, 0xd9, 0x72,
+		0x35, 0x9e, 0x7e, 0xd5, 0xa3, 0x08, 0xe8, 0x43,
+	},
+	{
+		0x00, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63,
+		0x09, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a,
+		0x12, 0xbe, 0x57, 0xfb, 0x98, 0x34, 0xdd, 0x71,
+		0x1b, 0xb7, 0x5e, 0xf2, 0x91, 0x3d, 0xd4, 0x78,
+		0x24, 0x88, 0x61, 0xcd, 0xae, 0x02, 0xeb, 0x47,
+		0x2d, 0x81, 0x68, 0xc4, 0xa7, 0x0b, 0xe2, 0x4e,
+		0x36, 0x9a, 0x73, 0xdf, 0xbc, 0x10, 0xf9, 0x55,
+		0x3f, 0x93, 0x7a, 0xd6, 0xb5, 0x19, 0xf0, 0x5c,
+		0x48, 0xe4, 0x0d, 0xa1, 0xc2, 0x6e, 0x87, 0x2b,
+		0x41, 0xed, 0x04, 0xa8, 0xcb, 0x67, 0x8e, 0x22,
+		0x5a, 0xf6, 0x1f, 0xb3, 0xd0, 0x7c, 0x95, 0x39,
+		0x53, 0xff, 0x16, 0xba, 0xd9, 0x75, 0x9c, 0x30,
+		0x6c, 0xc0, 0x29, 0x85, 0xe6, 0x4a, 0xa3, 0x0f,
+		0x65, 0xc9, 0x20, 0x8c, 0xef, 0x43, 0xaa, 0x06,
+		0x7e, 0xd2, 0x3b, 0x97, 0xf4, 0x58, 0xb1, 0x1d,
+		0x77, 0xdb, 0x32, 0x9e, 0xfd, 0x51, 0xb8, 0x14,
+		0x90, 0x3c, 0xd5, 0x79, 0x1a, 0xb6, 0x5f, 0xf3,
+		0x99, 0x35, 0xdc, 0x70, 0x13, 0xbf, 0x56, 0xfa,
+		0x82, 0x2e, 0xc7, 0x6b, 0x08, 0xa4, 0x4d, 0xe1,
+		0x8b, 0x27, 0xce, 0x62, 0x01, 0xad, 0x44, 0xe8,
+		0xb4, 0x18, 0xf1, 0x5d, 0x3e, 0x92, 0x7b, 0xd7,
+		0xbd, 0x11, 0xf8, 0x54, 0x37, 0x9b, 0x72, 0xde,
+		0xa6, 0x0a, 0xe3, 0x4f, 0x2c, 0x80, 0x69, 0xc5,
+		0xaf, 0x03, 0xea, 0x46, 0x25, 0x89, 0x60, 0xcc,
+		0xd8, 0x74, 0x9d, 0x31, 0x52, 0xfe, 0x17, 0xbb,
+		0xd1, 0x7d, 0x94, 0x38, 0x5b, 0xf7, 0x1e, 0xb2,
+		0xca, 0x66, 0x8f, 0x23, 0x40, 0xec, 0x05, 0xa9,
+		0xc3, 0x6f, 0x86, 0x2a, 0x49, 0xe5, 0x0c, 0xa0,
+		0xfc, 0x50, 0xb9, 0x15, 0x76, 0xda, 0x33, 0x9f,
+		0xf5, 0x59, 0xb0, 0x1c, 0x7f, 0xd3, 0x3a, 0x96,
+		0xee, 0x42, 0xab, 0x07, 0x64, 0xc8, 0x21, 0x8d,
+		0xe7, 0x4b, 0xa2, 0x0e, 0x6d, 0xc1, 0x28, 0x84,
+	},
+	{
+		0x00, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64,
+		0x01, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65,
+		0x02, 0xaf, 0x45, 0xe8, 0x8c, 0x21, 0xcb, 0x66,
+		0x03, 0xae, 0x44, 0xe9, 0x8d, 0x20, 0xca, 0x67,
+		0x04, 0xa9, 0x43, 0xee, 0x8a, 0x27, 0xcd, 0x60,
+		0x05, 0xa8, 0x42, 0xef, 0x8b, 0x26, 0xcc, 0x61,
+		0x06, 0xab, 0x41, 0xec, 0x88, 0x25, 0xcf, 0x62,
+		0x07, 0xaa, 0x40, 0xed, 0x89, 0x24, 0xce, 0x63,
+		0x08, 0xa5, 0x4f, 0xe2, 0x86, 0x2b, 0xc1, 0x6c,
+		0x09, 0xa4, 0x4e, 0xe3, 0x87, 0x2a, 0xc0, 0x6d,
+		0x0a, 0xa7, 0x4d, 0xe0, 0x84, 0x29, 0xc3, 0x6e,
+		0x0b, 0xa6, 0x4c, 0xe1, 0x85, 0x28, 0xc2, 0x6f,
+		0x0c, 0xa1, 0x4b, 0xe6, 0x82, 0x2f, 0xc5, 0x68,
+		0x0d, 0xa0, 0x4a, 0xe7, 0x83, 0x2e, 0xc4, 0x69,
+		0x0e, 0xa3, 0x49, 0xe4, 0x80, 0x2d, 0xc7, 0x6a,
+		0x0f, 0xa2, 0x48, 0xe5, 0x81, 0x2c, 0xc6, 0x6b,
+		0x10, 0xbd, 0x57, 0xfa, 0x9e, 0x33, 0xd9, 0x74,
+		0x11, 0xbc, 0x56, 0xfb, 0x9f, 0x32, 0xd8, 0x75,
+		0x12, 0xbf, 0x55, 0xf8, 0x9c, 0x31, 0xdb, 0x76,
+		0x13, 0xbe, 0x54, 0xf9, 0x9d, 0x30, 0xda, 0x77,
+		0x14, 0xb9, 0x53, 0xfe, 0x9a, 0x37, 0xdd, 0x70,
+		0x15, 0xb8, 0x52, 0xff, 0x9b, 0x36, 0xdc, 0x71,
+		0x16, 0xbb, 0x51, 0xfc, 0x98, 0x35, 0xdf, 0x72,
+		0x17, 0xba, 0x50, 0xfd, 0x99, 0x34, 0xde, 0x73,
+		0x18, 0xb5, 0x5f, 0xf2, 0x96, 0x3b, 0xd1, 0x7c,
+		0x19, 0xb4, 0x5e, 0xf3, 0x97, 0x3a, 0xd0, 0x7d,
+		0x1a, 0xb7, 0x5d, 0xf0, 0x94, 0x39, 0xd3, 0x7e,
+		0x1b, 0xb6, 0x5c, 0xf1, 0x95, 0x38, 0xd2, 0x7f,
+		0x1c, 0xb1, 0x5b, 0xf6, 0x92, 0x3f, 0xd5, 0x78,
+		0x1d, 0xb0, 0x5a, 0xf7, 0x93, 0x3e, 0xd4, 0x79,
+		0x1e, 0xb3, 0x59, 0xf4, 0x90, 0x3d, 0xd7, 0x7a,
+		0x1f, 0xb2, 0x58, 0xf5, 0x91, 0x3c, 0xd6, 0x7b,
+	},
+	{
+		0x00, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d,
+		0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74,
+		0x32, 0x9c, 0x73, 0xdd, 0xb0, 0x1e, 0xf1, 0x5f,
+		0x2b, 0x85, 0x6a, 0xc4, 0xa9, 0x07, 0xe8, 0x46,
+		0x64, 0xca, 0x25, 0x8b, 0xe6, 0x48, 0xa7, 0x09,
+		0x7d, 0xd3, 0x3c, 0x92, 0xff, 0x51, 0xbe, 0x10,
+		0x56, 0xf8, 0x17, 0xb9, 0xd4, 0x7a, 0x95, 0x3b,
+		0x4f, 0xe1, 0x0e, 0xa0, 0xcd, 0x63, 0x8c, 0x22,
+		0xc8, 0x66, 0x89, 0x27, 0x4a, 0xe4, 0x0b, 0xa5,
+		0xd1, 0x7f, 0x90, 0x3e, 0x53, 0xfd, 0x12, 0xbc,
+		0xfa, 0x54, 0xbb, 0x15, 0x78, 0xd6, 0x39, 0x97,
+		0xe3, 0x4d, 0xa2, 0x0c, 0x61, 0xcf, 0x20, 0x8e,
+		0xac, 0x02, 0xed, 0x43, 0x2e, 0x80, 0x6f, 0xc1,
+		0xb5, 0x1b, 0xf4, 0x5a, 0x37, 0x99, 0x76, 0xd8,
+		0x9e, 0x30, 0xdf, 0x71, 0x1c, 0xb2, 0x5d, 0xf3,
+		0x87, 0x29, 0xc6, 0x68, 0x05, 0xab, 0x44, 0xea,
+		0x8d, 0x23, 0xcc, 0x62, 0x0f, 0xa1, 0x4e, 0xe0,
+		0x94, 0x3a, 0xd5, 0x7b, 0x16, 0xb8, 0x57, 0xf9,
+		0xbf, 0x11, 0xfe, 0x50, 0x3d, 0x93, 0x7c, 0xd2,
+		0xa6, 0x08, 0xe7, 0x49, 0x24, 0x8a, 0x65, 0xcb,
+		0xe9, 0x47, 0xa8, 0x06, 0x6b, 0xc5, 0x2a, 0x84,
+		0xf0, 0x5e, 0xb1, 0x1f, 0x72, 0xdc, 0x33, 0x9d,
+		0xdb, 0x75, 0x9a, 0x34, 0x59, 0xf7, 0x18, 0xb6,
+		0xc2, 0x6c, 0x83, 0x2d, 0x40, 0xee, 0x01, 0xaf,
+		0x45, 0xeb, 0x04, 0xaa, 0xc7, 0x69, 0x86, 0x28,
+		0x5c, 0xf2, 0x1d, 0xb3, 0xde, 0x70, 0x9f, 0x31,
+		0x77, 0xd9, 0x36, 0x98, 0xf5, 0x5b, 0xb4, 0x1a,
+		0x6e, 0xc0, 0x2f, 0x81, 0xec, 0x42, 0xad, 0x03,
+		0x21, 0x8f, 0x60, 0xce, 0xa3, 0x0d, 0xe2, 0x4c,
+		0x38, 0x96, 0x79, 0xd7, 0xba, 0x14, 0xfb, 0x55,
+		0x13, 0xbd, 0x52, 0xfc, 0x91, 0x3f, 0xd0, 0x7e,
+		0x0a, 0xa4, 0x4b, 0xe5, 0x88, 0x26, 0xc9, 0x67,
+	},
+	{
+		0x00, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a,
+		0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b,
+		0x22, 0x8d, 0x61, 0xce, 0xa4, 0x0b, 0xe7, 0x48,
+		0x33, 0x9c, 0x70, 0xdf, 0xb5, 0x1a, 0xf6, 0x59,
+		0x44, 0xeb, 0x07, 0xa8, 0xc2, 0x6d, 0x81, 0x2e,
+		0x55, 0xfa, 0x16, 0xb9, 0xd3, 0x7c, 0x90, 0x3f,
+		0x66, 0xc9, 0x25, 0x8a, 0xe0, 0x4f, 0xa3, 0x0c,
+		0x77, 0xd8, 0x34, 0x9b, 0xf1, 0x5e, 0xb2, 0x1d,
+		0x88, 0x27, 0xcb, 0x64, 0x0e, 0xa1, 0x4d, 0xe2,
+		0x99, 0x36, 0xda, 0x75, 0x1f, 0xb0, 0x5c, 0xf3,
+		0xaa, 0x05, 0xe9, 0x46, 0x2c, 0x83, 0x6f, 0xc0,
+		0xbb, 0x14, 0xf8, 0x57, 0x3d, 0x92, 0x7e, 0xd1,
+		0xcc, 0x63, 0x8f, 0x20, 0x4a, 0xe5, 0x09, 0xa6,
+		0xdd, 0x72, 0x9e, 0x31, 0x5b, 0xf4, 0x18, 0xb7,
+		0xee, 0x41, 0xad, 0x02, 0x68, 0xc7, 0x2b, 0x84,
+		0xff, 0x50, 0xbc, 0x13, 0x79, 0xd6, 0x3a, 0x95,
+		0x0d, 0xa2, 0x4e, 0xe1, 0x8b, 0x24, 0xc8, 0x67,
+		0x1c, 0xb3, 0x5f, 0xf0, 0x9a, 0x35, 0xd9, 0x76,
+		0x2f, 0x80, 0x6c, 0xc3, 0xa9, 0x06, 0xea, 0x45,
+		0x3e, 0x91, 0x7d, 0xd2, 0xb8, 0x17, 0xfb, 0x54,
+		0x49, 0xe6, 0x0a, 0xa5, 0xcf, 0x60, 0x8c, 0x23,
+		0x58, 0xf7, 0x1b, 0xb4, 0xde, 0x71, 0x9d, 0x32,
+		0x6b, 0xc4, 0x28, 0x87, 0xed, 0x42, 0xae, 0x01,
+		0x7a, 0xd5, 0x39, 0x96, 0xfc, 0x53, 0xbf, 0x10,
+		0x85, 0x2a, 0xc6, 0x69, 0x03, 0xac, 0x40, 0xef,
+		0x94, 0x3b, 0xd7, 0x78, 0x12, 0xbd, 0x51, 0xfe,
+		0xa7, 0x08, 0xe4, 0x4b, 0x21, 0x8e, 0x62, 0xcd,
+		0xb6, 0x19, 0xf5, 0x5a, 0x30, 0x9f, 0x73, 0xdc,
+		0xc1, 0x6e, 0x82, 0x2d, 0x47, 0xe8, 0x04, 0xab,
+		0xd0, 0x7f, 0x93, 0x3c, 0x56, 0xf9, 0x15, 0xba,
+		0xe3, 0x4c, 0xa0, 0x0f, 0x65, 0xca, 0x26, 0x89,
+		0xf2, 0x5d, 0xb1, 0x1e, 0x74, 0xdb, 0x37, 0x98,
+	},
+	{
+		0x00, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37,
+		0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde,
+		0xcf, 0x7f, 0xb2, 0x02, 0x35, 0x85, 0x48, 0xf8,
+		0x26, 0x96, 0x5b, 0xeb, 0xdc, 0x6c, 0xa1, 0x11,
+		0x83, 0x33, 0xfe, 0x4e, 0x79, 0xc9, 0x04, 0xb4,
+		0x6a, 0xda, 0x17, 0xa7, 0x90, 0x20, 0xed, 0x5d,
+		0x4c, 0xfc, 0x31, 0x81, 0xb6, 0x06, 0xcb, 0x7b,
+		0xa5, 0x15, 0xd8, 0x68, 0x5f, 0xef, 0x22, 0x92,
+		0x1b, 0xab, 0x66, 0xd6, 0xe1, 0x51, 0x9c, 0x2c,
+		0xf2, 0x42, 0x8f, 0x3f, 0x08, 0xb8, 0x75, 0xc5,
+		0xd4, 0x64, 0xa9, 0x19, 0x2e, 0x9e, 0x53, 0xe3,
+		0x3d, 0x8d, 0x40, 0xf0, 0xc7, 0x77, 0xba, 0x0a,
+		0x98, 0x28, 0xe5, 0x55, 0x62, 0xd2, 0x1f, 0xaf,
+		0x71, 0xc1, 0x0c, 0xbc, 0x8b, 0x3b, 0xf6, 0x46,
+		0x57, 0xe7, 0x2a, 0x9a, 0xad, 0x1d, 0xd0, 0x60,
+		0xbe, 0x0e, 0xc3, 0x73, 0x44, 0xf4, 0x39, 0x89,
+		0x36, 0x86, 0x4b, 0xfb, 0xcc, 0x7c, 0xb1, 0x01,
+		0xdf, 0x6f, 0xa2, 0x12, 0x25, 0x95, 0x58, 0xe8,
+		0xf9, 0x49, 0x84, 0x34, 0x03, 0xb3, 0x7e, 0xce,
+		0x10, 0xa0, 0x6d, 0xdd, 0xea, 0x5a, 0x97, 0x27,
+		0xb5, 0x05, 0xc8, 0x78, 0x4f, 0xff, 0x32, 0x82,
+		0x5c, 0xec, 0x21, 0x91, 0xa6, 0x16, 0xdb, 0x6b,
+		0x7a, 0xca, 0x07, 0xb7, 0x80, 0x30, 0xfd, 0x4d,
+		0x93, 0x23, 0xee, 0x5e, 0x69, 0xd9, 0x14, 0xa4,
+		0x2d, 0x9d, 0x50, 0xe0, 0xd7, 0x67, 0xaa, 0x1a,
+		0xc4, 0x74, 0xb9, 0x09, 0x3e, 0x8e, 0x43, 0xf3,
+		0xe2, 0x52, 0x9f, 0x2f, 0x18, 0xa8, 0x65, 0xd5,
+		0x0b, 0xbb, 0x76, 0xc6, 0xf1, 0x41, 0x8c, 0x3c,
+		0xae, 0x1e, 0xd3, 0x63, 0x54, 0xe4, 0x29, 0x99,
+		0x47, 0xf7, 0x3a, 0x8a, 0xbd, 0x0d, 0xc0, 0x70,
+		0x61, 0xd1, 0x1c, 0xac, 0x9b, 0x2b, 0xe6, 0x56,
+		0x88, 0x38, 0xf5, 0x45, 0x72, 0xc2, 0x0f, 0xbf,
+	},
+	{
+		0x00, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30,
+		0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1,
+		0xdf, 0x6e, 0xa0, 0x11, 0x21, 0x90, 0x5e, 0xef,
+		0x3e, 0x8f, 0x41, 0xf0, 0xc0, 0x71, 0xbf, 0x0e,
+		0xa3, 0x12, 0xdc, 0x6d, 0x5d, 0xec, 0x22, 0x93,
+		0x42, 0xf3, 0x3d, 0x8c, 0xbc, 0x0d, 0xc3, 0x72,
+		0x7c, 0xcd, 0x03, 0xb2, 0x82, 0x33, 0xfd, 0x4c,
+		0x9d, 0x2c, 0xe2, 0x53, 0x63, 0xd2, 0x1c, 0xad,
+		0x5b, 0xea, 0x24, 0x95, 0xa5, 0x14, 0xda, 0x6b,
+		0xba, 0x0b, 0xc5, 0x74, 0x44, 0xf5, 0x3b, 0x8a,
+		0x84, 0x35, 0xfb, 0x4a, 0x7a, 0xcb, 0x05, 0xb4,
+		0x65, 0xd4, 0x1a, 0xab, 0x9b, 0x2a, 0xe4, 0x55,
+		0xf8, 0x49, 0x87, 0x36, 0x06, 0xb7, 0x79, 0xc8,
+		0x19, 0xa8, 0x66, 0xd7, 0xe7, 0x56, 0x98, 0x29,
+		0x27, 0x96, 0x58, 0xe9, 0xd9, 0x68, 0xa6, 0x17,
+		0xc6, 0x77, 0xb9, 0x08, 0x38, 0x89, 0x47, 0xf6,
+		0xb6, 0x07, 0xc9, 0x78, 0x48, 0xf9, 0x37, 0x86,
+		0x57, 0xe6, 0x28, 0x99, 0xa9, 0x18, 0xd6, 0x67,
+		0x69, 0xd8, 0x16, 0xa7, 0x97, 0x26, 0xe8, 0x59,
+		0x88, 0x39, 0xf7, 0x46, 0x76, 0xc7, 0x09, 0xb8,
+		0x15, 0xa4, 0x6a, 0xdb, 0xeb, 0x5a, 0x94, 0x25,
+		0xf4, 0x45, 0x8b, 0x3a, 0x0a, 0xbb, 0x75, 0xc4,
+		0xca, 0x7b, 0xb5, 0x04, 0x34, 0x85, 0x4b, 0xfa,
+		0x2b, 0x9a, 0x54, 0xe5, 0xd5, 0x64, 0xaa, 0x1b,
+		0xed, 0x5c, 0x92, 0x23, 0x13, 0xa2, 0x6c, 0xdd,
+		0x0c, 0xbd, 0x73, 0xc2, 0xf2, 0x43, 0x8d, 0x3c,
+		0x32, 0x83, 0x4d, 0xfc, 0xcc, 0x7d, 0xb3, 0x02,
+		0xd3, 0x62, 0xac, 0x1d, 0x2d, 0x9c, 0x52, 0xe3,
+		0x4e, 0xff, 0x31, 0x80, 0xb0, 0x01, 0xcf, 0x7e,
+		0xaf, 0x1e, 0xd0, 0x61, 0x51, 0xe0, 0x2e, 0x9f,
+		0x91, 0x20, 0xee, 0x5f, 0x6f, 0xde, 0x10, 0xa1,
+		0x70, 0xc1, 0x0f, 0xbe, 0x8e, 0x3f, 0xf1, 0x40,
+	},
+	{
+		0x00, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39,
+		0xf9, 0x4b, 0x80, 0x32, 0x0b, 0xb9, 0x72, 0xc0,
+		0xef, 0x5d, 0x96, 0x24, 0x1d, 0xaf, 0x64, 0xd6,
+		0x16, 0xa4, 0x6f, 0xdd, 0xe4, 0x56, 0x9d, 0x2f,
+		0xc3, 0x71, 0xba, 0x08, 0x31, 0x83, 0x48, 0xfa,
+		0x3a, 0x88, 0x43, 0xf1, 0xc8, 0x7a, 0xb1, 0x03,
+		0x2c, 0x9e, 0x55, 0xe7, 0xde, 0x6c, 0xa7, 0x15,
+		0xd5, 0x67, 0xac, 0x1e, 0x27, 0x95, 0x5e, 0xec,
+		0x9b, 0x29, 0xe2, 0x50, 0x69, 0xdb, 0x10, 0xa2,
+		0x62, 0xd0, 0x1b, 0xa9, 0x90, 0x22, 0xe9, 0x5b,
+		0x74, 0xc6, 0x0d, 0xbf, 0x86, 0x34, 0xff, 0x4d,
+		0x8d, 0x3f, 0xf4, 0x46, 0x7f, 0xcd, 0x06, 0xb4,
+		0x58, 0xea, 0x21, 0x93, 0xaa, 0x18, 0xd3, 0x61,
+		0xa1, 0x13, 0xd8, 0x6a, 0x53, 0xe1, 0x2a, 0x98,
+		0xb7, 0x05, 0xce, 0x7c, 0x45, 0xf7, 0x3c, 0x8e,
+		0x4e, 0xfc, 0x37, 0x85, 0xbc, 0x0e, 0xc5, 0x77,
+		0x2b, 0x99, 0x52, 0xe0, 0xd9, 0x6b, 0xa0, 0x12,
+		0xd2, 0x60, 0xab, 0x19, 0x20, 0x92, 0x59, 0xeb,
+		0xc4, 0x76, 0xbd, 0x0f, 0x36, 0x84, 0x4f, 0xfd,
+		0x3d, 0x8f, 0x44, 0xf6, 0xcf, 0x7d, 0xb6, 0x04,
+		0xe8, 0x5a, 0x91, 0x23, 0x1a, 0xa8, 0x63, 0xd1,
+		0x11, 0xa3, 0x68, 0xda, 0xe3, 0x51, 0x9a, 0x28,
+		0x07, 0xb5, 0x7e, 0xcc, 0xf5, 0x47, 0x8c, 0x3e,
+		0xfe, 0x4c, 0x87, 0x35, 0x0c, 0xbe, 0x75, 0xc7,
+		0xb0, 0x02, 0xc9, 0x7b, 0x42, 0xf0, 0x3b, 0x89,
+		0x49, 0xfb, 0x30, 0x82, 0xbb, 0x09, 0xc2, 0x70,
+		0x5f, 0xed, 0x26, 0x94, 0xad, 0x1f, 0xd4, 0x66,
+		0xa6, 0x14, 0xdf, 0x6d, 0x54, 0xe6, 0x2d, 0x9f,
+		0x73, 0xc1, 0x0a, 0xb8, 0x81, 0x33, 0xf8, 0x4a,
+		0x8a, 0x38, 0xf3, 0x41, 0x78, 0xca, 0x01, 0xb3,
+		0x9c, 0x2e, 0xe5, 0x57, 0x6e, 0xdc, 0x17, 0xa5,
+		0x65, 0xd7, 0x1c, 0xae, 0x97, 0x25, 0xee, 0x5c,
+	},
+	{
+		0x00, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e,
+		0xf1, 0x42, 0x8a, 0x39, 0x07, 0xb4, 0x7c, 0xcf,
+		0xff, 0x4c, 0x84, 0x37, 0x09, 0xba, 0x72, 0xc1,
+		0x0e, 0xbd, 0x75, 0xc6, 0xf8, 0x4b, 0x83, 0x30,
+		0xe3, 0x50, 0x98, 0x2b, 0x15, 0xa6, 0x6e, 0xdd,
+		0x12, 0xa1, 0x69, 0xda, 0xe4, 0x57, 0x9f, 0x2c,
+		0x1c, 0xaf, 0x67, 0xd4, 0xea, 0x59, 0x91, 0x22,
+		0xed, 0x5e, 0x96, 0x25, 0x1b, 0xa8, 0x60, 0xd3,
+		0xdb, 0x68, 0xa0, 0x13, 0x2d, 0x9e, 0x56, 0xe5,
+		0x2a, 0x99, 0x51, 0xe2, 0xdc, 0x6f, 0xa7, 0x14,
+		0x24, 0x97, 0x5f, 0xec, 0xd2, 0x61, 0xa9, 0x1a,
+		0xd5, 0x66, 0xae, 0x1d, 0x23, 0x90, 0x58, 0xeb,
+		0x38, 0x8b, 0x43, 0xf0, 0xce, 0x7d, 0xb5, 0x06,
+		0xc9, 0x7a, 0xb2, 0x01, 0x3f, 0x8c, 0x44, 0xf7,
+		0xc7, 0x74, 0xbc, 0x0f, 0x31, 0x82, 0x4a, 0xf9,
+		0x36, 0x85, 0x4d, 0xfe, 0xc0, 0x73, 0xbb, 0x08,
+		0xab, 0x18, 0xd0, 0x63, 0x5d, 0xee, 0x26, 0x95,
+		0x5a, 0xe9, 0x21, 0x92, 0xac, 0x1f, 0xd7, 0x64,
+		0x54, 0xe7, 0x2f, 0x9c, 0xa2, 0x11, 0xd9, 0x6a,
+		0xa5, 0x16, 0xde, 0x6d, 0x53, 0xe0, 0x28, 0x9b,
+		0x48, 0xfb, 0x33, 0x80, 0xbe, 0x0d, 0xc5, 0x76,
+		0xb9, 0x0a, 0xc2, 0x71, 0x4f, 0xfc, 0x34, 0x87,
+		0xb7, 0x04, 0xcc, 0x7f, 0x41, 0xf2, 0x3a, 0x89,
+		0x46, 0xf5, 0x3d, 0x8e, 0xb0, 0x03, 0xcb, 0x78,
+		0x70, 0xc3, 0x0b, 0xb8, 0x86, 0x35, 0xfd, 0x4e,
+		0x81, 0x32, 0xfa, 0x49, 0x77, 0xc4, 0x0c, 0xbf,
+		0x8f, 0x3c, 0xf4, 0x47, 0x79, 0xca, 0x02, 0xb1,
+		0x7e, 0xcd, 0x05, 0xb6, 0x88, 0x3b, 0xf3, 0x40,
+		0x93, 0x20, 0xe8, 0x5b, 0x65, 0xd6, 0x1e, 0xad,
+		0x62, 0xd1, 0x19, 0xaa, 0x94, 0x27, 0xef, 0x5c,
+		0x6c, 0xdf, 0x17, 0xa4, 0x9a, 0x29, 0xe1, 0x52,
+		0x9d, 0x2e, 0xe6, 0x55, 0x6b, 0xd8, 0x10, 0xa3,
+	},
+	{
+		0x00, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b,
+		0xc9, 0x7d, 0xbc, 0x08, 0x23, 0x97, 0x56, 0xe2,
+		0x8f, 0x3b, 0xfa, 0x4e, 0x65, 0xd1, 0x10, 0xa4,
+		0x46, 0xf2, 0x33, 0x87, 0xac, 0x18, 0xd9, 0x6d,
+		0x03, 0xb7, 0x76, 0xc2, 0xe9, 0x5d, 0x9c, 0x28,
+		0xca, 0x7e, 0xbf, 0x0b, 0x20, 0x94, 0x55, 0xe1,
+		0x8c, 0x38, 0xf9, 0x4d, 0x66, 0xd2, 0x13, 0xa7,
+		0x45, 0xf1, 0x30, 0x84, 0xaf, 0x1b, 0xda, 0x6e,
+		0x06, 0xb2, 0x73, 0xc7, 0xec, 0x58, 0x99, 0x2d,
+		0xcf, 0x7b, 0xba, 0x0e, 0x25, 0x91, 0x50, 0xe4,
+		0x89, 0x3d, 0xfc, 0x48, 0x63, 0xd7, 0x16, 0xa2,
+		0x40, 0xf4, 0x35, 0x81, 0xaa, 0x1e, 0xdf, 0x6b,
+		0x05, 0xb1, 0x70, 0xc4, 0xef, 0x5b, 0x9a, 0x2e,
+		0xcc, 0x78, 0xb9, 0x0d, 0x26, 0x92, 0x53, 0xe7,
+		0x8a, 0x3e, 0xff, 0x4b, 0x60, 0xd4, 0x15, 0xa1,
+		0x43, 0xf7, 0x36, 0x82, 0xa9, 0x1d, 0xdc, 0x68,
+		0x0c, 0xb8, 0x79, 0xcd, 0xe6, 0x52, 0x93, 0x27,
+		0xc5, 0x71, 0xb0, 0x04, 0x2f, 0x9b, 0x5a, 0xee,
+		0x83, 0x37, 0xf6, 0x42, 0x69, 0xdd, 0x1c, 0xa8,
+		0x4a, 0xfe, 0x3f, 0x8b, 0xa0, 0x14, 0xd5, 0x61,
+		0x0f, 0xbb, 0x7a, 0xce, 0xe5, 0x51, 0x90, 0x24,
+		0xc6, 0x72, 0xb3, 0x07, 0x2c, 0x98, 0x59, 0xed,
+		0x80, 0x34, 0xf5, 0x41, 0x6a, 0xde, 0x1f, 0xab,
+		0x49, 0xfd, 0x3c, 0x88, 0xa3, 0x17, 0xd6, 0x62,
+		0x0a, 0xbe, 0x7f, 0xcb, 0xe0, 0x54, 0x95, 0x21,
+		0xc3, 0x77, 0xb6, 0x02, 0x29, 0x9d, 0x5c, 0xe8,
+		0x85, 0x31, 0xf0, 0x44, 0x6f, 0xdb, 0x1a, 0xae,
+		0x4c, 0xf8, 0x39, 0x8d, 0xa6, 0x12, 0xd3, 0x67,
+		0x09, 0xbd, 0x7c, 0xc8, 0xe3, 0x57, 0x96, 0x22,
+		0xc0, 0x74, 0xb5, 0x01, 0x2a, 0x9e, 0x5f, 0xeb,
+		0x86, 0x32, 0xf3, 0x47, 0x6c, 0xd8, 0x19, 0xad,
+		0x4f, 0xfb, 0x3a, 0x8e, 0xa5, 0x11, 0xd0, 0x64,
+	},
+	{
+		0x00, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c,
+		0xc1, 0x74, 0xb6, 0x03, 0x2f, 0x9a, 0x58, 0xed,
+		0x9f, 0x2a, 0xe8, 0x5d, 0x71, 0xc4, 0x06, 0xb3,
+		0x5e, 0xeb, 0x29, 0x9c, 0xb0, 0x05, 0xc7, 0x72,
+		0x23, 0x96, 0x54, 0xe1, 0xcd, 0x78, 0xba, 0x0f,
+		0xe2, 0x57, 0x95, 0x20, 0x0c, 0xb9, 0x7b, 0xce,
+		0xbc, 0x09, 0xcb, 0x7e, 0x52, 0xe7, 0x25, 0x90,
+		0x7d, 0xc8, 0x0a, 0xbf, 0x93, 0x26, 0xe4, 0x51,
+		0x46, 0xf3, 0x31, 0x84, 0xa8, 0x1d, 0xdf, 0x6a,
+		0x87, 0x32, 0xf0, 0x45, 0x69, 0xdc, 0x1e, 0xab,
+		0xd9, 0x6c, 0xae, 0x1b, 0x37, 0x82, 0x40, 0xf5,
+		0x18, 0xad, 0x6f, 0xda, 0xf6, 0x43, 0x81, 0x34,
+		0x65, 0xd0, 0x12, 0xa7, 0x8b, 0x3e, 0xfc, 0x49,
+		0xa4, 0x11, 0xd3, 0x66, 0x4a, 0xff, 0x3d, 0x88,
+		0xfa, 0x4f, 0x8d, 0x38, 0x14, 0xa1, 0x63, 0xd6,
+		0x3b, 0x8e, 0x4c, 0xf9, 0xd5, 0x60, 0xa2, 0x17,
+		0x8c, 0x39, 0xfb, 0x4e, 0x62, 0xd7, 0x15, 0xa0,
+		0x4d, 0xf8, 0x3a, 0x8f, 0xa3, 0x16, 0xd4, 0x61,
+		0x13, 0xa6, 0x64, 0xd1, 0xfd, 0x48, 0x8a, 0x3f,
+		0xd2, 0x67, 0xa5, 0x10, 0x3c, 0x89, 0x4b, 0xfe,
+		0xaf, 0x1a, 0xd8, 0x6d, 0x41, 0xf4, 0x36, 0x83,
+		0x6e, 0xdb, 0x19, 0xac, 0x80, 0x35, 0xf7, 0x42,
+		0x30, 0x85, 0x47, 0xf2, 0xde, 0x6b, 0xa9, 0x1c,
+		0xf1, 0x44, 0x86, 0x33, 0x1f, 0xaa, 0x68, 0xdd,
+		0xca, 0x7f, 0xbd, 0x08, 0x24, 0x91, 0x53, 0xe6,
+		0x0b, 0xbe, 0x7c, 0xc9, 0xe5, 0x50, 0x92, 0x27,
+		0x55, 0xe0, 0x22, 0x97, 0xbb, 0x0e, 0xcc, 0x79,
+		0x94, 0x21, 0xe3, 0x56, 0x7a, 0xcf, 0x0d, 0xb8,
+		0xe9, 0x5c, 0x9e, 0x2b, 0x07, 0xb2, 0x70, 0xc5,
+		0x28, 0x9d, 0x5f, 0xea, 0xc6, 0x73, 0xb1, 0x04,
+		0x76, 0xc3, 0x01, 0xb4, 0x98, 0x2d, 0xef, 0x5a,
+		0xb7, 0x02, 0xc0, 0x75, 0x59, 0xec, 0x2e, 0x9b,
+	},
+	{
+		0x00, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25,
+		0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc,
+		0xaf, 0x19, 0xde, 0x68, 0x4d, 0xfb, 0x3c, 0x8a,
+		0x76, 0xc0, 0x07, 0xb1, 0x94, 0x22, 0xe5, 0x53,
+		0x43, 0xf5, 0x32, 0x84, 0xa1, 0x17, 0xd0, 0x66,
+		0x9a, 0x2c, 0xeb, 0x5d, 0x78, 0xce, 0x09, 0xbf,
+		0xec, 0x5a, 0x9d, 0x2b, 0x0e, 0xb8, 0x7f, 0xc9,
+		0x35, 0x83, 0x44, 0xf2, 0xd7, 0x61, 0xa6, 0x10,
+		0x86, 0x30, 0xf7, 0x41, 0x64, 0xd2, 0x15, 0xa3,
+		0x5f, 0xe9, 0x2e, 0x98, 0xbd, 0x0b, 0xcc, 0x7a,
+		0x29, 0x9f, 0x58, 0xee, 0xcb, 0x7d, 0xba, 0x0c,
+		0xf0, 0x46, 0x81, 0x37, 0x12, 0xa4, 0x63, 0xd5,
+		0xc5, 0x73, 0xb4, 0x02, 0x27, 0x91, 0x56, 0xe0,
+		0x1c, 0xaa, 0x6d, 0xdb, 0xfe, 0x48, 0x8f, 0x39,
+		0x6a, 0xdc, 0x1b, 0xad, 0x88, 0x3e, 0xf9, 0x4f,
+		0xb3, 0x05, 0xc2, 0x74, 0x51, 0xe7, 0x20, 0x96,
+		0x11, 0xa7, 0x60, 0xd6, 0xf3, 0x45, 0x82, 0x34,
+		0xc8, 0x7e, 0xb9, 0x0f, 0x2a, 0x9c, 0x5b, 0xed,
+		0xbe, 0x08, 0xcf, 0x79, 0x5c, 0xea, 0x2d, 0x9b,
+		0x67, 0xd1, 0x16, 0xa0, 0x85, 0x33, 0xf4, 0x42,
+		0x52, 0xe4, 0x23, 0x95, 0xb0, 0x06, 0xc1, 0x77,
+		0x8b, 0x3d, 0xfa, 0x4c, 0x69, 0xdf, 0x18, 0xae,
+		0xfd, 0x4b, 0x8c, 0x3a, 0x1f, 0xa9, 0x6e, 0xd8,
+		0x24, 0x92, 0x55, 0xe3, 0xc6, 0x70, 0xb7, 0x01,
+		0x97, 0x21, 0xe6, 0x50, 0x75, 0xc3, 0x04, 0xb2,
+		0x4e, 0xf8, 0x3f, 0x89, 0xac, 0x1a, 0xdd, 0x6b,
+		0x38, 0x8e, 0x49, 0xff, 0xda, 0x6c, 0xab, 0x1d,
+		0xe1, 0x57, 0x90, 0x26, 0x03, 0xb5, 0x72, 0xc4,
+		0xd4, 0x62, 0xa5, 0x13, 0x36, 0x80, 0x47, 0xf1,
+		0x0d, 0xbb, 0x7c, 0xca, 0xef, 0x59, 0x9e, 0x28,
+		0x7b, 0xcd, 0x0a, 0xbc, 0x99, 0x2f, 0xe8, 0x5e,
+		0xa2, 0x14, 0xd3, 0x65, 0x40, 0xf6, 0x31, 0x87,
+	},
+	{
+		0x00, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22,
+		0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3,
+		0xbf, 0x08, 0xcc, 0x7b, 0x59, 0xee, 0x2a, 0x9d,
+		0x6e, 0xd9, 0x1d, 0xaa, 0x88, 0x3f, 0xfb, 0x4c,
+		0x63, 0xd4, 0x10, 0xa7, 0x85, 0x32, 0xf6, 0x41,
+		0xb2, 0x05, 0xc1, 0x76, 0x54, 0xe3, 0x27, 0x90,
+		0xdc, 0x6b, 0xaf, 0x18, 0x3a, 0x8d, 0x49, 0xfe,
+		0x0d, 0xba, 0x7e, 0xc9, 0xeb, 0x5c, 0x98, 0x2f,
+		0xc6, 0x71, 0xb5, 0x02, 0x20, 0x97, 0x53, 0xe4,
+		0x17, 0xa0, 0x64, 0xd3, 0xf1, 0x46, 0x82, 0x35,
+		0x79, 0xce, 0x0a, 0xbd, 0x9f, 0x28, 0xec, 0x5b,
+		0xa8, 0x1f, 0xdb, 0x6c, 0x4e, 0xf9, 0x3d, 0x8a,
+		0xa5, 0x12, 0xd6, 0x61, 0x43, 0xf4, 0x30, 0x87,
+		0x74, 0xc3, 0x07, 0xb0, 0x92, 0x25, 0xe1, 0x56,
+		0x1a, 0xad, 0x69, 0xde, 0xfc, 0x4b, 0x8f, 0x38,
+		0xcb, 0x7c, 0xb8, 0x0f, 0x2d, 0x9a, 0x5e, 0xe9,
+		0x91, 0x26, 0xe2, 0x55, 0x77, 0xc0, 0x04, 0xb3,
+		0x40, 0xf7, 0x33, 0x84, 0xa6, 0x11, 0xd5, 0x62,
+		0x2e, 0x99, 0x5d, 0xea, 0xc8, 0x7f, 0xbb, 0x0c,
+		0xff, 0x48, 0x8c, 0x3b, 0x19, 0xae, 0x6a, 0xdd,
+		0xf2, 0x45, 0x81, 0x36, 0x14, 0xa3, 0x67, 0xd0,
+		0x23, 0x94, 0x50, 0xe7, 0xc5, 0x72, 0xb6, 0x01,
+		0x4d, 0xfa, 0x3e, 0x89, 0xab, 0x1c, 0xd8, 0x6f,
+		0x9c, 0x2b, 0xef, 0x58, 0x7a, 0xcd, 0x09, 0xbe,
+		0x57, 0xe0, 0x24, 0x93, 0xb1, 0x06, 0xc2, 0x75,
+		0x86, 0x31, 0xf5, 0x42, 0x60, 0xd7, 0x13, 0xa4,
+		0xe8, 0x5f, 0x9b, 0x2c, 0x0e, 0xb9, 0x7d, 0xca,
+		0x39, 0x8e, 0x4a, 0xfd, 0xdf, 0x68, 0xac, 0x1b,
+		0x34, 0x83, 0x47, 0xf0, 0xd2, 0x65, 0xa1, 0x16,
+		0xe5, 0x52, 0x96, 0x21, 0x03, 0xb4, 0x70, 0xc7,
+		0x8b, 0x3c, 0xf8, 0x4f, 0x6d, 0xda, 0x1e, 0xa9,
+		0x5a, 0xed, 0x29, 0x9e, 0xbc, 0x0b, 0xcf, 0x78,
+	},
+	{
+		0x00, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0x0f,
+		0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6,
+		0x4f, 0xf7, 0x22, 0x9a, 0x95, 0x2d, 0xf8, 0x40,
+		0xe6, 0x5e, 0x8b, 0x33, 0x3c, 0x84, 0x51, 0xe9,
+		0x9e, 0x26, 0xf3, 0x4b, 0x44, 0xfc, 0x29, 0x91,
+		0x37, 0x8f, 0x5a, 0xe2, 0xed, 0x55, 0x80, 0x38,
+		0xd1, 0x69, 0xbc, 0x04, 0x0b, 0xb3, 0x66, 0xde,
+		0x78, 0xc0, 0x15, 0xad, 0xa2, 0x1a, 0xcf, 0x77,
+		0x21, 0x99, 0x4c, 0xf4, 0xfb, 0x43, 0x96, 0x2e,
+		0x88, 0x30, 0xe5, 0x5d, 0x52, 0xea, 0x3f, 0x87,
+		0x6e, 0xd6, 0x03, 0xbb, 0xb4, 0x0c, 0xd9, 0x61,
+		0xc7, 0x7f, 0xaa, 0x12, 0x1d, 0xa5, 0x70, 0xc8,
+		0xbf, 0x07, 0xd2, 0x6a, 0x65, 0xdd, 0x08, 0xb0,
+		0x16, 0xae, 0x7b, 0xc3, 0xcc, 0x74, 0xa1, 0x19,
+		0xf0, 0x48, 0x9d, 0x25, 0x2a, 0x92, 0x47, 0xff,
+		0x59, 0xe1, 0x34, 0x8c, 0x83, 0x3b, 0xee, 0x56,
+		0x42, 0xfa, 0x2f, 0x97, 0x98, 0x20, 0xf5, 0x4d,
+		0xeb, 0x53, 0x86, 0x3e, 0x31, 0x89, 0x5c, 0xe4,
+		0x0d, 0xb5, 0x60, 0xd8, 0xd7, 0x6f, 0xba, 0x02,
+		0xa4, 0x1c, 0xc9, 0x71, 0x7e, 0xc6, 0x13, 0xab,
+		0xdc, 0x64, 0xb1, 0x09, 0x06, 0xbe, 0x6b, 0xd3,
+		0x75, 0xcd, 0x18, 0xa0, 0xaf, 0x17, 0xc2, 0x7a,
+		0x93, 0x2b, 0xfe, 0x46, 0x49, 0xf1, 0x24, 0x9c,
+		0x3a, 0x82, 0x57, 0xef, 0xe0, 0x58, 0x8d, 0x35,
+		0x63, 0xdb, 0x0e, 0xb6, 0xb9, 0x01, 0xd4, 0x6c,
+		0xca, 0x72, 0xa7, 0x1f, 0x10, 0xa8, 0x7d, 0xc5,
+		0x2c, 0x94, 0x41, 0xf9, 0xf6, 0x4e, 0x9b, 0x23,
+		0x85, 0x3d, 0xe8, 0x50, 0x5f, 0xe7, 0x32, 0x8a,
+		0xfd, 0x45, 0x90, 0x28, 0x27, 0x9f, 0x4a, 0xf2,
+		0x54, 0xec, 0x39, 0x81, 0x8e, 0x36, 0xe3, 0x5b,
+		0xb2, 0x0a, 0xdf, 0x67, 0x68, 0xd0, 0x05, 0xbd,
+		0x1b, 0xa3, 0x76, 0xce, 0xc1, 0x79, 0xac, 0x14,
+	},
+	{
+		0x00, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x08,
+		0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9,
+		0x5f, 0xe6, 0x30, 0x89, 0x81, 0x38, 0xee, 0x57,
+		0xfe, 0x47, 0x91, 0x28, 0x20, 0x99, 0x4f, 0xf6,
+		0xbe, 0x07, 0xd1, 0x68, 0x60, 0xd9, 0x0f, 0xb6,
+		0x1f, 0xa6, 0x70, 0xc9, 0xc1, 0x78, 0xae, 0x17,
+		0xe1, 0x58, 0x8e, 0x37, 0x3f, 0x86, 0x50, 0xe9,
+		0x40, 0xf9, 0x2f, 0x96, 0x9e, 0x27, 0xf1, 0x48,
+		0x61, 0xd8, 0x0e, 0xb7, 0xbf, 0x06, 0xd0, 0x69,
+		0xc0, 0x79, 0xaf, 0x16, 0x1e, 0xa7, 0x71, 0xc8,
+		0x3e, 0x87, 0x51, 0xe8, 0xe0, 0x59, 0x8f, 0x36,
+		0x9f, 0x26, 0xf0, 0x49, 0x41, 0xf8, 0x2e, 0x97,
+		0xdf, 0x66, 0xb0, 0x09, 0x01, 0xb8, 0x6e, 0xd7,
+		0x7e, 0xc7, 0x11, 0xa8, 0xa0, 0x19, 0xcf, 0x76,
+		0x80, 0x39, 0xef, 0x56, 0x5e, 0xe7, 0x31, 0x88,
+		0x21, 0x98, 0x4e, 0xf7, 0xff, 0x46, 0x90, 0x29,
+		0xc2, 0x7b, 0xad, 0x14, 0x1c, 0xa5, 0x73, 0xca,
+		0x63, 0xda, 0x0c, 0xb5, 0xbd, 0x04, 0xd2, 0x6b,
+		0x9d, 0x24, 0xf2, 0x4b, 0x43, 0xfa, 0x2c, 0x95,
+		0x3c, 0x85, 0x53, 0xea, 0xe2, 0x5b, 0x8d, 0x34,
+		0x7c, 0xc5, 0x13, 0xaa, 0xa2, 0x1b, 0xcd, 0x74,
+		0xdd, 0x64, 0xb2, 0x0b, 0x03, 0xba, 0x6c, 0xd5,
+		0x23, 0x9a, 0x4c, 0xf5, 0xfd, 0x44, 0x92, 0x2b,
+		0x82, 0x3b, 0xed, 0x54, 0x5c, 0xe5, 0x33, 0x8a,
+		0xa3, 0x1a, 0xcc, 0x75, 0x7d, 0xc4, 0x12, 0xab,
+		0x02, 0xbb, 0x6d, 0xd4, 0xdc, 0x65, 0xb3, 0x0a,
+		0xfc, 0x45, 0x93, 0x2a, 0x22, 0x9b, 0x4d, 0xf4,
+		0x5d, 0xe4, 0x32, 0x8b, 0x83, 0x3a, 0xec, 0x55,
+		0x1d, 0xa4, 0x72, 0xcb, 0xc3, 0x7a, 0xac, 0x15,
+		0xbc, 0x05, 0xd3, 0x6a, 0x62, 0xdb, 0x0d, 0xb4,
+		0x42, 0xfb, 0x2d, 0x94, 0x9c, 0x25, 0xf3, 0x4a,
+		0xe3, 0x5a, 0x8c, 0x35, 0x3d, 0x84, 0x52, 0xeb,
+	},
+	{
+		0x00, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x01,
+		0xb9, 0x03, 0xd0, 0x6a, 0x6b, 0xd1, 0x02, 0xb8,
+		0x6f, 0xd5, 0x06, 0xbc, 0xbd, 0x07, 0xd4, 0x6e,
+		0xd6, 0x6c, 0xbf, 0x05, 0x04, 0xbe, 0x6d, 0xd7,
+		0xde, 0x64, 0xb7, 0x0d, 0x0c, 0xb6, 0x65, 0xdf,
+		0x67, 0xdd, 0x0e, 0xb4, 0xb5, 0x0f, 0xdc, 0x66,
+		0xb1, 0x0b, 0xd8, 0x62, 0x63, 0xd9, 0x0a, 0xb0,
+		0x08, 0xb2, 0x61, 0xdb, 0xda, 0x60, 0xb3, 0x09,
+		0xa1, 0x1b, 0xc8, 0x72, 0x73, 0xc9, 0x1a, 0xa0,
+		0x18, 0xa2, 0x71, 0xcb, 0xca, 0x70, 0xa3, 0x19,
+		0xce, 0x74, 0xa7, 0x1d, 0x1c, 0xa6, 0x75, 0xcf,
+		0x77, 0xcd, 0x1e, 0xa4, 0xa5, 0x1f, 0xcc, 0x76,
+		0x7f, 0xc5, 0x16, 0xac, 0xad, 0x17, 0xc4, 0x7e,
+		0xc6, 0x7c, 0xaf, 0x15, 0x14, 0xae, 0x7d, 0xc7,
+		0x10, 0xaa, 0x79, 0xc3, 0xc2, 0x78, 0xab, 0x11,
+		0xa9, 0x13, 0xc0, 0x7a, 0x7b, 0xc1, 0x12, 0xa8,
+		0x5f, 0xe5, 0x36, 0x8c, 0x8d, 0x37, 0xe4, 0x5e,
+		0xe6, 0x5c, 0x8f, 0x35, 0x34, 0x8e, 0x5d, 0xe7,
+		0x30, 0x8a, 0x59, 0xe3, 0xe2, 0x58, 0x8b, 0x31,
+		0x89, 0x33, 0xe0, 0x5a, 0x5b, 0xe1, 0x32, 0x88,
+		0x81, 0x3b, 0xe8, 0x52, 0x53, 0xe9, 0x3a, 0x80,
+		0x38, 0x82, 0x51, 0xeb, 0xea, 0x50, 0x83, 0x39,
+		0xee, 0x54, 0x87, 0x3d, 0x3c, 0x86, 0x55, 0xef,
+		0x57, 0xed, 0x3e, 0x84, 0x85, 0x3f, 0xec, 0x56,
+		0xfe, 0x44, 0x97, 0x2d, 0x2c, 0x96, 0x45, 0xff,
+		0x47, 0xfd, 0x2e, 0x94, 0x95, 0x2f, 0xfc, 0x46,
+		0x91, 0x2b, 0xf8, 0x42, 0x43, 0xf9, 0x2a, 0x90,
+		0x28, 0x92, 0x41, 0xfb, 0xfa, 0x40, 0x93, 0x29,
+		0x20, 0x9a, 0x49, 0xf3, 0xf2, 0x48, 0x9b, 0x21,
+		0x99, 0x23, 0xf0, 0x4a, 0x4b, 0xf1, 0x22, 0x98,
+		0x4f, 0xf5, 0x26, 0x9c, 0x9d, 0x27, 0xf4, 0x4e,
+		0xf6, 0x4c, 0x9f, 0x25, 0x24, 0x9e, 0x4d, 0xf7,
+	},
+	{
+		0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+		0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7,
+		0x7f, 0xc4, 0x14, 0xaf, 0xa9, 0x12, 0xc2, 0x79,
+		0xce, 0x75, 0xa5, 0x1e, 0x18, 0xa3, 0x73, 0xc8,
+		0xfe, 0x45, 0x95, 0x2e, 0x28, 0x93, 0x43, 0xf8,
+		0x4f, 0xf4, 0x24, 0x9f, 0x99, 0x22, 0xf2, 0x49,
+		0x81, 0x3a, 0xea, 0x51, 0x57, 0xec, 0x3c, 0x87,
+		0x30, 0x8b, 0x5b, 0xe0, 0xe6, 0x5d, 0x8d, 0x36,
+		0xe1, 0x5a, 0x8a, 0x31, 0x37, 0x8c, 0x5c, 0xe7,
+		0x50, 0xeb, 0x3b, 0x80, 0x86, 0x3d, 0xed, 0x56,
+		0x9e, 0x25, 0xf5, 0x4e, 0x48, 0xf3, 0x23, 0x98,
+		0x2f, 0x94, 0x44, 0xff, 0xf9, 0x42, 0x92, 0x29,
+		0x1f, 0xa4, 0x74, 0xcf, 0xc9, 0x72, 0xa2, 0x19,
+		0xae, 0x15, 0xc5, 0x7e, 0x78, 0xc3, 0x13, 0xa8,
+		0x60, 0xdb, 0x0b, 0xb0, 0xb6, 0x0d, 0xdd, 0x66,
+		0xd1, 0x6a, 0xba, 0x01, 0x07, 0xbc, 0x6c, 0xd7,
+		0xdf, 0x64, 0xb4, 0x0f, 0x09, 0xb2, 0x62, 0xd9,
+		0x6e, 0xd5, 0x05, 0xbe, 0xb8, 0x03, 0xd3, 0x68,
+		0xa0, 0x1b, 0xcb, 0x70, 0x76, 0xcd, 0x1d, 0xa6,
+		0x11, 0xaa, 0x7a, 0xc1, 0xc7, 0x7c, 0xac, 0x17,
+		0x21, 0x9a, 0x4a, 0xf1, 0xf7, 0x4c, 0x9c, 0x27,
+		0x90, 0x2b, 0xfb, 0x40, 0x46, 0xfd, 0x2d, 0x96,
+		0x5e, 0xe5, 0x35, 0x8e, 0x88, 0x33, 0xe3, 0x58,
+		0xef, 0x54, 0x84, 0x3f, 0x39, 0x82, 0x52, 0xe9,
+		0x3e, 0x85, 0x55, 0xee, 0xe8, 0x53, 0x83, 0x38,
+		0x8f, 0x34, 0xe4, 0x5f, 0x59, 0xe2, 0x32, 0x89,
+		0x41, 0xfa, 0x2a, 0x91, 0x97, 0x2c, 0xfc, 0x47,
+		0xf0, 0x4b, 0x9b, 0x20, 0x26, 0x9d, 0x4d, 0xf6,
+		0xc0, 0x7b, 0xab, 0x10, 0x16, 0xad, 0x7d, 0xc6,
+		0x71, 0xca, 0x1a, 0xa1, 0xa7, 0x1c, 0xcc, 0x77,
+		0xbf, 0x04, 0xd4, 0x6f, 0x69, 0xd2, 0x02, 0xb9,
+		0x0e, 0xb5, 0x65, 0xde, 0xd8, 0x63, 0xb3, 0x08,
+	},
+	{
+		0x00, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13,
+		0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a,
+		0x0f, 0xb3, 0x6a, 0xd6, 0xc5, 0x79, 0xa0, 0x1c,
+		0x86, 0x3a, 0xe3, 0x5f, 0x4c, 0xf0, 0x29, 0x95,
+		0x1e, 0xa2, 0x7b, 0xc7, 0xd4, 0x68, 0xb1, 0x0d,
+		0x97, 0x2b, 0xf2, 0x4e, 0x5d, 0xe1, 0x38, 0x84,
+		0x11, 0xad, 0x74, 0xc8, 0xdb, 0x67, 0xbe, 0x02,
+		0x98, 0x24, 0xfd, 0x41, 0x52, 0xee, 0x37, 0x8b,
+		0x3c, 0x80, 0x59, 0xe5, 0xf6, 0x4a, 0x93, 0x2f,
+		0xb5, 0x09, 0xd0, 0x6c, 0x7f, 0xc3, 0x1a, 0xa6,
+		0x33, 0x8f, 0x56, 0xea, 0xf9, 0x45, 0x9c, 0x20,
+		0xba, 0x06, 0xdf, 0x63, 0x70, 0xcc, 0x15, 0xa9,
+		0x22, 0x9e, 0x47, 0xfb, 0xe8, 0x54, 0x8d, 0x31,
+		0xab, 0x17, 0xce, 0x72, 0x61, 0xdd, 0x04, 0xb8,
+		0x2d, 0x91, 0x48, 0xf4, 0xe7, 0x5b, 0x82, 0x3e,
+		0xa4, 0x18, 0xc1, 0x7d, 0x6e, 0xd2, 0x0b, 0xb7,
+		0x78, 0xc4, 0x1d, 0xa1, 0xb2, 0x0e, 0xd7, 0x6b,
+		0xf1, 0x4d, 0x94, 0x28, 0x3b, 0x87, 0x5e, 0xe2,
+		0x77, 0xcb, 0x12, 0xae, 0xbd, 0x01, 0xd8, 0x64,
+		0xfe, 0x42, 0x9b, 0x27, 0x34, 0x88, 0x51, 0xed,
+		0x66, 0xda, 0x03, 0xbf, 0xac, 0x10, 0xc9, 0x75,
+		0xef, 0x53, 0x8a, 0x36, 0x25, 0x99, 0x40, 0xfc,
+		0x69, 0xd5, 0x0c, 0xb0, 0xa3, 0x1f, 0xc6, 0x7a,
+		0xe0, 0x5c, 0x85, 0x39, 0x2a, 0x96, 0x4f, 0xf3,
+		0x44, 0xf8, 0x21, 0x9d, 0x8e, 0x32, 0xeb, 0x57,
+		0xcd, 0x71, 0xa8, 0x14, 0x07, 0xbb, 0x62, 0xde,
+		0x4b, 0xf7, 0x2e, 0x92, 0x81, 0x3d, 0xe4, 0x58,
+		0xc2, 0x7e, 0xa7, 0x1b, 0x08, 0xb4, 0x6d, 0xd1,
+		0x5a, 0xe6, 0x3f, 0x83, 0x90, 0x2c, 0xf5, 0x49,
+		0xd3, 0x6f, 0xb6, 0x0a, 0x19, 0xa5, 0x7c, 0xc0,
+		0x55, 0xe9, 0x30, 0x8c, 0x9f, 0x23, 0xfa, 0x46,
+		0xdc, 0x60, 0xb9, 0x05, 0x16, 0xaa, 0x73, 0xcf,
+	},
+	{
+		0x00, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14,
+		0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95,
+		0x1f, 0xa2, 0x78, 0xc5, 0xd1, 0x6c, 0xb6, 0x0b,
+		0x9e, 0x23, 0xf9, 0x44, 0x50, 0xed, 0x37, 0x8a,
+		0x3e, 0x83, 0x59, 0xe4, 0xf0, 0x4d, 0x97, 0x2a,
+		0xbf, 0x02, 0xd8, 0x65, 0x71, 0xcc, 0x16, 0xab,
+		0x21, 0x9c, 0x46, 0xfb, 0xef, 0x52, 0x88, 0x35,
+		0xa0, 0x1d, 0xc7, 0x7a, 0x6e, 0xd3, 0x09, 0xb4,
+		0x7c, 0xc1, 0x1b, 0xa6, 0xb2, 0x0f, 0xd5, 0x68,
+		0xfd, 0x40, 0x9a, 0x27, 0x33, 0x8e, 0x54, 0xe9,
+		0x63, 0xde, 0x04, 0xb9, 0xad, 0x10, 0xca, 0x77,
+		0xe2, 0x5f, 0x85, 0x38, 0x2c, 0x91, 0x4b, 0xf6,
+		0x42, 0xff, 0x25, 0x98, 0x8c, 0x31, 0xeb, 0x56,
+		0xc3, 0x7e, 0xa4, 0x19, 0x0d, 0xb0, 0x6a, 0xd7,
+		0x5d, 0xe0, 0x3a, 0x87, 0x93, 0x2e, 0xf4, 0x49,
+		0xdc, 0x61, 0xbb, 0x06, 0x12, 0xaf, 0x75, 0xc8,
+		0xf8, 0x45, 0x9f, 0x22, 0x36, 0x8b, 0x51, 0xec,
+		0x79, 0xc4, 0x1e, 0xa3, 0xb7, 0x0a, 0xd0, 0x6d,
+		0xe7, 0x5a, 0x80, 0x3d, 0x29, 0x94, 0x4e, 0xf3,
+		0x66, 0xdb, 0x01, 0xbc, 0xa8, 0x15, 0xcf, 0x72,
+		0xc6, 0x7b, 0xa1, 0x1c, 0x08, 0xb5, 0x6f, 0xd2,
+		0x47, 0xfa, 0x20, 0x9d, 0x89, 0x34, 0xee, 0x53,
+		0xd9, 0x64, 0xbe, 0x03, 0x17, 0xaa, 0x70, 0xcd,
+		0x58, 0xe5, 0x3f, 0x82, 0x96, 0x2b, 0xf1, 0x4c,
+		0x84, 0x39, 0xe3, 0x5e, 0x4a, 0xf7, 0x2d, 0x90,
+		0x05, 0xb8, 0x62, 0xdf, 0xcb, 0x76, 0xac, 0x11,
+		0x9b, 0x26, 0xfc, 0x41, 0x55, 0xe8, 0x32, 0x8f,
+		0x1a, 0xa7, 0x7d, 0xc0, 0xd4, 0x69, 0xb3, 0x0e,
+		0xba, 0x07, 0xdd, 0x60, 0x74, 0xc9, 0x13, 0xae,
+		0x3b, 0x86, 0x5c, 0xe1, 0xf5, 0x48, 0x92, 0x2f,
+		0xa5, 0x18, 0xc2, 0x7f, 0x6b, 0xd6, 0x0c, 0xb1,
+		0x24, 0x99, 0x43, 0xfe, 0xea, 0x57, 0x8d, 0x30,
+	},
+	{
+		0x00, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d,
+		0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84,
+		0x2f, 0x91, 0x4e, 0xf0, 0xed, 0x53, 0x8c, 0x32,
+		0xb6, 0x08, 0xd7, 0x69, 0x74, 0xca, 0x15, 0xab,
+		0x5e, 0xe0, 0x3f, 0x81, 0x9c, 0x22, 0xfd, 0x43,
+		0xc7, 0x79, 0xa6, 0x18, 0x05, 0xbb, 0x64, 0xda,
+		0x71, 0xcf, 0x10, 0xae, 0xb3, 0x0d, 0xd2, 0x6c,
+		0xe8, 0x56, 0x89, 0x37, 0x2a, 0x94, 0x4b, 0xf5,
+		0xbc, 0x02, 0xdd, 0x63, 0x7e, 0xc0, 0x1f, 0xa1,
+		0x25, 0x9b, 0x44, 0xfa, 0xe7, 0x59, 0x86, 0x38,
+		0x93, 0x2d, 0xf2, 0x4c, 0x51, 0xef, 0x30, 0x8e,
+		0x0a, 0xb4, 0x6b, 0xd5, 0xc8, 0x76, 0xa9, 0x17,
+		0xe2, 0x5c, 0x83, 0x3d, 0x20, 0x9e, 0x41, 0xff,
+		0x7b, 0xc5, 0x1a, 0xa4, 0xb9, 0x07, 0xd8, 0x66,
+		0xcd, 0x73, 0xac, 0x12, 0x0f, 0xb1, 0x6e, 0xd0,
+		0x54, 0xea, 0x35, 0x8b, 0x96, 0x28, 0xf7, 0x49,
+		0x65, 0xdb, 0x04, 0xba, 0xa7, 0x19, 0xc6, 0x78,
+		0xfc, 0x42, 0x9d, 0x23, 0x3e, 0x80, 0x5f, 0xe1,
+		0x4a, 0xf4, 0x2b, 0x95, 0x88, 0x36, 0xe9, 0x57,
+		0xd3, 0x6d, 0xb2, 0x0c, 0x11, 0xaf, 0x70, 0xce,
+		0x3b, 0x85, 0x5a, 0xe4, 0xf9, 0x47, 0x98, 0x26,
+		0xa2, 0x1c, 0xc3, 0x7d, 0x60, 0xde, 0x01, 0xbf,
+		0x14, 0xaa, 0x75, 0xcb, 0xd6, 0x68, 0xb7, 0x09,
+		0x8d, 0x33, 0xec, 0x52, 0x4f, 0xf1, 0x2e, 0x90,
+		0xd9, 0x67, 0xb8, 0x06, 0x1b, 0xa5, 0x7a, 0xc4,
+		0x40, 0xfe, 0x21, 0x9f, 0x82, 0x3c, 0xe3, 0x5d,
+		0xf6, 0x48, 0x97, 0x29, 0x34, 0x8a, 0x55, 0xeb,
+		0x6f, 0xd1, 0x0e, 0xb0, 0xad, 0x13, 0xcc, 0x72,
+		0x87, 0x39, 0xe6, 0x58, 0x45, 0xfb, 0x24, 0x9a,
+		0x1e, 0xa0, 0x7f, 0xc1, 0xdc, 0x62, 0xbd, 0x03,
+		0xa8, 0x16, 0xc9, 0x77, 0x6a, 0xd4, 0x0b, 0xb5,
+		0x31, 0x8f, 0x50, 0xee, 0xf3, 0x4d, 0x92, 0x2c,
+	},
+	{
+		0x00, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a,
+		0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b,
+		0x3f, 0x80, 0x5c, 0xe3, 0xf9, 0x46, 0x9a, 0x25,
+		0xae, 0x11, 0xcd, 0x72, 0x68, 0xd7, 0x0b, 0xb4,
+		0x7e, 0xc1, 0x1d, 0xa2, 0xb8, 0x07, 0xdb, 0x64,
+		0xef, 0x50, 0x8c, 0x33, 0x29, 0x96, 0x4a, 0xf5,
+		0x41, 0xfe, 0x22, 0x9d, 0x87, 0x38, 0xe4, 0x5b,
+		0xd0, 0x6f, 0xb3, 0x0c, 0x16, 0xa9, 0x75, 0xca,
+		0xfc, 0x43, 0x9f, 0x20, 0x3a, 0x85, 0x59, 0xe6,
+		0x6d, 0xd2, 0x0e, 0xb1, 0xab, 0x14, 0xc8, 0x77,
+		0xc3, 0x7c, 0xa0, 0x1f, 0x05, 0xba, 0x66, 0xd9,
+		0x52, 0xed, 0x31, 0x8e, 0x94, 0x2b, 0xf7, 0x48,
+		0x82, 0x3d, 0xe1, 0x5e, 0x44, 0xfb, 0x27, 0x98,
+		0x13, 0xac, 0x70, 0xcf, 0xd5, 0x6a, 0xb6, 0x09,
+		0xbd, 0x02, 0xde, 0x61, 0x7b, 0xc4, 0x18, 0xa7,
+		0x2c, 0x93, 0x4f, 0xf0, 0xea, 0x55, 0x89, 0x36,
+		0xe5, 0x5a, 0x86, 0x39, 0x23, 0x9c, 0x40, 0xff,
+		0x74, 0xcb, 0x17, 0xa8, 0xb2, 0x0d, 0xd1, 0x6e,
+		0xda, 0x65, 0xb9, 0x06, 0x1c, 0xa3, 0x7f, 0xc0,
+		0x4b, 0xf4, 0x28, 0x97, 0x8d, 0x32, 0xee, 0x51,
+		0x9b, 0x24, 0xf8, 0x47, 0x5d, 0xe2, 0x3e, 0x81,
+		0x0a, 0xb5, 0x69, 0xd6, 0xcc, 0x73, 0xaf, 0x10,
+		0xa4, 0x1b, 0xc7, 0x78, 0x62, 0xdd, 0x01, 0xbe,
+		0x35, 0x8a, 0x56, 0xe9, 0xf3, 0x4c, 0x90, 0x2f,
+		0x19, 0xa6, 0x7a, 0xc5, 0xdf, 0x60, 0xbc, 0x03,
+		0x88, 0x37, 0xeb, 0x54, 0x4e, 0xf1, 0x2d, 0x92,
+		0x26, 0x99, 0x45, 0xfa, 0xe0, 0x5f, 0x83, 0x3c,
+		0xb7, 0x08, 0xd4, 0x6b, 0x71, 0xce, 0x12, 0xad,
+		0x67, 0xd8, 0x04, 0xbb, 0xa1, 0x1e, 0xc2, 0x7d,
+		0xf6, 0x49, 0x95, 0x2a, 0x30, 0x8f, 0x53, 0xec,
+		0x58, 0xe7, 0x3b, 0x84, 0x9e, 0x21, 0xfd, 0x42,
+		0xc9, 0x76, 0xaa, 0x15, 0x0f, 0xb0, 0x6c, 0xd3,
+	},
+	{
+		0x00, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a,
+		0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34,
+		0x9c, 0x5c, 0x01, 0xc1, 0xbb, 0x7b, 0x26, 0xe6,
+		0xd2, 0x12, 0x4f, 0x8f, 0xf5, 0x35, 0x68, 0xa8,
+		0x25, 0xe5, 0xb8, 0x78, 0x02, 0xc2, 0x9f, 0x5f,
+		0x6b, 0xab, 0xf6, 0x36, 0x4c, 0x8c, 0xd1, 0x11,
+		0xb9, 0x79, 0x24, 0xe4, 0x9e, 0x5e, 0x03, 0xc3,
+		0xf7, 0x37, 0x6a, 0xaa, 0xd0, 0x10, 0x4d, 0x8d,
+		0x4a, 0x8a, 0xd7, 0x17, 0x6d, 0xad, 0xf0, 0x30,
+		0x04, 0xc4, 0x99, 0x59, 0x23, 0xe3, 0xbe, 0x7e,
+		0xd6, 0x16, 0x4b, 0x8b, 0xf1, 0x31, 0x6c, 0xac,
+		0x98, 0x58, 0x05, 0xc5, 0xbf, 0x7f, 0x22, 0xe2,
+		0x6f, 0xaf, 0xf2, 0x32, 0x48, 0x88, 0xd5, 0x15,
+		0x21, 0xe1, 0xbc, 0x7c, 0x06, 0xc6, 0x9b, 0x5b,
+		0xf3, 0x33, 0x6e, 0xae, 0xd4, 0x14, 0x49, 0x89,
+		0xbd, 0x7d, 0x20, 0xe0, 0x9a, 0x5a, 0x07, 0xc7,
+		0x94, 0x54, 0x09, 0xc9, 0xb3, 0x73, 0x2e, 0xee,
+		0xda, 0x1a, 0x47, 0x87, 0xfd, 0x3d, 0x60, 0xa0,
+		0x08, 0xc8, 0x95, 0x55, 0x2f, 0xef, 0xb2, 0x72,
+		0x46, 0x86, 0xdb, 0x1b, 0x61, 0xa1, 0xfc, 0x3c,
+		0xb1, 0x71, 0x2c, 0xec, 0x96, 0x56, 0x0b, 0xcb,
+		0xff, 0x3f, 0x62, 0xa2, 0xd8, 0x18, 0x45, 0x85,
+		0x2d, 0xed, 0xb0, 0x70, 0x0a, 0xca, 0x97, 0x57,
+		0x63, 0xa3, 0xfe, 0x3e, 0x44, 0x84, 0xd9, 0x19,
+		0xde, 0x1e, 0x43, 0x83, 0xf9, 0x39, 0x64, 0xa4,
+		0x90, 0x50, 0x0d, 0xcd, 0xb7, 0x77, 0x2a, 0xea,
+		0x42, 0x82, 0xdf, 0x1f, 0x65, 0xa5, 0xf8, 0x38,
+		0x0c, 0xcc, 0x91, 0x51, 0x2b, 0xeb, 0xb6, 0x76,
+		0xfb, 0x3b, 0x66, 0xa6, 0xdc, 0x1c, 0x41, 0x81,
+		0xb5, 0x75, 0x28, 0xe8, 0x92, 0x52, 0x0f, 0xcf,
+		0x67, 0xa7, 0xfa, 0x3a, 0x40, 0x80, 0xdd, 0x1d,
+		0x29, 0xe9, 0xb4, 0x74, 0x0e, 0xce, 0x93, 0x53,
+	},
+	{
+		0x00, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d,
+		0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b,
+		0x8c, 0x4d, 0x13, 0xd2, 0xaf, 0x6e, 0x30, 0xf1,
+		0xca, 0x0b, 0x55, 0x94, 0xe9, 0x28, 0x76, 0xb7,
+		0x05, 0xc4, 0x9a, 0x5b, 0x26, 0xe7, 0xb9, 0x78,
+		0x43, 0x82, 0xdc, 0x1d, 0x60, 0xa1, 0xff, 0x3e,
+		0x89, 0x48, 0x16, 0xd7, 0xaa, 0x6b, 0x35, 0xf4,
+		0xcf, 0x0e, 0x50, 0x91, 0xec, 0x2d, 0x73, 0xb2,
+		0x0a, 0xcb, 0x95, 0x54, 0x29, 0xe8, 0xb6, 0x77,
+		0x4c, 0x8d, 0xd3, 0x12, 0x6f, 0xae, 0xf0, 0x31,
+		0x86, 0x47, 0x19, 0xd8, 0xa5, 0x64, 0x3a, 0xfb,
+		0xc0, 0x01, 0x5f, 0x9e, 0xe3, 0x22, 0x7c, 0xbd,
+		0x0f, 0xce, 0x90, 0x51, 0x2c, 0xed, 0xb3, 0x72,
+		0x49, 0x88, 0xd6, 0x17, 0x6a, 0xab, 0xf5, 0x34,
+		0x83, 0x42, 0x1c, 0xdd, 0xa0, 0x61, 0x3f, 0xfe,
+		0xc5, 0x04, 0x5a, 0x9b, 0xe6, 0x27, 0x79, 0xb8,
+		0x14, 0xd5, 0x8b, 0x4a, 0x37, 0xf6, 0xa8, 0x69,
+		0x52, 0x93, 0xcd, 0x0c, 0x71, 0xb0, 0xee, 0x2f,
+		0x98, 0x59, 0x07, 0xc6, 0xbb, 0x7a, 0x24, 0xe5,
+		0xde, 0x1f, 0x41, 0x80, 0xfd, 0x3c, 0x62, 0xa3,
+		0x11, 0xd0, 0x8e, 0x4f, 0x32, 0xf3, 0xad, 0x6c,
+		0x57, 0x96, 0xc8, 0x09, 0x74, 0xb5, 0xeb, 0x2a,
+		0x9d, 0x5c, 0x02, 0xc3, 0xbe, 0x7f, 0x21, 0xe0,
+		0xdb, 0x1a, 0x44, 0x85, 0xf8, 0x39, 0x67, 0xa6,
+		0x1e, 0xdf, 0x81, 0x40, 0x3d, 0xfc, 0xa2, 0x63,
+		0x58, 0x99, 0xc7, 0x06, 0x7b, 0xba, 0xe4, 0x25,
+		0x92, 0x53, 0x0d, 0xcc, 0xb1, 0x70, 0x2e, 0xef,
+		0xd4, 0x15, 0x4b, 0x8a, 0xf7, 0x36, 0x68, 0xa9,
+		0x1b, 0xda, 0x84, 0x45, 0x38, 0xf9, 0xa7, 0x66,
+		0x5d, 0x9c, 0xc2, 0x03, 0x7e, 0xbf, 0xe1, 0x20,
+		0x97, 0x56, 0x08, 0xc9, 0xb4, 0x75, 0x2b, 0xea,
+		0xd1, 0x10, 0x4e, 0x8f, 0xf2, 0x33, 0x6d, 0xac,
+	},
+	{
+		0x00, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74,
+		0x5e, 0x9c, 0xc7, 0x05, 0x71, 0xb3, 0xe8, 0x2a,
+		0xbc, 0x7e, 0x25, 0xe7, 0x93, 0x51, 0x0a, 0xc8,
+		0xe2, 0x20, 0x7b, 0xb9, 0xcd, 0x0f, 0x54, 0x96,
+		0x65, 0xa7, 0xfc, 0x3e, 0x4a, 0x88, 0xd3, 0x11,
+		0x3b, 0xf9, 0xa2, 0x60, 0x14, 0xd6, 0x8d, 0x4f,
+		0xd9, 0x1b, 0x40, 0x82, 0xf6, 0x34, 0x6f, 0xad,
+		0x87, 0x45, 0x1e, 0xdc, 0xa8, 0x6a, 0x31, 0xf3,
+		0xca, 0x08, 0x53, 0x91, 0xe5, 0x27, 0x7c, 0xbe,
+		0x94, 0x56, 0x0d, 0xcf, 0xbb, 0x79, 0x22, 0xe0,
+		0x76, 0xb4, 0xef, 0x2d, 0x59, 0x9b, 0xc0, 0x02,
+		0x28, 0xea, 0xb1, 0x73, 0x07, 0xc5, 0x9e, 0x5c,
+		0xaf, 0x6d, 0x36, 0xf4, 0x80, 0x42, 0x19, 0xdb,
+		0xf1, 0x33, 0x68, 0xaa, 0xde, 0x1c, 0x47, 0x85,
+		0x13, 0xd1, 0x8a, 0x48, 0x3c, 0xfe, 0xa5, 0x67,
+		0x4d, 0x8f, 0xd4, 0x16, 0x62, 0xa0, 0xfb, 0x39,
+		0x89, 0x4b, 0x10, 0xd2, 0xa6, 0x64, 0x3f, 0xfd,
+		0xd7, 0x15, 0x4e, 0x8c, 0xf8, 0x3a, 0x61, 0xa3,
+		0x35, 0xf7, 0xac, 0x6e, 0x1a, 0xd8, 0x83, 0x41,
+		0x6b, 0xa9, 0xf2, 0x30, 0x44, 0x86, 0xdd, 0x1f,
+		0xec, 0x2e, 0x75, 0xb7, 0xc3, 0x01, 0x5a, 0x98,
+		0xb2, 0x70, 0x2b, 0xe9, 0x9d, 0x5f, 0x04, 0xc6,
+		0x50, 0x92, 0xc9, 0x0b, 0x7f, 0xbd, 0xe6, 0x24,
+		0x0e, 0xcc, 0x97, 0x55, 0x21, 0xe3, 0xb8, 0x7a,
+		0x43, 0x81, 0xda, 0x18, 0x6c, 0xae, 0xf5, 0x37,
+		0x1d, 0xdf, 0x84, 0x46, 0x32, 0xf0, 0xab, 0x69,
+		0xff, 0x3d, 0x66, 0xa4, 0xd0, 0x12, 0x49, 0x8b,
+		0xa1, 0x63, 0x38, 0xfa, 0x8e, 0x4c, 0x17, 0xd5,
+		0x26, 0xe4, 0xbf, 0x7d, 0x09, 0xcb, 0x90, 0x52,
+		0x78, 0xba, 0xe1, 0x23, 0x57, 0x95, 0xce, 0x0c,
+		0x9a, 0x58, 0x03, 0xc1, 0xb5, 0x77, 0x2c, 0xee,
+		0xc4, 0x06, 0x5d, 0x9f, 0xeb, 0x29, 0x72, 0xb0,
+	},
+	{
+		0x00, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73,
+		0x56, 0x95, 0xcd, 0x0e, 0x7d, 0xbe, 0xe6, 0x25,
+		0xac, 0x6f, 0x37, 0xf4, 0x87, 0x44, 0x1c, 0xdf,
+		0xfa, 0x39, 0x61, 0xa2, 0xd1, 0x12, 0x4a, 0x89,
+		0x45, 0x86, 0xde, 0x1d, 0x6e, 0xad, 0xf5, 0x36,
+		0x13, 0xd0, 0x88, 0x4b, 0x38, 0xfb, 0xa3, 0x60,
+		0xe9, 0x2a, 0x72, 0xb1, 0xc2, 0x01, 0x59, 0x9a,
+		0xbf, 0x7c, 0x24, 0xe7, 0x94, 0x57, 0x0f, 0xcc,
+		0x8a, 0x49, 0x11, 0xd2, 0xa1, 0x62, 0x3a, 0xf9,
+		0xdc, 0x1f, 0x47, 0x84, 0xf7, 0x34, 0x6c, 0xaf,
+		0x26, 0xe5, 0xbd, 0x7e, 0x0d, 0xce, 0x96, 0x55,
+		0x70, 0xb3, 0xeb, 0x28, 0x5b, 0x98, 0xc0, 0x03,
+		0xcf, 0x0c, 0x54, 0x97, 0xe4, 0x27, 0x7f, 0xbc,
+		0x99, 0x5a, 0x02, 0xc1, 0xb2, 0x71, 0x29, 0xea,
+		0x63, 0xa0, 0xf8, 0x3b, 0x48, 0x8b, 0xd3, 0x10,
+		0x35, 0xf6, 0xae, 0x6d, 0x1e, 0xdd, 0x85, 0x46,
+		0x09, 0xca, 0x92, 0x51, 0x22, 0xe1, 0xb9, 0x7a,
+		0x5f, 0x9c, 0xc4, 0x07, 0x74, 0xb7, 0xef, 0x2c,
+		0xa5, 0x66, 0x3e, 0xfd, 0x8e, 0x4d, 0x15, 0xd6,
+		0xf3, 0x30, 0x68, 0xab, 0xd8, 0x1b, 0x43, 0x80,
+		0x4c, 0x8f, 0xd7, 0x14, 0x67, 0xa4, 0xfc, 0x3f,
+		0x1a, 0xd9, 0x81, 0x42, 0x31, 0xf2, 0xaa, 0x69,
+		0xe0, 0x23, 0x7b, 0xb8, 0xcb, 0x08, 0x50, 0x93,
+		0xb6, 0x75, 0x2d, 0xee, 0x9d, 0x5e, 0x06, 0xc5,
+		0x83, 0x40, 0x18, 0xdb, 0xa8, 0x6b, 0x33, 0xf0,
+		0xd5, 0x16, 0x4e, 0x8d, 0xfe, 0x3d, 0x65, 0xa6,
+		0x2f, 0xec, 0xb4, 0x77, 0x04, 0xc7, 0x9f, 0x5c,
+		0x79, 0xba, 0xe2, 0x21, 0x52, 0x91, 0xc9, 0x0a,
+		0xc6, 0x05, 0x5d, 0x9e, 0xed, 0x2e, 0x76, 0xb5,
+		0x90, 0x53, 0x0b, 0xc8, 0xbb, 0x78, 0x20, 0xe3,
+		0x6a, 0xa9, 0xf1, 0x32, 0x41, 0x82, 0xda, 0x19,
+		0x3c, 0xff, 0xa7, 0x64, 0x17, 0xd4, 0x8c, 0x4f,
+	},
+	{
+		0x00, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66,
+		0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x08,
+		0xdc, 0x18, 0x49, 0x8d, 0xeb, 0x2f, 0x7e, 0xba,
+		0xb2, 0x76, 0x27, 0xe3, 0x85, 0x41, 0x10, 0xd4,
+		0xa5, 0x61, 0x30, 0xf4, 0x92, 0x56, 0x07, 0xc3,
+		0xcb, 0x0f, 0x5e, 0x9a, 0xfc, 0x38, 0x69, 0xad,
+		0x79, 0xbd, 0xec, 0x28, 0x4e, 0x8a, 0xdb, 0x1f,
+		0x17, 0xd3, 0x82, 0x46, 0x20, 0xe4, 0xb5, 0x71,
+		0x57, 0x93, 0xc2, 0x06, 0x60, 0xa4, 0xf5, 0x31,
+		0x39, 0xfd, 0xac, 0x68, 0x0e, 0xca, 0x9b, 0x5f,
+		0x8b, 0x4f, 0x1e, 0xda, 0xbc, 0x78, 0x29, 0xed,
+		0xe5, 0x21, 0x70, 0xb4, 0xd2, 0x16, 0x47, 0x83,
+		0xf2, 0x36, 0x67, 0xa3, 0xc5, 0x01, 0x50, 0x94,
+		0x9c, 0x58, 0x09, 0xcd, 0xab, 0x6f, 0x3e, 0xfa,
+		0x2e, 0xea, 0xbb, 0x7f, 0x19, 0xdd, 0x8c, 0x48,
+		0x40, 0x84, 0xd5, 0x11, 0x77, 0xb3, 0xe2, 0x26,
+		0xae, 0x6a, 0x3b, 0xff, 0x99, 0x5d, 0x0c, 0xc8,
+		0xc0, 0x04, 0x55, 0x91, 0xf7, 0x33, 0x62, 0xa6,
+		0x72, 0xb6, 0xe7, 0x23, 0x45, 0x81, 0xd0, 0x14,
+		0x1c, 0xd8, 0x89, 0x4d, 0x2b, 0xef, 0xbe, 0x7a,
+		0x0b, 0xcf, 0x9e, 0x5a, 0x3c, 0xf8, 0xa9, 0x6d,
+		0x65, 0xa1, 0xf0, 0x34, 0x52, 0x96, 0xc7, 0x03,
+		0xd7, 0x13, 0x42, 0x86, 0xe0, 0x24, 0x75, 0xb1,
+		0xb9, 0x7d, 0x2c, 0xe8, 0x8e, 0x4a, 0x1b, 0xdf,
+		0xf9, 0x3d, 0x6c, 0xa8, 0xce, 0x0a, 0x5b, 0x9f,
+		0x97, 0x53, 0x02, 0xc6, 0xa0, 0x64, 0x35, 0xf1,
+		0x25, 0xe1, 0xb0, 0x74, 0x12, 0xd6, 0x87, 0x43,
+		0x4b, 0x8f, 0xde, 0x1a, 0x7c, 0xb8, 0xe9, 0x2d,
+		0x5c, 0x98, 0xc9, 0x0d, 0x6b, 0xaf, 0xfe, 0x3a,
+		0x32, 0xf6, 0xa7, 0x63, 0x05, 0xc1, 0x90, 0x54,
+		0x80, 0x44, 0x15, 0xd1, 0xb7, 0x73, 0x22, 0xe6,
+		0xee, 0x2a, 0x7b, 0xbf, 0xd9, 0x1d, 0x4c, 0x88,
+	},
+	{
+		0x00, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61,
+		0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x07,
+		0xcc, 0x09, 0x5b, 0x9e, 0xff, 0x3a, 0x68, 0xad,
+		0xaa, 0x6f, 0x3d, 0xf8, 0x99, 0x5c, 0x0e, 0xcb,
+		0x85, 0x40, 0x12, 0xd7, 0xb6, 0x73, 0x21, 0xe4,
+		0xe3, 0x26, 0x74, 0xb1, 0xd0, 0x15, 0x47, 0x82,
+		0x49, 0x8c, 0xde, 0x1b, 0x7a, 0xbf, 0xed, 0x28,
+		0x2f, 0xea, 0xb8, 0x7d, 0x1c, 0xd9, 0x8b, 0x4e,
+		0x17, 0xd2, 0x80, 0x45, 0x24, 0xe1, 0xb3, 0x76,
+		0x71, 0xb4, 0xe6, 0x23, 0x42, 0x87, 0xd5, 0x10,
+		0xdb, 0x1e, 0x4c, 0x89, 0xe8, 0x2d, 0x7f, 0xba,
+		0xbd, 0x78, 0x2a, 0xef, 0x8e, 0x4b, 0x19, 0xdc,
+		0x92, 0x57, 0x05, 0xc0, 0xa1, 0x64, 0x36, 0xf3,
+		0xf4, 0x31, 0x63, 0xa6, 0xc7, 0x02, 0x50, 0x95,
+		0x5e, 0x9b, 0xc9, 0x0c, 0x6d, 0xa8, 0xfa, 0x3f,
+		0x38, 0xfd, 0xaf, 0x6a, 0x0b, 0xce, 0x9c, 0x59,
+		0x2e, 0xeb, 0xb9, 0x7c, 0x1d, 0xd8, 0x8a, 0x4f,
+		0x48, 0x8d, 0xdf, 0x1a, 0x7b, 0xbe, 0xec, 0x29,
+		0xe2, 0x27, 0x75, 0xb0, 0xd1, 0x14, 0x46, 0x83,
+		0x84, 0x41, 0x13, 0xd6, 0xb7, 0x72, 0x20, 0xe5,
+		0xab, 0x6e, 0x3c, 0xf9, 0x98, 0x5d, 0x0f, 0xca,
+		0xcd, 0x08, 0x5a, 0x9f, 0xfe, 0x3b, 0x69, 0xac,
+		0x67, 0xa2, 0xf0, 0x35, 0x54, 0x91, 0xc3, 0x06,
+		0x01, 0xc4, 0x96, 0x53, 0x32, 0xf7, 0xa5, 0x60,
+		0x39, 0xfc, 0xae, 0x6b, 0x0a, 0xcf, 0x9d, 0x58,
+		0x5f, 0x9a, 0xc8, 0x0d, 0x6c, 0xa9, 0xfb, 0x3e,
+		0xf5, 0x30, 0x62, 0xa7, 0xc6, 0x03, 0x51, 0x94,
+		0x93, 0x56, 0x04, 0xc1, 0xa0, 0x65, 0x37, 0xf2,
+		0xbc, 0x79, 0x2b, 0xee, 0x8f, 0x4a, 0x18, 0xdd,
+		0xda, 0x1f, 0x4d, 0x88, 0xe9, 0x2c, 0x7e, 0xbb,
+		0x70, 0xb5, 0xe7, 0x22, 0x43, 0x86, 0xd4, 0x11,
+		0x16, 0xd3, 0x81, 0x44, 0x25, 0xe0, 0xb2, 0x77,
+	},
+	{
+		0x00, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68,
+		0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16,
+		0xfc, 0x3a, 0x6d, 0xab, 0xc3, 0x05, 0x52, 0x94,
+		0x82, 0x44, 0x13, 0xd5, 0xbd, 0x7b, 0x2c, 0xea,
+		0xe5, 0x23, 0x74, 0xb2, 0xda, 0x1c, 0x4b, 0x8d,
+		0x9b, 0x5d, 0x0a, 0xcc, 0xa4, 0x62, 0x35, 0xf3,
+		0x19, 0xdf, 0x88, 0x4e, 0x26, 0xe0, 0xb7, 0x71,
+		0x67, 0xa1, 0xf6, 0x30, 0x58, 0x9e, 0xc9, 0x0f,
+		0xd7, 0x11, 0x46, 0x80, 0xe8, 0x2e, 0x79, 0xbf,
+		0xa9, 0x6f, 0x38, 0xfe, 0x96, 0x50, 0x07, 0xc1,
+		0x2b, 0xed, 0xba, 0x7c, 0x14, 0xd2, 0x85, 0x43,
+		0x55, 0x93, 0xc4, 0x02, 0x6a, 0xac, 0xfb, 0x3d,
+		0x32, 0xf4, 0xa3, 0x65, 0x0d, 0xcb, 0x9c, 0x5a,
+		0x4c, 0x8a, 0xdd, 0x1b, 0x73, 0xb5, 0xe2, 0x24,
+		0xce, 0x08, 0x5f, 0x99, 0xf1, 0x37, 0x60, 0xa6,
+		0xb0, 0x76, 0x21, 0xe7, 0x8f, 0x49, 0x1e, 0xd8,
+		0xb3, 0x75, 0x22, 0xe4, 0x8c, 0x4a, 0x1d, 0xdb,
+		0xcd, 0x0b, 0x5c, 0x9a, 0xf2, 0x34, 0x63, 0xa5,
+		0x4f, 0x89, 0xde, 0x18, 0x70, 0xb6, 0xe1, 0x27,
+		0x31, 0xf7, 0xa0, 0x66, 0x0e, 0xc8, 0x9f, 0x59,
+		0x56, 0x90, 0xc7, 0x01, 0x69, 0xaf, 0xf8, 0x3e,
+		0x28, 0xee, 0xb9, 0x7f, 0x17, 0xd1, 0x86, 0x40,
+		0xaa, 0x6c, 0x3b, 0xfd, 0x95, 0x53, 0x04, 0xc2,
+		0xd4, 0x12, 0x45, 0x83, 0xeb, 0x2d, 0x7a, 0xbc,
+		0x64, 0xa2, 0xf5, 0x33, 0x5b, 0x9d, 0xca, 0x0c,
+		0x1a, 0xdc, 0x8b, 0x4d, 0x25, 0xe3, 0xb4, 0x72,
+		0x98, 0x5e, 0x09, 0xcf, 0xa7, 0x61, 0x36, 0xf0,
+		0xe6, 0x20, 0x77, 0xb1, 0xd9, 0x1f, 0x48, 0x8e,
+		0x81, 0x47, 0x10, 0xd6, 0xbe, 0x78, 0x2f, 0xe9,
+		0xff, 0x39, 0x6e, 0xa8, 0xc0, 0x06, 0x51, 0x97,
+		0x7d, 0xbb, 0xec, 0x2a, 0x42, 0x84, 0xd3, 0x15,
+		0x03, 0xc5, 0x92, 0x54, 0x3c, 0xfa, 0xad, 0x6b,
+	},
+	{
+		0x00, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f,
+		0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19,
+		0xec, 0x2b, 0x7f, 0xb8, 0xd7, 0x10, 0x44, 0x83,
+		0x9a, 0x5d, 0x09, 0xce, 0xa1, 0x66, 0x32, 0xf5,
+		0xc5, 0x02, 0x56, 0x91, 0xfe, 0x39, 0x6d, 0xaa,
+		0xb3, 0x74, 0x20, 0xe7, 0x88, 0x4f, 0x1b, 0xdc,
+		0x29, 0xee, 0xba, 0x7d, 0x12, 0xd5, 0x81, 0x46,
+		0x5f, 0x98, 0xcc, 0x0b, 0x64, 0xa3, 0xf7, 0x30,
+		0x97, 0x50, 0x04, 0xc3, 0xac, 0x6b, 0x3f, 0xf8,
+		0xe1, 0x26, 0x72, 0xb5, 0xda, 0x1d, 0x49, 0x8e,
+		0x7b, 0xbc, 0xe8, 0x2f, 0x40, 0x87, 0xd3, 0x14,
+		0x0d, 0xca, 0x9e, 0x59, 0x36, 0xf1, 0xa5, 0x62,
+		0x52, 0x95, 0xc1, 0x06, 0x69, 0xae, 0xfa, 0x3d,
+		0x24, 0xe3, 0xb7, 0x70, 0x1f, 0xd8, 0x8c, 0x4b,
+		0xbe, 0x79, 0x2d, 0xea, 0x85, 0x42, 0x16, 0xd1,
+		0xc8, 0x0f, 0x5b, 0x9c, 0xf3, 0x34, 0x60, 0xa7,
+		0x33, 0xf4, 0xa0, 0x67, 0x08, 0xcf, 0x9b, 0x5c,
+		0x45, 0x82, 0xd6, 0x11, 0x7e, 0xb9, 0xed, 0x2a,
+		0xdf, 0x18, 0x4c, 0x8b, 0xe4, 0x23, 0x77, 0xb0,
+		0xa9, 0x6e, 0x3a, 0xfd, 0x92, 0x55, 0x01, 0xc6,
+		0xf6, 0x31, 0x65, 0xa2, 0xcd, 0x0a, 0x5e, 0x99,
+		0x80, 0x47, 0x13, 0xd4, 0xbb, 0x7c, 0x28, 0xef,
+		0x1a, 0xdd, 0x89, 0x4e, 0x21, 0xe6, 0xb2, 0x75,
+		0x6c, 0xab, 0xff, 0x38, 0x57, 0x90, 0xc4, 0x03,
+		0xa4, 0x63, 0x37, 0xf0, 0x9f, 0x58, 0x0c, 0xcb,
+		0xd2, 0x15, 0x41, 0x86, 0xe9, 0x2e, 0x7a, 0xbd,
+		0x48, 0x8f, 0xdb, 0x1c, 0x73, 0xb4, 0xe0, 0x27,
+		0x3e, 0xf9, 0xad, 0x6a, 0x05, 0xc2, 0x96, 0x51,
+		0x61, 0xa6, 0xf2, 0x35, 0x5a, 0x9d, 0xc9, 0x0e,
+		0x17, 0xd0, 0x84, 0x43, 0x2c, 0xeb, 0xbf, 0x78,
+		0x8d, 0x4a, 0x1e, 0xd9, 0xb6, 0x71, 0x25, 0xe2,
+		0xfb, 0x3c, 0x68, 0xaf, 0xc0, 0x07, 0x53, 0x94,
+	},
+	{
+		0x00, 0xc8, 0x8d, 0x45, 0x07, 0xcf, 0x8a, 0x42,
+		0x0e, 0xc6, 0x83, 0x4b, 0x09, 0xc1, 0x84, 0x4c,
+		0x1c, 0xd4, 0x91, 0x59, 0x1b, 0xd3, 0x96, 0x5e,
+		0x12, 0xda, 0x9f, 0x57, 0x15, 0xdd, 0x98, 0x50,
+		0x38, 0xf0, 0xb5, 0x7d, 0x3f, 0xf7, 0xb2, 0x7a,
+		0x36, 0xfe, 0xbb, 0x73, 0x31, 0xf9, 0xbc, 0x74,
+		0x24, 0xec, 0xa9, 0x61, 0x23, 0xeb, 0xae, 0x66,
+		0x2a, 0xe2, 0xa7, 0x6f, 0x2d, 0xe5, 0xa0, 0x68,
+		0x70, 0xb8, 0xfd, 0x35, 0x77, 0xbf, 0xfa, 0x32,
+		0x7e, 0xb6, 0xf3, 0x3b, 0x79, 0xb1, 0xf4, 0x3c,
+		0x6c, 0xa4, 0xe1, 0x29, 0x6b, 0xa3, 0xe6, 0x2e,
+		0x62, 0xaa, 0xef, 0x27, 0x65, 0xad, 0xe8, 0x20,
+		0x48, 0x80, 0xc5, 0x0d, 0x4f, 0x87, 0xc2, 0x0a,
+		0x46, 0x8e, 0xcb, 0x03, 0x41, 0x89, 0xcc, 0x04,
+		0x54, 0x9c, 0xd9, 0x11, 0x53, 0x9b, 0xde, 0x16,
+		0x5a, 0x92, 0xd7, 0x1f, 0x5d, 0x95, 0xd0, 0x18,
+		0xe0, 0x28, 0x6d, 0xa5, 0xe7, 0x2f, 0x6a, 0xa2,
+		0xee, 0x26, 0x63, 0xab, 0xe9, 0x21, 0x64, 0xac,
+		0xfc, 0x34, 0x71, 0xb9, 0xfb, 0x33, 0x76, 0xbe,
+		0xf2, 0x3a, 0x7f, 0xb7, 0xf5, 0x3d, 0x78, 0xb0,
+		0xd8, 0x10, 0x55, 0x9d, 0xdf, 0x17, 0x52, 0x9a,
+		0xd6, 0x1e, 0x5b, 0x93, 0xd1, 0x19, 0x5c, 0x94,
+		0xc4, 0x0c, 0x49, 0x81, 0xc3, 0x0b, 0x4e, 0x86,
+		0xca, 0x02, 0x47, 0x8f, 0xcd, 0x05, 0x40, 0x88,
+		0x90, 0x58, 0x1d, 0xd5, 0x97, 0x5f, 0x1a, 0xd2,
+		0x9e, 0x56, 0x13, 0xdb, 0x99, 0x51, 0x14, 0xdc,
+		0x8c, 0x44, 0x01, 0xc9, 0x8b, 0x43, 0x06, 0xce,
+		0x82, 0x4a, 0x0f, 0xc7, 0x85, 0x4d, 0x08, 0xc0,
+		0xa8, 0x60, 0x25, 0xed, 0xaf, 0x67, 0x22, 0xea,
+		0xa6, 0x6e, 0x2b, 0xe3, 0xa1, 0x69, 0x2c, 0xe4,
+		0xb4, 0x7c, 0x39, 0xf1, 0xb3, 0x7b, 0x3e, 0xf6,
+		0xba, 0x72, 0x37, 0xff, 0xbd, 0x75, 0x30, 0xf8,
+	},
+	{
+		0x00, 0xc9, 0x8f, 0x46, 0x03, 0xca, 0x8c, 0x45,
+		0x06, 0xcf, 0x89, 0x40, 0x05, 0xcc, 0x8a, 0x43,
+		0x0c, 0xc5, 0x83, 0x4a, 0x0f, 0xc6, 0x80, 0x49,
+		0x0a, 0xc3, 0x85, 0x4c, 0x09, 0xc0, 0x86, 0x4f,
+		0x18, 0xd1, 0x97, 0x5e, 0x1b, 0xd2, 0x94, 0x5d,
+		0x1e, 0xd7, 0x91, 0x58, 0x1d, 0xd4, 0x92, 0x5b,
+		0x14, 0xdd, 0x9b, 0x52, 0x17, 0xde, 0x98, 0x51,
+		0x12, 0xdb, 0x9d, 0x54, 0x11, 0xd8, 0x9e, 0x57,
+		0x30, 0xf9, 0xbf, 0x76, 0x33, 0xfa, 0xbc, 0x75,
+		0x36, 0xff, 0xb9, 0x70, 0x35, 0xfc, 0xba, 0x73,
+		0x3c, 0xf5, 0xb3, 0x7a, 0x3f, 0xf6, 0xb0, 0x79,
+		0x3a, 0xf3, 0xb5, 0x7c, 0x39, 0xf0, 0xb6, 0x7f,
+		0x28, 0xe1, 0xa7, 0x6e, 0x2b, 0xe2, 0xa4, 0x6d,
+		0x2e, 0xe7, 0xa1, 0x68, 0x2d, 0xe4, 0xa2, 0x6b,
+		0x24, 0xed, 0xab, 0x62, 0x27, 0xee, 0xa8, 0x61,
+		0x22, 0xeb, 0xad, 0x64, 0x21, 0xe8, 0xae, 0x67,
+		0x60, 0xa9, 0xef, 0x26, 0x63, 0xaa, 0xec, 0x25,
+		0x66, 0xaf, 0xe9, 0x20, 0x65, 0xac, 0xea, 0x23,
+		0x6c, 0xa5, 0xe3, 0x2a, 0x6f, 0xa6, 0xe0, 0x29,
+		0x6a, 0xa3, 0xe5, 0x2c, 0x69, 0xa0, 0xe6, 0x2f,
+		0x78, 0xb1, 0xf7, 0x3e, 0x7b, 0xb2, 0xf4, 0x3d,
+		0x7e, 0xb7, 0xf1, 0x38, 0x7d, 0xb4, 0xf2, 0x3b,
+		0x74, 0xbd, 0xfb, 0x32, 0x77, 0xbe, 0xf8, 0x31,
+		0x72, 0xbb, 0xfd, 0x34, 0x71, 0xb8, 0xfe, 0x37,
+		0x50, 0x99, 0xdf, 0x16, 0x53, 0x9a, 0xdc, 0x15,
+		0x56, 0x9f, 0xd9, 0x10, 0x55, 0x9c, 0xda, 0x13,
+		0x5c, 0x95, 0xd3, 0x1a, 0x5f, 0x96, 0xd0, 0x19,
+		0x5a, 0x93, 0xd5, 0x1c, 0x59, 0x90, 0xd6, 0x1f,
+		0x48, 0x81, 0xc7, 0x0e, 0x4b, 0x82, 0xc4, 0x0d,
+		0x4e, 0x87, 0xc1, 0x08, 0x4d, 0x84, 0xc2, 0x0b,
+		0x44, 0x8d, 0xcb, 0x02, 0x47, 0x8e, 0xc8, 0x01,
+		0x42, 0x8b, 0xcd, 0x04, 0x41, 0x88, 0xce, 0x07,
+	},
+	{
+		0x00, 0xca, 0x89, 0x43, 0x0f, 0xc5, 0x86, 0x4c,
+		0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52,
+		0x3c, 0xf6, 0xb5, 0x7f, 0x33, 0xf9, 0xba, 0x70,
+		0x22, 0xe8, 0xab, 0x61, 0x2d, 0xe7, 0xa4, 0x6e,
+		0x78, 0xb2, 0xf1, 0x3b, 0x77, 0xbd, 0xfe, 0x34,
+		0x66, 0xac, 0xef, 0x25, 0x69, 0xa3, 0xe0, 0x2a,
+		0x44, 0x8e, 0xcd, 0x07, 0x4b, 0x81, 0xc2, 0x08,
+		0x5a, 0x90, 0xd3, 0x19, 0x55, 0x9f, 0xdc, 0x16,
+		0xf0, 0x3a, 0x79, 0xb3, 0xff, 0x35, 0x76, 0xbc,
+		0xee, 0x24, 0x67, 0xad, 0xe1, 0x2b, 0x68, 0xa2,
+		0xcc, 0x06, 0x45, 0x8f, 0xc3, 0x09, 0x4a, 0x80,
+		0xd2, 0x18, 0x5b, 0x91, 0xdd, 0x17, 0x54, 0x9e,
+		0x88, 0x42, 0x01, 0xcb, 0x87, 0x4d, 0x0e, 0xc4,
+		0x96, 0x5c, 0x1f, 0xd5, 0x99, 0x53, 0x10, 0xda,
+		0xb4, 0x7e, 0x3d, 0xf7, 0xbb, 0x71, 0x32, 0xf8,
+		0xaa, 0x60, 0x23, 0xe9, 0xa5, 0x6f, 0x2c, 0xe6,
+		0xfd, 0x37, 0x74, 0xbe, 0xf2, 0x38, 0x7b, 0xb1,
+		0xe3, 0x29, 0x6a, 0xa0, 0xec, 0x26, 0x65, 0xaf,
+		0xc1, 0x0b, 0x48, 0x82, 0xce, 0x04, 0x47, 0x8d,
+		0xdf, 0x15, 0x56, 0x9c, 0xd0, 0x1a, 0x59, 0x93,
+		0x85, 0x4f, 0x0c, 0xc6, 0x8a, 0x40, 0x03, 0xc9,
+		0x9b, 0x51, 0x12, 0xd8, 0x94, 0x5e, 0x1d, 0xd7,
+		0xb9, 0x73, 0x30, 0xfa, 0xb6, 0x7c, 0x3f, 0xf5,
+		0xa7, 0x6d, 0x2e, 0xe4, 0xa8, 0x62, 0x21, 0xeb,
+		0x0d, 0xc7, 0x84, 0x4e, 0x02, 0xc8, 0x8b, 0x41,
+		0x13, 0xd9, 0x9a, 0x50, 0x1c, 0xd6, 0x95, 0x5f,
+		0x31, 0xfb, 0xb8, 0x72, 0x3e, 0xf4, 0xb7, 0x7d,
+		0x2f, 0xe5, 0xa6, 0x6c, 0x20, 0xea, 0xa9, 0x63,
+		0x75, 0xbf, 0xfc, 0x36, 0x7a, 0xb0, 0xf3, 0x39,
+		0x6b, 0xa1, 0xe2, 0x28, 0x64, 0xae, 0xed, 0x27,
+		0x49, 0x83, 0xc0, 0x0a, 0x46, 0x8c, 0xcf, 0x05,
+		0x57, 0x9d, 0xde, 0x14, 0x58, 0x92, 0xd1, 0x1b,
+	},
+	{
+		0x00, 0xcb, 0x8b, 0x40, 0x0b, 0xc0, 0x80, 0x4b,
+		0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d,
+		0x2c, 0xe7, 0xa7, 0x6c, 0x27, 0xec, 0xac, 0x67,
+		0x3a, 0xf1, 0xb1, 0x7a, 0x31, 0xfa, 0xba, 0x71,
+		0x58, 0x93, 0xd3, 0x18, 0x53, 0x98, 0xd8, 0x13,
+		0x4e, 0x85, 0xc5, 0x0e, 0x45, 0x8e, 0xce, 0x05,
+		0x74, 0xbf, 0xff, 0x34, 0x7f, 0xb4, 0xf4, 0x3f,
+		0x62, 0xa9, 0xe9, 0x22, 0x69, 0xa2, 0xe2, 0x29,
+		0xb0, 0x7b, 0x3b, 0xf0, 0xbb, 0x70, 0x30, 0xfb,
+		0xa6, 0x6d, 0x2d, 0xe6, 0xad, 0x66, 0x26, 0xed,
+		0x9c, 0x57, 0x17, 0xdc, 0x97, 0x5c, 0x1c, 0xd7,
+		0x8a, 0x41, 0x01, 0xca, 0x81, 0x4a, 0x0a, 0xc1,
+		0xe8, 0x23, 0x63, 0xa8, 0xe3, 0x28, 0x68, 0xa3,
+		0xfe, 0x35, 0x75, 0xbe, 0xf5, 0x3e, 0x7e, 0xb5,
+		0xc4, 0x0f, 0x4f, 0x84, 0xcf, 0x04, 0x44, 0x8f,
+		0xd2, 0x19, 0x59, 0x92, 0xd9, 0x12, 0x52, 0x99,
+		0x7d, 0xb6, 0xf6, 0x3d, 0x76, 0xbd, 0xfd, 0x36,
+		0x6b, 0xa0, 0xe0, 0x2b, 0x60, 0xab, 0xeb, 0x20,
+		0x51, 0x9a, 0xda, 0x11, 0x5a, 0x91, 0xd1, 0x1a,
+		0x47, 0x8c, 0xcc, 0x07, 0x4c, 0x87, 0xc7, 0x0c,
+		0x25, 0xee, 0xae, 0x65, 0x2e, 0xe5, 0xa5, 0x6e,
+		0x33, 0xf8, 0xb8, 0x73, 0x38, 0xf3, 0xb3, 0x78,
+		0x09, 0xc2, 0x82, 0x49, 0x02, 0xc9, 0x89, 0x42,
+		0x1f, 0xd4, 0x94, 0x5f, 0x14, 0xdf, 0x9f, 0x54,
+		0xcd, 0x06, 0x46, 0x8d, 0xc6, 0x0d, 0x4d, 0x86,
+		0xdb, 0x10, 0x50, 0x9b, 0xd0, 0x1b, 0x5b, 0x90,
+		0xe1, 0x2a, 0x6a, 0xa1, 0xea, 0x21, 0x61, 0xaa,
+		0xf7, 0x3c, 0x7c, 0xb7, 0xfc, 0x37, 0x77, 0xbc,
+		0x95, 0x5e, 0x1e, 0xd5, 0x9e, 0x55, 0x15, 0xde,
+		0x83, 0x48, 0x08, 0xc3, 0x88, 0x43, 0x03, 0xc8,
+		0xb9, 0x72, 0x32, 0xf9, 0xb2, 0x79, 0x39, 0xf2,
+		0xaf, 0x64, 0x24, 0xef, 0xa4, 0x6f, 0x2f, 0xe4,
+	},
+	{
+		0x00, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e,
+		0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70,
+		0x5c, 0x90, 0xd9, 0x15, 0x4b, 0x87, 0xce, 0x02,
+		0x72, 0xbe, 0xf7, 0x3b, 0x65, 0xa9, 0xe0, 0x2c,
+		0xb8, 0x74, 0x3d, 0xf1, 0xaf, 0x63, 0x2a, 0xe6,
+		0x96, 0x5a, 0x13, 0xdf, 0x81, 0x4d, 0x04, 0xc8,
+		0xe4, 0x28, 0x61, 0xad, 0xf3, 0x3f, 0x76, 0xba,
+		0xca, 0x06, 0x4f, 0x83, 0xdd, 0x11, 0x58, 0x94,
+		0x6d, 0xa1, 0xe8, 0x24, 0x7a, 0xb6, 0xff, 0x33,
+		0x43, 0x8f, 0xc6, 0x0a, 0x54, 0x98, 0xd1, 0x1d,
+		0x31, 0xfd, 0xb4, 0x78, 0x26, 0xea, 0xa3, 0x6f,
+		0x1f, 0xd3, 0x9a, 0x56, 0x08, 0xc4, 0x8d, 0x41,
+		0xd5, 0x19, 0x50, 0x9c, 0xc2, 0x0e, 0x47, 0x8b,
+		0xfb, 0x37, 0x7e, 0xb2, 0xec, 0x20, 0x69, 0xa5,
+		0x89, 0x45, 0x0c, 0xc0, 0x9e, 0x52, 0x1b, 0xd7,
+		0xa7, 0x6b, 0x22, 0xee, 0xb0, 0x7c, 0x35, 0xf9,
+		0xda, 0x16, 0x5f, 0x93, 0xcd, 0x01, 0x48, 0x84,
+		0xf4, 0x38, 0x71, 0xbd, 0xe3, 0x2f, 0x66, 0xaa,
+		0x86, 0x4a, 0x03, 0xcf, 0x91, 0x5d, 0x14, 0xd8,
+		0xa8, 0x64, 0x2d, 0xe1, 0xbf, 0x73, 0x3a, 0xf6,
+		0x62, 0xae, 0xe7, 0x2b, 0x75, 0xb9, 0xf0, 0x3c,
+		0x4c, 0x80, 0xc9, 0x05, 0x5b, 0x97, 0xde, 0x12,
+		0x3e, 0xf2, 0xbb, 0x77, 0x29, 0xe5, 0xac, 0x60,
+		0x10, 0xdc, 0x95, 0x59, 0x07, 0xcb, 0x82, 0x4e,
+		0xb7, 0x7b, 0x32, 0xfe, 0xa0, 0x6c, 0x25, 0xe9,
+		0x99, 0x55, 0x1c, 0xd0, 0x8e, 0x42, 0x0b, 0xc7,
+		0xeb, 0x27, 0x6e, 0xa2, 0xfc, 0x30, 0x79, 0xb5,
+		0xc5, 0x09, 0x40, 0x8c, 0xd2, 0x1e, 0x57, 0x9b,
+		0x0f, 0xc3, 0x8a, 0x46, 0x18, 0xd4, 0x9d, 0x51,
+		0x21, 0xed, 0xa4, 0x68, 0x36, 0xfa, 0xb3, 0x7f,
+		0x53, 0x9f, 0xd6, 0x1a, 0x44, 0x88, 0xc1, 0x0d,
+		0x7d, 0xb1, 0xf8, 0x34, 0x6a, 0xa6, 0xef, 0x23,
+	},
+	{
+		0x00, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59,
+		0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f,
+		0x4c, 0x81, 0xcb, 0x06, 0x5f, 0x92, 0xd8, 0x15,
+		0x6a, 0xa7, 0xed, 0x20, 0x79, 0xb4, 0xfe, 0x33,
+		0x98, 0x55, 0x1f, 0xd2, 0x8b, 0x46, 0x0c, 0xc1,
+		0xbe, 0x73, 0x39, 0xf4, 0xad, 0x60, 0x2a, 0xe7,
+		0xd4, 0x19, 0x53, 0x9e, 0xc7, 0x0a, 0x40, 0x8d,
+		0xf2, 0x3f, 0x75, 0xb8, 0xe1, 0x2c, 0x66, 0xab,
+		0x2d, 0xe0, 0xaa, 0x67, 0x3e, 0xf3, 0xb9, 0x74,
+		0x0b, 0xc6, 0x8c, 0x41, 0x18, 0xd5, 0x9f, 0x52,
+		0x61, 0xac, 0xe6, 0x2b, 0x72, 0xbf, 0xf5, 0x38,
+		0x47, 0x8a, 0xc0, 0x0d, 0x54, 0x99, 0xd3, 0x1e,
+		0xb5, 0x78, 0x32, 0xff, 0xa6, 0x6b, 0x21, 0xec,
+		0x93, 0x5e, 0x14, 0xd9, 0x80, 0x4d, 0x07, 0xca,
+		0xf9, 0x34, 0x7e, 0xb3, 0xea, 0x27, 0x6d, 0xa0,
+		0xdf, 0x12, 0x58, 0x95, 0xcc, 0x01, 0x4b, 0x86,
+		0x5a, 0x97, 0xdd, 0x10, 0x49, 0x84, 0xce, 0x03,
+		0x7c, 0xb1, 0xfb, 0x36, 0x6f, 0xa2, 0xe8, 0x25,
+		0x16, 0xdb, 0x91, 0x5c, 0x05, 0xc8, 0x82, 0x4f,
+		0x30, 0xfd, 0xb7, 0x7a, 0x23, 0xee, 0xa4, 0x69,
+		0xc2, 0x0f, 0x45, 0x88, 0xd1, 0x1c, 0x56, 0x9b,
+		0xe4, 0x29, 0x63, 0xae, 0xf7, 0x3a, 0x70, 0xbd,
+		0x8e, 0x43, 0x09, 0xc4, 0x9d, 0x50, 0x1a, 0xd7,
+		0xa8, 0x65, 0x2f, 0xe2, 0xbb, 0x76, 0x3c, 0xf1,
+		0x77, 0xba, 0xf0, 0x3d, 0x64, 0xa9, 0xe3, 0x2e,
+		0x51, 0x9c, 0xd6, 0x1b, 0x42, 0x8f, 0xc5, 0x08,
+		0x3b, 0xf6, 0xbc, 0x71, 0x28, 0xe5, 0xaf, 0x62,
+		0x1d, 0xd0, 0x9a, 0x57, 0x0e, 0xc3, 0x89, 0x44,
+		0xef, 0x22, 0x68, 0xa5, 0xfc, 0x31, 0x7b, 0xb6,
+		0xc9, 0x04, 0x4e, 0x83, 0xda, 0x17, 0x5d, 0x90,
+		0xa3, 0x6e, 0x24, 0xe9, 0xb0, 0x7d, 0x37, 0xfa,
+		0x85, 0x48, 0x02, 0xcf, 0x96, 0x5b, 0x11, 0xdc,
+	},
+	{
+		0x00, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50,
+		0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e,
+		0x7c, 0xb2, 0xfd, 0x33, 0x63, 0xad, 0xe2, 0x2c,
+		0x42, 0x8c, 0xc3, 0x0d, 0x5d, 0x93, 0xdc, 0x12,
+		0xf8, 0x36, 0x79, 0xb7, 0xe7, 0x29, 0x66, 0xa8,
+		0xc6, 0x08, 0x47, 0x89, 0xd9, 0x17, 0x58, 0x96,
+		0x84, 0x4a, 0x05, 0xcb, 0x9b, 0x55, 0x1a, 0xd4,
+		0xba, 0x74, 0x3b, 0xf5, 0xa5, 0x6b, 0x24, 0xea,
+		0xed, 0x23, 0x6c, 0xa2, 0xf2, 0x3c, 0x73, 0xbd,
+		0xd3, 0x1d, 0x52, 0x9c, 0xcc, 0x02, 0x4d, 0x83,
+		0x91, 0x5f, 0x10, 0xde, 0x8e, 0x40, 0x0f, 0xc1,
+		0xaf, 0x61, 0x2e, 0xe0, 0xb0, 0x7e, 0x31, 0xff,
+		0x15, 0xdb, 0x94, 0x5a, 0x0a, 0xc4, 0x8b, 0x45,
+		0x2b, 0xe5, 0xaa, 0x64, 0x34, 0xfa, 0xb5, 0x7b,
+		0x69, 0xa7, 0xe8, 0x26, 0x76, 0xb8, 0xf7, 0x39,
+		0x57, 0x99, 0xd6, 0x18, 0x48, 0x86, 0xc9, 0x07,
+		0xc7, 0x09, 0x46, 0x88, 0xd8, 0x16, 0x59, 0x97,
+		0xf9, 0x37, 0x78, 0xb6, 0xe6, 0x28, 0x67, 0xa9,
+		0xbb, 0x75, 0x3a, 0xf4, 0xa4, 0x6a, 0x25, 0xeb,
+		0x85, 0x4b, 0x04, 0xca, 0x9a, 0x54, 0x1b, 0xd5,
+		0x3f, 0xf1, 0xbe, 0x70, 0x20, 0xee, 0xa1, 0x6f,
+		0x01, 0xcf, 0x80, 0x4e, 0x1e, 0xd0, 0x9f, 0x51,
+		0x43, 0x8d, 0xc2, 0x0c, 0x5c, 0x92, 0xdd, 0x13,
+		0x7d, 0xb3, 0xfc, 0x32, 0x62, 0xac, 0xe3, 0x2d,
+		0x2a, 0xe4, 0xab, 0x65, 0x35, 0xfb, 0xb4, 0x7a,
+		0x14, 0xda, 0x95, 0x5b, 0x0b, 0xc5, 0x8a, 0x44,
+		0x56, 0x98, 0xd7, 0x19, 0x49, 0x87, 0xc8, 0x06,
+		0x68, 0xa6, 0xe9, 0x27, 0x77, 0xb9, 0xf6, 0x38,
+		0xd2, 0x1c, 0x53, 0x9d, 0xcd, 0x03, 0x4c, 0x82,
+		0xec, 0x22, 0x6d, 0xa3, 0xf3, 0x3d, 0x72, 0xbc,
+		0xae, 0x60, 0x2f, 0xe1, 0xb1, 0x7f, 0x30, 0xfe,
+		0x90, 0x5e, 0x11, 0xdf, 0x8f, 0x41, 0x0e, 0xc0,
+	},
+	{
+		0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+		0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61,
+		0x6c, 0xa3, 0xef, 0x20, 0x77, 0xb8, 0xf4, 0x3b,
+		0x5a, 0x95, 0xd9, 0x16, 0x41, 0x8e, 0xc2, 0x0d,
+		0xd8, 0x17, 0x5b, 0x94, 0xc3, 0x0c, 0x40, 0x8f,
+		0xee, 0x21, 0x6d, 0xa2, 0xf5, 0x3a, 0x76, 0xb9,
+		0xb4, 0x7b, 0x37, 0xf8, 0xaf, 0x60, 0x2c, 0xe3,
+		0x82, 0x4d, 0x01, 0xce, 0x99, 0x56, 0x1a, 0xd5,
+		0xad, 0x62, 0x2e, 0xe1, 0xb6, 0x79, 0x35, 0xfa,
+		0x9b, 0x54, 0x18, 0xd7, 0x80, 0x4f, 0x03, 0xcc,
+		0xc1, 0x0e, 0x42, 0x8d, 0xda, 0x15, 0x59, 0x96,
+		0xf7, 0x38, 0x74, 0xbb, 0xec, 0x23, 0x6f, 0xa0,
+		0x75, 0xba, 0xf6, 0x39, 0x6e, 0xa1, 0xed, 0x22,
+		0x43, 0x8c, 0xc0, 0x0f, 0x58, 0x97, 0xdb, 0x14,
+		0x19, 0xd6, 0x9a, 0x55, 0x02, 0xcd, 0x81, 0x4e,
+		0x2f, 0xe0, 0xac, 0x63, 0x34, 0xfb, 0xb7, 0x78,
+		0x47, 0x88, 0xc4, 0x0b, 0x5c, 0x93, 0xdf, 0x10,
+		0x71, 0xbe, 0xf2, 0x3d, 0x6a, 0xa5, 0xe9, 0x26,
+		0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c,
+		0x1d, 0xd2, 0x9e, 0x51, 0x06, 0xc9, 0x85, 0x4a,
+		0x9f, 0x50, 0x1c, 0xd3, 0x84, 0x4b, 0x07, 0xc8,
+		0xa9, 0x66, 0x2a, 0xe5, 0xb2, 0x7d, 0x31, 0xfe,
+		0xf3, 0x3c, 0x70, 0xbf, 0xe8, 0x27, 0x6b, 0xa4,
+		0xc5, 0x0a, 0x46, 0x89, 0xde, 0x11, 0x5d, 0x92,
+		0xea, 0x25, 0x69, 0xa6, 0xf1, 0x3e, 0x72, 0xbd,
+		0xdc, 0x13, 0x5f, 0x90, 0xc7, 0x08, 0x44, 0x8b,
+		0x86, 0x49, 0x05, 0xca, 0x9d, 0x52, 0x1e, 0xd1,
+		0xb0, 0x7f, 0x33, 0xfc, 0xab, 0x64, 0x28, 0xe7,
+		0x32, 0xfd, 0xb1, 0x7e, 0x29, 0xe6, 0xaa, 0x65,
+		0x04, 0xcb, 0x87, 0x48, 0x1f, 0xd0, 0x9c, 0x53,
+		0x5e, 0x91, 0xdd, 0x12, 0x45, 0x8a, 0xc6, 0x09,
+		0x68, 0xa7, 0xeb, 0x24, 0x73, 0xbc, 0xf0, 0x3f,
+	},
+	{
+		0x00, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0x0a,
+		0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4,
+		0x81, 0x51, 0x3c, 0xec, 0xe6, 0x36, 0x5b, 0x8b,
+		0x4f, 0x9f, 0xf2, 0x22, 0x28, 0xf8, 0x95, 0x45,
+		0x1f, 0xcf, 0xa2, 0x72, 0x78, 0xa8, 0xc5, 0x15,
+		0xd1, 0x01, 0x6c, 0xbc, 0xb6, 0x66, 0x0b, 0xdb,
+		0x9e, 0x4e, 0x23, 0xf3, 0xf9, 0x29, 0x44, 0x94,
+		0x50, 0x80, 0xed, 0x3d, 0x37, 0xe7, 0x8a, 0x5a,
+		0x3e, 0xee, 0x83, 0x53, 0x59, 0x89, 0xe4, 0x34,
+		0xf0, 0x20, 0x4d, 0x9d, 0x97, 0x47, 0x2a, 0xfa,
+		0xbf, 0x6f, 0x02, 0xd2, 0xd8, 0x08, 0x65, 0xb5,
+		0x71, 0xa1, 0xcc, 0x1c, 0x16, 0xc6, 0xab, 0x7b,
+		0x21, 0xf1, 0x9c, 0x4c, 0x46, 0x96, 0xfb, 0x2b,
+		0xef, 0x3f, 0x52, 0x82, 0x88, 0x58, 0x35, 0xe5,
+		0xa0, 0x70, 0x1d, 0xcd, 0xc7, 0x17, 0x7a, 0xaa,
+		0x6e, 0xbe, 0xd3, 0x03, 0x09, 0xd9, 0xb4, 0x64,
+		0x7c, 0xac, 0xc1, 0x11, 0x1b, 0xcb, 0xa6, 0x76,
+		0xb2, 0x62, 0x0f, 0xdf, 0xd5, 0x05, 0x68, 0xb8,
+		0xfd, 0x2d, 0x40, 0x90, 0x9a, 0x4a, 0x27, 0xf7,
+		0x33, 0xe3, 0x8e, 0x5e, 0x54, 0x84, 0xe9, 0x39,
+		0x63, 0xb3, 0xde, 0x0e, 0x04, 0xd4, 0xb9, 0x69,
+		0xad, 0x7d, 0x10, 0xc0, 0xca, 0x1a, 0x77, 0xa7,
+		0xe2, 0x32, 0x5f, 0x8f, 0x85, 0x55, 0x38, 0xe8,
+		0x2c, 0xfc, 0x91, 0x41, 0x4b, 0x9b, 0xf6, 0x26,
+		0x42, 0x92, 0xff, 0x2f, 0x25, 0xf5, 0x98, 0x48,
+		0x8c, 0x5c, 0x31, 0xe1, 0xeb, 0x3b, 0x56, 0x86,
+		0xc3, 0x13, 0x7e, 0xae, 0xa4, 0x74, 0x19, 0xc9,
+		0x0d, 0xdd, 0xb0, 0x60, 0x6a, 0xba, 0xd7, 0x07,
+		0x5d, 0x8d, 0xe0, 0x30, 0x3a, 0xea, 0x87, 0x57,
+		0x93, 0x43, 0x2e, 0xfe, 0xf4, 0x24, 0x49, 0x99,
+		0xdc, 0x0c, 0x61, 0xb1, 0xbb, 0x6b, 0x06, 0xd6,
+		0x12, 0xc2, 0xaf, 0x7f, 0x75, 0xa5, 0xc8, 0x18,
+	},
+	{
+		0x00, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0x0d,
+		0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb,
+		0x91, 0x40, 0x2e, 0xff, 0xf2, 0x23, 0x4d, 0x9c,
+		0x57, 0x86, 0xe8, 0x39, 0x34, 0xe5, 0x8b, 0x5a,
+		0x3f, 0xee, 0x80, 0x51, 0x5c, 0x8d, 0xe3, 0x32,
+		0xf9, 0x28, 0x46, 0x97, 0x9a, 0x4b, 0x25, 0xf4,
+		0xae, 0x7f, 0x11, 0xc0, 0xcd, 0x1c, 0x72, 0xa3,
+		0x68, 0xb9, 0xd7, 0x06, 0x0b, 0xda, 0xb4, 0x65,
+		0x7e, 0xaf, 0xc1, 0x10, 0x1d, 0xcc, 0xa2, 0x73,
+		0xb8, 0x69, 0x07, 0xd6, 0xdb, 0x0a, 0x64, 0xb5,
+		0xef, 0x3e, 0x50, 0x81, 0x8c, 0x5d, 0x33, 0xe2,
+		0x29, 0xf8, 0x96, 0x47, 0x4a, 0x9b, 0xf5, 0x24,
+		0x41, 0x90, 0xfe, 0x2f, 0x22, 0xf3, 0x9d, 0x4c,
+		0x87, 0x56, 0x38, 0xe9, 0xe4, 0x35, 0x5b, 0x8a,
+		0xd0, 0x01, 0x6f, 0xbe, 0xb3, 0x62, 0x0c, 0xdd,
+		0x16, 0xc7, 0xa9, 0x78, 0x75, 0xa4, 0xca, 0x1b,
+		0xfc, 0x2d, 0x43, 0x92, 0x9f, 0x4e, 0x20, 0xf1,
+		0x3a, 0xeb, 0x85, 0x54, 0x59, 0x88, 0xe6, 0x37,
+		0x6d, 0xbc, 0xd2, 0x03, 0x0e, 0xdf, 0xb1, 0x60,
+		0xab, 0x7a, 0x14, 0xc5, 0xc8, 0x19, 0x77, 0xa6,
+		0xc3, 0x12, 0x7c, 0xad, 0xa0, 0x71, 0x1f, 0xce,
+		0x05, 0xd4, 0xba, 0x6b, 0x66, 0xb7, 0xd9, 0x08,
+		0x52, 0x83, 0xed, 0x3c, 0x31, 0xe0, 0x8e, 0x5f,
+		0x94, 0x45, 0x2b, 0xfa, 0xf7, 0x26, 0x48, 0x99,
+		0x82, 0x53, 0x3d, 0xec, 0xe1, 0x30, 0x5e, 0x8f,
+		0x44, 0x95, 0xfb, 0x2a, 0x27, 0xf6, 0x98, 0x49,
+		0x13, 0xc2, 0xac, 0x7d, 0x70, 0xa1, 0xcf, 0x1e,
+		0xd5, 0x04, 0x6a, 0xbb, 0xb6, 0x67, 0x09, 0xd8,
+		0xbd, 0x6c, 0x02, 0xd3, 0xde, 0x0f, 0x61, 0xb0,
+		0x7b, 0xaa, 0xc4, 0x15, 0x18, 0xc9, 0xa7, 0x76,
+		0x2c, 0xfd, 0x93, 0x42, 0x4f, 0x9e, 0xf0, 0x21,
+		0xea, 0x3b, 0x55, 0x84, 0x89, 0x58, 0x36, 0xe7,
+	},
+	{
+		0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+		0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda,
+		0xa1, 0x73, 0x18, 0xca, 0xce, 0x1c, 0x77, 0xa5,
+		0x7f, 0xad, 0xc6, 0x14, 0x10, 0xc2, 0xa9, 0x7b,
+		0x5f, 0x8d, 0xe6, 0x34, 0x30, 0xe2, 0x89, 0x5b,
+		0x81, 0x53, 0x38, 0xea, 0xee, 0x3c, 0x57, 0x85,
+		0xfe, 0x2c, 0x47, 0x95, 0x91, 0x43, 0x28, 0xfa,
+		0x20, 0xf2, 0x99, 0x4b, 0x4f, 0x9d, 0xf6, 0x24,
+		0xbe, 0x6c, 0x07, 0xd5, 0xd1, 0x03, 0x68, 0xba,
+		0x60, 0xb2, 0xd9, 0x0b, 0x0f, 0xdd, 0xb6, 0x64,
+		0x1f, 0xcd, 0xa6, 0x74, 0x70, 0xa2, 0xc9, 0x1b,
+		0xc1, 0x13, 0x78, 0xaa, 0xae, 0x7c, 0x17, 0xc5,
+		0xe1, 0x33, 0x58, 0x8a, 0x8e, 0x5c, 0x37, 0xe5,
+		0x3f, 0xed, 0x86, 0x54, 0x50, 0x82, 0xe9, 0x3b,
+		0x40, 0x92, 0xf9, 0x2b, 0x2f, 0xfd, 0x96, 0x44,
+		0x9e, 0x4c, 0x27, 0xf5, 0xf1, 0x23, 0x48, 0x9a,
+		0x61, 0xb3, 0xd8, 0x0a, 0x0e, 0xdc, 0xb7, 0x65,
+		0xbf, 0x6d, 0x06, 0xd4, 0xd0, 0x02, 0x69, 0xbb,
+		0xc0, 0x12, 0x79, 0xab, 0xaf, 0x7d, 0x16, 0xc4,
+		0x1e, 0xcc, 0xa7, 0x75, 0x71, 0xa3, 0xc8, 0x1a,
+		0x3e, 0xec, 0x87, 0x55, 0x51, 0x83, 0xe8, 0x3a,
+		0xe0, 0x32, 0x59, 0x8b, 0x8f, 0x5d, 0x36, 0xe4,
+		0x9f, 0x4d, 0x26, 0xf4, 0xf0, 0x22, 0x49, 0x9b,
+		0x41, 0x93, 0xf8, 0x2a, 0x2e, 0xfc, 0x97, 0x45,
+		0xdf, 0x0d, 0x66, 0xb4, 0xb0, 0x62, 0x09, 0xdb,
+		0x01, 0xd3, 0xb8, 0x6a, 0x6e, 0xbc, 0xd7, 0x05,
+		0x7e, 0xac, 0xc7, 0x15, 0x11, 0xc3, 0xa8, 0x7a,
+		0xa0, 0x72, 0x19, 0xcb, 0xcf, 0x1d, 0x76, 0xa4,
+		0x80, 0x52, 0x39, 0xeb, 0xef, 0x3d, 0x56, 0x84,
+		0x5e, 0x8c, 0xe7, 0x35, 0x31, 0xe3, 0x88, 0x5a,
+		0x21, 0xf3, 0x98, 0x4a, 0x4e, 0x9c, 0xf7, 0x25,
+		0xff, 0x2d, 0x46, 0x94, 0x90, 0x42, 0x29, 0xfb,
+	},
+	{
+		0x00, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x03,
+		0xd6, 0x05, 0x6d, 0xbe, 0xbd, 0x6e, 0x06, 0xd5,
+		0xb1, 0x62, 0x0a, 0xd9, 0xda, 0x09, 0x61, 0xb2,
+		0x67, 0xb4, 0xdc, 0x0f, 0x0c, 0xdf, 0xb7, 0x64,
+		0x7f, 0xac, 0xc4, 0x17, 0x14, 0xc7, 0xaf, 0x7c,
+		0xa9, 0x7a, 0x12, 0xc1, 0xc2, 0x11, 0x79, 0xaa,
+		0xce, 0x1d, 0x75, 0xa6, 0xa5, 0x76, 0x1e, 0xcd,
+		0x18, 0xcb, 0xa3, 0x70, 0x73, 0xa0, 0xc8, 0x1b,
+		0xfe, 0x2d, 0x45, 0x96, 0x95, 0x46, 0x2e, 0xfd,
+		0x28, 0xfb, 0x93, 0x40, 0x43, 0x90, 0xf8, 0x2b,
+		0x4f, 0x9c, 0xf4, 0x27, 0x24, 0xf7, 0x9f, 0x4c,
+		0x99, 0x4a, 0x22, 0xf1, 0xf2, 0x21, 0x49, 0x9a,
+		0x81, 0x52, 0x3a, 0xe9, 0xea, 0x39, 0x51, 0x82,
+		0x57, 0x84, 0xec, 0x3f, 0x3c, 0xef, 0x87, 0x54,
+		0x30, 0xe3, 0x8b, 0x58, 0x5b, 0x88, 0xe0, 0x33,
+		0xe6, 0x35, 0x5d, 0x8e, 0x8d, 0x5e, 0x36, 0xe5,
+		0xe1, 0x32, 0x5a, 0x89, 0x8a, 0x59, 0x31, 0xe2,
+		0x37, 0xe4, 0x8c, 0x5f, 0x5c, 0x8f, 0xe7, 0x34,
+		0x50, 0x83, 0xeb, 0x38, 0x3b, 0xe8, 0x80, 0x53,
+		0x86, 0x55, 0x3d, 0xee, 0xed, 0x3e, 0x56, 0x85,
+		0x9e, 0x4d, 0x25, 0xf6, 0xf5, 0x26, 0x4e, 0x9d,
+		0x48, 0x9b, 0xf3, 0x20, 0x23, 0xf0, 0x98, 0x4b,
+		0x2f, 0xfc, 0x94, 0x47, 0x44, 0x97, 0xff, 0x2c,
+		0xf9, 0x2a, 0x42, 0x91, 0x92, 0x41, 0x29, 0xfa,
+		0x1f, 0xcc, 0xa4, 0x77, 0x74, 0xa7, 0xcf, 0x1c,
+		0xc9, 0x1a, 0x72, 0xa1, 0xa2, 0x71, 0x19, 0xca,
+		0xae, 0x7d, 0x15, 0xc6, 0xc5, 0x16, 0x7e, 0xad,
+		0x78, 0xab, 0xc3, 0x10, 0x13, 0xc0, 0xa8, 0x7b,
+		0x60, 0xb3, 0xdb, 0x08, 0x0b, 0xd8, 0xb0, 0x63,
+		0xb6, 0x65, 0x0d, 0xde, 0xdd, 0x0e, 0x66, 0xb5,
+		0xd1, 0x02, 0x6a, 0xb9, 0xba, 0x69, 0x01, 0xd2,
+		0x07, 0xd4, 0xbc, 0x6f, 0x6c, 0xbf, 0xd7, 0x04,
+	},
+	{
+		0x00, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16,
+		0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8,
+		0xc1, 0x15, 0x74, 0xa0, 0xb6, 0x62, 0x03, 0xd7,
+		0x2f, 0xfb, 0x9a, 0x4e, 0x58, 0x8c, 0xed, 0x39,
+		0x9f, 0x4b, 0x2a, 0xfe, 0xe8, 0x3c, 0x5d, 0x89,
+		0x71, 0xa5, 0xc4, 0x10, 0x06, 0xd2, 0xb3, 0x67,
+		0x5e, 0x8a, 0xeb, 0x3f, 0x29, 0xfd, 0x9c, 0x48,
+		0xb0, 0x64, 0x05, 0xd1, 0xc7, 0x13, 0x72, 0xa6,
+		0x23, 0xf7, 0x96, 0x42, 0x54, 0x80, 0xe1, 0x35,
+		0xcd, 0x19, 0x78, 0xac, 0xba, 0x6e, 0x0f, 0xdb,
+		0xe2, 0x36, 0x57, 0x83, 0x95, 0x41, 0x20, 0xf4,
+		0x0c, 0xd8, 0xb9, 0x6d, 0x7b, 0xaf, 0xce, 0x1a,
+		0xbc, 0x68, 0x09, 0xdd, 0xcb, 0x1f, 0x7e, 0xaa,
+		0x52, 0x86, 0xe7, 0x33, 0x25, 0xf1, 0x90, 0x44,
+		0x7d, 0xa9, 0xc8, 0x1c, 0x0a, 0xde, 0xbf, 0x6b,
+		0x93, 0x47, 0x26, 0xf2, 0xe4, 0x30, 0x51, 0x85,
+		0x46, 0x92, 0xf3, 0x27, 0x31, 0xe5, 0x84, 0x50,
+		0xa8, 0x7c, 0x1d, 0xc9, 0xdf, 0x0b, 0x6a, 0xbe,
+		0x87, 0x53, 0x32, 0xe6, 0xf0, 0x24, 0x45, 0x91,
+		0x69, 0xbd, 0xdc, 0x08, 0x1e, 0xca, 0xab, 0x7f,
+		0xd9, 0x0d, 0x6c, 0xb8, 0xae, 0x7a, 0x1b, 0xcf,
+		0x37, 0xe3, 0x82, 0x56, 0x40, 0x94, 0xf5, 0x21,
+		0x18, 0xcc, 0xad, 0x79, 0x6f, 0xbb, 0xda, 0x0e,
+		0xf6, 0x22, 0x43, 0x97, 0x81, 0x55, 0x34, 0xe0,
+		0x65, 0xb1, 0xd0, 0x04, 0x12, 0xc6, 0xa7, 0x73,
+		0x8b, 0x5f, 0x3e, 0xea, 0xfc, 0x28, 0x49, 0x9d,
+		0xa4, 0x70, 0x11, 0xc5, 0xd3, 0x07, 0x66, 0xb2,
+		0x4a, 0x9e, 0xff, 0x2b, 0x3d, 0xe9, 0x88, 0x5c,
+		0xfa, 0x2e, 0x4f, 0x9b, 0x8d, 0x59, 0x38, 0xec,
+		0x14, 0xc0, 0xa1, 0x75, 0x63, 0xb7, 0xd6, 0x02,
+		0x3b, 0xef, 0x8e, 0x5a, 0x4c, 0x98, 0xf9, 0x2d,
+		0xd5, 0x01, 0x60, 0xb4, 0xa2, 0x76, 0x17, 0xc3,
+	},
+	{
+		0x00, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11,
+		0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7,
+		0xd1, 0x04, 0x66, 0xb3, 0xa2, 0x77, 0x15, 0xc0,
+		0x37, 0xe2, 0x80, 0x55, 0x44, 0x91, 0xf3, 0x26,
+		0xbf, 0x6a, 0x08, 0xdd, 0xcc, 0x19, 0x7b, 0xae,
+		0x59, 0x8c, 0xee, 0x3b, 0x2a, 0xff, 0x9d, 0x48,
+		0x6e, 0xbb, 0xd9, 0x0c, 0x1d, 0xc8, 0xaa, 0x7f,
+		0x88, 0x5d, 0x3f, 0xea, 0xfb, 0x2e, 0x4c, 0x99,
+		0x63, 0xb6, 0xd4, 0x01, 0x10, 0xc5, 0xa7, 0x72,
+		0x85, 0x50, 0x32, 0xe7, 0xf6, 0x23, 0x41, 0x94,
+		0xb2, 0x67, 0x05, 0xd0, 0xc1, 0x14, 0x76, 0xa3,
+		0x54, 0x81, 0xe3, 0x36, 0x27, 0xf2, 0x90, 0x45,
+		0xdc, 0x09, 0x6b, 0xbe, 0xaf, 0x7a, 0x18, 0xcd,
+		0x3a, 0xef, 0x8d, 0x58, 0x49, 0x9c, 0xfe, 0x2b,
+		0x0d, 0xd8, 0xba, 0x6f, 0x7e, 0xab, 0xc9, 0x1c,
+		0xeb, 0x3e, 0x5c, 0x89, 0x98, 0x4d, 0x2f, 0xfa,
+		0xc6, 0x13, 0x71, 0xa4, 0xb5, 0x60, 0x02, 0xd7,
+		0x20, 0xf5, 0x97, 0x42, 0x53, 0x86, 0xe4, 0x31,
+		0x17, 0xc2, 0xa0, 0x75, 0x64, 0xb1, 0xd3, 0x06,
+		0xf1, 0x24, 0x46, 0x93, 0x82, 0x57, 0x35, 0xe0,
+		0x79, 0xac, 0xce, 0x1b, 0x0a, 0xdf, 0xbd, 0x68,
+		0x9f, 0x4a, 0x28, 0xfd, 0xec, 0x39, 0x5b, 0x8e,
+		0xa8, 0x7d, 0x1f, 0xca, 0xdb, 0x0e, 0x6c, 0xb9,
+		0x4e, 0x9b, 0xf9, 0x2c, 0x3d, 0xe8, 0x8a, 0x5f,
+		0xa5, 0x70, 0x12, 0xc7, 0xd6, 0x03, 0x61, 0xb4,
+		0x43, 0x96, 0xf4, 0x21, 0x30, 0xe5, 0x87, 0x52,
+		0x74, 0xa1, 0xc3, 0x16, 0x07, 0xd2, 0xb0, 0x65,
+		0x92, 0x47, 0x25, 0xf0, 0xe1, 0x34, 0x56, 0x83,
+		0x1a, 0xcf, 0xad, 0x78, 0x69, 0xbc, 0xde, 0x0b,
+		0xfc, 0x29, 0x4b, 0x9e, 0x8f, 0x5a, 0x38, 0xed,
+		0xcb, 0x1e, 0x7c, 0xa9, 0xb8, 0x6d, 0x0f, 0xda,
+		0x2d, 0xf8, 0x9a, 0x4f, 0x5e, 0x8b, 0xe9, 0x3c,
+	},
+	{
+		0x00, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18,
+		0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6,
+		0xe1, 0x37, 0x50, 0x86, 0x9e, 0x48, 0x2f, 0xf9,
+		0x1f, 0xc9, 0xae, 0x78, 0x60, 0xb6, 0xd1, 0x07,
+		0xdf, 0x09, 0x6e, 0xb8, 0xa0, 0x76, 0x11, 0xc7,
+		0x21, 0xf7, 0x90, 0x46, 0x5e, 0x88, 0xef, 0x39,
+		0x3e, 0xe8, 0x8f, 0x59, 0x41, 0x97, 0xf0, 0x26,
+		0xc0, 0x16, 0x71, 0xa7, 0xbf, 0x69, 0x0e, 0xd8,
+		0xa3, 0x75, 0x12, 0xc4, 0xdc, 0x0a, 0x6d, 0xbb,
+		0x5d, 0x8b, 0xec, 0x3a, 0x22, 0xf4, 0x93, 0x45,
+		0x42, 0x94, 0xf3, 0x25, 0x3d, 0xeb, 0x8c, 0x5a,
+		0xbc, 0x6a, 0x0d, 0xdb, 0xc3, 0x15, 0x72, 0xa4,
+		0x7c, 0xaa, 0xcd, 0x1b, 0x03, 0xd5, 0xb2, 0x64,
+		0x82, 0x54, 0x33, 0xe5, 0xfd, 0x2b, 0x4c, 0x9a,
+		0x9d, 0x4b, 0x2c, 0xfa, 0xe2, 0x34, 0x53, 0x85,
+		0x63, 0xb5, 0xd2, 0x04, 0x1c, 0xca, 0xad, 0x7b,
+		0x5b, 0x8d, 0xea, 0x3c, 0x24, 0xf2, 0x95, 0x43,
+		0xa5, 0x73, 0x14, 0xc2, 0xda, 0x0c, 0x6b, 0xbd,
+		0xba, 0x6c, 0x0b, 0xdd, 0xc5, 0x13, 0x74, 0xa2,
+		0x44, 0x92, 0xf5, 0x23, 0x3b, 0xed, 0x8a, 0x5c,
+		0x84, 0x52, 0x35, 0xe3, 0xfb, 0x2d, 0x4a, 0x9c,
+		0x7a, 0xac, 0xcb, 0x1d, 0x05, 0xd3, 0xb4, 0x62,
+		0x65, 0xb3, 0xd4, 0x02, 0x1a, 0xcc, 0xab, 0x7d,
+		0x9b, 0x4d, 0x2a, 0xfc, 0xe4, 0x32, 0x55, 0x83,
+		0xf8, 0x2e, 0x49, 0x9f, 0x87, 0x51, 0x36, 0xe0,
+		0x06, 0xd0, 0xb7, 0x61, 0x79, 0xaf, 0xc8, 0x1e,
+		0x19, 0xcf, 0xa8, 0x7e, 0x66, 0xb0, 0xd7, 0x01,
+		0xe7, 0x31, 0x56, 0x80, 0x98, 0x4e, 0x29, 0xff,
+		0x27, 0xf1, 0x96, 0x40, 0x58, 0x8e, 0xe9, 0x3f,
+		0xd9, 0x0f, 0x68, 0xbe, 0xa6, 0x70, 0x17, 0xc1,
+		0xc6, 0x10, 0x77, 0xa1, 0xb9, 0x6f, 0x08, 0xde,
+		0x38, 0xee, 0x89, 0x5f, 0x47, 0x91, 0xf6, 0x20,
+	},
+	{
+		0x00, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f,
+		0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9,
+		0xf1, 0x26, 0x42, 0x95, 0x8a, 0x5d, 0x39, 0xee,
+		0x07, 0xd0, 0xb4, 0x63, 0x7c, 0xab, 0xcf, 0x18,
+		0xff, 0x28, 0x4c, 0x9b, 0x84, 0x53, 0x37, 0xe0,
+		0x09, 0xde, 0xba, 0x6d, 0x72, 0xa5, 0xc1, 0x16,
+		0x0e, 0xd9, 0xbd, 0x6a, 0x75, 0xa2, 0xc6, 0x11,
+		0xf8, 0x2f, 0x4b, 0x9c, 0x83, 0x54, 0x30, 0xe7,
+		0xe3, 0x34, 0x50, 0x87, 0x98, 0x4f, 0x2b, 0xfc,
+		0x15, 0xc2, 0xa6, 0x71, 0x6e, 0xb9, 0xdd, 0x0a,
+		0x12, 0xc5, 0xa1, 0x76, 0x69, 0xbe, 0xda, 0x0d,
+		0xe4, 0x33, 0x57, 0x80, 0x9f, 0x48, 0x2c, 0xfb,
+		0x1c, 0xcb, 0xaf, 0x78, 0x67, 0xb0, 0xd4, 0x03,
+		0xea, 0x3d, 0x59, 0x8e, 0x91, 0x46, 0x22, 0xf5,
+		0xed, 0x3a, 0x5e, 0x89, 0x96, 0x41, 0x25, 0xf2,
+		0x1b, 0xcc, 0xa8, 0x7f, 0x60, 0xb7, 0xd3, 0x04,
+		0xdb, 0x0c, 0x68, 0xbf, 0xa0, 0x77, 0x13, 0xc4,
+		0x2d, 0xfa, 0x9e, 0x49, 0x56, 0x81, 0xe5, 0x32,
+		0x2a, 0xfd, 0x99, 0x4e, 0x51, 0x86, 0xe2, 0x35,
+		0xdc, 0x0b, 0x6f, 0xb8, 0xa7, 0x70, 0x14, 0xc3,
+		0x24, 0xf3, 0x97, 0x40, 0x5f, 0x88, 0xec, 0x3b,
+		0xd2, 0x05, 0x61, 0xb6, 0xa9, 0x7e, 0x1a, 0xcd,
+		0xd5, 0x02, 0x66, 0xb1, 0xae, 0x79, 0x1d, 0xca,
+		0x23, 0xf4, 0x90, 0x47, 0x58, 0x8f, 0xeb, 0x3c,
+		0x38, 0xef, 0x8b, 0x5c, 0x43, 0x94, 0xf0, 0x27,
+		0xce, 0x19, 0x7d, 0xaa, 0xb5, 0x62, 0x06, 0xd1,
+		0xc9, 0x1e, 0x7a, 0xad, 0xb2, 0x65, 0x01, 0xd6,
+		0x3f, 0xe8, 0x8c, 0x5b, 0x44, 0x93, 0xf7, 0x20,
+		0xc7, 0x10, 0x74, 0xa3, 0xbc, 0x6b, 0x0f, 0xd8,
+		0x31, 0xe6, 0x82, 0x55, 0x4a, 0x9d, 0xf9, 0x2e,
+		0x36, 0xe1, 0x85, 0x52, 0x4d, 0x9a, 0xfe, 0x29,
+		0xc0, 0x17, 0x73, 0xa4, 0xbb, 0x6c, 0x08, 0xdf,
+	},
+	{
+		0x00, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32,
+		0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc,
+		0x01, 0xd9, 0xac, 0x74, 0x46, 0x9e, 0xeb, 0x33,
+		0x8f, 0x57, 0x22, 0xfa, 0xc8, 0x10, 0x65, 0xbd,
+		0x02, 0xda, 0xaf, 0x77, 0x45, 0x9d, 0xe8, 0x30,
+		0x8c, 0x54, 0x21, 0xf9, 0xcb, 0x13, 0x66, 0xbe,
+		0x03, 0xdb, 0xae, 0x76, 0x44, 0x9c, 0xe9, 0x31,
+		0x8d, 0x55, 0x20, 0xf8, 0xca, 0x12, 0x67, 0xbf,
+		0x04, 0xdc, 0xa9, 0x71, 0x43, 0x9b, 0xee, 0x36,
+		0x8a, 0x52, 0x27, 0xff, 0xcd, 0x15, 0x60, 0xb8,
+		0x05, 0xdd, 0xa8, 0x70, 0x42, 0x9a, 0xef, 0x37,
+		0x8b, 0x53, 0x26, 0xfe, 0xcc, 0x14, 0x61, 0xb9,
+		0x06, 0xde, 0xab, 0x73, 0x41, 0x99, 0xec, 0x34,
+		0x88, 0x50, 0x25, 0xfd, 0xcf, 0x17, 0x62, 0xba,
+		0x07, 0xdf, 0xaa, 0x72, 0x40, 0x98, 0xed, 0x35,
+		0x89, 0x51, 0x24, 0xfc, 0xce, 0x16, 0x63, 0xbb,
+		0x08, 0xd0, 0xa5, 0x7d, 0x4f, 0x97, 0xe2, 0x3a,
+		0x86, 0x5e, 0x2b, 0xf3, 0xc1, 0x19, 0x6c, 0xb4,
+		0x09, 0xd1, 0xa4, 0x7c, 0x4e, 0x96, 0xe3, 0x3b,
+		0x87, 0x5f, 0x2a, 0xf2, 0xc0, 0x18, 0x6d, 0xb5,
+		0x0a, 0xd2, 0xa7, 0x7f, 0x4d, 0x95, 0xe0, 0x38,
+		0x84, 0x5c, 0x29, 0xf1, 0xc3, 0x1b, 0x6e, 0xb6,
+		0x0b, 0xd3, 0xa6, 0x7e, 0x4c, 0x94, 0xe1, 0x39,
+		0x85, 0x5d, 0x28, 0xf0, 0xc2, 0x1a, 0x6f, 0xb7,
+		0x0c, 0xd4, 0xa1, 0x79, 0x4b, 0x93, 0xe6, 0x3e,
+		0x82, 0x5a, 0x2f, 0xf7, 0xc5, 0x1d, 0x68, 0xb0,
+		0x0d, 0xd5, 0xa0, 0x78, 0x4a, 0x92, 0xe7, 0x3f,
+		0x83, 0x5b, 0x2e, 0xf6, 0xc4, 0x1c, 0x69, 0xb1,
+		0x0e, 0xd6, 0xa3, 0x7b, 0x49, 0x91, 0xe4, 0x3c,
+		0x80, 0x58, 0x2d, 0xf5, 0xc7, 0x1f, 0x6a, 0xb2,
+		0x0f, 0xd7, 0xa2, 0x7a, 0x48, 0x90, 0xe5, 0x3d,
+		0x81, 0x59, 0x2c, 0xf4, 0xc6, 0x1e, 0x6b, 0xb3,
+	},
+	{
+		0x00, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35,
+		0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3,
+		0x11, 0xc8, 0xbe, 0x67, 0x52, 0x8b, 0xfd, 0x24,
+		0x97, 0x4e, 0x38, 0xe1, 0xd4, 0x0d, 0x7b, 0xa2,
+		0x22, 0xfb, 0x8d, 0x54, 0x61, 0xb8, 0xce, 0x17,
+		0xa4, 0x7d, 0x0b, 0xd2, 0xe7, 0x3e, 0x48, 0x91,
+		0x33, 0xea, 0x9c, 0x45, 0x70, 0xa9, 0xdf, 0x06,
+		0xb5, 0x6c, 0x1a, 0xc3, 0xf6, 0x2f, 0x59, 0x80,
+		0x44, 0x9d, 0xeb, 0x32, 0x07, 0xde, 0xa8, 0x71,
+		0xc2, 0x1b, 0x6d, 0xb4, 0x81, 0x58, 0x2e, 0xf7,
+		0x55, 0x8c, 0xfa, 0x23, 0x16, 0xcf, 0xb9, 0x60,
+		0xd3, 0x0a, 0x7c, 0xa5, 0x90, 0x49, 0x3f, 0xe6,
+		0x66, 0xbf, 0xc9, 0x10, 0x25, 0xfc, 0x8a, 0x53,
+		0xe0, 0x39, 0x4f, 0x96, 0xa3, 0x7a, 0x0c, 0xd5,
+		0x77, 0xae, 0xd8, 0x01, 0x34, 0xed, 0x9b, 0x42,
+		0xf1, 0x28, 0x5e, 0x87, 0xb2, 0x6b, 0x1d, 0xc4,
+		0x88, 0x51, 0x27, 0xfe, 0xcb, 0x12, 0x64, 0xbd,
+		0x0e, 0xd7, 0xa1, 0x78, 0x4d, 0x94, 0xe2, 0x3b,
+		0x99, 0x40, 0x36, 0xef, 0xda, 0x03, 0x75, 0xac,
+		0x1f, 0xc6, 0xb0, 0x69, 0x5c, 0x85, 0xf3, 0x2a,
+		0xaa, 0x73, 0x05, 0xdc, 0xe9, 0x30, 0x46, 0x9f,
+		0x2c, 0xf5, 0x83, 0x5a, 0x6f, 0xb6, 0xc0, 0x19,
+		0xbb, 0x62, 0x14, 0xcd, 0xf8, 0x21, 0x57, 0x8e,
+		0x3d, 0xe4, 0x92, 0x4b, 0x7e, 0xa7, 0xd1, 0x08,
+		0xcc, 0x15, 0x63, 0xba, 0x8f, 0x56, 0x20, 0xf9,
+		0x4a, 0x93, 0xe5, 0x3c, 0x09, 0xd0, 0xa6, 0x7f,
+		0xdd, 0x04, 0x72, 0xab, 0x9e, 0x47, 0x31, 0xe8,
+		0x5b, 0x82, 0xf4, 0x2d, 0x18, 0xc1, 0xb7, 0x6e,
+		0xee, 0x37, 0x41, 0x98, 0xad, 0x74, 0x02, 0xdb,
+		0x68, 0xb1, 0xc7, 0x1e, 0x2b, 0xf2, 0x84, 0x5d,
+		0xff, 0x26, 0x50, 0x89, 0xbc, 0x65, 0x13, 0xca,
+		0x79, 0xa0, 0xd6, 0x0f, 0x3a, 0xe3, 0x95, 0x4c,
+	},
+	{
+		0x00, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c,
+		0x9e, 0x44, 0x37, 0xed, 0xd1, 0x0b, 0x78, 0xa2,
+		0x21, 0xfb, 0x88, 0x52, 0x6e, 0xb4, 0xc7, 0x1d,
+		0xbf, 0x65, 0x16, 0xcc, 0xf0, 0x2a, 0x59, 0x83,
+		0x42, 0x98, 0xeb, 0x31, 0x0d, 0xd7, 0xa4, 0x7e,
+		0xdc, 0x06, 0x75, 0xaf, 0x93, 0x49, 0x3a, 0xe0,
+		0x63, 0xb9, 0xca, 0x10, 0x2c, 0xf6, 0x85, 0x5f,
+		0xfd, 0x27, 0x54, 0x8e, 0xb2, 0x68, 0x1b, 0xc1,
+		0x84, 0x5e, 0x2d, 0xf7, 0xcb, 0x11, 0x62, 0xb8,
+		0x1a, 0xc0, 0xb3, 0x69, 0x55, 0x8f, 0xfc, 0x26,
+		0xa5, 0x7f, 0x0c, 0xd6, 0xea, 0x30, 0x43, 0x99,
+		0x3b, 0xe1, 0x92, 0x48, 0x74, 0xae, 0xdd, 0x07,
+		0xc6, 0x1c, 0x6f, 0xb5, 0x89, 0x53, 0x20, 0xfa,
+		0x58, 0x82, 0xf1, 0x2b, 0x17, 0xcd, 0xbe, 0x64,
+		0xe7, 0x3d, 0x4e, 0x94, 0xa8, 0x72, 0x01, 0xdb,
+		0x79, 0xa3, 0xd0, 0x0a, 0x36, 0xec, 0x9f, 0x45,
+		0x15, 0xcf, 0xbc, 0x66, 0x5a, 0x80, 0xf3, 0x29,
+		0x8b, 0x51, 0x22, 0xf8, 0xc4, 0x1e, 0x6d, 0xb7,
+		0x34, 0xee, 0x9d, 0x47, 0x7b, 0xa1, 0xd2, 0x08,
+		0xaa, 0x70, 0x03, 0xd9, 0xe5, 0x3f, 0x4c, 0x96,
+		0x57, 0x8d, 0xfe, 0x24, 0x18, 0xc2, 0xb1, 0x6b,
+		0xc9, 0x13, 0x60, 0xba, 0x86, 0x5c, 0x2f, 0xf5,
+		0x76, 0xac, 0xdf, 0x05, 0x39, 0xe3, 0x90, 0x4a,
+		0xe8, 0x32, 0x41, 0x9b, 0xa7, 0x7d, 0x0e, 0xd4,
+		0x91, 0x4b, 0x38, 0xe2, 0xde, 0x04, 0x77, 0xad,
+		0x0f, 0xd5, 0xa6, 0x7c, 0x40, 0x9a, 0xe9, 0x33,
+		0xb0, 0x6a, 0x19, 0xc3, 0xff, 0x25, 0x56, 0x8c,
+		0x2e, 0xf4, 0x87, 0x5d, 0x61, 0xbb, 0xc8, 0x12,
+		0xd3, 0x09, 0x7a, 0xa0, 0x9c, 0x46, 0x35, 0xef,
+		0x4d, 0x97, 0xe4, 0x3e, 0x02, 0xd8, 0xab, 0x71,
+		0xf2, 0x28, 0x5b, 0x81, 0xbd, 0x67, 0x14, 0xce,
+		0x6c, 0xb6, 0xc5, 0x1f, 0x23, 0xf9, 0x8a, 0x50,
+	},
+	{
+		0x00, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b,
+		0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x06, 0x76, 0xad,
+		0x31, 0xea, 0x9a, 0x41, 0x7a, 0xa1, 0xd1, 0x0a,
+		0xa7, 0x7c, 0x0c, 0xd7, 0xec, 0x37, 0x47, 0x9c,
+		0x62, 0xb9, 0xc9, 0x12, 0x29, 0xf2, 0x82, 0x59,
+		0xf4, 0x2f, 0x5f, 0x84, 0xbf, 0x64, 0x14, 0xcf,
+		0x53, 0x88, 0xf8, 0x23, 0x18, 0xc3, 0xb3, 0x68,
+		0xc5, 0x1e, 0x6e, 0xb5, 0x8e, 0x55, 0x25, 0xfe,
+		0xc4, 0x1f, 0x6f, 0xb4, 0x8f, 0x54, 0x24, 0xff,
+		0x52, 0x89, 0xf9, 0x22, 0x19, 0xc2, 0xb2, 0x69,
+		0xf5, 0x2e, 0x5e, 0x85, 0xbe, 0x65, 0x15, 0xce,
+		0x63, 0xb8, 0xc8, 0x13, 0x28, 0xf3, 0x83, 0x58,
+		0xa6, 0x7d, 0x0d, 0xd6, 0xed, 0x36, 0x46, 0x9d,
+		0x30, 0xeb, 0x9b, 0x40, 0x7b, 0xa0, 0xd0, 0x0b,
+		0x97, 0x4c, 0x3c, 0xe7, 0xdc, 0x07, 0x77, 0xac,
+		0x01, 0xda, 0xaa, 0x71, 0x4a, 0x91, 0xe1, 0x3a,
+		0x95, 0x4e, 0x3e, 0xe5, 0xde, 0x05, 0x75, 0xae,
+		0x03, 0xd8, 0xa8, 0x73, 0x48, 0x93, 0xe3, 0x38,
+		0xa4, 0x7f, 0x0f, 0xd4, 0xef, 0x34, 0x44, 0x9f,
+		0x32, 0xe9, 0x99, 0x42, 0x79, 0xa2, 0xd2, 0x09,
+		0xf7, 0x2c, 0x5c, 0x87, 0xbc, 0x67, 0x17, 0xcc,
+		0x61, 0xba, 0xca, 0x11, 0x2a, 0xf1, 0x81, 0x5a,
+		0xc6, 0x1d, 0x6d, 0xb6, 0x8d, 0x56, 0x26, 0xfd,
+		0x50, 0x8b, 0xfb, 0x20, 0x1b, 0xc0, 0xb0, 0x6b,
+		0x51, 0x8a, 0xfa, 0x21, 0x1a, 0xc1, 0xb1, 0x6a,
+		0xc7, 0x1c, 0x6c, 0xb7, 0x8c, 0x57, 0x27, 0xfc,
+		0x60, 0xbb, 0xcb, 0x10, 0x2b, 0xf0, 0x80, 0x5b,
+		0xf6, 0x2d, 0x5d, 0x86, 0xbd, 0x66, 0x16, 0xcd,
+		0x33, 0xe8, 0x98, 0x43, 0x78, 0xa3, 0xd3, 0x08,
+		0xa5, 0x7e, 0x0e, 0xd5, 0xee, 0x35, 0x45, 0x9e,
+		0x02, 0xd9, 0xa9, 0x72, 0x49, 0x92, 0xe2, 0x39,
+		0x94, 0x4f, 0x3f, 0xe4, 0xdf, 0x04, 0x74, 0xaf,
+	},
+	{
+		0x00, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e,
+		0xae, 0x72, 0x0b, 0xd7, 0xf9, 0x25, 0x5c, 0x80,
+		0x41, 0x9d, 0xe4, 0x38, 0x16, 0xca, 0xb3, 0x6f,
+		0xef, 0x33, 0x4a, 0x96, 0xb8, 0x64, 0x1d, 0xc1,
+		0x82, 0x5e, 0x27, 0xfb, 0xd5, 0x09, 0x70, 0xac,
+		0x2c, 0xf0, 0x89, 0x55, 0x7b, 0xa7, 0xde, 0x02,
+		0xc3, 0x1f, 0x66, 0xba, 0x94, 0x48, 0x31, 0xed,
+		0x6d, 0xb1, 0xc8, 0x14, 0x3a, 0xe6, 0x9f, 0x43,
+		0x19, 0xc5, 0xbc, 0x60, 0x4e, 0x92, 0xeb, 0x37,
+		0xb7, 0x6b, 0x12, 0xce, 0xe0, 0x3c, 0x45, 0x99,
+		0x58, 0x84, 0xfd, 0x21, 0x0f, 0xd3, 0xaa, 0x76,
+		0xf6, 0x2a, 0x53, 0x8f, 0xa1, 0x7d, 0x04, 0xd8,
+		0x9b, 0x47, 0x3e, 0xe2, 0xcc, 0x10, 0x69, 0xb5,
+		0x35, 0xe9, 0x90, 0x4c, 0x62, 0xbe, 0xc7, 0x1b,
+		0xda, 0x06, 0x7f, 0xa3, 0x8d, 0x51, 0x28, 0xf4,
+		0x74, 0xa8, 0xd1, 0x0d, 0x23, 0xff, 0x86, 0x5a,
+		0x32, 0xee, 0x97, 0x4b, 0x65, 0xb9, 0xc0, 0x1c,
+		0x9c, 0x40, 0x39, 0xe5, 0xcb, 0x17, 0x6e, 0xb2,
+		0x73, 0xaf, 0xd6, 0x0a, 0x24, 0xf8, 0x81, 0x5d,
+		0xdd, 0x01, 0x78, 0xa4, 0x8a, 0x56, 0x2f, 0xf3,
+		0xb0, 0x6c, 0x15, 0xc9, 0xe7, 0x3b, 0x42, 0x9e,
+		0x1e, 0xc2, 0xbb, 0x67, 0x49, 0x95, 0xec, 0x30,
+		0xf1, 0x2d, 0x54, 0x88, 0xa6, 0x7a, 0x03, 0xdf,
+		0x5f, 0x83, 0xfa, 0x26, 0x08, 0xd4, 0xad, 0x71,
+		0x2b, 0xf7, 0x8e, 0x52, 0x7c, 0xa0, 0xd9, 0x05,
+		0x85, 0x59, 0x20, 0xfc, 0xd2, 0x0e, 0x77, 0xab,
+		0x6a, 0xb6, 0xcf, 0x13, 0x3d, 0xe1, 0x98, 0x44,
+		0xc4, 0x18, 0x61, 0xbd, 0x93, 0x4f, 0x36, 0xea,
+		0xa9, 0x75, 0x0c, 0xd0, 0xfe, 0x22, 0x5b, 0x87,
+		0x07, 0xdb, 0xa2, 0x7e, 0x50, 0x8c, 0xf5, 0x29,
+		0xe8, 0x34, 0x4d, 0x91, 0xbf, 0x63, 0x1a, 0xc6,
+		0x46, 0x9a, 0xe3, 0x3f, 0x11, 0xcd, 0xb4, 0x68,
+	},
+	{
+		0x00, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29,
+		0xa6, 0x7b, 0x01, 0xdc, 0xf5, 0x28, 0x52, 0x8f,
+		0x51, 0x8c, 0xf6, 0x2b, 0x02, 0xdf, 0xa5, 0x78,
+		0xf7, 0x2a, 0x50, 0x8d, 0xa4, 0x79, 0x03, 0xde,
+		0xa2, 0x7f, 0x05, 0xd8, 0xf1, 0x2c, 0x56, 0x8b,
+		0x04, 0xd9, 0xa3, 0x7e, 0x57, 0x8a, 0xf0, 0x2d,
+		0xf3, 0x2e, 0x54, 0x89, 0xa0, 0x7d, 0x07, 0xda,
+		0x55, 0x88, 0xf2, 0x2f, 0x06, 0xdb, 0xa1, 0x7c,
+		0x59, 0x84, 0xfe, 0x23, 0x0a, 0xd7, 0xad, 0x70,
+		0xff, 0x22, 0x58, 0x85, 0xac, 0x71, 0x0b, 0xd6,
+		0x08, 0xd5, 0xaf, 0x72, 0x5b, 0x86, 0xfc, 0x21,
+		0xae, 0x73, 0x09, 0xd4, 0xfd, 0x20, 0x5a, 0x87,
+		0xfb, 0x26, 0x5c, 0x81, 0xa8, 0x75, 0x0f, 0xd2,
+		0x5d, 0x80, 0xfa, 0x27, 0x0e, 0xd3, 0xa9, 0x74,
+		0xaa, 0x77, 0x0d, 0xd0, 0xf9, 0x24, 0x5e, 0x83,
+		0x0c, 0xd1, 0xab, 0x76, 0x5f, 0x82, 0xf8, 0x25,
+		0xb2, 0x6f, 0x15, 0xc8, 0xe1, 0x3c, 0x46, 0x9b,
+		0x14, 0xc9, 0xb3, 0x6e, 0x47, 0x9a, 0xe0, 0x3d,
+		0xe3, 0x3e, 0x44, 0x99, 0xb0, 0x6d, 0x17, 0xca,
+		0x45, 0x98, 0xe2, 0x3f, 0x16, 0xcb, 0xb1, 0x6c,
+		0x10, 0xcd, 0xb7, 0x6a, 0x43, 0x9e, 0xe4, 0x39,
+		0xb6, 0x6b, 0x11, 0xcc, 0xe5, 0x38, 0x42, 0x9f,
+		0x41, 0x9c, 0xe6, 0x3b, 0x12, 0xcf, 0xb5, 0x68,
+		0xe7, 0x3a, 0x40, 0x9d, 0xb4, 0x69, 0x13, 0xce,
+		0xeb, 0x36, 0x4c, 0x91, 0xb8, 0x65, 0x1f, 0xc2,
+		0x4d, 0x90, 0xea, 0x37, 0x1e, 0xc3, 0xb9, 0x64,
+		0xba, 0x67, 0x1d, 0xc0, 0xe9, 0x34, 0x4e, 0x93,
+		0x1c, 0xc1, 0xbb, 0x66, 0x4f, 0x92, 0xe8, 0x35,
+		0x49, 0x94, 0xee, 0x33, 0x1a, 0xc7, 0xbd, 0x60,
+		0xef, 0x32, 0x48, 0x95, 0xbc, 0x61, 0x1b, 0xc6,
+		0x18, 0xc5, 0xbf, 0x62, 0x4b, 0x96, 0xec, 0x31,
+		0xbe, 0x63, 0x19, 0xc4, 0xed, 0x30, 0x4a, 0x97,
+	},
+	{
+		0x00, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20,
+		0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e,
+		0x61, 0xbf, 0xc0, 0x1e, 0x3e, 0xe0, 0x9f, 0x41,
+		0xdf, 0x01, 0x7e, 0xa0, 0x80, 0x5e, 0x21, 0xff,
+		0xc2, 0x1c, 0x63, 0xbd, 0x9d, 0x43, 0x3c, 0xe2,
+		0x7c, 0xa2, 0xdd, 0x03, 0x23, 0xfd, 0x82, 0x5c,
+		0xa3, 0x7d, 0x02, 0xdc, 0xfc, 0x22, 0x5d, 0x83,
+		0x1d, 0xc3, 0xbc, 0x62, 0x42, 0x9c, 0xe3, 0x3d,
+		0x99, 0x47, 0x38, 0xe6, 0xc6, 0x18, 0x67, 0xb9,
+		0x27, 0xf9, 0x86, 0x58, 0x78, 0xa6, 0xd9, 0x07,
+		0xf8, 0x26, 0x59, 0x87, 0xa7, 0x79, 0x06, 0xd8,
+		0x46, 0x98, 0xe7, 0x39, 0x19, 0xc7, 0xb8, 0x66,
+		0x5b, 0x85, 0xfa, 0x24, 0x04, 0xda, 0xa5, 0x7b,
+		0xe5, 0x3b, 0x44, 0x9a, 0xba, 0x64, 0x1b, 0xc5,
+		0x3a, 0xe4, 0x9b, 0x45, 0x65, 0xbb, 0xc4, 0x1a,
+		0x84, 0x5a, 0x25, 0xfb, 0xdb, 0x05, 0x7a, 0xa4,
+		0x2f, 0xf1, 0x8e, 0x50, 0x70, 0xae, 0xd1, 0x0f,
+		0x91, 0x4f, 0x30, 0xee, 0xce, 0x10, 0x6f, 0xb1,
+		0x4e, 0x90, 0xef, 0x31, 0x11, 0xcf, 0xb0, 0x6e,
+		0xf0, 0x2e, 0x51, 0x8f, 0xaf, 0x71, 0x0e, 0xd0,
+		0xed, 0x33, 0x4c, 0x92, 0xb2, 0x6c, 0x13, 0xcd,
+		0x53, 0x8d, 0xf2, 0x2c, 0x0c, 0xd2, 0xad, 0x73,
+		0x8c, 0x52, 0x2d, 0xf3, 0xd3, 0x0d, 0x72, 0xac,
+		0x32, 0xec, 0x93, 0x4d, 0x6d, 0xb3, 0xcc, 0x12,
+		0xb6, 0x68, 0x17, 0xc9, 0xe9, 0x37, 0x48, 0x96,
+		0x08, 0xd6, 0xa9, 0x77, 0x57, 0x89, 0xf6, 0x28,
+		0xd7, 0x09, 0x76, 0xa8, 0x88, 0x56, 0x29, 0xf7,
+		0x69, 0xb7, 0xc8, 0x16, 0x36, 0xe8, 0x97, 0x49,
+		0x74, 0xaa, 0xd5, 0x0b, 0x2b, 0xf5, 0x8a, 0x54,
+		0xca, 0x14, 0x6b, 0xb5, 0x95, 0x4b, 0x34, 0xea,
+		0x15, 0xcb, 0xb4, 0x6a, 0x4a, 0x94, 0xeb, 0x35,
+		0xab, 0x75, 0x0a, 0xd4, 0xf4, 0x2a, 0x55, 0x8b,
+	},
+	{
+		0x00, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27,
+		0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91,
+		0x71, 0xae, 0xd2, 0x0d, 0x2a, 0xf5, 0x89, 0x56,
+		0xc7, 0x18, 0x64, 0xbb, 0x9c, 0x43, 0x3f, 0xe0,
+		0xe2, 0x3d, 0x41, 0x9e, 0xb9, 0x66, 0x1a, 0xc5,
+		0x54, 0x8b, 0xf7, 0x28, 0x0f, 0xd0, 0xac, 0x73,
+		0x93, 0x4c, 0x30, 0xef, 0xc8, 0x17, 0x6b, 0xb4,
+		0x25, 0xfa, 0x86, 0x59, 0x7e, 0xa1, 0xdd, 0x02,
+		0xd9, 0x06, 0x7a, 0xa5, 0x82, 0x5d, 0x21, 0xfe,
+		0x6f, 0xb0, 0xcc, 0x13, 0x34, 0xeb, 0x97, 0x48,
+		0xa8, 0x77, 0x0b, 0xd4, 0xf3, 0x2c, 0x50, 0x8f,
+		0x1e, 0xc1, 0xbd, 0x62, 0x45, 0x9a, 0xe6, 0x39,
+		0x3b, 0xe4, 0x98, 0x47, 0x60, 0xbf, 0xc3, 0x1c,
+		0x8d, 0x52, 0x2e, 0xf1, 0xd6, 0x09, 0x75, 0xaa,
+		0x4a, 0x95, 0xe9, 0x36, 0x11, 0xce, 0xb2, 0x6d,
+		0xfc, 0x23, 0x5f, 0x80, 0xa7, 0x78, 0x04, 0xdb,
+		0xaf, 0x70, 0x0c, 0xd3, 0xf4, 0x2b, 0x57, 0x88,
+		0x19, 0xc6, 0xba, 0x65, 0x42, 0x9d, 0xe1, 0x3e,
+		0xde, 0x01, 0x7d, 0xa2, 0x85, 0x5a, 0x26, 0xf9,
+		0x68, 0xb7, 0xcb, 0x14, 0x33, 0xec, 0x90, 0x4f,
+		0x4d, 0x92, 0xee, 0x31, 0x16, 0xc9, 0xb5, 0x6a,
+		0xfb, 0x24, 0x58, 0x87, 0xa0, 0x7f, 0x03, 0xdc,
+		0x3c, 0xe3, 0x9f, 0x40, 0x67, 0xb8, 0xc4, 0x1b,
+		0x8a, 0x55, 0x29, 0xf6, 0xd1, 0x0e, 0x72, 0xad,
+		0x76, 0xa9, 0xd5, 0x0a, 0x2d, 0xf2, 0x8e, 0x51,
+		0xc0, 0x1f, 0x63, 0xbc, 0x9b, 0x44, 0x38, 0xe7,
+		0x07, 0xd8, 0xa4, 0x7b, 0x5c, 0x83, 0xff, 0x20,
+		0xb1, 0x6e, 0x12, 0xcd, 0xea, 0x35, 0x49, 0x96,
+		0x94, 0x4b, 0x37, 0xe8, 0xcf, 0x10, 0x6c, 0xb3,
+		0x22, 0xfd, 0x81, 0x5e, 0x79, 0xa6, 0xda, 0x05,
+		0xe5, 0x3a, 0x46, 0x99, 0xbe, 0x61, 0x1d, 0xc2,
+		0x53, 0x8c, 0xf0, 0x2f, 0x08, 0xd7, 0xab, 0x74,
+	},
+	{
+		0x00, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a,
+		0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9,
+		0xa6, 0x46, 0x7b, 0x9b, 0x01, 0xe1, 0xdc, 0x3c,
+		0xf5, 0x15, 0x28, 0xc8, 0x52, 0xb2, 0x8f, 0x6f,
+		0x51, 0xb1, 0x8c, 0x6c, 0xf6, 0x16, 0x2b, 0xcb,
+		0x02, 0xe2, 0xdf, 0x3f, 0xa5, 0x45, 0x78, 0x98,
+		0xf7, 0x17, 0x2a, 0xca, 0x50, 0xb0, 0x8d, 0x6d,
+		0xa4, 0x44, 0x79, 0x99, 0x03, 0xe3, 0xde, 0x3e,
+		0xa2, 0x42, 0x7f, 0x9f, 0x05, 0xe5, 0xd8, 0x38,
+		0xf1, 0x11, 0x2c, 0xcc, 0x56, 0xb6, 0x8b, 0x6b,
+		0x04, 0xe4, 0xd9, 0x39, 0xa3, 0x43, 0x7e, 0x9e,
+		0x57, 0xb7, 0x8a, 0x6a, 0xf0, 0x10, 0x2d, 0xcd,
+		0xf3, 0x13, 0x2e, 0xce, 0x54, 0xb4, 0x89, 0x69,
+		0xa0, 0x40, 0x7d, 0x9d, 0x07, 0xe7, 0xda, 0x3a,
+		0x55, 0xb5, 0x88, 0x68, 0xf2, 0x12, 0x2f, 0xcf,
+		0x06, 0xe6, 0xdb, 0x3b, 0xa1, 0x41, 0x7c, 0x9c,
+		0x59, 0xb9, 0x84, 0x64, 0xfe, 0x1e, 0x23, 0xc3,
+		0x0a, 0xea, 0xd7, 0x37, 0xad, 0x4d, 0x70, 0x90,
+		0xff, 0x1f, 0x22, 0xc2, 0x58, 0xb8, 0x85, 0x65,
+		0xac, 0x4c, 0x71, 0x91, 0x0b, 0xeb, 0xd6, 0x36,
+		0x08, 0xe8, 0xd5, 0x35, 0xaf, 0x4f, 0x72, 0x92,
+		0x5b, 0xbb, 0x86, 0x66, 0xfc, 0x1c, 0x21, 0xc1,
+		0xae, 0x4e, 0x73, 0x93, 0x09, 0xe9, 0xd4, 0x34,
+		0xfd, 0x1d, 0x20, 0xc0, 0x5a, 0xba, 0x87, 0x67,
+		0xfb, 0x1b, 0x26, 0xc6, 0x5c, 0xbc, 0x81, 0x61,
+		0xa8, 0x48, 0x75, 0x95, 0x0f, 0xef, 0xd2, 0x32,
+		0x5d, 0xbd, 0x80, 0x60, 0xfa, 0x1a, 0x27, 0xc7,
+		0x0e, 0xee, 0xd3, 0x33, 0xa9, 0x49, 0x74, 0x94,
+		0xaa, 0x4a, 0x77, 0x97, 0x0d, 0xed, 0xd0, 0x30,
+		0xf9, 0x19, 0x24, 0xc4, 0x5e, 0xbe, 0x83, 0x63,
+		0x0c, 0xec, 0xd1, 0x31, 0xab, 0x4b, 0x76, 0x96,
+		0x5f, 0xbf, 0x82, 0x62, 0xf8, 0x18, 0x25, 0xc5,
+	},
+	{
+		0x00, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d,
+		0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6,
+		0xb6, 0x57, 0x69, 0x88, 0x15, 0xf4, 0xca, 0x2b,
+		0xed, 0x0c, 0x32, 0xd3, 0x4e, 0xaf, 0x91, 0x70,
+		0x71, 0x90, 0xae, 0x4f, 0xd2, 0x33, 0x0d, 0xec,
+		0x2a, 0xcb, 0xf5, 0x14, 0x89, 0x68, 0x56, 0xb7,
+		0xc7, 0x26, 0x18, 0xf9, 0x64, 0x85, 0xbb, 0x5a,
+		0x9c, 0x7d, 0x43, 0xa2, 0x3f, 0xde, 0xe0, 0x01,
+		0xe2, 0x03, 0x3d, 0xdc, 0x41, 0xa0, 0x9e, 0x7f,
+		0xb9, 0x58, 0x66, 0x87, 0x1a, 0xfb, 0xc5, 0x24,
+		0x54, 0xb5, 0x8b, 0x6a, 0xf7, 0x16, 0x28, 0xc9,
+		0x0f, 0xee, 0xd0, 0x31, 0xac, 0x4d, 0x73, 0x92,
+		0x93, 0x72, 0x4c, 0xad, 0x30, 0xd1, 0xef, 0x0e,
+		0xc8, 0x29, 0x17, 0xf6, 0x6b, 0x8a, 0xb4, 0x55,
+		0x25, 0xc4, 0xfa, 0x1b, 0x86, 0x67, 0x59, 0xb8,
+		0x7e, 0x9f, 0xa1, 0x40, 0xdd, 0x3c, 0x02, 0xe3,
+		0xd9, 0x38, 0x06, 0xe7, 0x7a, 0x9b, 0xa5, 0x44,
+		0x82, 0x63, 0x5d, 0xbc, 0x21, 0xc0, 0xfe, 0x1f,
+		0x6f, 0x8e, 0xb0, 0x51, 0xcc, 0x2d, 0x13, 0xf2,
+		0x34, 0xd5, 0xeb, 0x0a, 0x97, 0x76, 0x48, 0xa9,
+		0xa8, 0x49, 0x77, 0x96, 0x0b, 0xea, 0xd4, 0x35,
+		0xf3, 0x12, 0x2c, 0xcd, 0x50, 0xb1, 0x8f, 0x6e,
+		0x1e, 0xff, 0xc1, 0x20, 0xbd, 0x5c, 0x62, 0x83,
+		0x45, 0xa4, 0x9a, 0x7b, 0xe6, 0x07, 0x39, 0xd8,
+		0x3b, 0xda, 0xe4, 0x05, 0x98, 0x79, 0x47, 0xa6,
+		0x60, 0x81, 0xbf, 0x5e, 0xc3, 0x22, 0x1c, 0xfd,
+		0x8d, 0x6c, 0x52, 0xb3, 0x2e, 0xcf, 0xf1, 0x10,
+		0xd6, 0x37, 0x09, 0xe8, 0x75, 0x94, 0xaa, 0x4b,
+		0x4a, 0xab, 0x95, 0x74, 0xe9, 0x08, 0x36, 0xd7,
+		0x11, 0xf0, 0xce, 0x2f, 0xb2, 0x53, 0x6d, 0x8c,
+		0xfc, 0x1d, 0x23, 0xc2, 0x5f, 0xbe, 0x80, 0x61,
+		0xa7, 0x46, 0x78, 0x99, 0x04, 0xe5, 0xdb, 0x3a,
+	},
+	{
+		0x00, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94,
+		0x43, 0xa1, 0x9a, 0x78, 0xec, 0x0e, 0x35, 0xd7,
+		0x86, 0x64, 0x5f, 0xbd, 0x29, 0xcb, 0xf0, 0x12,
+		0xc5, 0x27, 0x1c, 0xfe, 0x6a, 0x88, 0xb3, 0x51,
+		0x11, 0xf3, 0xc8, 0x2a, 0xbe, 0x5c, 0x67, 0x85,
+		0x52, 0xb0, 0x8b, 0x69, 0xfd, 0x1f, 0x24, 0xc6,
+		0x97, 0x75, 0x4e, 0xac, 0x38, 0xda, 0xe1, 0x03,
+		0xd4, 0x36, 0x0d, 0xef, 0x7b, 0x99, 0xa2, 0x40,
+		0x22, 0xc0, 0xfb, 0x19, 0x8d, 0x6f, 0x54, 0xb6,
+		0x61, 0x83, 0xb8, 0x5a, 0xce, 0x2c, 0x17, 0xf5,
+		0xa4, 0x46, 0x7d, 0x9f, 0x0b, 0xe9, 0xd2, 0x30,
+		0xe7, 0x05, 0x3e, 0xdc, 0x48, 0xaa, 0x91, 0x73,
+		0x33, 0xd1, 0xea, 0x08, 0x9c, 0x7e, 0x45, 0xa7,
+		0x70, 0x92, 0xa9, 0x4b, 0xdf, 0x3d, 0x06, 0xe4,
+		0xb5, 0x57, 0x6c, 0x8e, 0x1a, 0xf8, 0xc3, 0x21,
+		0xf6, 0x14, 0x2f, 0xcd, 0x59, 0xbb, 0x80, 0x62,
+		0x44, 0xa6, 0x9d, 0x7f, 0xeb, 0x09, 0x32, 0xd0,
+		0x07, 0xe5, 0xde, 0x3c, 0xa8, 0x4a, 0x71, 0x93,
+		0xc2, 0x20, 0x1b, 0xf9, 0x6d, 0x8f, 0xb4, 0x56,
+		0x81, 0x63, 0x58, 0xba, 0x2e, 0xcc, 0xf7, 0x15,
+		0x55, 0xb7, 0x8c, 0x6e, 0xfa, 0x18, 0x23, 0xc1,
+		0x16, 0xf4, 0xcf, 0x2d, 0xb9, 0x5b, 0x60, 0x82,
+		0xd3, 0x31, 0x0a, 0xe8, 0x7c, 0x9e, 0xa5, 0x47,
+		0x90, 0x72, 0x49, 0xab, 0x3f, 0xdd, 0xe6, 0x04,
+		0x66, 0x84, 0xbf, 0x5d, 0xc9, 0x2b, 0x10, 0xf2,
+		0x25, 0xc7, 0xfc, 0x1e, 0x8a, 0x68, 0x53, 0xb1,
+		0xe0, 0x02, 0x39, 0xdb, 0x4f, 0xad, 0x96, 0x74,
+		0xa3, 0x41, 0x7a, 0x98, 0x0c, 0xee, 0xd5, 0x37,
+		0x77, 0x95, 0xae, 0x4c, 0xd8, 0x3a, 0x01, 0xe3,
+		0x34, 0xd6, 0xed, 0x0f, 0x9b, 0x79, 0x42, 0xa0,
+		0xf1, 0x13, 0x28, 0xca, 0x5e, 0xbc, 0x87, 0x65,
+		0xb2, 0x50, 0x6b, 0x89, 0x1d, 0xff, 0xc4, 0x26,
+	},
+	{
+		0x00, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93,
+		0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x03, 0x3b, 0xd8,
+		0x96, 0x75, 0x4d, 0xae, 0x3d, 0xde, 0xe6, 0x05,
+		0xdd, 0x3e, 0x06, 0xe5, 0x76, 0x95, 0xad, 0x4e,
+		0x31, 0xd2, 0xea, 0x09, 0x9a, 0x79, 0x41, 0xa2,
+		0x7a, 0x99, 0xa1, 0x42, 0xd1, 0x32, 0x0a, 0xe9,
+		0xa7, 0x44, 0x7c, 0x9f, 0x0c, 0xef, 0xd7, 0x34,
+		0xec, 0x0f, 0x37, 0xd4, 0x47, 0xa4, 0x9c, 0x7f,
+		0x62, 0x81, 0xb9, 0x5a, 0xc9, 0x2a, 0x12, 0xf1,
+		0x29, 0xca, 0xf2, 0x11, 0x82, 0x61, 0x59, 0xba,
+		0xf4, 0x17, 0x2f, 0xcc, 0x5f, 0xbc, 0x84, 0x67,
+		0xbf, 0x5c, 0x64, 0x87, 0x14, 0xf7, 0xcf, 0x2c,
+		0x53, 0xb0, 0x88, 0x6b, 0xf8, 0x1b, 0x23, 0xc0,
+		0x18, 0xfb, 0xc3, 0x20, 0xb3, 0x50, 0x68, 0x8b,
+		0xc5, 0x26, 0x1e, 0xfd, 0x6e, 0x8d, 0xb5, 0x56,
+		0x8e, 0x6d, 0x55, 0xb6, 0x25, 0xc6, 0xfe, 0x1d,
+		0xc4, 0x27, 0x1f, 0xfc, 0x6f, 0x8c, 0xb4, 0x57,
+		0x8f, 0x6c, 0x54, 0xb7, 0x24, 0xc7, 0xff, 0x1c,
+		0x52, 0xb1, 0x89, 0x6a, 0xf9, 0x1a, 0x22, 0xc1,
+		0x19, 0xfa, 0xc2, 0x21, 0xb2, 0x51, 0x69, 0x8a,
+		0xf5, 0x16, 0x2e, 0xcd, 0x5e, 0xbd, 0x85, 0x66,
+		0xbe, 0x5d, 0x65, 0x86, 0x15, 0xf6, 0xce, 0x2d,
+		0x63, 0x80, 0xb8, 0x5b, 0xc8, 0x2b, 0x13, 0xf0,
+		0x28, 0xcb, 0xf3, 0x10, 0x83, 0x60, 0x58, 0xbb,
+		0xa6, 0x45, 0x7d, 0x9e, 0x0d, 0xee, 0xd6, 0x35,
+		0xed, 0x0e, 0x36, 0xd5, 0x46, 0xa5, 0x9d, 0x7e,
+		0x30, 0xd3, 0xeb, 0x08, 0x9b, 0x78, 0x40, 0xa3,
+		0x7b, 0x98, 0xa0, 0x43, 0xd0, 0x33, 0x0b, 0xe8,
+		0x97, 0x74, 0x4c, 0xaf, 0x3c, 0xdf, 0xe7, 0x04,
+		0xdc, 0x3f, 0x07, 0xe4, 0x77, 0x94, 0xac, 0x4f,
+		0x01, 0xe2, 0xda, 0x39, 0xaa, 0x49, 0x71, 0x92,
+		0x4a, 0xa9, 0x91, 0x72, 0xe1, 0x02, 0x3a, 0xd9,
+	},
+	{
+		0x00, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86,
+		0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5,
+		0xe6, 0x02, 0x33, 0xd7, 0x51, 0xb5, 0x84, 0x60,
+		0x95, 0x71, 0x40, 0xa4, 0x22, 0xc6, 0xf7, 0x13,
+		0xd1, 0x35, 0x04, 0xe0, 0x66, 0x82, 0xb3, 0x57,
+		0xa2, 0x46, 0x77, 0x93, 0x15, 0xf1, 0xc0, 0x24,
+		0x37, 0xd3, 0xe2, 0x06, 0x80, 0x64, 0x55, 0xb1,
+		0x44, 0xa0, 0x91, 0x75, 0xf3, 0x17, 0x26, 0xc2,
+		0xbf, 0x5b, 0x6a, 0x8e, 0x08, 0xec, 0xdd, 0x39,
+		0xcc, 0x28, 0x19, 0xfd, 0x7b, 0x9f, 0xae, 0x4a,
+		0x59, 0xbd, 0x8c, 0x68, 0xee, 0x0a, 0x3b, 0xdf,
+		0x2a, 0xce, 0xff, 0x1b, 0x9d, 0x79, 0x48, 0xac,
+		0x6e, 0x8a, 0xbb, 0x5f, 0xd9, 0x3d, 0x0c, 0xe8,
+		0x1d, 0xf9, 0xc8, 0x2c, 0xaa, 0x4e, 0x7f, 0x9b,
+		0x88, 0x6c, 0x5d, 0xb9, 0x3f, 0xdb, 0xea, 0x0e,
+		0xfb, 0x1f, 0x2e, 0xca, 0x4c, 0xa8, 0x99, 0x7d,
+		0x63, 0x87, 0xb6, 0x52, 0xd4, 0x30, 0x01, 0xe5,
+		0x10, 0xf4, 0xc5, 0x21, 0xa7, 0x43, 0x72, 0x96,
+		0x85, 0x61, 0x50, 0xb4, 0x32, 0xd6, 0xe7, 0x03,
+		0xf6, 0x12, 0x23, 0xc7, 0x41, 0xa5, 0x94, 0x70,
+		0xb2, 0x56, 0x67, 0x83, 0x05, 0xe1, 0xd0, 0x34,
+		0xc1, 0x25, 0x14, 0xf0, 0x76, 0x92, 0xa3, 0x47,
+		0x54, 0xb0, 0x81, 0x65, 0xe3, 0x07, 0x36, 0xd2,
+		0x27, 0xc3, 0xf2, 0x16, 0x90, 0x74, 0x45, 0xa1,
+		0xdc, 0x38, 0x09, 0xed, 0x6b, 0x8f, 0xbe, 0x5a,
+		0xaf, 0x4b, 0x7a, 0x9e, 0x18, 0xfc, 0xcd, 0x29,
+		0x3a, 0xde, 0xef, 0x0b, 0x8d, 0x69, 0x58, 0xbc,
+		0x49, 0xad, 0x9c, 0x78, 0xfe, 0x1a, 0x2b, 0xcf,
+		0x0d, 0xe9, 0xd8, 0x3c, 0xba, 0x5e, 0x6f, 0x8b,
+		0x7e, 0x9a, 0xab, 0x4f, 0xc9, 0x2d, 0x1c, 0xf8,
+		0xeb, 0x0f, 0x3e, 0xda, 0x5c, 0xb8, 0x89, 0x6d,
+		0x98, 0x7c, 0x4d, 0xa9, 0x2f, 0xcb, 0xfa, 0x1e,
+	},
+	{
+		0x00, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81,
+		0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa,
+		0xf6, 0x13, 0x21, 0xc4, 0x45, 0xa0, 0x92, 0x77,
+		0x8d, 0x68, 0x5a, 0xbf, 0x3e, 0xdb, 0xe9, 0x0c,
+		0xf1, 0x14, 0x26, 0xc3, 0x42, 0xa7, 0x95, 0x70,
+		0x8a, 0x6f, 0x5d, 0xb8, 0x39, 0xdc, 0xee, 0x0b,
+		0x07, 0xe2, 0xd0, 0x35, 0xb4, 0x51, 0x63, 0x86,
+		0x7c, 0x99, 0xab, 0x4e, 0xcf, 0x2a, 0x18, 0xfd,
+		0xff, 0x1a, 0x28, 0xcd, 0x4c, 0xa9, 0x9b, 0x7e,
+		0x84, 0x61, 0x53, 0xb6, 0x37, 0xd2, 0xe0, 0x05,
+		0x09, 0xec, 0xde, 0x3b, 0xba, 0x5f, 0x6d, 0x88,
+		0x72, 0x97, 0xa5, 0x40, 0xc1, 0x24, 0x16, 0xf3,
+		0x0e, 0xeb, 0xd9, 0x3c, 0xbd, 0x58, 0x6a, 0x8f,
+		0x75, 0x90, 0xa2, 0x47, 0xc6, 0x23, 0x11, 0xf4,
+		0xf8, 0x1d, 0x2f, 0xca, 0x4b, 0xae, 0x9c, 0x79,
+		0x83, 0x66, 0x54, 0xb1, 0x30, 0xd5, 0xe7, 0x02,
+		0xe3, 0x06, 0x34, 0xd1, 0x50, 0xb5, 0x87, 0x62,
+		0x98, 0x7d, 0x4f, 0xaa, 0x2b, 0xce, 0xfc, 0x19,
+		0x15, 0xf0, 0xc2, 0x27, 0xa6, 0x43, 0x71, 0x94,
+		0x6e, 0x8b, 0xb9, 0x5c, 0xdd, 0x38, 0x0a, 0xef,
+		0x12, 0xf7, 0xc5, 0x20, 0xa1, 0x44, 0x76, 0x93,
+		0x69, 0x8c, 0xbe, 0x5b, 0xda, 0x3f, 0x0d, 0xe8,
+		0xe4, 0x01, 0x33, 0xd6, 0x57, 0xb2, 0x80, 0x65,
+		0x9f, 0x7a, 0x48, 0xad, 0x2c, 0xc9, 0xfb, 0x1e,
+		0x1c, 0xf9, 0xcb, 0x2e, 0xaf, 0x4a, 0x78, 0x9d,
+		0x67, 0x82, 0xb0, 0x55, 0xd4, 0x31, 0x03, 0xe6,
+		0xea, 0x0f, 0x3d, 0xd8, 0x59, 0xbc, 0x8e, 0x6b,
+		0x91, 0x74, 0x46, 0xa3, 0x22, 0xc7, 0xf5, 0x10,
+		0xed, 0x08, 0x3a, 0xdf, 0x5e, 0xbb, 0x89, 0x6c,
+		0x96, 0x73, 0x41, 0xa4, 0x25, 0xc0, 0xf2, 0x17,
+		0x1b, 0xfe, 0xcc, 0x29, 0xa8, 0x4d, 0x7f, 0x9a,
+		0x60, 0x85, 0xb7, 0x52, 0xd3, 0x36, 0x04, 0xe1,
+	},
+	{
+		0x00, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88,
+		0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0x0d, 0xeb,
+		0xc6, 0x20, 0x17, 0xf1, 0x79, 0x9f, 0xa8, 0x4e,
+		0xa5, 0x43, 0x74, 0x92, 0x1a, 0xfc, 0xcb, 0x2d,
+		0x91, 0x77, 0x40, 0xa6, 0x2e, 0xc8, 0xff, 0x19,
+		0xf2, 0x14, 0x23, 0xc5, 0x4d, 0xab, 0x9c, 0x7a,
+		0x57, 0xb1, 0x86, 0x60, 0xe8, 0x0e, 0x39, 0xdf,
+		0x34, 0xd2, 0xe5, 0x03, 0x8b, 0x6d, 0x5a, 0xbc,
+		0x3f, 0xd9, 0xee, 0x08, 0x80, 0x66, 0x51, 0xb7,
+		0x5c, 0xba, 0x8d, 0x6b, 0xe3, 0x05, 0x32, 0xd4,
+		0xf9, 0x1f, 0x28, 0xce, 0x46, 0xa0, 0x97, 0x71,
+		0x9a, 0x7c, 0x4b, 0xad, 0x25, 0xc3, 0xf4, 0x12,
+		0xae, 0x48, 0x7f, 0x99, 0x11, 0xf7, 0xc0, 0x26,
+		0xcd, 0x2b, 0x1c, 0xfa, 0x72, 0x94, 0xa3, 0x45,
+		0x68, 0x8e, 0xb9, 0x5f, 0xd7, 0x31, 0x06, 0xe0,
+		0x0b, 0xed, 0xda, 0x3c, 0xb4, 0x52, 0x65, 0x83,
+		0x7e, 0x98, 0xaf, 0x49, 0xc1, 0x27, 0x10, 0xf6,
+		0x1d, 0xfb, 0xcc, 0x2a, 0xa2, 0x44, 0x73, 0x95,
+		0xb8, 0x5e, 0x69, 0x8f, 0x07, 0xe1, 0xd6, 0x30,
+		0xdb, 0x3d, 0x0a, 0xec, 0x64, 0x82, 0xb5, 0x53,
+		0xef, 0x09, 0x3e, 0xd8, 0x50, 0xb6, 0x81, 0x67,
+		0x8c, 0x6a, 0x5d, 0xbb, 0x33, 0xd5, 0xe2, 0x04,
+		0x29, 0xcf, 0xf8, 0x1e, 0x96, 0x70, 0x47, 0xa1,
+		0x4a, 0xac, 0x9b, 0x7d, 0xf5, 0x13, 0x24, 0xc2,
+		0x41, 0xa7, 0x90, 0x76, 0xfe, 0x18, 0x2f, 0xc9,
+		0x22, 0xc4, 0xf3, 0x15, 0x9d, 0x7b, 0x4c, 0xaa,
+		0x87, 0x61, 0x56, 0xb0, 0x38, 0xde, 0xe9, 0x0f,
+		0xe4, 0x02, 0x35, 0xd3, 0x5b, 0xbd, 0x8a, 0x6c,
+		0xd0, 0x36, 0x01, 0xe7, 0x6f, 0x89, 0xbe, 0x58,
+		0xb3, 0x55, 0x62, 0x84, 0x0c, 0xea, 0xdd, 0x3b,
+		0x16, 0xf0, 0xc7, 0x21, 0xa9, 0x4f, 0x78, 0x9e,
+		0x75, 0x93, 0xa4, 0x42, 0xca, 0x2c, 0x1b, 0xfd,
+	},
+	{
+		0x00, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f,
+		0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x03, 0xe4,
+		0xd6, 0x31, 0x05, 0xe2, 0x6d, 0x8a, 0xbe, 0x59,
+		0xbd, 0x5a, 0x6e, 0x89, 0x06, 0xe1, 0xd5, 0x32,
+		0xb1, 0x56, 0x62, 0x85, 0x0a, 0xed, 0xd9, 0x3e,
+		0xda, 0x3d, 0x09, 0xee, 0x61, 0x86, 0xb2, 0x55,
+		0x67, 0x80, 0xb4, 0x53, 0xdc, 0x3b, 0x0f, 0xe8,
+		0x0c, 0xeb, 0xdf, 0x38, 0xb7, 0x50, 0x64, 0x83,
+		0x7f, 0x98, 0xac, 0x4b, 0xc4, 0x23, 0x17, 0xf0,
+		0x14, 0xf3, 0xc7, 0x20, 0xaf, 0x48, 0x7c, 0x9b,
+		0xa9, 0x4e, 0x7a, 0x9d, 0x12, 0xf5, 0xc1, 0x26,
+		0xc2, 0x25, 0x11, 0xf6, 0x79, 0x9e, 0xaa, 0x4d,
+		0xce, 0x29, 0x1d, 0xfa, 0x75, 0x92, 0xa6, 0x41,
+		0xa5, 0x42, 0x76, 0x91, 0x1e, 0xf9, 0xcd, 0x2a,
+		0x18, 0xff, 0xcb, 0x2c, 0xa3, 0x44, 0x70, 0x97,
+		0x73, 0x94, 0xa0, 0x47, 0xc8, 0x2f, 0x1b, 0xfc,
+		0xfe, 0x19, 0x2d, 0xca, 0x45, 0xa2, 0x96, 0x71,
+		0x95, 0x72, 0x46, 0xa1, 0x2e, 0xc9, 0xfd, 0x1a,
+		0x28, 0xcf, 0xfb, 0x1c, 0x93, 0x74, 0x40, 0xa7,
+		0x43, 0xa4, 0x90, 0x77, 0xf8, 0x1f, 0x2b, 0xcc,
+		0x4f, 0xa8, 0x9c, 0x7b, 0xf4, 0x13, 0x27, 0xc0,
+		0x24, 0xc3, 0xf7, 0x10, 0x9f, 0x78, 0x4c, 0xab,
+		0x99, 0x7e, 0x4a, 0xad, 0x22, 0xc5, 0xf1, 0x16,
+		0xf2, 0x15, 0x21, 0xc6, 0x49, 0xae, 0x9a, 0x7d,
+		0x81, 0x66, 0x52, 0xb5, 0x3a, 0xdd, 0xe9, 0x0e,
+		0xea, 0x0d, 0x39, 0xde, 0x51, 0xb6, 0x82, 0x65,
+		0x57, 0xb0, 0x84, 0x63, 0xec, 0x0b, 0x3f, 0xd8,
+		0x3c, 0xdb, 0xef, 0x08, 0x87, 0x60, 0x54, 0xb3,
+		0x30, 0xd7, 0xe3, 0x04, 0x8b, 0x6c, 0x58, 0xbf,
+		0x5b, 0xbc, 0x88, 0x6f, 0xe0, 0x07, 0x33, 0xd4,
+		0xe6, 0x01, 0x35, 0xd2, 0x5d, 0xba, 0x8e, 0x69,
+		0x8d, 0x6a, 0x5e, 0xb9, 0x36, 0xd1, 0xe5, 0x02,
+	},
+	{
+		0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+		0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1,
+		0x26, 0xce, 0xeb, 0x03, 0xa1, 0x49, 0x6c, 0x84,
+		0x35, 0xdd, 0xf8, 0x10, 0xb2, 0x5a, 0x7f, 0x97,
+		0x4c, 0xa4, 0x81, 0x69, 0xcb, 0x23, 0x06, 0xee,
+		0x5f, 0xb7, 0x92, 0x7a, 0xd8, 0x30, 0x15, 0xfd,
+		0x6a, 0x82, 0xa7, 0x4f, 0xed, 0x05, 0x20, 0xc8,
+		0x79, 0x91, 0xb4, 0x5c, 0xfe, 0x16, 0x33, 0xdb,
+		0x98, 0x70, 0x55, 0xbd, 0x1f, 0xf7, 0xd2, 0x3a,
+		0x8b, 0x63, 0x46, 0xae, 0x0c, 0xe4, 0xc1, 0x29,
+		0xbe, 0x56, 0x73, 0x9b, 0x39, 0xd1, 0xf4, 0x1c,
+		0xad, 0x45, 0x60, 0x88, 0x2a, 0xc2, 0xe7, 0x0f,
+		0xd4, 0x3c, 0x19, 0xf1, 0x53, 0xbb, 0x9e, 0x76,
+		0xc7, 0x2f, 0x0a, 0xe2, 0x40, 0xa8, 0x8d, 0x65,
+		0xf2, 0x1a, 0x3f, 0xd7, 0x75, 0x9d, 0xb8, 0x50,
+		0xe1, 0x09, 0x2c, 0xc4, 0x66, 0x8e, 0xab, 0x43,
+		0x2d, 0xc5, 0xe0, 0x08, 0xaa, 0x42, 0x67, 0x8f,
+		0x3e, 0xd6, 0xf3, 0x1b, 0xb9, 0x51, 0x74, 0x9c,
+		0x0b, 0xe3, 0xc6, 0x2e, 0x8c, 0x64, 0x41, 0xa9,
+		0x18, 0xf0, 0xd5, 0x3d, 0x9f, 0x77, 0x52, 0xba,
+		0x61, 0x89, 0xac, 0x44, 0xe6, 0x0e, 0x2b, 0xc3,
+		0x72, 0x9a, 0xbf, 0x57, 0xf5, 0x1d, 0x38, 0xd0,
+		0x47, 0xaf, 0x8a, 0x62, 0xc0, 0x28, 0x0d, 0xe5,
+		0x54, 0xbc, 0x99, 0x71, 0xd3, 0x3b, 0x1e, 0xf6,
+		0xb5, 0x5d, 0x78, 0x90, 0x32, 0xda, 0xff, 0x17,
+		0xa6, 0x4e, 0x6b, 0x83, 0x21, 0xc9, 0xec, 0x04,
+		0x93, 0x7b, 0x5e, 0xb6, 0x14, 0xfc, 0xd9, 0x31,
+		0x80, 0x68, 0x4d, 0xa5, 0x07, 0xef, 0xca, 0x22,
+		0xf9, 0x11, 0x34, 0xdc, 0x7e, 0x96, 0xb3, 0x5b,
+		0xea, 0x02, 0x27, 0xcf, 0x6d, 0x85, 0xa0, 0x48,
+		0xdf, 0x37, 0x12, 0xfa, 0x58, 0xb0, 0x95, 0x7d,
+		0xcc, 0x24, 0x01, 0xe9, 0x4b, 0xa3, 0x86, 0x6e,
+	},
+	{
+		0x00, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5,
+		0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe,
+		0x36, 0xdf, 0xf9, 0x10, 0xb5, 0x5c, 0x7a, 0x93,
+		0x2d, 0xc4, 0xe2, 0x0b, 0xae, 0x47, 0x61, 0x88,
+		0x6c, 0x85, 0xa3, 0x4a, 0xef, 0x06, 0x20, 0xc9,
+		0x77, 0x9e, 0xb8, 0x51, 0xf4, 0x1d, 0x3b, 0xd2,
+		0x5a, 0xb3, 0x95, 0x7c, 0xd9, 0x30, 0x16, 0xff,
+		0x41, 0xa8, 0x8e, 0x67, 0xc2, 0x2b, 0x0d, 0xe4,
+		0xd8, 0x31, 0x17, 0xfe, 0x5b, 0xb2, 0x94, 0x7d,
+		0xc3, 0x2a, 0x0c, 0xe5, 0x40, 0xa9, 0x8f, 0x66,
+		0xee, 0x07, 0x21, 0xc8, 0x6d, 0x84, 0xa2, 0x4b,
+		0xf5, 0x1c, 0x3a, 0xd3, 0x76, 0x9f, 0xb9, 0x50,
+		0xb4, 0x5d, 0x7b, 0x92, 0x37, 0xde, 0xf8, 0x11,
+		0xaf, 0x46, 0x60, 0x89, 0x2c, 0xc5, 0xe3, 0x0a,
+		0x82, 0x6b, 0x4d, 0xa4, 0x01, 0xe8, 0xce, 0x27,
+		0x99, 0x70, 0x56, 0xbf, 0x1a, 0xf3, 0xd5, 0x3c,
+		0xad, 0x44, 0x62, 0x8b, 0x2e, 0xc7, 0xe1, 0x08,
+		0xb6, 0x5f, 0x79, 0x90, 0x35, 0xdc, 0xfa, 0x13,
+		0x9b, 0x72, 0x54, 0xbd, 0x18, 0xf1, 0xd7, 0x3e,
+		0x80, 0x69, 0x4f, 0xa6, 0x03, 0xea, 0xcc, 0x25,
+		0xc1, 0x28, 0x0e, 0xe7, 0x42, 0xab, 0x8d, 0x64,
+		0xda, 0x33, 0x15, 0xfc, 0x59, 0xb0, 0x96, 0x7f,
+		0xf7, 0x1e, 0x38, 0xd1, 0x74, 0x9d, 0xbb, 0x52,
+		0xec, 0x05, 0x23, 0xca, 0x6f, 0x86, 0xa0, 0x49,
+		0x75, 0x9c, 0xba, 0x53, 0xf6, 0x1f, 0x39, 0xd0,
+		0x6e, 0x87, 0xa1, 0x48, 0xed, 0x04, 0x22, 0xcb,
+		0x43, 0xaa, 0x8c, 0x65, 0xc0, 0x29, 0x0f, 0xe6,
+		0x58, 0xb1, 0x97, 0x7e, 0xdb, 0x32, 0x14, 0xfd,
+		0x19, 0xf0, 0xd6, 0x3f, 0x9a, 0x73, 0x55, 0xbc,
+		0x02, 0xeb, 0xcd, 0x24, 0x81, 0x68, 0x4e, 0xa7,
+		0x2f, 0xc6, 0xe0, 0x09, 0xac, 0x45, 0x63, 0x8a,
+		0x34, 0xdd, 0xfb, 0x12, 0xb7, 0x5e, 0x78, 0x91,
+	},
+	{
+		0x00, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac,
+		0x03, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf,
+		0x06, 0xec, 0xcf, 0x25, 0x89, 0x63, 0x40, 0xaa,
+		0x05, 0xef, 0xcc, 0x26, 0x8a, 0x60, 0x43, 0xa9,
+		0x0c, 0xe6, 0xc5, 0x2f, 0x83, 0x69, 0x4a, 0xa0,
+		0x0f, 0xe5, 0xc6, 0x2c, 0x80, 0x6a, 0x49, 0xa3,
+		0x0a, 0xe0, 0xc3, 0x29, 0x85, 0x6f, 0x4c, 0xa6,
+		0x09, 0xe3, 0xc0, 0x2a, 0x86, 0x6c, 0x4f, 0xa5,
+		0x18, 0xf2, 0xd1, 0x3b, 0x97, 0x7d, 0x5e, 0xb4,
+		0x1b, 0xf1, 0xd2, 0x38, 0x94, 0x7e, 0x5d, 0xb7,
+		0x1e, 0xf4, 0xd7, 0x3d, 0x91, 0x7b, 0x58, 0xb2,
+		0x1d, 0xf7, 0xd4, 0x3e, 0x92, 0x78, 0x5b, 0xb1,
+		0x14, 0xfe, 0xdd, 0x37, 0x9b, 0x71, 0x52, 0xb8,
+		0x17, 0xfd, 0xde, 0x34, 0x98, 0x72, 0x51, 0xbb,
+		0x12, 0xf8, 0xdb, 0x31, 0x9d, 0x77, 0x54, 0xbe,
+		0x11, 0xfb, 0xd8, 0x32, 0x9e, 0x74, 0x57, 0xbd,
+		0x30, 0xda, 0xf9, 0x13, 0xbf, 0x55, 0x76, 0x9c,
+		0x33, 0xd9, 0xfa, 0x10, 0xbc, 0x56, 0x75, 0x9f,
+		0x36, 0xdc, 0xff, 0x15, 0xb9, 0x53, 0x70, 0x9a,
+		0x35, 0xdf, 0xfc, 0x16, 0xba, 0x50, 0x73, 0x99,
+		0x3c, 0xd6, 0xf5, 0x1f, 0xb3, 0x59, 0x7a, 0x90,
+		0x3f, 0xd5, 0xf6, 0x1c, 0xb0, 0x5a, 0x79, 0x93,
+		0x3a, 0xd0, 0xf3, 0x19, 0xb5, 0x5f, 0x7c, 0x96,
+		0x39, 0xd3, 0xf0, 0x1a, 0xb6, 0x5c, 0x7f, 0x95,
+		0x28, 0xc2, 0xe1, 0x0b, 0xa7, 0x4d, 0x6e, 0x84,
+		0x2b, 0xc1, 0xe2, 0x08, 0xa4, 0x4e, 0x6d, 0x87,
+		0x2e, 0xc4, 0xe7, 0x0d, 0xa1, 0x4b, 0x68, 0x82,
+		0x2d, 0xc7, 0xe4, 0x0e, 0xa2, 0x48, 0x6b, 0x81,
+		0x24, 0xce, 0xed, 0x07, 0xab, 0x41, 0x62, 0x88,
+		0x27, 0xcd, 0xee, 0x04, 0xa8, 0x42, 0x61, 0x8b,
+		0x22, 0xc8, 0xeb, 0x01, 0xad, 0x47, 0x64, 0x8e,
+		0x21, 0xcb, 0xe8, 0x02, 0xae, 0x44, 0x67, 0x8d,
+	},
+	{
+		0x00, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab,
+		0x0b, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0,
+		0x16, 0xfd, 0xdd, 0x36, 0x9d, 0x76, 0x56, 0xbd,
+		0x1d, 0xf6, 0xd6, 0x3d, 0x96, 0x7d, 0x5d, 0xb6,
+		0x2c, 0xc7, 0xe7, 0x0c, 0xa7, 0x4c, 0x6c, 0x87,
+		0x27, 0xcc, 0xec, 0x07, 0xac, 0x47, 0x67, 0x8c,
+		0x3a, 0xd1, 0xf1, 0x1a, 0xb1, 0x5a, 0x7a, 0x91,
+		0x31, 0xda, 0xfa, 0x11, 0xba, 0x51, 0x71, 0x9a,
+		0x58, 0xb3, 0x93, 0x78, 0xd3, 0x38, 0x18, 0xf3,
+		0x53, 0xb8, 0x98, 0x73, 0xd8, 0x33, 0x13, 0xf8,
+		0x4e, 0xa5, 0x85, 0x6e, 0xc5, 0x2e, 0x0e, 0xe5,
+		0x45, 0xae, 0x8e, 0x65, 0xce, 0x25, 0x05, 0xee,
+		0x74, 0x9f, 0xbf, 0x54, 0xff, 0x14, 0x34, 0xdf,
+		0x7f, 0x94, 0xb4, 0x5f, 0xf4, 0x1f, 0x3f, 0xd4,
+		0x62, 0x89, 0xa9, 0x42, 0xe9, 0x02, 0x22, 0xc9,
+		0x69, 0x82, 0xa2, 0x49, 0xe2, 0x09, 0x29, 0xc2,
+		0xb0, 0x5b, 0x7b, 0x90, 0x3b, 0xd0, 0xf0, 0x1b,
+		0xbb, 0x50, 0x70, 0x9b, 0x30, 0xdb, 0xfb, 0x10,
+		0xa6, 0x4d, 0x6d, 0x86, 0x2d, 0xc6, 0xe6, 0x0d,
+		0xad, 0x46, 0x66, 0x8d, 0x26, 0xcd, 0xed, 0x06,
+		0x9c, 0x77, 0x57, 0xbc, 0x17, 0xfc, 0xdc, 0x37,
+		0x97, 0x7c, 0x5c, 0xb7, 0x1c, 0xf7, 0xd7, 0x3c,
+		0x8a, 0x61, 0x41, 0xaa, 0x01, 0xea, 0xca, 0x21,
+		0x81, 0x6a, 0x4a, 0xa1, 0x0a, 0xe1, 0xc1, 0x2a,
+		0xe8, 0x03, 0x23, 0xc8, 0x63, 0x88, 0xa8, 0x43,
+		0xe3, 0x08, 0x28, 0xc3, 0x68, 0x83, 0xa3, 0x48,
+		0xfe, 0x15, 0x35, 0xde, 0x75, 0x9e, 0xbe, 0x55,
+		0xf5, 0x1e, 0x3e, 0xd5, 0x7e, 0x95, 0xb5, 0x5e,
+		0xc4, 0x2f, 0x0f, 0xe4, 0x4f, 0xa4, 0x84, 0x6f,
+		0xcf, 0x24, 0x04, 0xef, 0x44, 0xaf, 0x8f, 0x64,
+		0xd2, 0x39, 0x19, 0xf2, 0x59, 0xb2, 0x92, 0x79,
+		0xd9, 0x32, 0x12, 0xf9, 0x52, 0xb9, 0x99, 0x72,
+	},
+	{
+		0x00, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe,
+		0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d,
+		0x66, 0x8a, 0xa3, 0x4f, 0xf1, 0x1d, 0x34, 0xd8,
+		0x55, 0xb9, 0x90, 0x7c, 0xc2, 0x2e, 0x07, 0xeb,
+		0xcc, 0x20, 0x09, 0xe5, 0x5b, 0xb7, 0x9e, 0x72,
+		0xff, 0x13, 0x3a, 0xd6, 0x68, 0x84, 0xad, 0x41,
+		0xaa, 0x46, 0x6f, 0x83, 0x3d, 0xd1, 0xf8, 0x14,
+		0x99, 0x75, 0x5c, 0xb0, 0x0e, 0xe2, 0xcb, 0x27,
+		0x85, 0x69, 0x40, 0xac, 0x12, 0xfe, 0xd7, 0x3b,
+		0xb6, 0x5a, 0x73, 0x9f, 0x21, 0xcd, 0xe4, 0x08,
+		0xe3, 0x0f, 0x26, 0xca, 0x74, 0x98, 0xb1, 0x5d,
+		0xd0, 0x3c, 0x15, 0xf9, 0x47, 0xab, 0x82, 0x6e,
+		0x49, 0xa5, 0x8c, 0x60, 0xde, 0x32, 0x1b, 0xf7,
+		0x7a, 0x96, 0xbf, 0x53, 0xed, 0x01, 0x28, 0xc4,
+		0x2f, 0xc3, 0xea, 0x06, 0xb8, 0x54, 0x7d, 0x91,
+		0x1c, 0xf0, 0xd9, 0x35, 0x8b, 0x67, 0x4e, 0xa2,
+		0x17, 0xfb, 0xd2, 0x3e, 0x80, 0x6c, 0x45, 0xa9,
+		0x24, 0xc8, 0xe1, 0x0d, 0xb3, 0x5f, 0x76, 0x9a,
+		0x71, 0x9d, 0xb4, 0x58, 0xe6, 0x0a, 0x23, 0xcf,
+		0x42, 0xae, 0x87, 0x6b, 0xd5, 0x39, 0x10, 0xfc,
+		0xdb, 0x37, 0x1e, 0xf2, 0x4c, 0xa0, 0x89, 0x65,
+		0xe8, 0x04, 0x2d, 0xc1, 0x7f, 0x93, 0xba, 0x56,
+		0xbd, 0x51, 0x78, 0x94, 0x2a, 0xc6, 0xef, 0x03,
+		0x8e, 0x62, 0x4b, 0xa7, 0x19, 0xf5, 0xdc, 0x30,
+		0x92, 0x7e, 0x57, 0xbb, 0x05, 0xe9, 0xc0, 0x2c,
+		0xa1, 0x4d, 0x64, 0x88, 0x36, 0xda, 0xf3, 0x1f,
+		0xf4, 0x18, 0x31, 0xdd, 0x63, 0x8f, 0xa6, 0x4a,
+		0xc7, 0x2b, 0x02, 0xee, 0x50, 0xbc, 0x95, 0x79,
+		0x5e, 0xb2, 0x9b, 0x77, 0xc9, 0x25, 0x0c, 0xe0,
+		0x6d, 0x81, 0xa8, 0x44, 0xfa, 0x16, 0x3f, 0xd3,
+		0x38, 0xd4, 0xfd, 0x11, 0xaf, 0x43, 0x6a, 0x86,
+		0x0b, 0xe7, 0xce, 0x22, 0x9c, 0x70, 0x59, 0xb5,
+	},
+	{
+		0x00, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9,
+		0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82,
+		0x76, 0x9b, 0xb1, 0x5c, 0xe5, 0x08, 0x22, 0xcf,
+		0x4d, 0xa0, 0x8a, 0x67, 0xde, 0x33, 0x19, 0xf4,
+		0xec, 0x01, 0x2b, 0xc6, 0x7f, 0x92, 0xb8, 0x55,
+		0xd7, 0x3a, 0x10, 0xfd, 0x44, 0xa9, 0x83, 0x6e,
+		0x9a, 0x77, 0x5d, 0xb0, 0x09, 0xe4, 0xce, 0x23,
+		0xa1, 0x4c, 0x66, 0x8b, 0x32, 0xdf, 0xf5, 0x18,
+		0xc5, 0x28, 0x02, 0xef, 0x56, 0xbb, 0x91, 0x7c,
+		0xfe, 0x13, 0x39, 0xd4, 0x6d, 0x80, 0xaa, 0x47,
+		0xb3, 0x5e, 0x74, 0x99, 0x20, 0xcd, 0xe7, 0x0a,
+		0x88, 0x65, 0x4f, 0xa2, 0x1b, 0xf6, 0xdc, 0x31,
+		0x29, 0xc4, 0xee, 0x03, 0xba, 0x57, 0x7d, 0x90,
+		0x12, 0xff, 0xd5, 0x38, 0x81, 0x6c, 0x46, 0xab,
+		0x5f, 0xb2, 0x98, 0x75, 0xcc, 0x21, 0x0b, 0xe6,
+		0x64, 0x89, 0xa3, 0x4e, 0xf7, 0x1a, 0x30, 0xdd,
+		0x97, 0x7a, 0x50, 0xbd, 0x04, 0xe9, 0xc3, 0x2e,
+		0xac, 0x41, 0x6b, 0x86, 0x3f, 0xd2, 0xf8, 0x15,
+		0xe1, 0x0c, 0x26, 0xcb, 0x72, 0x9f, 0xb5, 0x58,
+		0xda, 0x37, 0x1d, 0xf0, 0x49, 0xa4, 0x8e, 0x63,
+		0x7b, 0x96, 0xbc, 0x51, 0xe8, 0x05, 0x2f, 0xc2,
+		0x40, 0xad, 0x87, 0x6a, 0xd3, 0x3e, 0x14, 0xf9,
+		0x0d, 0xe0, 0xca, 0x27, 0x9e, 0x73, 0x59, 0xb4,
+		0x36, 0xdb, 0xf1, 0x1c, 0xa5, 0x48, 0x62, 0x8f,
+		0x52, 0xbf, 0x95, 0x78, 0xc1, 0x2c, 0x06, 0xeb,
+		0x69, 0x84, 0xae, 0x43, 0xfa, 0x17, 0x3d, 0xd0,
+		0x24, 0xc9, 0xe3, 0x0e, 0xb7, 0x5a, 0x70, 0x9d,
+		0x1f, 0xf2, 0xd8, 0x35, 0x8c, 0x61, 0x4b, 0xa6,
+		0xbe, 0x53, 0x79, 0x94, 0x2d, 0xc0, 0xea, 0x07,
+		0x85, 0x68, 0x42, 0xaf, 0x16, 0xfb, 0xd1, 0x3c,
+		0xc8, 0x25, 0x0f, 0xe2, 0x5b, 0xb6, 0x9c, 0x71,
+		0xf3, 0x1e, 0x34, 0xd9, 0x60, 0x8d, 0xa7, 0x4a,
+	},
+	{
+		0x00, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0,
+		0x23, 0xcd, 0xe2, 0x0c, 0xbc, 0x52, 0x7d, 0x93,
+		0x46, 0xa8, 0x87, 0x69, 0xd9, 0x37, 0x18, 0xf6,
+		0x65, 0x8b, 0xa4, 0x4a, 0xfa, 0x14, 0x3b, 0xd5,
+		0x8c, 0x62, 0x4d, 0xa3, 0x13, 0xfd, 0xd2, 0x3c,
+		0xaf, 0x41, 0x6e, 0x80, 0x30, 0xde, 0xf1, 0x1f,
+		0xca, 0x24, 0x0b, 0xe5, 0x55, 0xbb, 0x94, 0x7a,
+		0xe9, 0x07, 0x28, 0xc6, 0x76, 0x98, 0xb7, 0x59,
+		0x05, 0xeb, 0xc4, 0x2a, 0x9a, 0x74, 0x5b, 0xb5,
+		0x26, 0xc8, 0xe7, 0x09, 0xb9, 0x57, 0x78, 0x96,
+		0x43, 0xad, 0x82, 0x6c, 0xdc, 0x32, 0x1d, 0xf3,
+		0x60, 0x8e, 0xa1, 0x4f, 0xff, 0x11, 0x3e, 0xd0,
+		0x89, 0x67, 0x48, 0xa6, 0x16, 0xf8, 0xd7, 0x39,
+		0xaa, 0x44, 0x6b, 0x85, 0x35, 0xdb, 0xf4, 0x1a,
+		0xcf, 0x21, 0x0e, 0xe0, 0x50, 0xbe, 0x91, 0x7f,
+		0xec, 0x02, 0x2d, 0xc3, 0x73, 0x9d, 0xb2, 0x5c,
+		0x0a, 0xe4, 0xcb, 0x25, 0x95, 0x7b, 0x54, 0xba,
+		0x29, 0xc7, 0xe8, 0x06, 0xb6, 0x58, 0x77, 0x99,
+		0x4c, 0xa2, 0x8d, 0x63, 0xd3, 0x3d, 0x12, 0xfc,
+		0x6f, 0x81, 0xae, 0x40, 0xf0, 0x1e, 0x31, 0xdf,
+		0x86, 0x68, 0x47, 0xa9, 0x19, 0xf7, 0xd8, 0x36,
+		0xa5, 0x4b, 0x64, 0x8a, 0x3a, 0xd4, 0xfb, 0x15,
+		0xc0, 0x2e, 0x01, 0xef, 0x5f, 0xb1, 0x9e, 0x70,
+		0xe3, 0x0d, 0x22, 0xcc, 0x7c, 0x92, 0xbd, 0x53,
+		0x0f, 0xe1, 0xce, 0x20, 0x90, 0x7e, 0x51, 0xbf,
+		0x2c, 0xc2, 0xed, 0x03, 0xb3, 0x5d, 0x72, 0x9c,
+		0x49, 0xa7, 0x88, 0x66, 0xd6, 0x38, 0x17, 0xf9,
+		0x6a, 0x84, 0xab, 0x45, 0xf5, 0x1b, 0x34, 0xda,
+		0x83, 0x6d, 0x42, 0xac, 0x1c, 0xf2, 0xdd, 0x33,
+		0xa0, 0x4e, 0x61, 0x8f, 0x3f, 0xd1, 0xfe, 0x10,
+		0xc5, 0x2b, 0x04, 0xea, 0x5a, 0xb4, 0x9b, 0x75,
+		0xe6, 0x08, 0x27, 0xc9, 0x79, 0x97, 0xb8, 0x56,
+	},
+	{
+		0x00, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7,
+		0x2b, 0xc4, 0xe8, 0x07, 0xb0, 0x5f, 0x73, 0x9c,
+		0x56, 0xb9, 0x95, 0x7a, 0xcd, 0x22, 0x0e, 0xe1,
+		0x7d, 0x92, 0xbe, 0x51, 0xe6, 0x09, 0x25, 0xca,
+		0xac, 0x43, 0x6f, 0x80, 0x37, 0xd8, 0xf4, 0x1b,
+		0x87, 0x68, 0x44, 0xab, 0x1c, 0xf3, 0xdf, 0x30,
+		0xfa, 0x15, 0x39, 0xd6, 0x61, 0x8e, 0xa2, 0x4d,
+		0xd1, 0x3e, 0x12, 0xfd, 0x4a, 0xa5, 0x89, 0x66,
+		0x45, 0xaa, 0x86, 0x69, 0xde, 0x31, 0x1d, 0xf2,
+		0x6e, 0x81, 0xad, 0x42, 0xf5, 0x1a, 0x36, 0xd9,
+		0x13, 0xfc, 0xd0, 0x3f, 0x88, 0x67, 0x4b, 0xa4,
+		0x38, 0xd7, 0xfb, 0x14, 0xa3, 0x4c, 0x60, 0x8f,
+		0xe9, 0x06, 0x2a, 0xc5, 0x72, 0x9d, 0xb1, 0x5e,
+		0xc2, 0x2d, 0x01, 0xee, 0x59, 0xb6, 0x9a, 0x75,
+		0xbf, 0x50, 0x7c, 0x93, 0x24, 0xcb, 0xe7, 0x08,
+		0x94, 0x7b, 0x57, 0xb8, 0x0f, 0xe0, 0xcc, 0x23,
+		0x8a, 0x65, 0x49, 0xa6, 0x11, 0xfe, 0xd2, 0x3d,
+		0xa1, 0x4e, 0x62, 0x8d, 0x3a, 0xd5, 0xf9, 0x16,
+		0xdc, 0x33, 0x1f, 0xf0, 0x47, 0xa8, 0x84, 0x6b,
+		0xf7, 0x18, 0x34, 0xdb, 0x6c, 0x83, 0xaf, 0x40,
+		0x26, 0xc9, 0xe5, 0x0a, 0xbd, 0x52, 0x7e, 0x91,
+		0x0d, 0xe2, 0xce, 0x21, 0x96, 0x79, 0x55, 0xba,
+		0x70, 0x9f, 0xb3, 0x5c, 0xeb, 0x04, 0x28, 0xc7,
+		0x5b, 0xb4, 0x98, 0x77, 0xc0, 0x2f, 0x03, 0xec,
+		0xcf, 0x20, 0x0c, 0xe3, 0x54, 0xbb, 0x97, 0x78,
+		0xe4, 0x0b, 0x27, 0xc8, 0x7f, 0x90, 0xbc, 0x53,
+		0x99, 0x76, 0x5a, 0xb5, 0x02, 0xed, 0xc1, 0x2e,
+		0xb2, 0x5d, 0x71, 0x9e, 0x29, 0xc6, 0xea, 0x05,
+		0x63, 0x8c, 0xa0, 0x4f, 0xf8, 0x17, 0x3b, 0xd4,
+		0x48, 0xa7, 0x8b, 0x64, 0xd3, 0x3c, 0x10, 0xff,
+		0x35, 0xda, 0xf6, 0x19, 0xae, 0x41, 0x6d, 0x82,
+		0x1e, 0xf1, 0xdd, 0x32, 0x85, 0x6a, 0x46, 0xa9,
+	},
+	{
+		0x00, 0xf0, 0xfd, 0x0d, 0xe7, 0x17, 0x1a, 0xea,
+		0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39,
+		0xbb, 0x4b, 0x46, 0xb6, 0x5c, 0xac, 0xa1, 0x51,
+		0x68, 0x98, 0x95, 0x65, 0x8f, 0x7f, 0x72, 0x82,
+		0x6b, 0x9b, 0x96, 0x66, 0x8c, 0x7c, 0x71, 0x81,
+		0xb8, 0x48, 0x45, 0xb5, 0x5f, 0xaf, 0xa2, 0x52,
+		0xd0, 0x20, 0x2d, 0xdd, 0x37, 0xc7, 0xca, 0x3a,
+		0x03, 0xf3, 0xfe, 0x0e, 0xe4, 0x14, 0x19, 0xe9,
+		0xd6, 0x26, 0x2b, 0xdb, 0x31, 0xc1, 0xcc, 0x3c,
+		0x05, 0xf5, 0xf8, 0x08, 0xe2, 0x12, 0x1f, 0xef,
+		0x6d, 0x9d, 0x90, 0x60, 0x8a, 0x7a, 0x77, 0x87,
+		0xbe, 0x4e, 0x43, 0xb3, 0x59, 0xa9, 0xa4, 0x54,
+		0xbd, 0x4d, 0x40, 0xb0, 0x5a, 0xaa, 0xa7, 0x57,
+		0x6e, 0x9e, 0x93, 0x63, 0x89, 0x79, 0x74, 0x84,
+		0x06, 0xf6, 0xfb, 0x0b, 0xe1, 0x11, 0x1c, 0xec,
+		0xd5, 0x25, 0x28, 0xd8, 0x32, 0xc2, 0xcf, 0x3f,
+		0xb1, 0x41, 0x4c, 0xbc, 0x56, 0xa6, 0xab, 0x5b,
+		0x62, 0x92, 0x9f, 0x6f, 0x85, 0x75, 0x78, 0x88,
+		0x0a, 0xfa, 0xf7, 0x07, 0xed, 0x1d, 0x10, 0xe0,
+		0xd9, 0x29, 0x24, 0xd4, 0x3e, 0xce, 0xc3, 0x33,
+		0xda, 0x2a, 0x27, 0xd7, 0x3d, 0xcd, 0xc0, 0x30,
+		0x09, 0xf9, 0xf4, 0x04, 0xee, 0x1e, 0x13, 0xe3,
+		0x61, 0x91, 0x9c, 0x6c, 0x86, 0x76, 0x7b, 0x8b,
+		0xb2, 0x42, 0x4f, 0xbf, 0x55, 0xa5, 0xa8, 0x58,
+		0x67, 0x97, 0x9a, 0x6a, 0x80, 0x70, 0x7d, 0x8d,
+		0xb4, 0x44, 0x49, 0xb9, 0x53, 0xa3, 0xae, 0x5e,
+		0xdc, 0x2c, 0x21, 0xd1, 0x3b, 0xcb, 0xc6, 0x36,
+		0x0f, 0xff, 0xf2, 0x02, 0xe8, 0x18, 0x15, 0xe5,
+		0x0c, 0xfc, 0xf1, 0x01, 0xeb, 0x1b, 0x16, 0xe6,
+		0xdf, 0x2f, 0x22, 0xd2, 0x38, 0xc8, 0xc5, 0x35,
+		0xb7, 0x47, 0x4a, 0xba, 0x50, 0xa0, 0xad, 0x5d,
+		0x64, 0x94, 0x99, 0x69, 0x83, 0x73, 0x7e, 0x8e,
+	},
+	{
+		0x00, 0xf1, 0xff, 0x0e, 0xe3, 0x12, 0x1c, 0xed,
+		0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36,
+		0xab, 0x5a, 0x54, 0xa5, 0x48, 0xb9, 0xb7, 0x46,
+		0x70, 0x81, 0x8f, 0x7e, 0x93, 0x62, 0x6c, 0x9d,
+		0x4b, 0xba, 0xb4, 0x45, 0xa8, 0x59, 0x57, 0xa6,
+		0x90, 0x61, 0x6f, 0x9e, 0x73, 0x82, 0x8c, 0x7d,
+		0xe0, 0x11, 0x1f, 0xee, 0x03, 0xf2, 0xfc, 0x0d,
+		0x3b, 0xca, 0xc4, 0x35, 0xd8, 0x29, 0x27, 0xd6,
+		0x96, 0x67, 0x69, 0x98, 0x75, 0x84, 0x8a, 0x7b,
+		0x4d, 0xbc, 0xb2, 0x43, 0xae, 0x5f, 0x51, 0xa0,
+		0x3d, 0xcc, 0xc2, 0x33, 0xde, 0x2f, 0x21, 0xd0,
+		0xe6, 0x17, 0x19, 0xe8, 0x05, 0xf4, 0xfa, 0x0b,
+		0xdd, 0x2c, 0x22, 0xd3, 0x3e, 0xcf, 0xc1, 0x30,
+		0x06, 0xf7, 0xf9, 0x08, 0xe5, 0x14, 0x1a, 0xeb,
+		0x76, 0x87, 0x89, 0x78, 0x95, 0x64, 0x6a, 0x9b,
+		0xad, 0x5c, 0x52, 0xa3, 0x4e, 0xbf, 0xb1, 0x40,
+		0x31, 0xc0, 0xce, 0x3f, 0xd2, 0x23, 0x2d, 0xdc,
+		0xea, 0x1b, 0x15, 0xe4, 0x09, 0xf8, 0xf6, 0x07,
+		0x9a, 0x6b, 0x65, 0x94, 0x79, 0x88, 0x86, 0x77,
+		0x41, 0xb0, 0xbe, 0x4f, 0xa2, 0x53, 0x5d, 0xac,
+		0x7a, 0x8b, 0x85, 0x74, 0x99, 0x68, 0x66, 0x97,
+		0xa1, 0x50, 0x5e, 0xaf, 0x42, 0xb3, 0xbd, 0x4c,
+		0xd1, 0x20, 0x2e, 0xdf, 0x32, 0xc3, 0xcd, 0x3c,
+		0x0a, 0xfb, 0xf5, 0x04, 0xe9, 0x18, 0x16, 0xe7,
+		0xa7, 0x56, 0x58, 0xa9, 0x44, 0xb5, 0xbb, 0x4a,
+		0x7c, 0x8d, 0x83, 0x72, 0x9f, 0x6e, 0x60, 0x91,
+		0x0c, 0xfd, 0xf3, 0x02, 0xef, 0x1e, 0x10, 0xe1,
+		0xd7, 0x26, 0x28, 0xd9, 0x34, 0xc5, 0xcb, 0x3a,
+		0xec, 0x1d, 0x13, 0xe2, 0x0f, 0xfe, 0xf0, 0x01,
+		0x37, 0xc6, 0xc8, 0x39, 0xd4, 0x25, 0x2b, 0xda,
+		0x47, 0xb6, 0xb8, 0x49, 0xa4, 0x55, 0x5b, 0xaa,
+		0x9c, 0x6d, 0x63, 0x92, 0x7f, 0x8e, 0x80, 0x71,
+	},
+	{
+		0x00, 0xf2, 0xf9, 0x0b, 0xef, 0x1d, 0x16, 0xe4,
+		0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27,
+		0x9b, 0x69, 0x62, 0x90, 0x74, 0x86, 0x8d, 0x7f,
+		0x58, 0xaa, 0xa1, 0x53, 0xb7, 0x45, 0x4e, 0xbc,
+		0x2b, 0xd9, 0xd2, 0x20, 0xc4, 0x36, 0x3d, 0xcf,
+		0xe8, 0x1a, 0x11, 0xe3, 0x07, 0xf5, 0xfe, 0x0c,
+		0xb0, 0x42, 0x49, 0xbb, 0x5f, 0xad, 0xa6, 0x54,
+		0x73, 0x81, 0x8a, 0x78, 0x9c, 0x6e, 0x65, 0x97,
+		0x56, 0xa4, 0xaf, 0x5d, 0xb9, 0x4b, 0x40, 0xb2,
+		0x95, 0x67, 0x6c, 0x9e, 0x7a, 0x88, 0x83, 0x71,
+		0xcd, 0x3f, 0x34, 0xc6, 0x22, 0xd0, 0xdb, 0x29,
+		0x0e, 0xfc, 0xf7, 0x05, 0xe1, 0x13, 0x18, 0xea,
+		0x7d, 0x8f, 0x84, 0x76, 0x92, 0x60, 0x6b, 0x99,
+		0xbe, 0x4c, 0x47, 0xb5, 0x51, 0xa3, 0xa8, 0x5a,
+		0xe6, 0x14, 0x1f, 0xed, 0x09, 0xfb, 0xf0, 0x02,
+		0x25, 0xd7, 0xdc, 0x2e, 0xca, 0x38, 0x33, 0xc1,
+		0xac, 0x5e, 0x55, 0xa7, 0x43, 0xb1, 0xba, 0x48,
+		0x6f, 0x9d, 0x96, 0x64, 0x80, 0x72, 0x79, 0x8b,
+		0x37, 0xc5, 0xce, 0x3c, 0xd8, 0x2a, 0x21, 0xd3,
+		0xf4, 0x06, 0x0d, 0xff, 0x1b, 0xe9, 0xe2, 0x10,
+		0x87, 0x75, 0x7e, 0x8c, 0x68, 0x9a, 0x91, 0x63,
+		0x44, 0xb6, 0xbd, 0x4f, 0xab, 0x59, 0x52, 0xa0,
+		0x1c, 0xee, 0xe5, 0x17, 0xf3, 0x01, 0x0a, 0xf8,
+		0xdf, 0x2d, 0x26, 0xd4, 0x30, 0xc2, 0xc9, 0x3b,
+		0xfa, 0x08, 0x03, 0xf1, 0x15, 0xe7, 0xec, 0x1e,
+		0x39, 0xcb, 0xc0, 0x32, 0xd6, 0x24, 0x2f, 0xdd,
+		0x61, 0x93, 0x98, 0x6a, 0x8e, 0x7c, 0x77, 0x85,
+		0xa2, 0x50, 0x5b, 0xa9, 0x4d, 0xbf, 0xb4, 0x46,
+		0xd1, 0x23, 0x28, 0xda, 0x3e, 0xcc, 0xc7, 0x35,
+		0x12, 0xe0, 0xeb, 0x19, 0xfd, 0x0f, 0x04, 0xf6,
+		0x4a, 0xb8, 0xb3, 0x41, 0xa5, 0x57, 0x5c, 0xae,
+		0x89, 0x7b, 0x70, 0x82, 0x66, 0x94, 0x9f, 0x6d,
+	},
+	{
+		0x00, 0xf3, 0xfb, 0x08, 0xeb, 0x18, 0x10, 0xe3,
+		0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28,
+		0x8b, 0x78, 0x70, 0x83, 0x60, 0x93, 0x9b, 0x68,
+		0x40, 0xb3, 0xbb, 0x48, 0xab, 0x58, 0x50, 0xa3,
+		0x0b, 0xf8, 0xf0, 0x03, 0xe0, 0x13, 0x1b, 0xe8,
+		0xc0, 0x33, 0x3b, 0xc8, 0x2b, 0xd8, 0xd0, 0x23,
+		0x80, 0x73, 0x7b, 0x88, 0x6b, 0x98, 0x90, 0x63,
+		0x4b, 0xb8, 0xb0, 0x43, 0xa0, 0x53, 0x5b, 0xa8,
+		0x16, 0xe5, 0xed, 0x1e, 0xfd, 0x0e, 0x06, 0xf5,
+		0xdd, 0x2e, 0x26, 0xd5, 0x36, 0xc5, 0xcd, 0x3e,
+		0x9d, 0x6e, 0x66, 0x95, 0x76, 0x85, 0x8d, 0x7e,
+		0x56, 0xa5, 0xad, 0x5e, 0xbd, 0x4e, 0x46, 0xb5,
+		0x1d, 0xee, 0xe6, 0x15, 0xf6, 0x05, 0x0d, 0xfe,
+		0xd6, 0x25, 0x2d, 0xde, 0x3d, 0xce, 0xc6, 0x35,
+		0x96, 0x65, 0x6d, 0x9e, 0x7d, 0x8e, 0x86, 0x75,
+		0x5d, 0xae, 0xa6, 0x55, 0xb6, 0x45, 0x4d, 0xbe,
+		0x2c, 0xdf, 0xd7, 0x24, 0xc7, 0x34, 0x3c, 0xcf,
+		0xe7, 0x14, 0x1c, 0xef, 0x0c, 0xff, 0xf7, 0x04,
+		0xa7, 0x54, 0x5c, 0xaf, 0x4c, 0xbf, 0xb7, 0x44,
+		0x6c, 0x9f, 0x97, 0x64, 0x87, 0x74, 0x7c, 0x8f,
+		0x27, 0xd4, 0xdc, 0x2f, 0xcc, 0x3f, 0x37, 0xc4,
+		0xec, 0x1f, 0x17, 0xe4, 0x07, 0xf4, 0xfc, 0x0f,
+		0xac, 0x5f, 0x57, 0xa4, 0x47, 0xb4, 0xbc, 0x4f,
+		0x67, 0x94, 0x9c, 0x6f, 0x8c, 0x7f, 0x77, 0x84,
+		0x3a, 0xc9, 0xc1, 0x32, 0xd1, 0x22, 0x2a, 0xd9,
+		0xf1, 0x02, 0x0a, 0xf9, 0x1a, 0xe9, 0xe1, 0x12,
+		0xb1, 0x42, 0x4a, 0xb9, 0x5a, 0xa9, 0xa1, 0x52,
+		0x7a, 0x89, 0x81, 0x72, 0x91, 0x62, 0x6a, 0x99,
+		0x31, 0xc2, 0xca, 0x39, 0xda, 0x29, 0x21, 0xd2,
+		0xfa, 0x09, 0x01, 0xf2, 0x11, 0xe2, 0xea, 0x19,
+		0xba, 0x49, 0x41, 0xb2, 0x51, 0xa2, 0xaa, 0x59,
+		0x71, 0x82, 0x8a, 0x79, 0x9a, 0x69, 0x61, 0x92,
+	},
+	{
+		0x00, 0xf4, 0xf5, 0x01, 0xf7, 0x03, 0x02, 0xf6,
+		0xf3, 0x07, 0x06, 0xf2, 0x04, 0xf0, 0xf1, 0x05,
+		0xfb, 0x0f, 0x0e, 0xfa, 0x0c, 0xf8, 0xf9, 0x0d,
+		0x08, 0xfc, 0xfd, 0x09, 0xff, 0x0b, 0x0a, 0xfe,
+		0xeb, 0x1f, 0x1e, 0xea, 0x1c, 0xe8, 0xe9, 0x1d,
+		0x18, 0xec, 0xed, 0x19, 0xef, 0x1b, 0x1a, 0xee,
+		0x10, 0xe4, 0xe5, 0x11, 0xe7, 0x13, 0x12, 0xe6,
+		0xe3, 0x17, 0x16, 0xe2, 0x14, 0xe0, 0xe1, 0x15,
+		0xcb, 0x3f, 0x3e, 0xca, 0x3c, 0xc8, 0xc9, 0x3d,
+		0x38, 0xcc, 0xcd, 0x39, 0xcf, 0x3b, 0x3a, 0xce,
+		0x30, 0xc4, 0xc5, 0x31, 0xc7, 0x33, 0x32, 0xc6,
+		0xc3, 0x37, 0x36, 0xc2, 0x34, 0xc0, 0xc1, 0x35,
+		0x20, 0xd4, 0xd5, 0x21, 0xd7, 0x23, 0x22, 0xd6,
+		0xd3, 0x27, 0x26, 0xd2, 0x24, 0xd0, 0xd1, 0x25,
+		0xdb, 0x2f, 0x2e, 0xda, 0x2c, 0xd8, 0xd9, 0x2d,
+		0x28, 0xdc, 0xdd, 0x29, 0xdf, 0x2b, 0x2a, 0xde,
+		0x8b, 0x7f, 0x7e, 0x8a, 0x7c, 0x88, 0x89, 0x7d,
+		0x78, 0x8c, 0x8d, 0x79, 0x8f, 0x7b, 0x7a, 0x8e,
+		0x70, 0x84, 0x85, 0x71, 0x87, 0x73, 0x72, 0x86,
+		0x83, 0x77, 0x76, 0x82, 0x74, 0x80, 0x81, 0x75,
+		0x60, 0x94, 0x95, 0x61, 0x97, 0x63, 0x62, 0x96,
+		0x93, 0x67, 0x66, 0x92, 0x64, 0x90, 0x91, 0x65,
+		0x9b, 0x6f, 0x6e, 0x9a, 0x6c, 0x98, 0x99, 0x6d,
+		0x68, 0x9c, 0x9d, 0x69, 0x9f, 0x6b, 0x6a, 0x9e,
+		0x40, 0xb4, 0xb5, 0x41, 0xb7, 0x43, 0x42, 0xb6,
+		0xb3, 0x47, 0x46, 0xb2, 0x44, 0xb0, 0xb1, 0x45,
+		0xbb, 0x4f, 0x4e, 0xba, 0x4c, 0xb8, 0xb9, 0x4d,
+		0x48, 0xbc, 0xbd, 0x49, 0xbf, 0x4b, 0x4a, 0xbe,
+		0xab, 0x5f, 0x5e, 0xaa, 0x5c, 0xa8, 0xa9, 0x5d,
+		0x58, 0xac, 0xad, 0x59, 0xaf, 0x5b, 0x5a, 0xae,
+		0x50, 0xa4, 0xa5, 0x51, 0xa7, 0x53, 0x52, 0xa6,
+		0xa3, 0x57, 0x56, 0xa2, 0x54, 0xa0, 0xa1, 0x55,
+	},
+	{
+		0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+		0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a,
+		0xeb, 0x1e, 0x1c, 0xe9, 0x18, 0xed, 0xef, 0x1a,
+		0x10, 0xe5, 0xe7, 0x12, 0xe3, 0x16, 0x14, 0xe1,
+		0xcb, 0x3e, 0x3c, 0xc9, 0x38, 0xcd, 0xcf, 0x3a,
+		0x30, 0xc5, 0xc7, 0x32, 0xc3, 0x36, 0x34, 0xc1,
+		0x20, 0xd5, 0xd7, 0x22, 0xd3, 0x26, 0x24, 0xd1,
+		0xdb, 0x2e, 0x2c, 0xd9, 0x28, 0xdd, 0xdf, 0x2a,
+		0x8b, 0x7e, 0x7c, 0x89, 0x78, 0x8d, 0x8f, 0x7a,
+		0x70, 0x85, 0x87, 0x72, 0x83, 0x76, 0x74, 0x81,
+		0x60, 0x95, 0x97, 0x62, 0x93, 0x66, 0x64, 0x91,
+		0x9b, 0x6e, 0x6c, 0x99, 0x68, 0x9d, 0x9f, 0x6a,
+		0x40, 0xb5, 0xb7, 0x42, 0xb3, 0x46, 0x44, 0xb1,
+		0xbb, 0x4e, 0x4c, 0xb9, 0x48, 0xbd, 0xbf, 0x4a,
+		0xab, 0x5e, 0x5c, 0xa9, 0x58, 0xad, 0xaf, 0x5a,
+		0x50, 0xa5, 0xa7, 0x52, 0xa3, 0x56, 0x54, 0xa1,
+		0x0b, 0xfe, 0xfc, 0x09, 0xf8, 0x0d, 0x0f, 0xfa,
+		0xf0, 0x05, 0x07, 0xf2, 0x03, 0xf6, 0xf4, 0x01,
+		0xe0, 0x15, 0x17, 0xe2, 0x13, 0xe6, 0xe4, 0x11,
+		0x1b, 0xee, 0xec, 0x19, 0xe8, 0x1d, 0x1f, 0xea,
+		0xc0, 0x35, 0x37, 0xc2, 0x33, 0xc6, 0xc4, 0x31,
+		0x3b, 0xce, 0xcc, 0x39, 0xc8, 0x3d, 0x3f, 0xca,
+		0x2b, 0xde, 0xdc, 0x29, 0xd8, 0x2d, 0x2f, 0xda,
+		0xd0, 0x25, 0x27, 0xd2, 0x23, 0xd6, 0xd4, 0x21,
+		0x80, 0x75, 0x77, 0x82, 0x73, 0x86, 0x84, 0x71,
+		0x7b, 0x8e, 0x8c, 0x79, 0x88, 0x7d, 0x7f, 0x8a,
+		0x6b, 0x9e, 0x9c, 0x69, 0x98, 0x6d, 0x6f, 0x9a,
+		0x90, 0x65, 0x67, 0x92, 0x63, 0x96, 0x94, 0x61,
+		0x4b, 0xbe, 0xbc, 0x49, 0xb8, 0x4d, 0x4f, 0xba,
+		0xb0, 0x45, 0x47, 0xb2, 0x43, 0xb6, 0xb4, 0x41,
+		0xa0, 0x55, 0x57, 0xa2, 0x53, 0xa6, 0xa4, 0x51,
+		0x5b, 0xae, 0xac, 0x59, 0xa8, 0x5d, 0x5f, 0xaa,
+	},
+	{
+		0x00, 0xf6, 0xf1, 0x07, 0xff, 0x09, 0x0e, 0xf8,
+		0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b,
+		0xdb, 0x2d, 0x2a, 0xdc, 0x24, 0xd2, 0xd5, 0x23,
+		0x38, 0xce, 0xc9, 0x3f, 0xc7, 0x31, 0x36, 0xc0,
+		0xab, 0x5d, 0x5a, 0xac, 0x54, 0xa2, 0xa5, 0x53,
+		0x48, 0xbe, 0xb9, 0x4f, 0xb7, 0x41, 0x46, 0xb0,
+		0x70, 0x86, 0x81, 0x77, 0x8f, 0x79, 0x7e, 0x88,
+		0x93, 0x65, 0x62, 0x94, 0x6c, 0x9a, 0x9d, 0x6b,
+		0x4b, 0xbd, 0xba, 0x4c, 0xb4, 0x42, 0x45, 0xb3,
+		0xa8, 0x5e, 0x59, 0xaf, 0x57, 0xa1, 0xa6, 0x50,
+		0x90, 0x66, 0x61, 0x97, 0x6f, 0x99, 0x9e, 0x68,
+		0x73, 0x85, 0x82, 0x74, 0x8c, 0x7a, 0x7d, 0x8b,
+		0xe0, 0x16, 0x11, 0xe7, 0x1f, 0xe9, 0xee, 0x18,
+		0x03, 0xf5, 0xf2, 0x04, 0xfc, 0x0a, 0x0d, 0xfb,
+		0x3b, 0xcd, 0xca, 0x3c, 0xc4, 0x32, 0x35, 0xc3,
+		0xd8, 0x2e, 0x29, 0xdf, 0x27, 0xd1, 0xd6, 0x20,
+		0x96, 0x60, 0x67, 0x91, 0x69, 0x9f, 0x98, 0x6e,
+		0x75, 0x83, 0x84, 0x72, 0x8a, 0x7c, 0x7b, 0x8d,
+		0x4d, 0xbb, 0xbc, 0x4a, 0xb2, 0x44, 0x43, 0xb5,
+		0xae, 0x58, 0x5f, 0xa9, 0x51, 0xa7, 0xa0, 0x56,
+		0x3d, 0xcb, 0xcc, 0x3a, 0xc2, 0x34, 0x33, 0xc5,
+		0xde, 0x28, 0x2f, 0xd9, 0x21, 0xd7, 0xd0, 0x26,
+		0xe6, 0x10, 0x17, 0xe1, 0x19, 0xef, 0xe8, 0x1e,
+		0x05, 0xf3, 0xf4, 0x02, 0xfa, 0x0c, 0x0b, 0xfd,
+		0xdd, 0x2b, 0x2c, 0xda, 0x22, 0xd4, 0xd3, 0x25,
+		0x3e, 0xc8, 0xcf, 0x39, 0xc1, 0x37, 0x30, 0xc6,
+		0x06, 0xf0, 0xf7, 0x01, 0xf9, 0x0f, 0x08, 0xfe,
+		0xe5, 0x13, 0x14, 0xe2, 0x1a, 0xec, 0xeb, 0x1d,
+		0x76, 0x80, 0x87, 0x71, 0x89, 0x7f, 0x78, 0x8e,
+		0x95, 0x63, 0x64, 0x92, 0x6a, 0x9c, 0x9b, 0x6d,
+		0xad, 0x5b, 0x5c, 0xaa, 0x52, 0xa4, 0xa3, 0x55,
+		0x4e, 0xb8, 0xbf, 0x49, 0xb1, 0x47, 0x40, 0xb6,
+	},
+	{
+		0x00, 0xf7, 0xf3, 0x04, 0xfb, 0x0c, 0x08, 0xff,
+		0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14,
+		0xcb, 0x3c, 0x38, 0xcf, 0x30, 0xc7, 0xc3, 0x34,
+		0x20, 0xd7, 0xd3, 0x24, 0xdb, 0x2c, 0x28, 0xdf,
+		0x8b, 0x7c, 0x78, 0x8f, 0x70, 0x87, 0x83, 0x74,
+		0x60, 0x97, 0x93, 0x64, 0x9b, 0x6c, 0x68, 0x9f,
+		0x40, 0xb7, 0xb3, 0x44, 0xbb, 0x4c, 0x48, 0xbf,
+		0xab, 0x5c, 0x58, 0xaf, 0x50, 0xa7, 0xa3, 0x54,
+		0x0b, 0xfc, 0xf8, 0x0f, 0xf0, 0x07, 0x03, 0xf4,
+		0xe0, 0x17, 0x13, 0xe4, 0x1b, 0xec, 0xe8, 0x1f,
+		0xc0, 0x37, 0x33, 0xc4, 0x3b, 0xcc, 0xc8, 0x3f,
+		0x2b, 0xdc, 0xd8, 0x2f, 0xd0, 0x27, 0x23, 0xd4,
+		0x80, 0x77, 0x73, 0x84, 0x7b, 0x8c, 0x88, 0x7f,
+		0x6b, 0x9c, 0x98, 0x6f, 0x90, 0x67, 0x63, 0x94,
+		0x4b, 0xbc, 0xb8, 0x4f, 0xb0, 0x47, 0x43, 0xb4,
+		0xa0, 0x57, 0x53, 0xa4, 0x5b, 0xac, 0xa8, 0x5f,
+		0x16, 0xe1, 0xe5, 0x12, 0xed, 0x1a, 0x1e, 0xe9,
+		0xfd, 0x0a, 0x0e, 0xf9, 0x06, 0xf1, 0xf5, 0x02,
+		0xdd, 0x2a, 0x2e, 0xd9, 0x26, 0xd1, 0xd5, 0x22,
+		0x36, 0xc1, 0xc5, 0x32, 0xcd, 0x3a, 0x3e, 0xc9,
+		0x9d, 0x6a, 0x6e, 0x99, 0x66, 0x91, 0x95, 0x62,
+		0x76, 0x81, 0x85, 0x72, 0x8d, 0x7a, 0x7e, 0x89,
+		0x56, 0xa1, 0xa5, 0x52, 0xad, 0x5a, 0x5e, 0xa9,
+		0xbd, 0x4a, 0x4e, 0xb9, 0x46, 0xb1, 0xb5, 0x42,
+		0x1d, 0xea, 0xee, 0x19, 0xe6, 0x11, 0x15, 0xe2,
+		0xf6, 0x01, 0x05, 0xf2, 0x0d, 0xfa, 0xfe, 0x09,
+		0xd6, 0x21, 0x25, 0xd2, 0x2d, 0xda, 0xde, 0x29,
+		0x3d, 0xca, 0xce, 0x39, 0xc6, 0x31, 0x35, 0xc2,
+		0x96, 0x61, 0x65, 0x92, 0x6d, 0x9a, 0x9e, 0x69,
+		0x7d, 0x8a, 0x8e, 0x79, 0x86, 0x71, 0x75, 0x82,
+		0x5d, 0xaa, 0xae, 0x59, 0xa6, 0x51, 0x55, 0xa2,
+		0xb6, 0x41, 0x45, 0xb2, 0x4d, 0xba, 0xbe, 0x49,
+	},
+	{
+		0x00, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2,
+		0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41,
+		0x3b, 0xc3, 0xd6, 0x2e, 0xfc, 0x04, 0x11, 0xe9,
+		0xa8, 0x50, 0x45, 0xbd, 0x6f, 0x97, 0x82, 0x7a,
+		0x76, 0x8e, 0x9b, 0x63, 0xb1, 0x49, 0x5c, 0xa4,
+		0xe5, 0x1d, 0x08, 0xf0, 0x22, 0xda, 0xcf, 0x37,
+		0x4d, 0xb5, 0xa0, 0x58, 0x8a, 0x72, 0x67, 0x9f,
+		0xde, 0x26, 0x33, 0xcb, 0x19, 0xe1, 0xf4, 0x0c,
+		0xec, 0x14, 0x01, 0xf9, 0x2b, 0xd3, 0xc6, 0x3e,
+		0x7f, 0x87, 0x92, 0x6a, 0xb8, 0x40, 0x55, 0xad,
+		0xd7, 0x2f, 0x3a, 0xc2, 0x10, 0xe8, 0xfd, 0x05,
+		0x44, 0xbc, 0xa9, 0x51, 0x83, 0x7b, 0x6e, 0x96,
+		0x9a, 0x62, 0x77, 0x8f, 0x5d, 0xa5, 0xb0, 0x48,
+		0x09, 0xf1, 0xe4, 0x1c, 0xce, 0x36, 0x23, 0xdb,
+		0xa1, 0x59, 0x4c, 0xb4, 0x66, 0x9e, 0x8b, 0x73,
+		0x32, 0xca, 0xdf, 0x27, 0xf5, 0x0d, 0x18, 0xe0,
+		0xc5, 0x3d, 0x28, 0xd0, 0x02, 0xfa, 0xef, 0x17,
+		0x56, 0xae, 0xbb, 0x43, 0x91, 0x69, 0x7c, 0x84,
+		0xfe, 0x06, 0x13, 0xeb, 0x39, 0xc1, 0xd4, 0x2c,
+		0x6d, 0x95, 0x80, 0x78, 0xaa, 0x52, 0x47, 0xbf,
+		0xb3, 0x4b, 0x5e, 0xa6, 0x74, 0x8c, 0x99, 0x61,
+		0x20, 0xd8, 0xcd, 0x35, 0xe7, 0x1f, 0x0a, 0xf2,
+		0x88, 0x70, 0x65, 0x9d, 0x4f, 0xb7, 0xa2, 0x5a,
+		0x1b, 0xe3, 0xf6, 0x0e, 0xdc, 0x24, 0x31, 0xc9,
+		0x29, 0xd1, 0xc4, 0x3c, 0xee, 0x16, 0x03, 0xfb,
+		0xba, 0x42, 0x57, 0xaf, 0x7d, 0x85, 0x90, 0x68,
+		0x12, 0xea, 0xff, 0x07, 0xd5, 0x2d, 0x38, 0xc0,
+		0x81, 0x79, 0x6c, 0x94, 0x46, 0xbe, 0xab, 0x53,
+		0x5f, 0xa7, 0xb2, 0x4a, 0x98, 0x60, 0x75, 0x8d,
+		0xcc, 0x34, 0x21, 0xd9, 0x0b, 0xf3, 0xe6, 0x1e,
+		0x64, 0x9c, 0x89, 0x71, 0xa3, 0x5b, 0x4e, 0xb6,
+		0xf7, 0x0f, 0x1a, 0xe2, 0x30, 0xc8, 0xdd, 0x25,
+	},
+	{
+		0x00, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5,
+		0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e,
+		0x2b, 0xd2, 0xc4, 0x3d, 0xe8, 0x11, 0x07, 0xfe,
+		0xb0, 0x49, 0x5f, 0xa6, 0x73, 0x8a, 0x9c, 0x65,
+		0x56, 0xaf, 0xb9, 0x40, 0x95, 0x6c, 0x7a, 0x83,
+		0xcd, 0x34, 0x22, 0xdb, 0x0e, 0xf7, 0xe1, 0x18,
+		0x7d, 0x84, 0x92, 0x6b, 0xbe, 0x47, 0x51, 0xa8,
+		0xe6, 0x1f, 0x09, 0xf0, 0x25, 0xdc, 0xca, 0x33,
+		0xac, 0x55, 0x43, 0xba, 0x6f, 0x96, 0x80, 0x79,
+		0x37, 0xce, 0xd8, 0x21, 0xf4, 0x0d, 0x1b, 0xe2,
+		0x87, 0x7e, 0x68, 0x91, 0x44, 0xbd, 0xab, 0x52,
+		0x1c, 0xe5, 0xf3, 0x0a, 0xdf, 0x26, 0x30, 0xc9,
+		0xfa, 0x03, 0x15, 0xec, 0x39, 0xc0, 0xd6, 0x2f,
+		0x61, 0x98, 0x8e, 0x77, 0xa2, 0x5b, 0x4d, 0xb4,
+		0xd1, 0x28, 0x3e, 0xc7, 0x12, 0xeb, 0xfd, 0x04,
+		0x4a, 0xb3, 0xa5, 0x5c, 0x89, 0x70, 0x66, 0x9f,
+		0x45, 0xbc, 0xaa, 0x53, 0x86, 0x7f, 0x69, 0x90,
+		0xde, 0x27, 0x31, 0xc8, 0x1d, 0xe4, 0xf2, 0x0b,
+		0x6e, 0x97, 0x81, 0x78, 0xad, 0x54, 0x42, 0xbb,
+		0xf5, 0x0c, 0x1a, 0xe3, 0x36, 0xcf, 0xd9, 0x20,
+		0x13, 0xea, 0xfc, 0x05, 0xd0, 0x29, 0x3f, 0xc6,
+		0x88, 0x71, 0x67, 0x9e, 0x4b, 0xb2, 0xa4, 0x5d,
+		0x38, 0xc1, 0xd7, 0x2e, 0xfb, 0x02, 0x14, 0xed,
+		0xa3, 0x5a, 0x4c, 0xb5, 0x60, 0x99, 0x8f, 0x76,
+		0xe9, 0x10, 0x06, 0xff, 0x2a, 0xd3, 0xc5, 0x3c,
+		0x72, 0x8b, 0x9d, 0x64, 0xb1, 0x48, 0x5e, 0xa7,
+		0xc2, 0x3b, 0x2d, 0xd4, 0x01, 0xf8, 0xee, 0x17,
+		0x59, 0xa0, 0xb6, 0x4f, 0x9a, 0x63, 0x75, 0x8c,
+		0xbf, 0x46, 0x50, 0xa9, 0x7c, 0x85, 0x93, 0x6a,
+		0x24, 0xdd, 0xcb, 0x32, 0xe7, 0x1e, 0x08, 0xf1,
+		0x94, 0x6d, 0x7b, 0x82, 0x57, 0xae, 0xb8, 0x41,
+		0x0f, 0xf6, 0xe0, 0x19, 0xcc, 0x35, 0x23, 0xda,
+	},
+	{
+		0x00, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc,
+		0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f,
+		0x1b, 0xe1, 0xf2, 0x08, 0xd4, 0x2e, 0x3d, 0xc7,
+		0x98, 0x62, 0x71, 0x8b, 0x57, 0xad, 0xbe, 0x44,
+		0x36, 0xcc, 0xdf, 0x25, 0xf9, 0x03, 0x10, 0xea,
+		0xb5, 0x4f, 0x5c, 0xa6, 0x7a, 0x80, 0x93, 0x69,
+		0x2d, 0xd7, 0xc4, 0x3e, 0xe2, 0x18, 0x0b, 0xf1,
+		0xae, 0x54, 0x47, 0xbd, 0x61, 0x9b, 0x88, 0x72,
+		0x6c, 0x96, 0x85, 0x7f, 0xa3, 0x59, 0x4a, 0xb0,
+		0xef, 0x15, 0x06, 0xfc, 0x20, 0xda, 0xc9, 0x33,
+		0x77, 0x8d, 0x9e, 0x64, 0xb8, 0x42, 0x51, 0xab,
+		0xf4, 0x0e, 0x1d, 0xe7, 0x3b, 0xc1, 0xd2, 0x28,
+		0x5a, 0xa0, 0xb3, 0x49, 0x95, 0x6f, 0x7c, 0x86,
+		0xd9, 0x23, 0x30, 0xca, 0x16, 0xec, 0xff, 0x05,
+		0x41, 0xbb, 0xa8, 0x52, 0x8e, 0x74, 0x67, 0x9d,
+		0xc2, 0x38, 0x2b, 0xd1, 0x0d, 0xf7, 0xe4, 0x1e,
+		0xd8, 0x22, 0x31, 0xcb, 0x17, 0xed, 0xfe, 0x04,
+		0x5b, 0xa1, 0xb2, 0x48, 0x94, 0x6e, 0x7d, 0x87,
+		0xc3, 0x39, 0x2a, 0xd0, 0x0c, 0xf6, 0xe5, 0x1f,
+		0x40, 0xba, 0xa9, 0x53, 0x8f, 0x75, 0x66, 0x9c,
+		0xee, 0x14, 0x07, 0xfd, 0x21, 0xdb, 0xc8, 0x32,
+		0x6d, 0x97, 0x84, 0x7e, 0xa2, 0x58, 0x4b, 0xb1,
+		0xf5, 0x0f, 0x1c, 0xe6, 0x3a, 0xc0, 0xd3, 0x29,
+		0x76, 0x8c, 0x9f, 0x65, 0xb9, 0x43, 0x50, 0xaa,
+		0xb4, 0x4e, 0x5d, 0xa7, 0x7b, 0x81, 0x92, 0x68,
+		0x37, 0xcd, 0xde, 0x24, 0xf8, 0x02, 0x11, 0xeb,
+		0xaf, 0x55, 0x46, 0xbc, 0x60, 0x9a, 0x89, 0x73,
+		0x2c, 0xd6, 0xc5, 0x3f, 0xe3, 0x19, 0x0a, 0xf0,
+		0x82, 0x78, 0x6b, 0x91, 0x4d, 0xb7, 0xa4, 0x5e,
+		0x01, 0xfb, 0xe8, 0x12, 0xce, 0x34, 0x27, 0xdd,
+		0x99, 0x63, 0x70, 0x8a, 0x56, 0xac, 0xbf, 0x45,
+		0x1a, 0xe0, 0xf3, 0x09, 0xd5, 0x2f, 0x3c, 0xc6,
+	},
+	{
+		0x00, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb,
+		0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50,
+		0x0b, 0xf0, 0xe0, 0x1b, 0xc0, 0x3b, 0x2b, 0xd0,
+		0x80, 0x7b, 0x6b, 0x90, 0x4b, 0xb0, 0xa0, 0x5b,
+		0x16, 0xed, 0xfd, 0x06, 0xdd, 0x26, 0x36, 0xcd,
+		0x9d, 0x66, 0x76, 0x8d, 0x56, 0xad, 0xbd, 0x46,
+		0x1d, 0xe6, 0xf6, 0x0d, 0xd6, 0x2d, 0x3d, 0xc6,
+		0x96, 0x6d, 0x7d, 0x86, 0x5d, 0xa6, 0xb6, 0x4d,
+		0x2c, 0xd7, 0xc7, 0x3c, 0xe7, 0x1c, 0x0c, 0xf7,
+		0xa7, 0x5c, 0x4c, 0xb7, 0x6c, 0x97, 0x87, 0x7c,
+		0x27, 0xdc, 0xcc, 0x37, 0xec, 0x17, 0x07, 0xfc,
+		0xac, 0x57, 0x47, 0xbc, 0x67, 0x9c, 0x8c, 0x77,
+		0x3a, 0xc1, 0xd1, 0x2a, 0xf1, 0x0a, 0x1a, 0xe1,
+		0xb1, 0x4a, 0x5a, 0xa1, 0x7a, 0x81, 0x91, 0x6a,
+		0x31, 0xca, 0xda, 0x21, 0xfa, 0x01, 0x11, 0xea,
+		0xba, 0x41, 0x51, 0xaa, 0x71, 0x8a, 0x9a, 0x61,
+		0x58, 0xa3, 0xb3, 0x48, 0x93, 0x68, 0x78, 0x83,
+		0xd3, 0x28, 0x38, 0xc3, 0x18, 0xe3, 0xf3, 0x08,
+		0x53, 0xa8, 0xb8, 0x43, 0x98, 0x63, 0x73, 0x88,
+		0xd8, 0x23, 0x33, 0xc8, 0x13, 0xe8, 0xf8, 0x03,
+		0x4e, 0xb5, 0xa5, 0x5e, 0x85, 0x7e, 0x6e, 0x95,
+		0xc5, 0x3e, 0x2e, 0xd5, 0x0e, 0xf5, 0xe5, 0x1e,
+		0x45, 0xbe, 0xae, 0x55, 0x8e, 0x75, 0x65, 0x9e,
+		0xce, 0x35, 0x25, 0xde, 0x05, 0xfe, 0xee, 0x15,
+		0x74, 0x8f, 0x9f, 0x64, 0xbf, 0x44, 0x54, 0xaf,
+		0xff, 0x04, 0x14, 0xef, 0x34, 0xcf, 0xdf, 0x24,
+		0x7f, 0x84, 0x94, 0x6f, 0xb4, 0x4f, 0x5f, 0xa4,
+		0xf4, 0x0f, 0x1f, 0xe4, 0x3f, 0xc4, 0xd4, 0x2f,
+		0x62, 0x99, 0x89, 0x72, 0xa9, 0x52, 0x42, 0xb9,
+		0xe9, 0x12, 0x02, 0xf9, 0x22, 0xd9, 0xc9, 0x32,
+		0x69, 0x92, 0x82, 0x79, 0xa2, 0x59, 0x49, 0xb2,
+		0xe2, 0x19, 0x09, 0xf2, 0x29, 0xd2, 0xc2, 0x39,
+	},
+	{
+		0x00, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce,
+		0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d,
+		0x7b, 0x87, 0x9e, 0x62, 0xac, 0x50, 0x49, 0xb5,
+		0xc8, 0x34, 0x2d, 0xd1, 0x1f, 0xe3, 0xfa, 0x06,
+		0xf6, 0x0a, 0x13, 0xef, 0x21, 0xdd, 0xc4, 0x38,
+		0x45, 0xb9, 0xa0, 0x5c, 0x92, 0x6e, 0x77, 0x8b,
+		0x8d, 0x71, 0x68, 0x94, 0x5a, 0xa6, 0xbf, 0x43,
+		0x3e, 0xc2, 0xdb, 0x27, 0xe9, 0x15, 0x0c, 0xf0,
+		0xf1, 0x0d, 0x14, 0xe8, 0x26, 0xda, 0xc3, 0x3f,
+		0x42, 0xbe, 0xa7, 0x5b, 0x95, 0x69, 0x70, 0x8c,
+		0x8a, 0x76, 0x6f, 0x93, 0x5d, 0xa1, 0xb8, 0x44,
+		0x39, 0xc5, 0xdc, 0x20, 0xee, 0x12, 0x0b, 0xf7,
+		0x07, 0xfb, 0xe2, 0x1e, 0xd0, 0x2c, 0x35, 0xc9,
+		0xb4, 0x48, 0x51, 0xad, 0x63, 0x9f, 0x86, 0x7a,
+		0x7c, 0x80, 0x99, 0x65, 0xab, 0x57, 0x4e, 0xb2,
+		0xcf, 0x33, 0x2a, 0xd6, 0x18, 0xe4, 0xfd, 0x01,
+		0xff, 0x03, 0x1a, 0xe6, 0x28, 0xd4, 0xcd, 0x31,
+		0x4c, 0xb0, 0xa9, 0x55, 0x9b, 0x67, 0x7e, 0x82,
+		0x84, 0x78, 0x61, 0x9d, 0x53, 0xaf, 0xb6, 0x4a,
+		0x37, 0xcb, 0xd2, 0x2e, 0xe0, 0x1c, 0x05, 0xf9,
+		0x09, 0xf5, 0xec, 0x10, 0xde, 0x22, 0x3b, 0xc7,
+		0xba, 0x46, 0x5f, 0xa3, 0x6d, 0x91, 0x88, 0x74,
+		0x72, 0x8e, 0x97, 0x6b, 0xa5, 0x59, 0x40, 0xbc,
+		0xc1, 0x3d, 0x24, 0xd8, 0x16, 0xea, 0xf3, 0x0f,
+		0x0e, 0xf2, 0xeb, 0x17, 0xd9, 0x25, 0x3c, 0xc0,
+		0xbd, 0x41, 0x58, 0xa4, 0x6a, 0x96, 0x8f, 0x73,
+		0x75, 0x89, 0x90, 0x6c, 0xa2, 0x5e, 0x47, 0xbb,
+		0xc6, 0x3a, 0x23, 0xdf, 0x11, 0xed, 0xf4, 0x08,
+		0xf8, 0x04, 0x1d, 0xe1, 0x2f, 0xd3, 0xca, 0x36,
+		0x4b, 0xb7, 0xae, 0x52, 0x9c, 0x60, 0x79, 0x85,
+		0x83, 0x7f, 0x66, 0x9a, 0x54, 0xa8, 0xb1, 0x4d,
+		0x30, 0xcc, 0xd5, 0x29, 0xe7, 0x1b, 0x02, 0xfe,
+	},
+	{
+		0x00, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9,
+		0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72,
+		0x6b, 0x96, 0x8c, 0x71, 0xb8, 0x45, 0x5f, 0xa2,
+		0xd0, 0x2d, 0x37, 0xca, 0x03, 0xfe, 0xe4, 0x19,
+		0xd6, 0x2b, 0x31, 0xcc, 0x05, 0xf8, 0xe2, 0x1f,
+		0x6d, 0x90, 0x8a, 0x77, 0xbe, 0x43, 0x59, 0xa4,
+		0xbd, 0x40, 0x5a, 0xa7, 0x6e, 0x93, 0x89, 0x74,
+		0x06, 0xfb, 0xe1, 0x1c, 0xd5, 0x28, 0x32, 0xcf,
+		0xb1, 0x4c, 0x56, 0xab, 0x62, 0x9f, 0x85, 0x78,
+		0x0a, 0xf7, 0xed, 0x10, 0xd9, 0x24, 0x3e, 0xc3,
+		0xda, 0x27, 0x3d, 0xc0, 0x09, 0xf4, 0xee, 0x13,
+		0x61, 0x9c, 0x86, 0x7b, 0xb2, 0x4f, 0x55, 0xa8,
+		0x67, 0x9a, 0x80, 0x7d, 0xb4, 0x49, 0x53, 0xae,
+		0xdc, 0x21, 0x3b, 0xc6, 0x0f, 0xf2, 0xe8, 0x15,
+		0x0c, 0xf1, 0xeb, 0x16, 0xdf, 0x22, 0x38, 0xc5,
+		0xb7, 0x4a, 0x50, 0xad, 0x64, 0x99, 0x83, 0x7e,
+		0x7f, 0x82, 0x98, 0x65, 0xac, 0x51, 0x4b, 0xb6,
+		0xc4, 0x39, 0x23, 0xde, 0x17, 0xea, 0xf0, 0x0d,
+		0x14, 0xe9, 0xf3, 0x0e, 0xc7, 0x3a, 0x20, 0xdd,
+		0xaf, 0x52, 0x48, 0xb5, 0x7c, 0x81, 0x9b, 0x66,
+		0xa9, 0x54, 0x4e, 0xb3, 0x7a, 0x87, 0x9d, 0x60,
+		0x12, 0xef, 0xf5, 0x08, 0xc1, 0x3c, 0x26, 0xdb,
+		0xc2, 0x3f, 0x25, 0xd8, 0x11, 0xec, 0xf6, 0x0b,
+		0x79, 0x84, 0x9e, 0x63, 0xaa, 0x57, 0x4d, 0xb0,
+		0xce, 0x33, 0x29, 0xd4, 0x1d, 0xe0, 0xfa, 0x07,
+		0x75, 0x88, 0x92, 0x6f, 0xa6, 0x5b, 0x41, 0xbc,
+		0xa5, 0x58, 0x42, 0xbf, 0x76, 0x8b, 0x91, 0x6c,
+		0x1e, 0xe3, 0xf9, 0x04, 0xcd, 0x30, 0x2a, 0xd7,
+		0x18, 0xe5, 0xff, 0x02, 0xcb, 0x36, 0x2c, 0xd1,
+		0xa3, 0x5e, 0x44, 0xb9, 0x70, 0x8d, 0x97, 0x6a,
+		0x73, 0x8e, 0x94, 0x69, 0xa0, 0x5d, 0x47, 0xba,
+		0xc8, 0x35, 0x2f, 0xd2, 0x1b, 0xe6, 0xfc, 0x01,
+	},
+	{
+		0x00, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0,
+		0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63,
+		0x5b, 0xa5, 0xba, 0x44, 0x84, 0x7a, 0x65, 0x9b,
+		0xf8, 0x06, 0x19, 0xe7, 0x27, 0xd9, 0xc6, 0x38,
+		0xb6, 0x48, 0x57, 0xa9, 0x69, 0x97, 0x88, 0x76,
+		0x15, 0xeb, 0xf4, 0x0a, 0xca, 0x34, 0x2b, 0xd5,
+		0xed, 0x13, 0x0c, 0xf2, 0x32, 0xcc, 0xd3, 0x2d,
+		0x4e, 0xb0, 0xaf, 0x51, 0x91, 0x6f, 0x70, 0x8e,
+		0x71, 0x8f, 0x90, 0x6e, 0xae, 0x50, 0x4f, 0xb1,
+		0xd2, 0x2c, 0x33, 0xcd, 0x0d, 0xf3, 0xec, 0x12,
+		0x2a, 0xd4, 0xcb, 0x35, 0xf5, 0x0b, 0x14, 0xea,
+		0x89, 0x77, 0x68, 0x96, 0x56, 0xa8, 0xb7, 0x49,
+		0xc7, 0x39, 0x26, 0xd8, 0x18, 0xe6, 0xf9, 0x07,
+		0x64, 0x9a, 0x85, 0x7b, 0xbb, 0x45, 0x5a, 0xa4,
+		0x9c, 0x62, 0x7d, 0x83, 0x43, 0xbd, 0xa2, 0x5c,
+		0x3f, 0xc1, 0xde, 0x20, 0xe0, 0x1e, 0x01, 0xff,
+		0xe2, 0x1c, 0x03, 0xfd, 0x3d, 0xc3, 0xdc, 0x22,
+		0x41, 0xbf, 0xa0, 0x5e, 0x9e, 0x60, 0x7f, 0x81,
+		0xb9, 0x47, 0x58, 0xa6, 0x66, 0x98, 0x87, 0x79,
+		0x1a, 0xe4, 0xfb, 0x05, 0xc5, 0x3b, 0x24, 0xda,
+		0x54, 0xaa, 0xb5, 0x4b, 0x8b, 0x75, 0x6a, 0x94,
+		0xf7, 0x09, 0x16, 0xe8, 0x28, 0xd6, 0xc9, 0x37,
+		0x0f, 0xf1, 0xee, 0x10, 0xd0, 0x2e, 0x31, 0xcf,
+		0xac, 0x52, 0x4d, 0xb3, 0x73, 0x8d, 0x92, 0x6c,
+		0x93, 0x6d, 0x72, 0x8c, 0x4c, 0xb2, 0xad, 0x53,
+		0x30, 0xce, 0xd1, 0x2f, 0xef, 0x11, 0x0e, 0xf0,
+		0xc8, 0x36, 0x29, 0xd7, 0x17, 0xe9, 0xf6, 0x08,
+		0x6b, 0x95, 0x8a, 0x74, 0xb4, 0x4a, 0x55, 0xab,
+		0x25, 0xdb, 0xc4, 0x3a, 0xfa, 0x04, 0x1b, 0xe5,
+		0x86, 0x78, 0x67, 0x99, 0x59, 0xa7, 0xb8, 0x46,
+		0x7e, 0x80, 0x9f, 0x61, 0xa1, 0x5f, 0x40, 0xbe,
+		0xdd, 0x23, 0x3c, 0xc2, 0x02, 0xfc, 0xe3, 0x1d,
+	},
+	{
+		0x00, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7,
+		0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c,
+		0x4b, 0xb4, 0xa8, 0x57, 0x90, 0x6f, 0x73, 0x8c,
+		0xe0, 0x1f, 0x03, 0xfc, 0x3b, 0xc4, 0xd8, 0x27,
+		0x96, 0x69, 0x75, 0x8a, 0x4d, 0xb2, 0xae, 0x51,
+		0x3d, 0xc2, 0xde, 0x21, 0xe6, 0x19, 0x05, 0xfa,
+		0xdd, 0x22, 0x3e, 0xc1, 0x06, 0xf9, 0xe5, 0x1a,
+		0x76, 0x89, 0x95, 0x6a, 0xad, 0x52, 0x4e, 0xb1,
+		0x31, 0xce, 0xd2, 0x2d, 0xea, 0x15, 0x09, 0xf6,
+		0x9a, 0x65, 0x79, 0x86, 0x41, 0xbe, 0xa2, 0x5d,
+		0x7a, 0x85, 0x99, 0x66, 0xa1, 0x5e, 0x42, 0xbd,
+		0xd1, 0x2e, 0x32, 0xcd, 0x0a, 0xf5, 0xe9, 0x16,
+		0xa7, 0x58, 0x44, 0xbb, 0x7c, 0x83, 0x9f, 0x60,
+		0x0c, 0xf3, 0xef, 0x10, 0xd7, 0x28, 0x34, 0xcb,
+		0xec, 0x13, 0x0f, 0xf0, 0x37, 0xc8, 0xd4, 0x2b,
+		0x47, 0xb8, 0xa4, 0x5b, 0x9c, 0x63, 0x7f, 0x80,
+		0x62, 0x9d, 0x81, 0x7e, 0xb9, 0x46, 0x5a, 0xa5,
+		0xc9, 0x36, 0x2a, 0xd5, 0x12, 0xed, 0xf1, 0x0e,
+		0x29, 0xd6, 0xca, 0x35, 0xf2, 0x0d, 0x11, 0xee,
+		0x82, 0x7d, 0x61, 0x9e, 0x59, 0xa6, 0xba, 0x45,
+		0xf4, 0x0b, 0x17, 0xe8, 0x2f, 0xd0, 0xcc, 0x33,
+		0x5f, 0xa0, 0xbc, 0x43, 0x84, 0x7b, 0x67, 0x98,
+		0xbf, 0x40, 0x5c, 0xa3, 0x64, 0x9b, 0x87, 0x78,
+		0x14, 0xeb, 0xf7, 0x08, 0xcf, 0x30, 0x2c, 0xd3,
+		0x53, 0xac, 0xb0, 0x4f, 0x88, 0x77, 0x6b, 0x94,
+		0xf8, 0x07, 0x1b, 0xe4, 0x23, 0xdc, 0xc0, 0x3f,
+		0x18, 0xe7, 0xfb, 0x04, 0xc3, 0x3c, 0x20, 0xdf,
+		0xb3, 0x4c, 0x50, 0xaf, 0x68, 0x97, 0x8b, 0x74,
+		0xc5, 0x3a, 0x26, 0xd9, 0x1e, 0xe1, 0xfd, 0x02,
+		0x6e, 0x91, 0x8d, 0x72, 0xb5, 0x4a, 0x56, 0xa9,
+		0x8e, 0x71, 0x6d, 0x92, 0x55, 0xaa, 0xb6, 0x49,
+		0x25, 0xda, 0xc6, 0x39, 0xfe, 0x01, 0x1d, 0xe2,
+	},
+};
+
+const uint8_t __aligned(256) raid_gfexp[256] =
+{
+	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
+	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
+	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
+	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
+	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
+	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
+	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
+	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
+	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
+	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
+	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
+	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
+	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
+	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
+	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
+	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
+	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
+	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
+	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
+	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
+	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
+	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
+	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
+	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
+	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
+	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
+	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
+	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
+	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
+	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
+	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01,
+};
+
+const uint8_t __aligned(256) raid_gfinv[256] =
+{
+	/* note that the first element is not significative */
+	0x00, 0x01, 0x8e, 0xf4, 0x47, 0xa7, 0x7a, 0xba,
+	0xad, 0x9d, 0xdd, 0x98, 0x3d, 0xaa, 0x5d, 0x96,
+	0xd8, 0x72, 0xc0, 0x58, 0xe0, 0x3e, 0x4c, 0x66,
+	0x90, 0xde, 0x55, 0x80, 0xa0, 0x83, 0x4b, 0x2a,
+	0x6c, 0xed, 0x39, 0x51, 0x60, 0x56, 0x2c, 0x8a,
+	0x70, 0xd0, 0x1f, 0x4a, 0x26, 0x8b, 0x33, 0x6e,
+	0x48, 0x89, 0x6f, 0x2e, 0xa4, 0xc3, 0x40, 0x5e,
+	0x50, 0x22, 0xcf, 0xa9, 0xab, 0x0c, 0x15, 0xe1,
+	0x36, 0x5f, 0xf8, 0xd5, 0x92, 0x4e, 0xa6, 0x04,
+	0x30, 0x88, 0x2b, 0x1e, 0x16, 0x67, 0x45, 0x93,
+	0x38, 0x23, 0x68, 0x8c, 0x81, 0x1a, 0x25, 0x61,
+	0x13, 0xc1, 0xcb, 0x63, 0x97, 0x0e, 0x37, 0x41,
+	0x24, 0x57, 0xca, 0x5b, 0xb9, 0xc4, 0x17, 0x4d,
+	0x52, 0x8d, 0xef, 0xb3, 0x20, 0xec, 0x2f, 0x32,
+	0x28, 0xd1, 0x11, 0xd9, 0xe9, 0xfb, 0xda, 0x79,
+	0xdb, 0x77, 0x06, 0xbb, 0x84, 0xcd, 0xfe, 0xfc,
+	0x1b, 0x54, 0xa1, 0x1d, 0x7c, 0xcc, 0xe4, 0xb0,
+	0x49, 0x31, 0x27, 0x2d, 0x53, 0x69, 0x02, 0xf5,
+	0x18, 0xdf, 0x44, 0x4f, 0x9b, 0xbc, 0x0f, 0x5c,
+	0x0b, 0xdc, 0xbd, 0x94, 0xac, 0x09, 0xc7, 0xa2,
+	0x1c, 0x82, 0x9f, 0xc6, 0x34, 0xc2, 0x46, 0x05,
+	0xce, 0x3b, 0x0d, 0x3c, 0x9c, 0x08, 0xbe, 0xb7,
+	0x87, 0xe5, 0xee, 0x6b, 0xeb, 0xf2, 0xbf, 0xaf,
+	0xc5, 0x64, 0x07, 0x7b, 0x95, 0x9a, 0xae, 0xb6,
+	0x12, 0x59, 0xa5, 0x35, 0x65, 0xb8, 0xa3, 0x9e,
+	0xd2, 0xf7, 0x62, 0x5a, 0x85, 0x7d, 0xa8, 0x3a,
+	0x29, 0x71, 0xc8, 0xf6, 0xf9, 0x43, 0xd7, 0xd6,
+	0x10, 0x73, 0x76, 0x78, 0x99, 0x0a, 0x19, 0x91,
+	0x14, 0x3f, 0xe6, 0xf0, 0x86, 0xb1, 0xe2, 0xf1,
+	0xfa, 0x74, 0xf3, 0xb4, 0x6d, 0x21, 0xb2, 0x6a,
+	0xe3, 0xe7, 0xb5, 0xea, 0x03, 0x8f, 0xd3, 0xc9,
+	0x42, 0xd4, 0xe8, 0x75, 0x7f, 0xff, 0x7e, 0xfd,
+};
+
+/**
+ * Power matrix used to generate parity.
+ * This matrix is valid for up to 3 parity with 251 data disks.
+ *
+ * 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
+ * 01 02 04 08 10 20 40 80 1d 3a 74 e8 cd 87 13 26 4c 98 2d 5a b4 75 ea c9 8f 03 06 0c 18 30 60 c0 9d 27 4e 9c 25 4a 94 35 6a d4 b5 77 ee c1 9f 23 46 8c 05 0a 14 28 50 a0 5d ba 69 d2 b9 6f de a1 5f be 61 c2 99 2f 5e bc 65 ca 89 0f 1e 3c 78 f0 fd e7 d3 bb 6b d6 b1 7f fe e1 df a3 5b b6 71 e2 d9 af 43 86 11 22 44 88 0d 1a 34 68 d0 bd 67 ce 81 1f 3e 7c f8 ed c7 93 3b 76 ec c5 97 33 66 cc 85 17 2e 5c b8 6d da a9 4f 9e 21 42 84 15 2a 54 a8 4d 9a 29 52 a4 55 aa 49 92 39 72 e4 d5 b7 73 e6 d1 bf 63 c6 91 3f 7e fc e5 d7 b3 7b f6 f1 ff e3 db ab 4b 96 31 62 c4 95 37 6e dc a5 57 ae 41 82 19 32 64 c8 8d 07 0e 1c 38 70 e0 dd a7 53 a6 51 a2 59 b2 79 f2 f9 ef c3 9b 2b 56 ac 45 8a 09 12 24 48 90 3d 7a f4 f5 f7 f3 fb eb cb 8b 0b 16 2c 58 b0 7d fa e9 cf 83 1b 36 6c
+ * 01 8e 47 ad d8 6c 36 1b 83 cf e9 fa 7d b0 58 2c 16 0b 8b cb eb fb f3 f7 f5 f4 7a 3d 90 48 24 12 09 8a 45 ac 56 2b 9b c3 ef f9 f2 79 b2 59 a2 51 a6 53 a7 dd e0 70 38 1c 0e 07 8d c8 64 32 19 82 41 ae 57 a5 dc 6e 37 95 c4 62 31 96 4b ab db e3 ff f1 f6 7b b3 d7 e5 fc 7e 3f 91 c6 63 bf d1 e6 73 b7 d5 e4 72 39 92 49 aa 55 a4 52 29 9a 4d a8 54 2a 15 84 42 21 9e 4f a9 da 6d b8 5c 2e 17 85 cc 66 33 97 c5 ec 76 3b 93 c7 ed f8 7c 3e 1f 81 ce 67 bd d0 68 34 1a 0d 88 44 22 11 86 43 af d9 e2 71 b6 5b a3 df e1 fe 7f b1 d6 6b bb d3 e7 fd f0 78 3c 1e 0f 89 ca 65 bc 5e 2f 99 c2 61 be 5f a1 de 6f b9 d2 69 ba 5d a0 50 28 14 0a 05 8c 46 23 9f c1 ee 77 b5 d4 6a 35 94 4a 25 9c 4e 27 9d c0 60 30 18 0c 06 03 8f c9 ea 75 b4 5a 2d 98 4c 26 13 87 cd e8 74 3a 1d 80 40 20
+ */
+const uint8_t __aligned(256) raid_gfvandermonde[3][256] =
+{
+	{
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01,
+	},
+	{
+		0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+		0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
+		0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
+		0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
+		0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
+		0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
+		0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
+		0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
+		0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
+		0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
+		0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
+		0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
+		0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
+		0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
+		0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
+		0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
+		0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
+		0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
+		0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
+		0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
+		0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
+		0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
+		0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
+		0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
+		0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
+		0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
+		0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
+		0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
+		0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
+		0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
+		0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
+		0x1b, 0x36, 0x6c,
+	},
+	{
+		0x01, 0x8e, 0x47, 0xad, 0xd8, 0x6c, 0x36, 0x1b,
+		0x83, 0xcf, 0xe9, 0xfa, 0x7d, 0xb0, 0x58, 0x2c,
+		0x16, 0x0b, 0x8b, 0xcb, 0xeb, 0xfb, 0xf3, 0xf7,
+		0xf5, 0xf4, 0x7a, 0x3d, 0x90, 0x48, 0x24, 0x12,
+		0x09, 0x8a, 0x45, 0xac, 0x56, 0x2b, 0x9b, 0xc3,
+		0xef, 0xf9, 0xf2, 0x79, 0xb2, 0x59, 0xa2, 0x51,
+		0xa6, 0x53, 0xa7, 0xdd, 0xe0, 0x70, 0x38, 0x1c,
+		0x0e, 0x07, 0x8d, 0xc8, 0x64, 0x32, 0x19, 0x82,
+		0x41, 0xae, 0x57, 0xa5, 0xdc, 0x6e, 0x37, 0x95,
+		0xc4, 0x62, 0x31, 0x96, 0x4b, 0xab, 0xdb, 0xe3,
+		0xff, 0xf1, 0xf6, 0x7b, 0xb3, 0xd7, 0xe5, 0xfc,
+		0x7e, 0x3f, 0x91, 0xc6, 0x63, 0xbf, 0xd1, 0xe6,
+		0x73, 0xb7, 0xd5, 0xe4, 0x72, 0x39, 0x92, 0x49,
+		0xaa, 0x55, 0xa4, 0x52, 0x29, 0x9a, 0x4d, 0xa8,
+		0x54, 0x2a, 0x15, 0x84, 0x42, 0x21, 0x9e, 0x4f,
+		0xa9, 0xda, 0x6d, 0xb8, 0x5c, 0x2e, 0x17, 0x85,
+		0xcc, 0x66, 0x33, 0x97, 0xc5, 0xec, 0x76, 0x3b,
+		0x93, 0xc7, 0xed, 0xf8, 0x7c, 0x3e, 0x1f, 0x81,
+		0xce, 0x67, 0xbd, 0xd0, 0x68, 0x34, 0x1a, 0x0d,
+		0x88, 0x44, 0x22, 0x11, 0x86, 0x43, 0xaf, 0xd9,
+		0xe2, 0x71, 0xb6, 0x5b, 0xa3, 0xdf, 0xe1, 0xfe,
+		0x7f, 0xb1, 0xd6, 0x6b, 0xbb, 0xd3, 0xe7, 0xfd,
+		0xf0, 0x78, 0x3c, 0x1e, 0x0f, 0x89, 0xca, 0x65,
+		0xbc, 0x5e, 0x2f, 0x99, 0xc2, 0x61, 0xbe, 0x5f,
+		0xa1, 0xde, 0x6f, 0xb9, 0xd2, 0x69, 0xba, 0x5d,
+		0xa0, 0x50, 0x28, 0x14, 0x0a, 0x05, 0x8c, 0x46,
+		0x23, 0x9f, 0xc1, 0xee, 0x77, 0xb5, 0xd4, 0x6a,
+		0x35, 0x94, 0x4a, 0x25, 0x9c, 0x4e, 0x27, 0x9d,
+		0xc0, 0x60, 0x30, 0x18, 0x0c, 0x06, 0x03, 0x8f,
+		0xc9, 0xea, 0x75, 0xb4, 0x5a, 0x2d, 0x98, 0x4c,
+		0x26, 0x13, 0x87, 0xcd, 0xe8, 0x74, 0x3a, 0x1d,
+		0x80, 0x40, 0x20,
+	},
+};
+
+/**
+ * Cauchy matrix used to generate parity.
+ * This matrix is valid for up to 6 parity with 251 data disks.
+ *
+ * 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
+ * 01 02 04 08 10 20 40 80 1d 3a 74 e8 cd 87 13 26 4c 98 2d 5a b4 75 ea c9 8f 03 06 0c 18 30 60 c0 9d 27 4e 9c 25 4a 94 35 6a d4 b5 77 ee c1 9f 23 46 8c 05 0a 14 28 50 a0 5d ba 69 d2 b9 6f de a1 5f be 61 c2 99 2f 5e bc 65 ca 89 0f 1e 3c 78 f0 fd e7 d3 bb 6b d6 b1 7f fe e1 df a3 5b b6 71 e2 d9 af 43 86 11 22 44 88 0d 1a 34 68 d0 bd 67 ce 81 1f 3e 7c f8 ed c7 93 3b 76 ec c5 97 33 66 cc 85 17 2e 5c b8 6d da a9 4f 9e 21 42 84 15 2a 54 a8 4d 9a 29 52 a4 55 aa 49 92 39 72 e4 d5 b7 73 e6 d1 bf 63 c6 91 3f 7e fc e5 d7 b3 7b f6 f1 ff e3 db ab 4b 96 31 62 c4 95 37 6e dc a5 57 ae 41 82 19 32 64 c8 8d 07 0e 1c 38 70 e0 dd a7 53 a6 51 a2 59 b2 79 f2 f9 ef c3 9b 2b 56 ac 45 8a 09 12 24 48 90 3d 7a f4 f5 f7 f3 fb eb cb 8b 0b 16 2c 58 b0 7d fa e9 cf 83 1b 36 6c
+ * 01 f5 d2 c4 9a 71 f1 7f fc 87 c1 c6 19 2f 40 55 3d ba 53 04 9c 61 34 8c 46 68 70 3e cc 7d 74 75 b5 db 0c df 9e 6d 79 eb 63 9f 38 d0 94 a5 24 89 5c 65 5b ae 37 33 4c dd 47 f4 02 a6 39 d8 9d 2d 62 b9 2e 0f 2b 60 58 e4 f8 6c 72 b0 85 4d 95 41 1c 23 05 99 32 c5 0e 82 91 14 d1 af f9 b3 07 97 6e 0b 67 3b 78 e6 28 22 4f a3 ca 48 de 1d a8 17 6f 90 aa 31 5a f3 e9 a9 44 30 56 09 59 6a 42 cd e5 d6 86 d9 bf cb 26 66 7c d5 be 25 1f e0 98 27 92 51 c7 45 2c c0 ad a7 69 f7 b4 e8 84 e1 18 88 3c 76 20 5e 9b 1e 0d 81 4a bd 16 8a ac 93 ce 1a c2 0a 3f fd e3 77 6b d7 ef a4 80 a1 36 ed a2 12 57 b6 29 8d 7b c8 52 c3 bc b8 21 d4 ea d3 06 ab 2a 1b 5f b7 10 ec 64 f6 e2 11 50 83 54 3a fa fb f2 43 b1 ff e7 c9 03 bb ee 13 8b dc 35 b2 da cf a0 96 49 4e 08 73 f0 7e fe 15 4b
+ * 01 bb a6 d7 c7 07 ce 82 4a 2f a5 9b b6 60 f1 ad e7 f4 06 d2 df 2e ca 65 5c 48 21 aa cd 4e c1 61 38 0a 3e d1 d5 cb 10 dc 5e 24 b8 de 79 36 43 72 d9 f8 f9 a2 a4 6a 3d ea 8e 03 f5 ab b4 5d b5 53 6b 39 86 b0 50 74 96 84 5a 4b e8 49 e5 51 ef 12 bc 89 5b 2b 29 09 c3 57 1e 37 76 0b 64 8a 52 59 80 da a8 44 95 3c 33 e6 7c af 6c b1 9d fc 92 d6 d8 ff a7 77 04 13 73 66 28 7d 83 fb 5f 63 25 19 bd c5 3b 6e 20 35 55 42 31 e1 b9 9e 90 d4 ba db f7 2a e9 3a a0 75 7a d3 02 ee 9c c6 1f 14 cc 22 4d 30 71 58 11 85 4f 6f 6d 1d cf fa 54 a9 17 a3 0f ae 0d 1c c2 d0 32 16 f6 c0 7f 2d 15 f3 1b f2 ed b3 45 c8 ac 7b 2c e2 e4 bf be 9f 34 05 70 3f 98 fe 62 18 9a 56 8d 93 97 78 4c 7e 27 87 08 8b ec 67 0e 1a 23 8c 68 99 94 40 b2 a1 eb b7 26 f0 dd e3 69 0c c4 88 41 81 91 e0 fd
+ * 01 97 7f 9c 7c 18 bd a2 58 1a da 74 70 a3 e5 47 29 07 f5 80 23 e9 fa 46 54 a0 99 95 53 9b 0b c7 09 c0 78 89 92 e3 0d b0 2a 8c fb 17 3f 26 65 87 27 5c 66 61 79 4d 32 b3 8d 52 e2 82 3d f9 c5 02 bc 4c 73 48 62 af ba 41 d9 c4 2f b1 33 b8 15 7d cf 3a a9 5f 84 6d 34 1b 44 94 72 81 42 be cc 4b 0a 6f 5a 22 36 b5 3c 9d 13 7e 08 dd d6 5e 04 fc 5b ec ef f1 6e 1e 77 24 e6 c6 aa cb fd 51 67 06 6a 4a 88 db b2 c2 5d 43 40 f7 50 a8 f2 7a 71 a4 d2 bf 31 90 19 9a 8e f6 c3 a6 e7 60 12 ee 2d de 38 e8 b7 98 c1 28 f3 05 96 63 d1 b9 14 9f 1d 83 68 75 ed 16 03 ce e4 df e0 10 ae 69 55 91 2e 4e fe 21 1f 9e e1 d5 cd ca f0 8b 2b c9 8a 93 bb 57 20 86 1c a1 4f 3e 25 d4 6c a5 6b a7 37 ff 39 35 0c f8 ea 56 45 8f 2c 59 ab 85 eb 49 0f dc d8 76 b6 f4 0e 11 b4 d0 30 d3 3b ad d7
+ * 01 2b 3f cf 73 2c d6 ed cb 74 15 78 8a c1 17 c9 89 68 21 ab 76 3b 4b 5a 6e 0e b9 d3 b6 3e 36 86 bf a2 a7 30 14 eb c7 2d 96 67 20 b5 9a e0 a8 c6 80 04 8d fe 75 5e 23 ca 8f 48 99 0d df 8e b8 70 29 9c 44 69 3d a5 c2 90 d2 1c 9b 02 1d 98 93 ec 84 e8 64 4c 3a 8b 97 f3 e5 c0 7d 26 c8 08 a0 62 82 55 f7 33 f6 51 63 4d 77 da fd c3 38 6d ee 09 47 a3 05 de a6 f1 22 25 6a 0c 81 b2 6b 58 d5 b3 fc fb 28 7f 07 dc 7a 9e d0 37 b4 e1 1a 24 03 ae 94 ba 88 2f ea 2e 8c 5b bb 79 d1 11 ff a4 19 3c 2a 4e 52 e3 95 bd 31 5d 35 4a 41 c4 db 42 c5 0b 49 1b 7c e4 b0 9d 45 f0 a9 61 57 06 d4 40 91 56 13 fa 87 ac 27 54 dd 59 1f 71 39 43 6c f9 be 4f f4 1e 32 cd e9 7e 7b 66 5f ef e7 6f 0a 60 d7 b7 83 92 e2 af 72 f8 b1 50 10 ce 18 53 a1 cc ad 12 34 0f f5 aa 16 e6 f2 d8 85 9f bc
+ */
+const uint8_t __aligned(256) raid_gfcauchy[6][256] =
+{
+	{
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01,
+	},
+	{
+		0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+		0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
+		0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
+		0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
+		0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
+		0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
+		0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
+		0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
+		0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
+		0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
+		0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
+		0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
+		0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
+		0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
+		0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
+		0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
+		0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
+		0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
+		0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
+		0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
+		0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
+		0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
+		0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
+		0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
+		0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
+		0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
+		0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
+		0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
+		0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
+		0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
+		0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
+		0x1b, 0x36, 0x6c,
+	},
+	{
+		0x01, 0xf5, 0xd2, 0xc4, 0x9a, 0x71, 0xf1, 0x7f,
+		0xfc, 0x87, 0xc1, 0xc6, 0x19, 0x2f, 0x40, 0x55,
+		0x3d, 0xba, 0x53, 0x04, 0x9c, 0x61, 0x34, 0x8c,
+		0x46, 0x68, 0x70, 0x3e, 0xcc, 0x7d, 0x74, 0x75,
+		0xb5, 0xdb, 0x0c, 0xdf, 0x9e, 0x6d, 0x79, 0xeb,
+		0x63, 0x9f, 0x38, 0xd0, 0x94, 0xa5, 0x24, 0x89,
+		0x5c, 0x65, 0x5b, 0xae, 0x37, 0x33, 0x4c, 0xdd,
+		0x47, 0xf4, 0x02, 0xa6, 0x39, 0xd8, 0x9d, 0x2d,
+		0x62, 0xb9, 0x2e, 0x0f, 0x2b, 0x60, 0x58, 0xe4,
+		0xf8, 0x6c, 0x72, 0xb0, 0x85, 0x4d, 0x95, 0x41,
+		0x1c, 0x23, 0x05, 0x99, 0x32, 0xc5, 0x0e, 0x82,
+		0x91, 0x14, 0xd1, 0xaf, 0xf9, 0xb3, 0x07, 0x97,
+		0x6e, 0x0b, 0x67, 0x3b, 0x78, 0xe6, 0x28, 0x22,
+		0x4f, 0xa3, 0xca, 0x48, 0xde, 0x1d, 0xa8, 0x17,
+		0x6f, 0x90, 0xaa, 0x31, 0x5a, 0xf3, 0xe9, 0xa9,
+		0x44, 0x30, 0x56, 0x09, 0x59, 0x6a, 0x42, 0xcd,
+		0xe5, 0xd6, 0x86, 0xd9, 0xbf, 0xcb, 0x26, 0x66,
+		0x7c, 0xd5, 0xbe, 0x25, 0x1f, 0xe0, 0x98, 0x27,
+		0x92, 0x51, 0xc7, 0x45, 0x2c, 0xc0, 0xad, 0xa7,
+		0x69, 0xf7, 0xb4, 0xe8, 0x84, 0xe1, 0x18, 0x88,
+		0x3c, 0x76, 0x20, 0x5e, 0x9b, 0x1e, 0x0d, 0x81,
+		0x4a, 0xbd, 0x16, 0x8a, 0xac, 0x93, 0xce, 0x1a,
+		0xc2, 0x0a, 0x3f, 0xfd, 0xe3, 0x77, 0x6b, 0xd7,
+		0xef, 0xa4, 0x80, 0xa1, 0x36, 0xed, 0xa2, 0x12,
+		0x57, 0xb6, 0x29, 0x8d, 0x7b, 0xc8, 0x52, 0xc3,
+		0xbc, 0xb8, 0x21, 0xd4, 0xea, 0xd3, 0x06, 0xab,
+		0x2a, 0x1b, 0x5f, 0xb7, 0x10, 0xec, 0x64, 0xf6,
+		0xe2, 0x11, 0x50, 0x83, 0x54, 0x3a, 0xfa, 0xfb,
+		0xf2, 0x43, 0xb1, 0xff, 0xe7, 0xc9, 0x03, 0xbb,
+		0xee, 0x13, 0x8b, 0xdc, 0x35, 0xb2, 0xda, 0xcf,
+		0xa0, 0x96, 0x49, 0x4e, 0x08, 0x73, 0xf0, 0x7e,
+		0xfe, 0x15, 0x4b,
+	},
+	{
+		0x01, 0xbb, 0xa6, 0xd7, 0xc7, 0x07, 0xce, 0x82,
+		0x4a, 0x2f, 0xa5, 0x9b, 0xb6, 0x60, 0xf1, 0xad,
+		0xe7, 0xf4, 0x06, 0xd2, 0xdf, 0x2e, 0xca, 0x65,
+		0x5c, 0x48, 0x21, 0xaa, 0xcd, 0x4e, 0xc1, 0x61,
+		0x38, 0x0a, 0x3e, 0xd1, 0xd5, 0xcb, 0x10, 0xdc,
+		0x5e, 0x24, 0xb8, 0xde, 0x79, 0x36, 0x43, 0x72,
+		0xd9, 0xf8, 0xf9, 0xa2, 0xa4, 0x6a, 0x3d, 0xea,
+		0x8e, 0x03, 0xf5, 0xab, 0xb4, 0x5d, 0xb5, 0x53,
+		0x6b, 0x39, 0x86, 0xb0, 0x50, 0x74, 0x96, 0x84,
+		0x5a, 0x4b, 0xe8, 0x49, 0xe5, 0x51, 0xef, 0x12,
+		0xbc, 0x89, 0x5b, 0x2b, 0x29, 0x09, 0xc3, 0x57,
+		0x1e, 0x37, 0x76, 0x0b, 0x64, 0x8a, 0x52, 0x59,
+		0x80, 0xda, 0xa8, 0x44, 0x95, 0x3c, 0x33, 0xe6,
+		0x7c, 0xaf, 0x6c, 0xb1, 0x9d, 0xfc, 0x92, 0xd6,
+		0xd8, 0xff, 0xa7, 0x77, 0x04, 0x13, 0x73, 0x66,
+		0x28, 0x7d, 0x83, 0xfb, 0x5f, 0x63, 0x25, 0x19,
+		0xbd, 0xc5, 0x3b, 0x6e, 0x20, 0x35, 0x55, 0x42,
+		0x31, 0xe1, 0xb9, 0x9e, 0x90, 0xd4, 0xba, 0xdb,
+		0xf7, 0x2a, 0xe9, 0x3a, 0xa0, 0x75, 0x7a, 0xd3,
+		0x02, 0xee, 0x9c, 0xc6, 0x1f, 0x14, 0xcc, 0x22,
+		0x4d, 0x30, 0x71, 0x58, 0x11, 0x85, 0x4f, 0x6f,
+		0x6d, 0x1d, 0xcf, 0xfa, 0x54, 0xa9, 0x17, 0xa3,
+		0x0f, 0xae, 0x0d, 0x1c, 0xc2, 0xd0, 0x32, 0x16,
+		0xf6, 0xc0, 0x7f, 0x2d, 0x15, 0xf3, 0x1b, 0xf2,
+		0xed, 0xb3, 0x45, 0xc8, 0xac, 0x7b, 0x2c, 0xe2,
+		0xe4, 0xbf, 0xbe, 0x9f, 0x34, 0x05, 0x70, 0x3f,
+		0x98, 0xfe, 0x62, 0x18, 0x9a, 0x56, 0x8d, 0x93,
+		0x97, 0x78, 0x4c, 0x7e, 0x27, 0x87, 0x08, 0x8b,
+		0xec, 0x67, 0x0e, 0x1a, 0x23, 0x8c, 0x68, 0x99,
+		0x94, 0x40, 0xb2, 0xa1, 0xeb, 0xb7, 0x26, 0xf0,
+		0xdd, 0xe3, 0x69, 0x0c, 0xc4, 0x88, 0x41, 0x81,
+		0x91, 0xe0, 0xfd,
+	},
+	{
+		0x01, 0x97, 0x7f, 0x9c, 0x7c, 0x18, 0xbd, 0xa2,
+		0x58, 0x1a, 0xda, 0x74, 0x70, 0xa3, 0xe5, 0x47,
+		0x29, 0x07, 0xf5, 0x80, 0x23, 0xe9, 0xfa, 0x46,
+		0x54, 0xa0, 0x99, 0x95, 0x53, 0x9b, 0x0b, 0xc7,
+		0x09, 0xc0, 0x78, 0x89, 0x92, 0xe3, 0x0d, 0xb0,
+		0x2a, 0x8c, 0xfb, 0x17, 0x3f, 0x26, 0x65, 0x87,
+		0x27, 0x5c, 0x66, 0x61, 0x79, 0x4d, 0x32, 0xb3,
+		0x8d, 0x52, 0xe2, 0x82, 0x3d, 0xf9, 0xc5, 0x02,
+		0xbc, 0x4c, 0x73, 0x48, 0x62, 0xaf, 0xba, 0x41,
+		0xd9, 0xc4, 0x2f, 0xb1, 0x33, 0xb8, 0x15, 0x7d,
+		0xcf, 0x3a, 0xa9, 0x5f, 0x84, 0x6d, 0x34, 0x1b,
+		0x44, 0x94, 0x72, 0x81, 0x42, 0xbe, 0xcc, 0x4b,
+		0x0a, 0x6f, 0x5a, 0x22, 0x36, 0xb5, 0x3c, 0x9d,
+		0x13, 0x7e, 0x08, 0xdd, 0xd6, 0x5e, 0x04, 0xfc,
+		0x5b, 0xec, 0xef, 0xf1, 0x6e, 0x1e, 0x77, 0x24,
+		0xe6, 0xc6, 0xaa, 0xcb, 0xfd, 0x51, 0x67, 0x06,
+		0x6a, 0x4a, 0x88, 0xdb, 0xb2, 0xc2, 0x5d, 0x43,
+		0x40, 0xf7, 0x50, 0xa8, 0xf2, 0x7a, 0x71, 0xa4,
+		0xd2, 0xbf, 0x31, 0x90, 0x19, 0x9a, 0x8e, 0xf6,
+		0xc3, 0xa6, 0xe7, 0x60, 0x12, 0xee, 0x2d, 0xde,
+		0x38, 0xe8, 0xb7, 0x98, 0xc1, 0x28, 0xf3, 0x05,
+		0x96, 0x63, 0xd1, 0xb9, 0x14, 0x9f, 0x1d, 0x83,
+		0x68, 0x75, 0xed, 0x16, 0x03, 0xce, 0xe4, 0xdf,
+		0xe0, 0x10, 0xae, 0x69, 0x55, 0x91, 0x2e, 0x4e,
+		0xfe, 0x21, 0x1f, 0x9e, 0xe1, 0xd5, 0xcd, 0xca,
+		0xf0, 0x8b, 0x2b, 0xc9, 0x8a, 0x93, 0xbb, 0x57,
+		0x20, 0x86, 0x1c, 0xa1, 0x4f, 0x3e, 0x25, 0xd4,
+		0x6c, 0xa5, 0x6b, 0xa7, 0x37, 0xff, 0x39, 0x35,
+		0x0c, 0xf8, 0xea, 0x56, 0x45, 0x8f, 0x2c, 0x59,
+		0xab, 0x85, 0xeb, 0x49, 0x0f, 0xdc, 0xd8, 0x76,
+		0xb6, 0xf4, 0x0e, 0x11, 0xb4, 0xd0, 0x30, 0xd3,
+		0x3b, 0xad, 0xd7,
+	},
+	{
+		0x01, 0x2b, 0x3f, 0xcf, 0x73, 0x2c, 0xd6, 0xed,
+		0xcb, 0x74, 0x15, 0x78, 0x8a, 0xc1, 0x17, 0xc9,
+		0x89, 0x68, 0x21, 0xab, 0x76, 0x3b, 0x4b, 0x5a,
+		0x6e, 0x0e, 0xb9, 0xd3, 0xb6, 0x3e, 0x36, 0x86,
+		0xbf, 0xa2, 0xa7, 0x30, 0x14, 0xeb, 0xc7, 0x2d,
+		0x96, 0x67, 0x20, 0xb5, 0x9a, 0xe0, 0xa8, 0xc6,
+		0x80, 0x04, 0x8d, 0xfe, 0x75, 0x5e, 0x23, 0xca,
+		0x8f, 0x48, 0x99, 0x0d, 0xdf, 0x8e, 0xb8, 0x70,
+		0x29, 0x9c, 0x44, 0x69, 0x3d, 0xa5, 0xc2, 0x90,
+		0xd2, 0x1c, 0x9b, 0x02, 0x1d, 0x98, 0x93, 0xec,
+		0x84, 0xe8, 0x64, 0x4c, 0x3a, 0x8b, 0x97, 0xf3,
+		0xe5, 0xc0, 0x7d, 0x26, 0xc8, 0x08, 0xa0, 0x62,
+		0x82, 0x55, 0xf7, 0x33, 0xf6, 0x51, 0x63, 0x4d,
+		0x77, 0xda, 0xfd, 0xc3, 0x38, 0x6d, 0xee, 0x09,
+		0x47, 0xa3, 0x05, 0xde, 0xa6, 0xf1, 0x22, 0x25,
+		0x6a, 0x0c, 0x81, 0xb2, 0x6b, 0x58, 0xd5, 0xb3,
+		0xfc, 0xfb, 0x28, 0x7f, 0x07, 0xdc, 0x7a, 0x9e,
+		0xd0, 0x37, 0xb4, 0xe1, 0x1a, 0x24, 0x03, 0xae,
+		0x94, 0xba, 0x88, 0x2f, 0xea, 0x2e, 0x8c, 0x5b,
+		0xbb, 0x79, 0xd1, 0x11, 0xff, 0xa4, 0x19, 0x3c,
+		0x2a, 0x4e, 0x52, 0xe3, 0x95, 0xbd, 0x31, 0x5d,
+		0x35, 0x4a, 0x41, 0xc4, 0xdb, 0x42, 0xc5, 0x0b,
+		0x49, 0x1b, 0x7c, 0xe4, 0xb0, 0x9d, 0x45, 0xf0,
+		0xa9, 0x61, 0x57, 0x06, 0xd4, 0x40, 0x91, 0x56,
+		0x13, 0xfa, 0x87, 0xac, 0x27, 0x54, 0xdd, 0x59,
+		0x1f, 0x71, 0x39, 0x43, 0x6c, 0xf9, 0xbe, 0x4f,
+		0xf4, 0x1e, 0x32, 0xcd, 0xe9, 0x7e, 0x7b, 0x66,
+		0x5f, 0xef, 0xe7, 0x6f, 0x0a, 0x60, 0xd7, 0xb7,
+		0x83, 0x92, 0xe2, 0xaf, 0x72, 0xf8, 0xb1, 0x50,
+		0x10, 0xce, 0x18, 0x53, 0xa1, 0xcc, 0xad, 0x12,
+		0x34, 0x0f, 0xf5, 0xaa, 0x16, 0xe6, 0xf2, 0xd8,
+		0x85, 0x9f, 0xbc,
+	},
+};
+
+#ifdef CONFIG_X86
+/**
+ * PSHUFB tables for the Cauchy matrix.
+ *
+ * Indexes are [DISK][PARITY - 2][LH].
+ * Where DISK is from 0 to 250, PARITY from 2 to 5, LH from 0 to 1.
+ */
+const uint8_t __aligned(256) raid_gfcauchypshufb[251][4][2][16] =
+{
+	{
+		{
+			{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+			{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+		},
+		{
+			{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+			{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+		},
+		{
+			{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+			{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+		},
+		{
+			{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+			{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a },
+			{ 0x00, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0x0b, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0 },
+		},
+		{
+			{ 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 },
+			{ 0x00, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf },
+		},
+		{
+			{ 0x00, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0x0e },
+			{ 0x00, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0 },
+		},
+		{
+			{ 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94 },
+			{ 0x00, 0x8a, 0x09, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda },
+			{ 0x00, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21 },
+		},
+		{
+			{ 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c },
+			{ 0x00, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0x0b, 0xb9, 0x72, 0xc0 },
+		},
+		{
+			{ 0x00, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf },
+			{ 0x00, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f },
+		},
+		{
+			{ 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58 },
+			{ 0x00, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x08 },
+			{ 0x00, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0x0b, 0xd7, 0xf9, 0x25, 0x5c, 0x80 },
+		},
+		{
+			{ 0x00, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9 },
+			{ 0x00, 0xf1, 0xff, 0x0e, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36 },
+		},
+		{
+			{ 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 },
+			{ 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0x0b, 0x3e },
+		},
+		{
+			{ 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 },
+			{ 0x00, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45 },
+			{ 0x00, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24 },
+		},
+		{
+			{ 0x00, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19 },
+			{ 0x00, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d },
+		},
+		{
+			{ 0x00, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae },
+			{ 0x00, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x09, 0xa1, 0x32 },
+		},
+		{
+			{ 0x00, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb },
+			{ 0x00, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0x0b },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x07, 0x94, 0xe5 },
+			{ 0x00, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6 },
+		},
+		{
+			{ 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d },
+			{ 0x00, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0x0a, 0x9a, 0xea },
+		},
+		{
+			{ 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 },
+			{ 0x00, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x01, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68 },
+		},
+		{
+			{ 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x09, 0xcd, 0xe1, 0x95, 0xb9 },
+			{ 0x00, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xf1, 0xff, 0x0e, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36 },
+			{ 0x00, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0x0c, 0xec, 0x47 },
+		},
+		{
+			{ 0x00, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e },
+			{ 0x00, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae },
+		},
+		{
+			{ 0x00, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95 },
+			{ 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 },
+		},
+		{
+			{ 0x00, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6 },
+			{ 0x00, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf },
+			{ 0x00, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f },
+		},
+		{
+			{ 0x00, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd },
+			{ 0x00, 0xc8, 0x8d, 0x45, 0x07, 0xcf, 0x8a, 0x42, 0x0e, 0xc6, 0x83, 0x4b, 0x09, 0xc1, 0x84, 0x4c },
+		},
+		{
+			{ 0x00, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30 },
+			{ 0x00, 0xf2, 0xf9, 0x0b, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27 },
+		},
+		{
+			{ 0x00, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82 },
+			{ 0x00, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0x0d, 0x52, 0x24, 0xbe, 0xc8 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d },
+			{ 0x00, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x07, 0x7c, 0xff, 0x84, 0x09, 0x72, 0x0e, 0x75, 0xf8, 0x83 },
+		},
+		{
+			{ 0x00, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81 },
+			{ 0x00, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8 },
+		},
+		{
+			{ 0x00, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f },
+			{ 0x00, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe },
+		},
+		{
+			{ 0x00, 0xcb, 0x8b, 0x40, 0x0b, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d },
+			{ 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x09, 0xcd, 0xe1, 0x95, 0xb9 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe },
+			{ 0x00, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x01, 0xee, 0x76, 0xc3, 0x5b },
+		},
+		{
+			{ 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8 },
+			{ 0x00, 0xca, 0x89, 0x43, 0x0f, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52 },
+		},
+		{
+			{ 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 },
+			{ 0x00, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95 },
+		},
+		{
+			{ 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 },
+			{ 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b },
+			{ 0x00, 0x8c, 0x05, 0x89, 0x0a, 0x86, 0x0f, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97 },
+		},
+		{
+			{ 0x00, 0xa5, 0x57, 0xf2, 0xae, 0x0b, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d },
+			{ 0x00, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd },
+		},
+		{
+			{ 0x00, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0x0b, 0x78, 0xa2 },
+			{ 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2 },
+		},
+		{
+			{ 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 },
+			{ 0x00, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16 },
+			{ 0x00, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d },
+		},
+		{
+			{ 0x00, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a },
+			{ 0x00, 0x45, 0x8a, 0xcf, 0x09, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4 },
+		},
+		{
+			{ 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 },
+			{ 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 },
+		},
+		{
+			{ 0x00, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0x0d, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92 },
+			{ 0x00, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x03, 0xd6, 0x05, 0x6d, 0xbe, 0xbd, 0x6e, 0x06, 0xd5 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 },
+			{ 0x00, 0x8d, 0x07, 0x8a, 0x0e, 0x83, 0x09, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98 },
+		},
+		{
+			{ 0x00, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc },
+			{ 0x00, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b },
+		},
+		{
+			{ 0x00, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0x0a, 0x9a, 0xea },
+			{ 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 },
+		},
+		{
+			{ 0x00, 0x8a, 0x09, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5 },
+			{ 0x00, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0x0f, 0xd7, 0x9f },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8 },
+			{ 0x00, 0xca, 0x89, 0x43, 0x0f, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52 },
+		},
+		{
+			{ 0x00, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a },
+			{ 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd },
+		},
+		{
+			{ 0x00, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f },
+			{ 0x00, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0x0e, 0x35, 0xd7 },
+		},
+		{
+			{ 0x00, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b },
+			{ 0x00, 0x8c, 0x05, 0x89, 0x0a, 0x86, 0x0f, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7 },
+			{ 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 },
+		},
+		{
+			{ 0x00, 0xf1, 0xff, 0x0e, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36 },
+			{ 0x00, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0x0c, 0xec, 0x47 },
+		},
+		{
+			{ 0x00, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa },
+			{ 0x00, 0xf6, 0xf1, 0x07, 0xff, 0x09, 0x0e, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b },
+		},
+		{
+			{ 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd },
+			{ 0x00, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24 },
+			{ 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x08, 0x43, 0x7a },
+		},
+		{
+			{ 0x00, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x01, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65 },
+			{ 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e },
+		},
+		{
+			{ 0x00, 0x47, 0x8e, 0xc9, 0x01, 0x46, 0x8f, 0xc8, 0x02, 0x45, 0x8c, 0xcb, 0x03, 0x44, 0x8d, 0xca },
+			{ 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c },
+		},
+		{
+			{ 0x00, 0xc9, 0x8f, 0x46, 0x03, 0xca, 0x8c, 0x45, 0x06, 0xcf, 0x89, 0x40, 0x05, 0xcc, 0x8a, 0x43 },
+			{ 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x01, 0x3c, 0x7b, 0x46 },
+			{ 0x00, 0xf7, 0xf3, 0x04, 0xfb, 0x0c, 0x08, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14 },
+		},
+		{
+			{ 0x00, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x03, 0xe4 },
+			{ 0x00, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6 },
+		},
+		{
+			{ 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x07, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a },
+			{ 0x00, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x01, 0xe2, 0x48 },
+		},
+		{
+			{ 0x00, 0x89, 0x0f, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4 },
+			{ 0x00, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0x0d, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x01, 0xb9, 0x03, 0xd0, 0x6a, 0x6b, 0xd1, 0x02, 0xb8 },
+			{ 0x00, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f },
+		},
+		{
+			{ 0x00, 0xf4, 0xf5, 0x01, 0xf7, 0x03, 0x02, 0xf6, 0xf3, 0x07, 0x06, 0xf2, 0x04, 0xf0, 0xf1, 0x05 },
+			{ 0x00, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50 },
+		},
+		{
+			{ 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d },
+			{ 0x00, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0x0a, 0x9a, 0xea },
+		},
+		{
+			{ 0x00, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x05, 0x67, 0x0f, 0xb7, 0xdf, 0xda, 0xb2, 0x0a, 0x62 },
+			{ 0x00, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 },
+			{ 0x00, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60 },
+		},
+		{
+			{ 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 },
+			{ 0x00, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a },
+		},
+		{
+			{ 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a },
+			{ 0x00, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0x0b, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0 },
+		},
+		{
+			{ 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2 },
+			{ 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c },
+			{ 0x00, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7 },
+		},
+		{
+			{ 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda },
+			{ 0x00, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21 },
+		},
+		{
+			{ 0x00, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3 },
+			{ 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 },
+		},
+		{
+			{ 0x00, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0x0c, 0xec, 0x47 },
+			{ 0x00, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x04 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 },
+			{ 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0x0b, 0x3e },
+		},
+		{
+			{ 0x00, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91 },
+			{ 0x00, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x07, 0x94, 0xe5 },
+		},
+		{
+			{ 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x05, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec },
+			{ 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 },
+		},
+		{
+			{ 0x00, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0x0d, 0x52, 0x24, 0xbe, 0xc8 },
+			{ 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15 },
+			{ 0x00, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d },
+		},
+		{
+			{ 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7 },
+			{ 0x00, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0x0b, 0x78, 0xa2 },
+		},
+		{
+			{ 0x00, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe },
+			{ 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f },
+		},
+		{
+			{ 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64 },
+			{ 0x00, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0x0e },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x05, 0x31 },
+			{ 0x00, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37 },
+		},
+		{
+			{ 0x00, 0xca, 0x89, 0x43, 0x0f, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52 },
+			{ 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0x0d, 0x31, 0x75, 0x49 },
+		},
+		{
+			{ 0x00, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f },
+			{ 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 },
+		},
+		{
+			{ 0x00, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e },
+			{ 0x00, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x08 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x8c, 0x05, 0x89, 0x0a, 0x86, 0x0f, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97 },
+			{ 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0x0d, 0x25, 0xfd, 0xd5, 0xad, 0x85 },
+		},
+		{
+			{ 0x00, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26, 0x0f, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29 },
+			{ 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa },
+		},
+		{
+			{ 0x00, 0x46, 0x8c, 0xca, 0x05, 0x43, 0x89, 0xcf, 0x0a, 0x4c, 0x86, 0xc0, 0x0f, 0x49, 0x83, 0xc5 },
+			{ 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc },
+		},
+		{
+			{ 0x00, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x04, 0x9f, 0xc5, 0x2b, 0x71 },
+			{ 0x00, 0xc9, 0x8f, 0x46, 0x03, 0xca, 0x8c, 0x45, 0x06, 0xcf, 0x89, 0x40, 0x05, 0xcc, 0x8a, 0x43 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x46, 0x8c, 0xca, 0x05, 0x43, 0x89, 0xcf, 0x0a, 0x4c, 0x86, 0xc0, 0x0f, 0x49, 0x83, 0xc5 },
+			{ 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc },
+		},
+		{
+			{ 0x00, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89, 0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0x0f, 0x53 },
+			{ 0x00, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78, 0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59 },
+		},
+		{
+			{ 0x00, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b },
+			{ 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x07, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a },
+		},
+		{
+			{ 0x00, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40 },
+			{ 0x00, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x05, 0x67, 0x0f, 0xb7, 0xdf, 0xda, 0xb2, 0x0a, 0x62 },
+			{ 0x00, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e },
+		},
+		{
+			{ 0x00, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0x0f, 0xd7, 0x9f },
+			{ 0x00, 0xf4, 0xf5, 0x01, 0xf7, 0x03, 0x02, 0xf6, 0xf3, 0x07, 0x06, 0xf2, 0x04, 0xf0, 0xf1, 0x05 },
+		},
+		{
+			{ 0x00, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e },
+			{ 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda },
+		},
+		{
+			{ 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a },
+			{ 0x00, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0x0a, 0x9a, 0xea },
+			{ 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 },
+		},
+		{
+			{ 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2 },
+			{ 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b },
+		},
+		{
+			{ 0x00, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0x0a, 0xe2, 0x7b, 0xcd, 0x54 },
+			{ 0x00, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26, 0x0f, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29 },
+		},
+		{
+			{ 0x00, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x08, 0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9 },
+			{ 0x00, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57 },
+			{ 0x00, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19 },
+		},
+		{
+			{ 0x00, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x01, 0xe2, 0x48 },
+			{ 0x00, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4 },
+		},
+		{
+			{ 0x00, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10 },
+			{ 0x00, 0xa5, 0x57, 0xf2, 0xae, 0x0b, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d },
+		},
+		{
+			{ 0x00, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x03, 0xd6, 0x05, 0x6d, 0xbe, 0xbd, 0x6e, 0x06, 0xd5 },
+			{ 0x00, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70 },
+			{ 0x00, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89, 0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0x0f, 0x53 },
+		},
+		{
+			{ 0x00, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f },
+			{ 0x00, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3 },
+		},
+		{
+			{ 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 },
+			{ 0x00, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60 },
+		},
+		{
+			{ 0x00, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc },
+			{ 0x00, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1 },
+			{ 0x00, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2 },
+		},
+		{
+			{ 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd },
+			{ 0x00, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f },
+		},
+		{
+			{ 0x00, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a },
+			{ 0x00, 0x45, 0x8a, 0xcf, 0x09, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4 },
+		},
+		{
+			{ 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57 },
+			{ 0x00, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 },
+			{ 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 },
+		},
+		{
+			{ 0x00, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b },
+			{ 0x00, 0x8c, 0x05, 0x89, 0x0a, 0x86, 0x0f, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97 },
+		},
+		{
+			{ 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 },
+			{ 0x00, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde },
+		},
+		{
+			{ 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f },
+			{ 0x00, 0x47, 0x8e, 0xc9, 0x01, 0x46, 0x8f, 0xc8, 0x02, 0x45, 0x8c, 0xcb, 0x03, 0x44, 0x8d, 0xca },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9 },
+			{ 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 },
+		},
+		{
+			{ 0x00, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15 },
+			{ 0x00, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d },
+		},
+		{
+			{ 0x00, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19 },
+			{ 0x00, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d },
+		},
+		{
+			{ 0x00, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1 },
+			{ 0x00, 0x88, 0x0d, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x03, 0x2f, 0x9a, 0x58, 0xed },
+			{ 0x00, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76 },
+		},
+		{
+			{ 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x05, 0x4d, 0x75 },
+			{ 0x00, 0xa7, 0x53, 0xf4, 0xa6, 0x01, 0xf5, 0x52, 0x51, 0xf6, 0x02, 0xa5, 0xf7, 0x50, 0xa4, 0x03 },
+		},
+		{
+			{ 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 },
+			{ 0x00, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23 },
+		},
+		{
+			{ 0x00, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b },
+			{ 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x06, 0x76, 0xad },
+			{ 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x02 },
+		},
+		{
+			{ 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 },
+			{ 0x00, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e },
+		},
+		{
+			{ 0x00, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34 },
+			{ 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 },
+		},
+		{
+			{ 0x00, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30 },
+			{ 0x00, 0xf2, 0xf9, 0x0b, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 },
+			{ 0x00, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34 },
+		},
+		{
+			{ 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57 },
+			{ 0x00, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19 },
+		},
+		{
+			{ 0x00, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0x0d, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92 },
+			{ 0x00, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x03, 0xd6, 0x05, 0x6d, 0xbe, 0xbd, 0x6e, 0x06, 0xd5 },
+		},
+		{
+			{ 0x00, 0xa7, 0x53, 0xf4, 0xa6, 0x01, 0xf5, 0x52, 0x51, 0xf6, 0x02, 0xa5, 0xf7, 0x50, 0xa4, 0x03 },
+			{ 0x00, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91 },
+			{ 0x00, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x07, 0x94, 0xe5 },
+		},
+		{
+			{ 0x00, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0x0d, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb },
+			{ 0x00, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c },
+		},
+		{
+			{ 0x00, 0x89, 0x0f, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4 },
+			{ 0x00, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0x0d, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92 },
+		},
+		{
+			{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0x0d },
+			{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79 },
+			{ 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 },
+		},
+		{
+			{ 0x00, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7 },
+			{ 0x00, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0x0d, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb },
+		},
+		{
+			{ 0x00, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x04, 0xaf, 0x3d },
+			{ 0x00, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7 },
+		},
+		{
+			{ 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc },
+			{ 0x00, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e, 0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x01, 0x5c },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51 },
+			{ 0x00, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79 },
+		},
+		{
+			{ 0x00, 0xcb, 0x8b, 0x40, 0x0b, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d },
+			{ 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x09, 0xcd, 0xe1, 0x95, 0xb9 },
+		},
+		{
+			{ 0x00, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93, 0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x03, 0x3b, 0xd8 },
+			{ 0x00, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x01 },
+		},
+		{
+			{ 0x00, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0x0b, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0 },
+			{ 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0x0b, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d },
+			{ 0x00, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0x0e, 0x7d, 0xbe, 0xe6, 0x25 },
+		},
+		{
+			{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+			{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+		},
+		{
+			{ 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b },
+			{ 0x00, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0x0a, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4 },
+		},
+		{
+			{ 0x00, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19 },
+			{ 0x00, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0x0b, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0 },
+			{ 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 },
+		},
+		{
+			{ 0x00, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0x0b, 0xd7, 0xf9, 0x25, 0x5c, 0x80 },
+			{ 0x00, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8 },
+		},
+		{
+			{ 0x00, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde },
+			{ 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 },
+		},
+		{
+			{ 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x02, 0xc1, 0xec, 0x9b, 0xb6 },
+			{ 0x00, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x03, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0x0b },
+			{ 0x00, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0 },
+		},
+		{
+			{ 0x00, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d },
+			{ 0x00, 0x89, 0x0f, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4 },
+		},
+		{
+			{ 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b },
+			{ 0x00, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45 },
+		},
+		{
+			{ 0x00, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x01 },
+			{ 0x00, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76 },
+			{ 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 },
+		},
+		{
+			{ 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1 },
+			{ 0x00, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x01, 0x7b, 0xf7, 0x8d, 0x03, 0x79, 0x02, 0x78, 0xf6, 0x8c },
+		},
+		{
+			{ 0x00, 0x8c, 0x05, 0x89, 0x0a, 0x86, 0x0f, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97 },
+			{ 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0x0d, 0x25, 0xfd, 0xd5, 0xad, 0x85 },
+		},
+		{
+			{ 0x00, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37 },
+			{ 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x05, 0x4d, 0x75 },
+			{ 0x00, 0xa7, 0x53, 0xf4, 0xa6, 0x01, 0xf5, 0x52, 0x51, 0xf6, 0x02, 0xa5, 0xf7, 0x50, 0xa4, 0x03 },
+		},
+		{
+			{ 0x00, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0x0f, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6 },
+			{ 0x00, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0x0d, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2 },
+		},
+		{
+			{ 0x00, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50 },
+			{ 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 },
+		},
+		{
+			{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd },
+			{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0x0a, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4 },
+			{ 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc },
+		},
+		{
+			{ 0x00, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e },
+			{ 0x00, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15 },
+		},
+		{
+			{ 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd },
+			{ 0x00, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51 },
+		},
+		{
+			{ 0x00, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x03, 0x2f, 0x9a, 0x58, 0xed },
+			{ 0x00, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f },
+			{ 0x00, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x03, 0x2f, 0x9a, 0x58, 0xed },
+		},
+		{
+			{ 0x00, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0x0b, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d },
+			{ 0x00, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0x0e, 0x7d, 0xbe, 0xe6, 0x25 },
+		},
+		{
+			{ 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58 },
+			{ 0x00, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9 },
+		},
+		{
+			{ 0x00, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45 },
+			{ 0x00, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xa5, 0x57, 0xf2, 0xae, 0x0b, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d },
+			{ 0x00, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd },
+		},
+		{
+			{ 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f },
+			{ 0x00, 0x47, 0x8e, 0xc9, 0x01, 0x46, 0x8f, 0xc8, 0x02, 0x45, 0x8c, 0xcb, 0x03, 0x44, 0x8d, 0xca },
+		},
+		{
+			{ 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0x0b, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf },
+			{ 0x00, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x04, 0x9f, 0xc5, 0x2b, 0x71 },
+		},
+		{
+			{ 0x00, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9 },
+			{ 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1 },
+			{ 0x00, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x01, 0x7b, 0xf7, 0x8d, 0x03, 0x79, 0x02, 0x78, 0xf6, 0x8c },
+		},
+		{
+			{ 0x00, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6 },
+			{ 0x00, 0x44, 0x88, 0xcc, 0x0d, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb },
+		},
+		{
+			{ 0x00, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26, 0x0f, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29 },
+			{ 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa },
+		},
+		{
+			{ 0x00, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56 },
+			{ 0x00, 0x52, 0xa4, 0xf6, 0x55, 0x07, 0xf1, 0xa3, 0xaa, 0xf8, 0x0e, 0x5c, 0xff, 0xad, 0x5b, 0x09 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x89, 0x0f, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4 },
+			{ 0x00, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0x0d, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92 },
+		},
+		{
+			{ 0x00, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4 },
+			{ 0x00, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb },
+		},
+		{
+			{ 0x00, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe },
+			{ 0x00, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x01, 0xee, 0x76, 0xc3, 0x5b },
+		},
+		{
+			{ 0x00, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16 },
+			{ 0x00, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89, 0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0x0f, 0x53 },
+			{ 0x00, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78, 0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59 },
+		},
+		{
+			{ 0x00, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35, 0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3 },
+			{ 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff },
+		},
+		{
+			{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 },
+			{ 0x00, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81 },
+		},
+		{
+			{ 0x00, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3 },
+			{ 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26, 0x0f, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29 },
+			{ 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa },
+		},
+		{
+			{ 0x00, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41 },
+			{ 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64 },
+		},
+		{
+			{ 0x00, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89, 0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0x0f, 0x53 },
+			{ 0x00, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78, 0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59 },
+		},
+		{
+			{ 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c },
+			{ 0x00, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0x0f, 0x93, 0xc8, 0x25, 0x7e },
+			{ 0x00, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35, 0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3 },
+		},
+		{
+			{ 0x00, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e },
+			{ 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94 },
+		},
+		{
+			{ 0x00, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38 },
+			{ 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7 },
+		},
+		{
+			{ 0x00, 0x8d, 0x07, 0x8a, 0x0e, 0x83, 0x09, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98 },
+			{ 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x05, 0x4d, 0x75 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74 },
+			{ 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13 },
+		},
+		{
+			{ 0x00, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30 },
+			{ 0x00, 0xf2, 0xf9, 0x0b, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27 },
+		},
+		{
+			{ 0x00, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15 },
+			{ 0x00, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d },
+		},
+		{
+			{ 0x00, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63 },
+			{ 0x00, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0x0f, 0x93, 0xc8, 0x25, 0x7e },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20 },
+			{ 0x00, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a },
+		},
+		{
+			{ 0x00, 0xa4, 0x55, 0xf1, 0xaa, 0x0e, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12 },
+			{ 0x00, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x04, 0xaf, 0x3d },
+		},
+		{
+			{ 0x00, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0x0b, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d },
+			{ 0x00, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0x0e, 0x7d, 0xbe, 0xe6, 0x25 },
+		},
+		{
+			{ 0x00, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9 },
+			{ 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c },
+			{ 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd },
+		},
+		{
+			{ 0x00, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0x0b, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c },
+			{ 0x00, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0x0c, 0xbc, 0x52, 0x7d, 0x93 },
+		},
+		{
+			{ 0x00, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac },
+			{ 0x00, 0xa4, 0x55, 0xf1, 0xaa, 0x0e, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12 },
+		},
+		{
+			{ 0x00, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d },
+			{ 0x00, 0x89, 0x0f, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3 },
+			{ 0x00, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x08, 0x23, 0x97, 0x56, 0xe2 },
+		},
+		{
+			{ 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x01, 0x3c, 0x7b, 0x46 },
+			{ 0x00, 0xf7, 0xf3, 0x04, 0xfb, 0x0c, 0x08, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14 },
+		},
+		{
+			{ 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13 },
+			{ 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d },
+		},
+		{
+			{ 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x05, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec },
+			{ 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x01, 0xdc, 0xf5, 0x28, 0x52, 0x8f },
+			{ 0x00, 0x51, 0xa2, 0xf3, 0x59, 0x08, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18 },
+		},
+		{
+			{ 0x00, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x03, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf },
+			{ 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 },
+		},
+		{
+			{ 0x00, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x07, 0xb4, 0x7c, 0xcf },
+			{ 0x00, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c },
+		},
+		{
+			{ 0x00, 0xca, 0x89, 0x43, 0x0f, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52 },
+			{ 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0x0d, 0x31, 0x75, 0x49 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x47, 0x8e, 0xc9, 0x01, 0x46, 0x8f, 0xc8, 0x02, 0x45, 0x8c, 0xcb, 0x03, 0x44, 0x8d, 0xca },
+			{ 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c },
+		},
+		{
+			{ 0x00, 0x8e, 0x01, 0x8f, 0x02, 0x8c, 0x03, 0x8d, 0x04, 0x8a, 0x05, 0x8b, 0x06, 0x88, 0x07, 0x89 },
+			{ 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 },
+		},
+		{
+			{ 0x00, 0x8d, 0x07, 0x8a, 0x0e, 0x83, 0x09, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98 },
+			{ 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x05, 0x4d, 0x75 },
+		},
+		{
+			{ 0x00, 0x8f, 0x03, 0x8c, 0x06, 0x89, 0x05, 0x8a, 0x0c, 0x83, 0x0f, 0x80, 0x0a, 0x85, 0x09, 0x86 },
+			{ 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xf4, 0xf5, 0x01, 0xf7, 0x03, 0x02, 0xf6, 0xf3, 0x07, 0x06, 0xf2, 0x04, 0xf0, 0xf1, 0x05 },
+			{ 0x00, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50 },
+		},
+		{
+			{ 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 },
+			{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0x0d },
+		},
+		{
+			{ 0x00, 0x52, 0xa4, 0xf6, 0x55, 0x07, 0xf1, 0xa3, 0xaa, 0xf8, 0x0e, 0x5c, 0xff, 0xad, 0x5b, 0x09 },
+			{ 0x00, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x02, 0xd9, 0x90 },
+		},
+		{
+			{ 0x00, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0x0f, 0xd7, 0x9f },
+			{ 0x00, 0xf4, 0xf5, 0x01, 0xf7, 0x03, 0x02, 0xf6, 0xf3, 0x07, 0x06, 0xf2, 0x04, 0xf0, 0xf1, 0x05 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e },
+			{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd },
+		},
+		{
+			{ 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a },
+			{ 0x00, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0x0b, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0 },
+		},
+		{
+			{ 0x00, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0x0e, 0x35, 0xd7 },
+			{ 0x00, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1 },
+		},
+		{
+			{ 0x00, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0x0a, 0xe2, 0x7b, 0xcd, 0x54 },
+			{ 0x00, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26, 0x0f, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c },
+			{ 0x00, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0x0b, 0xb9, 0x72, 0xc0 },
+		},
+		{
+			{ 0x00, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0x0c, 0xec, 0x47 },
+			{ 0x00, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x04 },
+		},
+		{
+			{ 0x00, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd },
+			{ 0x00, 0xc8, 0x8d, 0x45, 0x07, 0xcf, 0x8a, 0x42, 0x0e, 0xc6, 0x83, 0x4b, 0x09, 0xc1, 0x84, 0x4c },
+		},
+		{
+			{ 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b },
+			{ 0x00, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0x0a, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x08, 0x43, 0x7a },
+			{ 0x00, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3 },
+		},
+		{
+			{ 0x00, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x08, 0x23, 0x97, 0x56, 0xe2 },
+			{ 0x00, 0x8f, 0x03, 0x8c, 0x06, 0x89, 0x05, 0x8a, 0x0c, 0x83, 0x0f, 0x80, 0x0a, 0x85, 0x09, 0x86 },
+		},
+		{
+			{ 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x01, 0x3c, 0x7b, 0x46 },
+			{ 0x00, 0xf7, 0xf3, 0x04, 0xfb, 0x0c, 0x08, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14 },
+		},
+		{
+			{ 0x00, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91 },
+			{ 0x00, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x07, 0x94, 0xe5 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc },
+			{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+		},
+		{
+			{ 0x00, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e, 0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x01, 0x5c },
+			{ 0x00, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x08, 0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9 },
+		},
+		{
+			{ 0x00, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e },
+			{ 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94 },
+		},
+		{
+			{ 0x00, 0x8e, 0x01, 0x8f, 0x02, 0x8c, 0x03, 0x8d, 0x04, 0x8a, 0x05, 0x8b, 0x06, 0x88, 0x07, 0x89 },
+			{ 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x01, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68 },
+			{ 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce },
+		},
+		{
+			{ 0x00, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x03, 0x2f, 0x9a, 0x58, 0xed },
+			{ 0x00, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76 },
+		},
+		{
+			{ 0x00, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x07 },
+			{ 0x00, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70 },
+		},
+		{
+			{ 0x00, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0x0f, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6 },
+			{ 0x00, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0x0d, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x02, 0xc1, 0xec, 0x9b, 0xb6 },
+			{ 0x00, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x03, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf },
+		},
+		{
+			{ 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 },
+			{ 0x00, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60 },
+		},
+		{
+			{ 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e },
+			{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd },
+		},
+		{
+			{ 0x00, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0x0a, 0x9a, 0xea },
+			{ 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x04 },
+			{ 0x00, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40 },
+		},
+		{
+			{ 0x00, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0x0c, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73 },
+			{ 0x00, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63 },
+		},
+		{
+			{ 0x00, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a },
+			{ 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 },
+		},
+		{
+			{ 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x07, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a },
+			{ 0x00, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x01, 0xe2, 0x48 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x08, 0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9 },
+			{ 0x00, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42 },
+		},
+		{
+			{ 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x08, 0x43, 0x7a },
+			{ 0x00, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3 },
+		},
+		{
+			{ 0x00, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3 },
+			{ 0x00, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x08, 0x23, 0x97, 0x56, 0xe2 },
+		},
+		{
+			{ 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 },
+			{ 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0x0b, 0x3e },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7 },
+			{ 0x00, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0x0b, 0x78, 0xa2 },
+		},
+		{
+			{ 0x00, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1 },
+			{ 0x00, 0x88, 0x0d, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab },
+		},
+		{
+			{ 0x00, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb },
+			{ 0x00, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0x0b },
+		},
+		{
+			{ 0x00, 0x44, 0x88, 0xcc, 0x0d, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb },
+			{ 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x05, 0x31 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 },
+			{ 0x00, 0xf0, 0xfd, 0x0d, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39 },
+		},
+		{
+			{ 0x00, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde },
+			{ 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 },
+		},
+		{
+			{ 0x00, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0x0f, 0xd7, 0x9f },
+			{ 0x00, 0xf4, 0xf5, 0x01, 0xf7, 0x03, 0x02, 0xf6, 0xf3, 0x07, 0x06, 0xf2, 0x04, 0xf0, 0xf1, 0x05 },
+		},
+		{
+			{ 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d },
+			{ 0x00, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94 },
+			{ 0x00, 0x8a, 0x09, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5 },
+		},
+		{
+			{ 0x00, 0x50, 0xa0, 0xf0, 0x5d, 0x0d, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17 },
+			{ 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d },
+		},
+		{
+			{ 0x00, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x04 },
+			{ 0x00, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40 },
+		},
+		{
+			{ 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x01, 0x3c, 0x7b, 0x46 },
+			{ 0x00, 0xf7, 0xf3, 0x04, 0xfb, 0x0c, 0x08, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a },
+			{ 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd },
+		},
+		{
+			{ 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 },
+			{ 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 },
+		},
+		{
+			{ 0x00, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b },
+			{ 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0x0d, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3 },
+		},
+		{
+			{ 0x00, 0xa5, 0x57, 0xf2, 0xae, 0x0b, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d },
+			{ 0x00, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f },
+			{ 0x00, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe },
+		},
+		{
+			{ 0x00, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x01 },
+			{ 0x00, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10 },
+		},
+		{
+			{ 0x00, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x01, 0xb9, 0x03, 0xd0, 0x6a, 0x6b, 0xd1, 0x02, 0xb8 },
+			{ 0x00, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f },
+		},
+		{
+			{ 0x00, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x05, 0x71, 0xb3, 0xe8, 0x2a },
+			{ 0x00, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5 },
+			{ 0x00, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0x0d, 0xeb },
+		},
+		{
+			{ 0x00, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef },
+			{ 0x00, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56 },
+		},
+		{
+			{ 0x00, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8 },
+			{ 0x00, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21, 0x07, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26 },
+		},
+		{
+			{ 0x00, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23 },
+			{ 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41 },
+			{ 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64 },
+		},
+		{
+			{ 0x00, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x04, 0x9f, 0xc5, 0x2b, 0x71 },
+			{ 0x00, 0xc9, 0x8f, 0x46, 0x03, 0xca, 0x8c, 0x45, 0x06, 0xcf, 0x89, 0x40, 0x05, 0xcc, 0x8a, 0x43 },
+		},
+		{
+			{ 0x00, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35, 0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3 },
+			{ 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff },
+		},
+		{
+			{ 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda },
+			{ 0x00, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e },
+			{ 0x00, 0x8e, 0x01, 0x8f, 0x02, 0x8c, 0x03, 0x8d, 0x04, 0x8a, 0x05, 0x8b, 0x06, 0x88, 0x07, 0x89 },
+		},
+		{
+			{ 0x00, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e },
+			{ 0x00, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x08 },
+		},
+		{
+			{ 0x00, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x08 },
+			{ 0x00, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0x0b, 0xd7, 0xf9, 0x25, 0x5c, 0x80 },
+		},
+		{
+			{ 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 },
+			{ 0x00, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x01, 0xdc, 0xf5, 0x28, 0x52, 0x8f },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4 },
+			{ 0x00, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb },
+		},
+		{
+			{ 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 },
+			{ 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0x0b, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf },
+		},
+		{
+			{ 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8 },
+			{ 0x00, 0xca, 0x89, 0x43, 0x0f, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52 },
+		},
+		{
+			{ 0x00, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a },
+			{ 0x00, 0x45, 0x8a, 0xcf, 0x09, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde },
+			{ 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 },
+		},
+		{
+			{ 0x00, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x02, 0xd9, 0x90 },
+			{ 0x00, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5 },
+		},
+		{
+			{ 0x00, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1 },
+			{ 0x00, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91 },
+		},
+		{
+			{ 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e },
+			{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0 },
+			{ 0x00, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0x0f, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6 },
+		},
+		{
+			{ 0x00, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa },
+			{ 0x00, 0xf6, 0xf1, 0x07, 0xff, 0x09, 0x0e, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b },
+		},
+		{
+			{ 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c },
+			{ 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd },
+		},
+		{
+			{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+			{ 0x00, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac },
+			{ 0x00, 0xa4, 0x55, 0xf1, 0xaa, 0x0e, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12 },
+		},
+		{
+			{ 0x00, 0x51, 0xa2, 0xf3, 0x59, 0x08, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18 },
+			{ 0x00, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0x0b, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d },
+		},
+		{
+			{ 0x00, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0x0f, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6 },
+			{ 0x00, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0x0d, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2 },
+		},
+		{
+			{ 0x00, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x01, 0xee, 0x76, 0xc3, 0x5b },
+			{ 0x00, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10 },
+			{ 0x00, 0xa5, 0x57, 0xf2, 0xae, 0x0b, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d },
+		},
+		{
+			{ 0x00, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x07, 0xb0, 0x5f, 0x73, 0x9c },
+			{ 0x00, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35 },
+		},
+		{
+			{ 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 },
+			{ 0x00, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac },
+		},
+		{
+			{ 0x00, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x09, 0xa1, 0x32 },
+			{ 0x00, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x07 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8 },
+			{ 0x00, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21, 0x07, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26 },
+		},
+		{
+			{ 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee },
+			{ 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x01, 0x3c, 0x7b, 0x46 },
+		},
+		{
+			{ 0x00, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1 },
+			{ 0x00, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2 },
+		},
+		{
+			{ 0x00, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d },
+			{ 0x00, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 },
+			{ 0x00, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x01, 0xdc, 0xf5, 0x28, 0x52, 0x8f },
+		},
+		{
+			{ 0x00, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a },
+			{ 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 },
+		},
+		{
+			{ 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 },
+			{ 0x00, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e },
+		},
+		{
+			{ 0x00, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef },
+			{ 0x00, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x05, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec },
+			{ 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 },
+		},
+		{
+			{ 0x00, 0x89, 0x0f, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4 },
+			{ 0x00, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0x0d, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92 },
+		},
+		{
+			{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b },
+			{ 0x00, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe },
+		},
+		{
+			{ 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 },
+			{ 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0x0b, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 },
+			{ 0x00, 0x50, 0xa0, 0xf0, 0x5d, 0x0d, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17 },
+		},
+		{
+			{ 0x00, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0x0f, 0x93, 0xc8, 0x25, 0x7e },
+			{ 0x00, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35, 0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3 },
+		},
+		{
+			{ 0x00, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78, 0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59 },
+			{ 0x00, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9 },
+		},
+		{
+			{ 0x00, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21, 0x07, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26 },
+			{ 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0x0a, 0xe2, 0x7b, 0xcd, 0x54 },
+			{ 0x00, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26, 0x0f, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29 },
+		},
+		{
+			{ 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94 },
+			{ 0x00, 0x8a, 0x09, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5 },
+		},
+		{
+			{ 0x00, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42 },
+			{ 0x00, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0x0a, 0xe2, 0x7b, 0xcd, 0x54 },
+		},
+		{
+			{ 0x00, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3 },
+			{ 0x00, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x08, 0x23, 0x97, 0x56, 0xe2 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13 },
+			{ 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d },
+		},
+		{
+			{ 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x07, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a },
+			{ 0x00, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x01, 0xe2, 0x48 },
+		},
+		{
+			{ 0x00, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef },
+			{ 0x00, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56 },
+		},
+		{
+			{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b },
+			{ 0x00, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x07 },
+			{ 0x00, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70 },
+		},
+		{
+			{ 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 },
+			{ 0x00, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23 },
+		},
+		{
+			{ 0x00, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51 },
+			{ 0x00, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79 },
+		},
+		{
+			{ 0x00, 0x8b, 0x0b, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba },
+			{ 0x00, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a },
+			{ 0x00, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9 },
+		},
+		{
+			{ 0x00, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0x0e, 0x7d, 0xbe, 0xe6, 0x25 },
+			{ 0x00, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63, 0x09, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a },
+		},
+		{
+			{ 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x05, 0x31 },
+			{ 0x00, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37 },
+		},
+		{
+			{ 0x00, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0x0e },
+			{ 0x00, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd },
+			{ 0x00, 0xc8, 0x8d, 0x45, 0x07, 0xcf, 0x8a, 0x42, 0x0e, 0xc6, 0x83, 0x4b, 0x09, 0xc1, 0x84, 0x4c },
+		},
+		{
+			{ 0x00, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a },
+			{ 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 },
+		},
+		{
+			{ 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 },
+			{ 0x00, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x01, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65 },
+		},
+		{
+			{ 0x00, 0xf3, 0xfb, 0x08, 0xeb, 0x18, 0x10, 0xe3, 0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28 },
+			{ 0x00, 0x8b, 0x0b, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c },
+			{ 0x00, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa },
+		},
+		{
+			{ 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa },
+			{ 0x00, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72 },
+		},
+		{
+			{ 0x00, 0x44, 0x88, 0xcc, 0x0d, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb },
+			{ 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x05, 0x31 },
+		},
+		{
+			{ 0x00, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa },
+			{ 0x00, 0xf6, 0xf1, 0x07, 0xff, 0x09, 0x0e, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc },
+			{ 0x00, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e, 0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x01, 0x5c },
+		},
+		{
+			{ 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20 },
+			{ 0x00, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a },
+		},
+		{
+			{ 0x00, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f },
+			{ 0x00, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x03, 0x2f, 0x9a, 0x58, 0xed },
+		},
+		{
+			{ 0x00, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34 },
+			{ 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0x0d, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb },
+			{ 0x00, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c },
+		},
+		{
+			{ 0x00, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0x0d, 0x52, 0x24, 0xbe, 0xc8 },
+			{ 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c },
+		},
+		{
+			{ 0x00, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4 },
+			{ 0x00, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb },
+		},
+		{
+			{ 0x00, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1 },
+			{ 0x00, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b },
+			{ 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0x0d, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3 },
+		},
+		{
+			{ 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 },
+			{ 0x00, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde },
+		},
+		{
+			{ 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc },
+			{ 0x00, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41 },
+		},
+		{
+			{ 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0x0b, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf },
+			{ 0x00, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x04, 0x9f, 0xc5, 0x2b, 0x71 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e },
+			{ 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94 },
+		},
+		{
+			{ 0x00, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21, 0x07, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26 },
+			{ 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a },
+		},
+		{
+			{ 0x00, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9 },
+			{ 0x00, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b },
+		},
+		{
+			{ 0x00, 0xc8, 0x8d, 0x45, 0x07, 0xcf, 0x8a, 0x42, 0x0e, 0xc6, 0x83, 0x4b, 0x09, 0xc1, 0x84, 0x4c },
+			{ 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x07, 0xb4, 0x7c, 0xcf },
+			{ 0x00, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c },
+		},
+		{
+			{ 0x00, 0x8a, 0x09, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5 },
+			{ 0x00, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0x0f, 0xd7, 0x9f },
+		},
+		{
+			{ 0x00, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84 },
+			{ 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8 },
+		},
+		{
+			{ 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 },
+			{ 0x00, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d },
+			{ 0x00, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0x0a, 0x9a, 0xea },
+		},
+		{
+			{ 0x00, 0x52, 0xa4, 0xf6, 0x55, 0x07, 0xf1, 0xa3, 0xaa, 0xf8, 0x0e, 0x5c, 0xff, 0xad, 0x5b, 0x09 },
+			{ 0x00, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x02, 0xd9, 0x90 },
+		},
+		{
+			{ 0x00, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70 },
+			{ 0x00, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89, 0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0x0f, 0x53 },
+		},
+		{
+			{ 0x00, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e },
+			{ 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0x0e },
+			{ 0x00, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0 },
+		},
+		{
+			{ 0x00, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60 },
+			{ 0x00, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e },
+		},
+		{
+			{ 0x00, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e },
+			{ 0x00, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x08 },
+		},
+		{
+			{ 0x00, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x04 },
+			{ 0x00, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40 },
+			{ 0x00, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74 },
+		},
+		{
+			{ 0x00, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3 },
+			{ 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 },
+		},
+		{
+			{ 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 },
+			{ 0x00, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e },
+		},
+		{
+			{ 0x00, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd },
+			{ 0x00, 0xc8, 0x8d, 0x45, 0x07, 0xcf, 0x8a, 0x42, 0x0e, 0xc6, 0x83, 0x4b, 0x09, 0xc1, 0x84, 0x4c },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 },
+			{ 0x00, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde },
+		},
+		{
+			{ 0x00, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0x0b, 0x78, 0xa2 },
+			{ 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2 },
+		},
+		{
+			{ 0x00, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f },
+			{ 0x00, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84 },
+		},
+		{
+			{ 0x00, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24 },
+			{ 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x08, 0x43, 0x7a },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37 },
+			{ 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57 },
+		},
+		{
+			{ 0x00, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56 },
+			{ 0x00, 0x52, 0xa4, 0xf6, 0x55, 0x07, 0xf1, 0xa3, 0xaa, 0xf8, 0x0e, 0x5c, 0xff, 0xad, 0x5b, 0x09 },
+		},
+		{
+			{ 0x00, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x04, 0x9f, 0xc5, 0x2b, 0x71 },
+			{ 0x00, 0xc9, 0x8f, 0x46, 0x03, 0xca, 0x8c, 0x45, 0x06, 0xcf, 0x89, 0x40, 0x05, 0xcc, 0x8a, 0x43 },
+		},
+		{
+			{ 0x00, 0xf7, 0xf3, 0x04, 0xfb, 0x0c, 0x08, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14 },
+			{ 0x00, 0xcb, 0x8b, 0x40, 0x0b, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64 },
+			{ 0x00, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0x0e },
+		},
+		{
+			{ 0x00, 0x44, 0x88, 0xcc, 0x0d, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb },
+			{ 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x05, 0x31 },
+		},
+		{
+			{ 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0x0d, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3 },
+			{ 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 },
+		},
+		{
+			{ 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c },
+			{ 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0x0d, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92 },
+			{ 0x00, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x03, 0xd6, 0x05, 0x6d, 0xbe, 0xbd, 0x6e, 0x06, 0xd5 },
+		},
+		{
+			{ 0x00, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10 },
+			{ 0x00, 0xa5, 0x57, 0xf2, 0xae, 0x0b, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d },
+		},
+		{
+			{ 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f },
+			{ 0x00, 0x47, 0x8e, 0xc9, 0x01, 0x46, 0x8f, 0xc8, 0x02, 0x45, 0x8c, 0xcb, 0x03, 0x44, 0x8d, 0xca },
+		},
+		{
+			{ 0x00, 0xf6, 0xf1, 0x07, 0xff, 0x09, 0x0e, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b },
+			{ 0x00, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x06, 0x76, 0xad },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0x0d, 0xeb },
+			{ 0x00, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16 },
+		},
+		{
+			{ 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0x0d, 0x31, 0x75, 0x49 },
+			{ 0x00, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x03, 0xe4 },
+		},
+		{
+			{ 0x00, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x03, 0x2f, 0x9a, 0x58, 0xed },
+			{ 0x00, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76 },
+		},
+		{
+			{ 0x00, 0x51, 0xa2, 0xf3, 0x59, 0x08, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18 },
+			{ 0x00, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0x0b, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0x0d, 0x25, 0xfd, 0xd5, 0xad, 0x85 },
+			{ 0x00, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x01, 0xb9, 0x03, 0xd0, 0x6a, 0x6b, 0xd1, 0x02, 0xb8 },
+		},
+		{
+			{ 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c },
+			{ 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd },
+		},
+		{
+			{ 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0x0d, 0x31, 0x75, 0x49 },
+			{ 0x00, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x03, 0xe4 },
+		},
+		{
+			{ 0x00, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0x0b },
+			{ 0x00, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0x0d, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3 },
+			{ 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 },
+		},
+		{
+			{ 0x00, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0x0d, 0xeb },
+			{ 0x00, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16 },
+		},
+		{
+			{ 0x00, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x01, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68 },
+			{ 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce },
+		},
+		{
+			{ 0x00, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac },
+			{ 0x00, 0xa4, 0x55, 0xf1, 0xaa, 0x0e, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0x0d, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2 },
+			{ 0x00, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef },
+		},
+		{
+			{ 0x00, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae },
+			{ 0x00, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x09, 0xa1, 0x32 },
+		},
+		{
+			{ 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 },
+			{ 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x02, 0xc1, 0xec, 0x9b, 0xb6 },
+		},
+		{
+			{ 0x00, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x06, 0x5e, 0x29, 0xb0, 0xc7 },
+			{ 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x05, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f },
+			{ 0x00, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0x0e, 0x35, 0xd7 },
+		},
+		{
+			{ 0x00, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b },
+			{ 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0x0d, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3 },
+		},
+		{
+			{ 0x00, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0 },
+			{ 0x00, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x07, 0xb4, 0x7c, 0xcf },
+		},
+		{
+			{ 0x00, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0x0b, 0x78, 0xa2 },
+			{ 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xca, 0x89, 0x43, 0x0f, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52 },
+			{ 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0x0d, 0x31, 0x75, 0x49 },
+		},
+		{
+			{ 0x00, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e },
+			{ 0x00, 0x8e, 0x01, 0x8f, 0x02, 0x8c, 0x03, 0x8d, 0x04, 0x8a, 0x05, 0x8b, 0x06, 0x88, 0x07, 0x89 },
+		},
+		{
+			{ 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 },
+			{ 0x00, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3 },
+		},
+		{
+			{ 0x00, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72 },
+			{ 0x00, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0x0c, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0x0f, 0xd7, 0x9f },
+			{ 0x00, 0xf4, 0xf5, 0x01, 0xf7, 0x03, 0x02, 0xf6, 0xf3, 0x07, 0x06, 0xf2, 0x04, 0xf0, 0xf1, 0x05 },
+		},
+		{
+			{ 0x00, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1 },
+			{ 0x00, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91 },
+		},
+		{
+			{ 0x00, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x01, 0xdc, 0xf5, 0x28, 0x52, 0x8f },
+			{ 0x00, 0x51, 0xa2, 0xf3, 0x59, 0x08, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18 },
+		},
+		{
+			{ 0x00, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0x0e, 0x7d, 0xbe, 0xe6, 0x25 },
+			{ 0x00, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63, 0x09, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e },
+			{ 0x00, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15 },
+		},
+		{
+			{ 0x00, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x01, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68 },
+			{ 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce },
+		},
+		{
+			{ 0x00, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6 },
+			{ 0x00, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6 },
+		},
+		{
+			{ 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x05, 0x4d, 0x75 },
+			{ 0x00, 0xa7, 0x53, 0xf4, 0xa6, 0x01, 0xf5, 0x52, 0x51, 0xf6, 0x02, 0xa5, 0xf7, 0x50, 0xa4, 0x03 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+			{ 0x00, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f },
+		},
+		{
+			{ 0x00, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d },
+			{ 0x00, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x07, 0x7c, 0xff, 0x84, 0x09, 0x72, 0x0e, 0x75, 0xf8, 0x83 },
+		},
+		{
+			{ 0x00, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d },
+			{ 0x00, 0x89, 0x0f, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4 },
+		},
+		{
+			{ 0x00, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51 },
+			{ 0x00, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56 },
+			{ 0x00, 0x52, 0xa4, 0xf6, 0x55, 0x07, 0xf1, 0xa3, 0xaa, 0xf8, 0x0e, 0x5c, 0xff, 0xad, 0x5b, 0x09 },
+		},
+		{
+			{ 0x00, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x04, 0xaf, 0x3d },
+			{ 0x00, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7 },
+		},
+		{
+			{ 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c },
+			{ 0x00, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7 },
+		},
+		{
+			{ 0x00, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0x0c, 0xbc, 0x52, 0x7d, 0x93 },
+			{ 0x00, 0x46, 0x8c, 0xca, 0x05, 0x43, 0x89, 0xcf, 0x0a, 0x4c, 0x86, 0xc0, 0x0f, 0x49, 0x83, 0xc5 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd },
+			{ 0x00, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51 },
+		},
+		{
+			{ 0x00, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6 },
+			{ 0x00, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6 },
+		},
+		{
+			{ 0x00, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d },
+			{ 0x00, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x07, 0x7c, 0xff, 0x84, 0x09, 0x72, 0x0e, 0x75, 0xf8, 0x83 },
+		},
+		{
+			{ 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 },
+			{ 0x00, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f },
+			{ 0x00, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84 },
+		},
+		{
+			{ 0x00, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc },
+			{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+		},
+		{
+			{ 0x00, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0x0f, 0x93, 0xc8, 0x25, 0x7e },
+			{ 0x00, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35, 0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3 },
+		},
+		{
+			{ 0x00, 0x47, 0x8e, 0xc9, 0x01, 0x46, 0x8f, 0xc8, 0x02, 0x45, 0x8c, 0xcb, 0x03, 0x44, 0x8d, 0xca },
+			{ 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23 },
+			{ 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a },
+		},
+		{
+			{ 0x00, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c },
+			{ 0x00, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e },
+		},
+		{
+			{ 0x00, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d },
+			{ 0x00, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38 },
+		},
+		{
+			{ 0x00, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f },
+			{ 0x00, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0x0e, 0x35, 0xd7 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x01, 0xe2, 0x48 },
+			{ 0x00, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4 },
+		},
+		{
+			{ 0x00, 0xa7, 0x53, 0xf4, 0xa6, 0x01, 0xf5, 0x52, 0x51, 0xf6, 0x02, 0xa5, 0xf7, 0x50, 0xa4, 0x03 },
+			{ 0x00, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30 },
+		},
+		{
+			{ 0x00, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x07, 0xb0, 0x5f, 0x73, 0x9c },
+			{ 0x00, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35 },
+		},
+		{
+			{ 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 },
+			{ 0x00, 0x50, 0xa0, 0xf0, 0x5d, 0x0d, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x02 },
+			{ 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20 },
+		},
+		{
+			{ 0x00, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x06, 0x5e, 0x29, 0xb0, 0xc7 },
+			{ 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x05, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec },
+		},
+		{
+			{ 0x00, 0xf1, 0xff, 0x0e, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36 },
+			{ 0x00, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0x0c, 0xec, 0x47 },
+		},
+		{
+			{ 0x00, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e },
+			{ 0x00, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x04, 0x9f, 0xc5, 0x2b, 0x71 },
+			{ 0x00, 0xc9, 0x8f, 0x46, 0x03, 0xca, 0x8c, 0x45, 0x06, 0xcf, 0x89, 0x40, 0x05, 0xcc, 0x8a, 0x43 },
+		},
+		{
+			{ 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c },
+			{ 0x00, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7 },
+		},
+		{
+			{ 0x00, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40 },
+			{ 0x00, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74 },
+		},
+		{
+			{ 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c },
+			{ 0x00, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0x0b, 0xb9, 0x72, 0xc0 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xf3, 0xfb, 0x08, 0xeb, 0x18, 0x10, 0xe3, 0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28 },
+			{ 0x00, 0x8b, 0x0b, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba },
+		},
+		{
+			{ 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 },
+			{ 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x02, 0xc1, 0xec, 0x9b, 0xb6 },
+		},
+		{
+			{ 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa },
+			{ 0x00, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72 },
+		},
+		{
+			{ 0x00, 0xf1, 0xff, 0x0e, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36 },
+			{ 0x00, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0x0c, 0xec, 0x47 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe },
+			{ 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f },
+		},
+		{
+			{ 0x00, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb },
+			{ 0x00, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0x0b },
+		},
+		{
+			{ 0x00, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x06, 0x5e, 0x29, 0xb0, 0xc7 },
+			{ 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x05, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec },
+		},
+		{
+			{ 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0x0d, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3 },
+			{ 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78, 0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59 },
+			{ 0x00, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9 },
+		},
+		{
+			{ 0x00, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38 },
+			{ 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7 },
+		},
+		{
+			{ 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1 },
+			{ 0x00, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x01, 0x7b, 0xf7, 0x8d, 0x03, 0x79, 0x02, 0x78, 0xf6, 0x8c },
+		},
+		{
+			{ 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce },
+			{ 0x00, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0x0b, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x44, 0x88, 0xcc, 0x0d, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb },
+			{ 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x05, 0x31 },
+		},
+		{
+			{ 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0x0d, 0x25, 0xfd, 0xd5, 0xad, 0x85 },
+			{ 0x00, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x01, 0xb9, 0x03, 0xd0, 0x6a, 0x6b, 0xd1, 0x02, 0xb8 },
+		},
+		{
+			{ 0x00, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0x0d, 0xeb },
+			{ 0x00, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16 },
+		},
+		{
+			{ 0x00, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0x0b, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c },
+			{ 0x00, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0x0c, 0xbc, 0x52, 0x7d, 0x93 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0x0d },
+			{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 },
+		},
+		{
+			{ 0x00, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1 },
+			{ 0x00, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2 },
+		},
+		{
+			{ 0x00, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16 },
+			{ 0x00, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d },
+		},
+		{
+			{ 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 },
+			{ 0x00, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35 },
+			{ 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 },
+		},
+		{
+			{ 0x00, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2 },
+			{ 0x00, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc },
+		},
+		{
+			{ 0x00, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x01, 0xe2, 0x48 },
+			{ 0x00, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4 },
+		},
+		{
+			{ 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc },
+			{ 0x00, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 },
+			{ 0x00, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23 },
+		},
+		{
+			{ 0x00, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50 },
+			{ 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 },
+		},
+		{
+			{ 0x00, 0xcb, 0x8b, 0x40, 0x0b, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d },
+			{ 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x09, 0xcd, 0xe1, 0x95, 0xb9 },
+		},
+		{
+			{ 0x00, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0x0b, 0xb9, 0x72, 0xc0 },
+			{ 0x00, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x07, 0xb0, 0x5f, 0x73, 0x9c },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60 },
+			{ 0x00, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e },
+		},
+		{
+			{ 0x00, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42 },
+			{ 0x00, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0x0a, 0xe2, 0x7b, 0xcd, 0x54 },
+		},
+		{
+			{ 0x00, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72 },
+			{ 0x00, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0x0c, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73 },
+		},
+		{
+			{ 0x00, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0x0c, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73 },
+			{ 0x00, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0x0b, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c },
+			{ 0x00, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0x0c, 0xbc, 0x52, 0x7d, 0x93 },
+		},
+		{
+			{ 0x00, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0x0b },
+			{ 0x00, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0 },
+		},
+		{
+			{ 0x00, 0x51, 0xa2, 0xf3, 0x59, 0x08, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18 },
+			{ 0x00, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0x0b, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d },
+		},
+		{
+			{ 0x00, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f },
+			{ 0x00, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9 },
+			{ 0x00, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b },
+		},
+		{
+			{ 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce },
+			{ 0x00, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0x0b, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c },
+		},
+		{
+			{ 0x00, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37 },
+			{ 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57 },
+		},
+		{
+			{ 0x00, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7 },
+			{ 0x00, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0x0d, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f },
+			{ 0x00, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3 },
+		},
+		{
+			{ 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 },
+			{ 0x00, 0x8d, 0x07, 0x8a, 0x0e, 0x83, 0x09, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98 },
+		},
+		{
+			{ 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 },
+			{ 0x00, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a },
+		},
+		{
+			{ 0x00, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x07, 0xb4, 0x7c, 0xcf },
+			{ 0x00, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa },
+			{ 0x00, 0xf6, 0xf1, 0x07, 0xff, 0x09, 0x0e, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b },
+		},
+		{
+			{ 0x00, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95 },
+			{ 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 },
+		},
+		{
+			{ 0x00, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0x0b, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c },
+			{ 0x00, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0x0c, 0xbc, 0x52, 0x7d, 0x93 },
+		},
+		{
+			{ 0x00, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d },
+			{ 0x00, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x07, 0x7c, 0xff, 0x84, 0x09, 0x72, 0x0e, 0x75, 0xf8, 0x83 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6 },
+			{ 0x00, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6 },
+		},
+		{
+			{ 0x00, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x07 },
+			{ 0x00, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70 },
+		},
+		{
+			{ 0x00, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81 },
+			{ 0x00, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8 },
+		},
+		{
+			{ 0x00, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50 },
+			{ 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1 },
+			{ 0x00, 0x88, 0x0d, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab },
+		},
+		{
+			{ 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64 },
+			{ 0x00, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0x0e },
+		},
+		{
+			{ 0x00, 0x88, 0x0d, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab },
+			{ 0x00, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x05, 0x67, 0x0f, 0xb7, 0xdf, 0xda, 0xb2, 0x0a, 0x62 },
+		},
+		{
+			{ 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0x0d, 0x25, 0xfd, 0xd5, 0xad, 0x85 },
+			{ 0x00, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x01, 0xb9, 0x03, 0xd0, 0x6a, 0x6b, 0xd1, 0x02, 0xb8 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35, 0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3 },
+			{ 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff },
+		},
+		{
+			{ 0x00, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40 },
+			{ 0x00, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74 },
+		},
+		{
+			{ 0x00, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x06, 0x76, 0xad },
+			{ 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x02 },
+		},
+		{
+			{ 0x00, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf },
+			{ 0x00, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b },
+			{ 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58 },
+		},
+		{
+			{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd },
+			{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b },
+		},
+		{
+			{ 0x00, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0x0b, 0xb9, 0x72, 0xc0 },
+			{ 0x00, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x07, 0xb0, 0x5f, 0x73, 0x9c },
+		},
+		{
+			{ 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d },
+			{ 0x00, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0x0a, 0x9a, 0xea },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xcb, 0x8b, 0x40, 0x0b, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d },
+			{ 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x09, 0xcd, 0xe1, 0x95, 0xb9 },
+		},
+		{
+			{ 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0x0b, 0x3e },
+			{ 0x00, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x06, 0x5e, 0x29, 0xb0, 0xc7 },
+		},
+		{
+			{ 0x00, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x05, 0x71, 0xb3, 0xe8, 0x2a },
+			{ 0x00, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a },
+		},
+		{
+			{ 0x00, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0x0b, 0xd7, 0xf9, 0x25, 0x5c, 0x80 },
+			{ 0x00, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0x0b, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf },
+			{ 0x00, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x04, 0x9f, 0xc5, 0x2b, 0x71 },
+		},
+		{
+			{ 0x00, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24 },
+			{ 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x08, 0x43, 0x7a },
+		},
+		{
+			{ 0x00, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e, 0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x01, 0x5c },
+			{ 0x00, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x08, 0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9 },
+		},
+		{
+			{ 0x00, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x01, 0x7b, 0xf7, 0x8d, 0x03, 0x79, 0x02, 0x78, 0xf6, 0x8c },
+			{ 0x00, 0xf3, 0xfb, 0x08, 0xeb, 0x18, 0x10, 0xe3, 0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38 },
+			{ 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7 },
+		},
+		{
+			{ 0x00, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9 },
+			{ 0x00, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b },
+		},
+		{
+			{ 0x00, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6 },
+			{ 0x00, 0x44, 0x88, 0xcc, 0x0d, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb },
+		},
+		{
+			{ 0x00, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79 },
+			{ 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae },
+			{ 0x00, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x09, 0xa1, 0x32 },
+		},
+		{
+			{ 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x02 },
+			{ 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20 },
+		},
+		{
+			{ 0x00, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7 },
+			{ 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 },
+		},
+		{
+			{ 0x00, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0x0a, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4 },
+			{ 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7 },
+			{ 0x00, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0x0d, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb },
+		},
+		{
+			{ 0x00, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6 },
+			{ 0x00, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc },
+		},
+		{
+			{ 0x00, 0xf7, 0xf3, 0x04, 0xfb, 0x0c, 0x08, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14 },
+			{ 0x00, 0xcb, 0x8b, 0x40, 0x0b, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d },
+		},
+		{
+			{ 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20 },
+			{ 0x00, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84 },
+			{ 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8 },
+		},
+		{
+			{ 0x00, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x08, 0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9 },
+			{ 0x00, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42 },
+		},
+		{
+			{ 0x00, 0x50, 0xa0, 0xf0, 0x5d, 0x0d, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17 },
+			{ 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d },
+		},
+		{
+			{ 0x00, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x08, 0x23, 0x97, 0x56, 0xe2 },
+			{ 0x00, 0x8f, 0x03, 0x8c, 0x06, 0x89, 0x05, 0x8a, 0x0c, 0x83, 0x0f, 0x80, 0x0a, 0x85, 0x09, 0x86 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce },
+			{ 0x00, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0x0b, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c },
+		},
+		{
+			{ 0x00, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79 },
+			{ 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 },
+		},
+		{
+			{ 0x00, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56 },
+			{ 0x00, 0x52, 0xa4, 0xf6, 0x55, 0x07, 0xf1, 0xa3, 0xaa, 0xf8, 0x0e, 0x5c, 0xff, 0xad, 0x5b, 0x09 },
+		},
+		{
+			{ 0x00, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6 },
+			{ 0x00, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 },
+			{ 0x00, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82 },
+		},
+		{
+			{ 0x00, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23 },
+			{ 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a },
+		},
+		{
+			{ 0x00, 0xf2, 0xf9, 0x0b, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27 },
+			{ 0x00, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a },
+		},
+		{
+			{ 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 },
+			{ 0x00, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9 },
+			{ 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c },
+		},
+		{
+			{ 0x00, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8 },
+			{ 0x00, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b },
+		},
+		{
+			{ 0x00, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x01, 0x7b, 0xf7, 0x8d, 0x03, 0x79, 0x02, 0x78, 0xf6, 0x8c },
+			{ 0x00, 0xf3, 0xfb, 0x08, 0xeb, 0x18, 0x10, 0xe3, 0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28 },
+		},
+		{
+			{ 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1 },
+			{ 0x00, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x01, 0x7b, 0xf7, 0x8d, 0x03, 0x79, 0x02, 0x78, 0xf6, 0x8c },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x01, 0xee, 0x76, 0xc3, 0x5b },
+			{ 0x00, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9 },
+		},
+		{
+			{ 0x00, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x01, 0xb9, 0x03, 0xd0, 0x6a, 0x6b, 0xd1, 0x02, 0xb8 },
+			{ 0x00, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f },
+		},
+		{
+			{ 0x00, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x07, 0x94, 0xe5 },
+			{ 0x00, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6 },
+		},
+		{
+			{ 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 },
+			{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0x0d },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 },
+			{ 0x00, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81 },
+		},
+		{
+			{ 0x00, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x06, 0x76, 0xad },
+			{ 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x02 },
+		},
+		{
+			{ 0x00, 0xa4, 0x55, 0xf1, 0xaa, 0x0e, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12 },
+			{ 0x00, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x04, 0xaf, 0x3d },
+		},
+		{
+			{ 0x00, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74 },
+			{ 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x04, 0xaf, 0x3d },
+			{ 0x00, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7 },
+		},
+		{
+			{ 0x00, 0xf7, 0xf3, 0x04, 0xfb, 0x0c, 0x08, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14 },
+			{ 0x00, 0xcb, 0x8b, 0x40, 0x0b, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d },
+		},
+		{
+			{ 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda },
+			{ 0x00, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21 },
+		},
+		{
+			{ 0x00, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f },
+			{ 0x00, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x03, 0x2f, 0x9a, 0x58, 0xed },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x51, 0xa2, 0xf3, 0x59, 0x08, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18 },
+			{ 0x00, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0x0b, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d },
+		},
+		{
+			{ 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b },
+			{ 0x00, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45 },
+		},
+		{
+			{ 0x00, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b },
+			{ 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58 },
+		},
+		{
+			{ 0x00, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x01, 0xb9, 0x03, 0xd0, 0x6a, 0x6b, 0xd1, 0x02, 0xb8 },
+			{ 0x00, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19 },
+			{ 0x00, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d },
+		},
+		{
+			{ 0x00, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe },
+			{ 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f },
+		},
+		{
+			{ 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x02 },
+			{ 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20 },
+		},
+		{
+			{ 0x00, 0x88, 0x0d, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab },
+			{ 0x00, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x05, 0x67, 0x0f, 0xb7, 0xdf, 0xda, 0xb2, 0x0a, 0x62 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x45, 0x8a, 0xcf, 0x09, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4 },
+			{ 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1 },
+		},
+		{
+			{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b },
+			{ 0x00, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe },
+		},
+		{
+			{ 0x00, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23 },
+			{ 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a },
+		},
+		{
+			{ 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8 },
+			{ 0x00, 0xca, 0x89, 0x43, 0x0f, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x09, 0xcd, 0xe1, 0x95, 0xb9 },
+			{ 0x00, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f },
+		},
+		{
+			{ 0x00, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e },
+			{ 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda },
+		},
+		{
+			{ 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 },
+			{ 0x00, 0x8d, 0x07, 0x8a, 0x0e, 0x83, 0x09, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98 },
+		},
+		{
+			{ 0x00, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x03, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf },
+			{ 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34 },
+			{ 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 },
+		},
+		{
+			{ 0x00, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9 },
+			{ 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 },
+		},
+		{
+			{ 0x00, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45 },
+			{ 0x00, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24 },
+		},
+		{
+			{ 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7 },
+			{ 0x00, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0x0b, 0x78, 0xa2 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x01, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65 },
+			{ 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e },
+		},
+		{
+			{ 0x00, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x01, 0x7b, 0xf7, 0x8d, 0x03, 0x79, 0x02, 0x78, 0xf6, 0x8c },
+			{ 0x00, 0xf3, 0xfb, 0x08, 0xeb, 0x18, 0x10, 0xe3, 0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28 },
+		},
+		{
+			{ 0x00, 0x8e, 0x01, 0x8f, 0x02, 0x8c, 0x03, 0x8d, 0x04, 0x8a, 0x05, 0x8b, 0x06, 0x88, 0x07, 0x89 },
+			{ 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 },
+		},
+		{
+			{ 0x00, 0x8c, 0x05, 0x89, 0x0a, 0x86, 0x0f, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97 },
+			{ 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0x0d, 0x25, 0xfd, 0xd5, 0xad, 0x85 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xa7, 0x53, 0xf4, 0xa6, 0x01, 0xf5, 0x52, 0x51, 0xf6, 0x02, 0xa5, 0xf7, 0x50, 0xa4, 0x03 },
+			{ 0x00, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30 },
+		},
+		{
+			{ 0x00, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x03, 0xd6, 0x05, 0x6d, 0xbe, 0xbd, 0x6e, 0x06, 0xd5 },
+			{ 0x00, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1 },
+		},
+		{
+			{ 0x00, 0xf6, 0xf1, 0x07, 0xff, 0x09, 0x0e, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b },
+			{ 0x00, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x06, 0x76, 0xad },
+		},
+		{
+			{ 0x00, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0x0f, 0x93, 0xc8, 0x25, 0x7e },
+			{ 0x00, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35, 0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d },
+			{ 0x00, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e },
+		},
+		{
+			{ 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e },
+			{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd },
+		},
+		{
+			{ 0x00, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0x0e, 0x7d, 0xbe, 0xe6, 0x25 },
+			{ 0x00, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63, 0x09, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a },
+		},
+		{
+			{ 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 },
+			{ 0x00, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xf7, 0xf3, 0x04, 0xfb, 0x0c, 0x08, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14 },
+			{ 0x00, 0xcb, 0x8b, 0x40, 0x0b, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d },
+		},
+		{
+			{ 0x00, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0x0c, 0xbc, 0x52, 0x7d, 0x93 },
+			{ 0x00, 0x46, 0x8c, 0xca, 0x05, 0x43, 0x89, 0xcf, 0x0a, 0x4c, 0x86, 0xc0, 0x0f, 0x49, 0x83, 0xc5 },
+		},
+		{
+			{ 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c },
+			{ 0x00, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0x0b, 0xb9, 0x72, 0xc0 },
+		},
+		{
+			{ 0x00, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0x0b, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d },
+			{ 0x00, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0x0e, 0x7d, 0xbe, 0xe6, 0x25 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x08, 0x23, 0x97, 0x56, 0xe2 },
+			{ 0x00, 0x8f, 0x03, 0x8c, 0x06, 0x89, 0x05, 0x8a, 0x0c, 0x83, 0x0f, 0x80, 0x0a, 0x85, 0x09, 0x86 },
+		},
+		{
+			{ 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 },
+			{ 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0x0b, 0x3e },
+		},
+		{
+			{ 0x00, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x03, 0xe4 },
+			{ 0x00, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6 },
+		},
+		{
+			{ 0x00, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0x0d, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb },
+			{ 0x00, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 },
+			{ 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0x0b, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf },
+		},
+		{
+			{ 0x00, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16 },
+			{ 0x00, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d },
+		},
+		{
+			{ 0x00, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a },
+			{ 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd },
+		},
+		{
+			{ 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff },
+			{ 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef },
+			{ 0x00, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56 },
+		},
+		{
+			{ 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 },
+			{ 0x00, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82 },
+		},
+		{
+			{ 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee },
+			{ 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x01, 0x3c, 0x7b, 0x46 },
+		},
+		{
+			{ 0x00, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c },
+			{ 0x00, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6 },
+			{ 0x00, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc },
+		},
+		{
+			{ 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc },
+			{ 0x00, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e, 0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x01, 0x5c },
+		},
+		{
+			{ 0x00, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0x0c, 0xbc, 0x52, 0x7d, 0x93 },
+			{ 0x00, 0x46, 0x8c, 0xca, 0x05, 0x43, 0x89, 0xcf, 0x0a, 0x4c, 0x86, 0xc0, 0x0f, 0x49, 0x83, 0xc5 },
+		},
+		{
+			{ 0x00, 0xa4, 0x55, 0xf1, 0xaa, 0x0e, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12 },
+			{ 0x00, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x04, 0xaf, 0x3d },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 },
+			{ 0x00, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x01, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68 },
+		},
+		{
+			{ 0x00, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70 },
+			{ 0x00, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89, 0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0x0f, 0x53 },
+		},
+		{
+			{ 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x02, 0xc1, 0xec, 0x9b, 0xb6 },
+			{ 0x00, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x03, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf },
+		},
+		{
+			{ 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 },
+			{ 0x00, 0x8d, 0x07, 0x8a, 0x0e, 0x83, 0x09, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x88, 0x0d, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab },
+			{ 0x00, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x05, 0x67, 0x0f, 0xb7, 0xdf, 0xda, 0xb2, 0x0a, 0x62 },
+		},
+		{
+			{ 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0x0d, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3 },
+			{ 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 },
+		},
+		{
+			{ 0x00, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e },
+			{ 0x00, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15 },
+		},
+		{
+			{ 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0x0d, 0x31, 0x75, 0x49 },
+			{ 0x00, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x03, 0xe4 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0x0d, 0x31, 0x75, 0x49 },
+			{ 0x00, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x03, 0xe4 },
+		},
+		{
+			{ 0x00, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac },
+			{ 0x00, 0xa4, 0x55, 0xf1, 0xaa, 0x0e, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12 },
+		},
+		{
+			{ 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x05, 0x4d, 0x75 },
+			{ 0x00, 0xa7, 0x53, 0xf4, 0xa6, 0x01, 0xf5, 0x52, 0x51, 0xf6, 0x02, 0xa5, 0xf7, 0x50, 0xa4, 0x03 },
+		},
+		{
+			{ 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b },
+			{ 0x00, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0x0d, 0x52, 0x24, 0xbe, 0xc8 },
+			{ 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c },
+		},
+		{
+			{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0x0d },
+			{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 },
+		},
+		{
+			{ 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 },
+			{ 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0x0b, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf },
+		},
+		{
+			{ 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd },
+			{ 0x00, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd },
+			{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b },
+		},
+		{
+			{ 0x00, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x07, 0x94, 0xe5 },
+			{ 0x00, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6 },
+		},
+		{
+			{ 0x00, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3 },
+			{ 0x00, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b },
+		},
+		{
+			{ 0x00, 0x52, 0xa4, 0xf6, 0x55, 0x07, 0xf1, 0xa3, 0xaa, 0xf8, 0x0e, 0x5c, 0xff, 0xad, 0x5b, 0x09 },
+			{ 0x00, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x02, 0xd9, 0x90 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d },
+			{ 0x00, 0x89, 0x0f, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4 },
+		},
+		{
+			{ 0x00, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f },
+			{ 0x00, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe },
+		},
+		{
+			{ 0x00, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x01, 0xee, 0x76, 0xc3, 0x5b },
+			{ 0x00, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9 },
+		},
+		{
+			{ 0x00, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93, 0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x03, 0x3b, 0xd8 },
+			{ 0x00, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x01 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a },
+			{ 0x00, 0x45, 0x8a, 0xcf, 0x09, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4 },
+		},
+		{
+			{ 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff },
+			{ 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b },
+		},
+		{
+			{ 0x00, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b },
+			{ 0x00, 0x8c, 0x05, 0x89, 0x0a, 0x86, 0x0f, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97 },
+		},
+		{
+			{ 0x00, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10 },
+			{ 0x00, 0xa5, 0x57, 0xf2, 0xae, 0x0b, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa },
+			{ 0x00, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72 },
+		},
+		{
+			{ 0x00, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0 },
+			{ 0x00, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0x0f, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6 },
+		},
+		{
+			{ 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0x0d, 0x25, 0xfd, 0xd5, 0xad, 0x85 },
+			{ 0x00, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x01, 0xb9, 0x03, 0xd0, 0x6a, 0x6b, 0xd1, 0x02, 0xb8 },
+		},
+		{
+			{ 0x00, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95 },
+			{ 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b },
+			{ 0x00, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0x0a, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4 },
+		},
+		{
+			{ 0x00, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0x0d, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2 },
+			{ 0x00, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef },
+		},
+		{
+			{ 0x00, 0xf3, 0xfb, 0x08, 0xeb, 0x18, 0x10, 0xe3, 0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28 },
+			{ 0x00, 0x8b, 0x0b, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba },
+		},
+		{
+			{ 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x02 },
+			{ 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc },
+			{ 0x00, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41 },
+		},
+		{
+			{ 0x00, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f },
+			{ 0x00, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84 },
+		},
+		{
+			{ 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 },
+			{ 0x00, 0x50, 0xa0, 0xf0, 0x5d, 0x0d, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17 },
+		},
+		{
+			{ 0x00, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e, 0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x01, 0x5c },
+			{ 0x00, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x08, 0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81 },
+			{ 0x00, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8 },
+		},
+		{
+			{ 0x00, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51 },
+			{ 0x00, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79 },
+		},
+		{
+			{ 0x00, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x01 },
+			{ 0x00, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10 },
+		},
+		{
+			{ 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0x0b, 0x3e },
+			{ 0x00, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x06, 0x5e, 0x29, 0xb0, 0xc7 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95 },
+			{ 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 },
+		},
+		{
+			{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+			{ 0x00, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f },
+		},
+		{
+			{ 0x00, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0x0b },
+			{ 0x00, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0 },
+		},
+		{
+			{ 0x00, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81 },
+			{ 0x00, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 },
+			{ 0x00, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1 },
+		},
+		{
+			{ 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 },
+			{ 0x00, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e },
+		},
+		{
+			{ 0x00, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0x0d, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb },
+			{ 0x00, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c },
+		},
+		{
+			{ 0x00, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8 },
+			{ 0x00, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21, 0x07, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x8a, 0x09, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5 },
+			{ 0x00, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0x0f, 0xd7, 0x9f },
+		},
+		{
+			{ 0x00, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f },
+			{ 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 },
+		},
+		{
+			{ 0x00, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x08, 0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9 },
+			{ 0x00, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42 },
+		},
+		{
+			{ 0x00, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x08 },
+			{ 0x00, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0x0b, 0xd7, 0xf9, 0x25, 0x5c, 0x80 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63, 0x09, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a },
+			{ 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee },
+		},
+		{
+			{ 0x00, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b },
+			{ 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x07, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a },
+		},
+		{
+			{ 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc },
+			{ 0x00, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e, 0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x01, 0x5c },
+		},
+		{
+			{ 0x00, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x06, 0x76, 0xad },
+			{ 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x02 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x09, 0xa1, 0x32 },
+			{ 0x00, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x07 },
+		},
+		{
+			{ 0x00, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78, 0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59 },
+			{ 0x00, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9 },
+		},
+		{
+			{ 0x00, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76 },
+			{ 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 },
+		},
+		{
+			{ 0x00, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9 },
+			{ 0x00, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e },
+			{ 0x00, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae },
+		},
+		{
+			{ 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd },
+			{ 0x00, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51 },
+		},
+		{
+			{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+			{ 0x00, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f },
+		},
+		{
+			{ 0x00, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x07 },
+			{ 0x00, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 },
+			{ 0x00, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95 },
+		},
+		{
+			{ 0x00, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f },
+			{ 0x00, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0x0e, 0x35, 0xd7 },
+		},
+		{
+			{ 0x00, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2 },
+			{ 0x00, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc },
+		},
+		{
+			{ 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 },
+			{ 0x00, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x05, 0x71, 0xb3, 0xe8, 0x2a },
+			{ 0x00, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a },
+		},
+		{
+			{ 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 },
+			{ 0x00, 0xf0, 0xfd, 0x0d, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39 },
+		},
+		{
+			{ 0x00, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x05, 0x67, 0x0f, 0xb7, 0xdf, 0xda, 0xb2, 0x0a, 0x62 },
+			{ 0x00, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e },
+		},
+		{
+			{ 0x00, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x02, 0xd9, 0x90 },
+			{ 0x00, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 },
+			{ 0x00, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e },
+		},
+		{
+			{ 0x00, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74 },
+			{ 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13 },
+		},
+		{
+			{ 0x00, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9 },
+			{ 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 },
+		},
+		{
+			{ 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 },
+			{ 0x00, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x01, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58 },
+			{ 0x00, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9 },
+		},
+		{
+			{ 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b },
+			{ 0x00, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0x0a, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4 },
+		},
+		{
+			{ 0x00, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82 },
+			{ 0x00, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0x0d, 0x52, 0x24, 0xbe, 0xc8 },
+		},
+		{
+			{ 0x00, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae },
+			{ 0x00, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x09, 0xa1, 0x32 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72 },
+			{ 0x00, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0x0c, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73 },
+		},
+		{
+			{ 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 },
+			{ 0x00, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x01, 0xdc, 0xf5, 0x28, 0x52, 0x8f },
+		},
+		{
+			{ 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 },
+			{ 0x00, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1 },
+		},
+		{
+			{ 0x00, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5 },
+			{ 0x00, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0x0d, 0xeb },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93, 0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x03, 0x3b, 0xd8 },
+			{ 0x00, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x01 },
+		},
+		{
+			{ 0x00, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x05, 0x71, 0xb3, 0xe8, 0x2a },
+			{ 0x00, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a },
+		},
+		{
+			{ 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 },
+			{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0x0d },
+		},
+		{
+			{ 0x00, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde },
+			{ 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x06, 0x5e, 0x29, 0xb0, 0xc7 },
+			{ 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x05, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec },
+		},
+		{
+			{ 0x00, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0x0a, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4 },
+			{ 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc },
+		},
+		{
+			{ 0x00, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e },
+			{ 0x00, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae },
+		},
+		{
+			{ 0x00, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x01, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68 },
+			{ 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0x0c, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73 },
+			{ 0x00, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63 },
+		},
+		{
+			{ 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13 },
+			{ 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d },
+		},
+		{
+			{ 0x00, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5 },
+			{ 0x00, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0x0d, 0xeb },
+		},
+		{
+			{ 0x00, 0x45, 0x8a, 0xcf, 0x09, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4 },
+			{ 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9 },
+			{ 0x00, 0xf1, 0xff, 0x0e, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36 },
+		},
+		{
+			{ 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 },
+			{ 0x00, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1 },
+		},
+		{
+			{ 0x00, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91 },
+			{ 0x00, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x07, 0x94, 0xe5 },
+		},
+		{
+			{ 0x00, 0xf0, 0xfd, 0x0d, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39 },
+			{ 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x07, 0xb0, 0x5f, 0x73, 0x9c },
+			{ 0x00, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35 },
+		},
+		{
+			{ 0x00, 0xf6, 0xf1, 0x07, 0xff, 0x09, 0x0e, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b },
+			{ 0x00, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x06, 0x76, 0xad },
+		},
+		{
+			{ 0x00, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9 },
+			{ 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c },
+		},
+		{
+			{ 0x00, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78, 0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59 },
+			{ 0x00, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xa4, 0x55, 0xf1, 0xaa, 0x0e, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12 },
+			{ 0x00, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x04, 0xaf, 0x3d },
+		},
+		{
+			{ 0x00, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34 },
+			{ 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 },
+		},
+		{
+			{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+			{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+		},
+		{
+			{ 0x00, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15 },
+			{ 0x00, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3 },
+			{ 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 },
+		},
+		{
+			{ 0x00, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf },
+			{ 0x00, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f },
+		},
+		{
+			{ 0x00, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74 },
+			{ 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13 },
+		},
+		{
+			{ 0x00, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a },
+			{ 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21 },
+			{ 0x00, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x05, 0x71, 0xb3, 0xe8, 0x2a },
+		},
+		{
+			{ 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x02, 0xc1, 0xec, 0x9b, 0xb6 },
+			{ 0x00, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x03, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf },
+		},
+		{
+			{ 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d },
+			{ 0x00, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e },
+		},
+		{
+			{ 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 },
+			{ 0x00, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f },
+			{ 0x00, 0x47, 0x8e, 0xc9, 0x01, 0x46, 0x8f, 0xc8, 0x02, 0x45, 0x8c, 0xcb, 0x03, 0x44, 0x8d, 0xca },
+		},
+		{
+			{ 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 },
+			{ 0x00, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac },
+		},
+		{
+			{ 0x00, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24 },
+			{ 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x08, 0x43, 0x7a },
+		},
+		{
+			{ 0x00, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8 },
+			{ 0x00, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82 },
+			{ 0x00, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0x0d, 0x52, 0x24, 0xbe, 0xc8 },
+		},
+		{
+			{ 0x00, 0xf3, 0xfb, 0x08, 0xeb, 0x18, 0x10, 0xe3, 0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28 },
+			{ 0x00, 0x8b, 0x0b, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba },
+		},
+		{
+			{ 0x00, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c },
+			{ 0x00, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa },
+		},
+		{
+			{ 0x00, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7 },
+			{ 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30 },
+			{ 0x00, 0xf2, 0xf9, 0x0b, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27 },
+		},
+		{
+			{ 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 },
+			{ 0x00, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x01, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65 },
+		},
+		{
+			{ 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7 },
+			{ 0x00, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0x0b, 0x78, 0xa2 },
+		},
+		{
+			{ 0x00, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c },
+			{ 0x00, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee },
+			{ 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x01, 0x3c, 0x7b, 0x46 },
+		},
+		{
+			{ 0x00, 0xf2, 0xf9, 0x0b, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27 },
+			{ 0x00, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a },
+		},
+		{
+			{ 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd },
+			{ 0x00, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f },
+		},
+		{
+			{ 0x00, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35 },
+			{ 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a },
+			{ 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 },
+		},
+		{
+			{ 0x00, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82 },
+			{ 0x00, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0x0d, 0x52, 0x24, 0xbe, 0xc8 },
+		},
+		{
+			{ 0x00, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63 },
+			{ 0x00, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0x0f, 0x93, 0xc8, 0x25, 0x7e },
+		},
+		{
+			{ 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 },
+			{ 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x02, 0xc1, 0xec, 0x9b, 0xb6 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc },
+			{ 0x00, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b },
+		},
+		{
+			{ 0x00, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x07, 0xb4, 0x7c, 0xcf },
+			{ 0x00, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c },
+		},
+		{
+			{ 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2 },
+			{ 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b },
+		},
+		{
+			{ 0x00, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f },
+			{ 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x07, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a },
+			{ 0x00, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x01, 0xe2, 0x48 },
+		},
+		{
+			{ 0x00, 0x45, 0x8a, 0xcf, 0x09, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4 },
+			{ 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1 },
+		},
+		{
+			{ 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 },
+			{ 0x00, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82 },
+		},
+		{
+			{ 0x00, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe },
+			{ 0x00, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x01, 0xee, 0x76, 0xc3, 0x5b },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x8d, 0x07, 0x8a, 0x0e, 0x83, 0x09, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98 },
+			{ 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x05, 0x4d, 0x75 },
+		},
+		{
+			{ 0x00, 0xc8, 0x8d, 0x45, 0x07, 0xcf, 0x8a, 0x42, 0x0e, 0xc6, 0x83, 0x4b, 0x09, 0xc1, 0x84, 0x4c },
+			{ 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 },
+		},
+		{
+			{ 0x00, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79 },
+			{ 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 },
+		},
+		{
+			{ 0x00, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63, 0x09, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a },
+			{ 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x07, 0x7c, 0xff, 0x84, 0x09, 0x72, 0x0e, 0x75, 0xf8, 0x83 },
+			{ 0x00, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93, 0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x03, 0x3b, 0xd8 },
+		},
+		{
+			{ 0x00, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63, 0x09, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a },
+			{ 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee },
+		},
+		{
+			{ 0x00, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6 },
+			{ 0x00, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc },
+		},
+		{
+			{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 },
+			{ 0x00, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xc8, 0x8d, 0x45, 0x07, 0xcf, 0x8a, 0x42, 0x0e, 0xc6, 0x83, 0x4b, 0x09, 0xc1, 0x84, 0x4c },
+			{ 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 },
+		},
+		{
+			{ 0x00, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x07, 0x7c, 0xff, 0x84, 0x09, 0x72, 0x0e, 0x75, 0xf8, 0x83 },
+			{ 0x00, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93, 0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x03, 0x3b, 0xd8 },
+		},
+		{
+			{ 0x00, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7 },
+			{ 0x00, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0x0d, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb },
+		},
+		{
+			{ 0x00, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b },
+			{ 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x07, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x52, 0xa4, 0xf6, 0x55, 0x07, 0xf1, 0xa3, 0xaa, 0xf8, 0x0e, 0x5c, 0xff, 0xad, 0x5b, 0x09 },
+			{ 0x00, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x02, 0xd9, 0x90 },
+		},
+		{
+			{ 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x09, 0xcd, 0xe1, 0x95, 0xb9 },
+			{ 0x00, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f },
+		},
+		{
+			{ 0x00, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f },
+			{ 0x00, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3 },
+		},
+		{
+			{ 0x00, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x01, 0xdc, 0xf5, 0x28, 0x52, 0x8f },
+			{ 0x00, 0x51, 0xa2, 0xf3, 0x59, 0x08, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0x0e, 0x7d, 0xbe, 0xe6, 0x25 },
+			{ 0x00, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63, 0x09, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a },
+		},
+		{
+			{ 0x00, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0x0e, 0x35, 0xd7 },
+			{ 0x00, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1 },
+		},
+		{
+			{ 0x00, 0xca, 0x89, 0x43, 0x0f, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52 },
+			{ 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0x0d, 0x31, 0x75, 0x49 },
+		},
+		{
+			{ 0x00, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60 },
+			{ 0x00, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a },
+			{ 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 },
+		},
+		{
+			{ 0x00, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5 },
+			{ 0x00, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0x0d, 0xeb },
+		},
+		{
+			{ 0x00, 0xf0, 0xfd, 0x0d, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39 },
+			{ 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 },
+		},
+		{
+			{ 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 },
+			{ 0x00, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0x0f, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6 },
+			{ 0x00, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0x0d, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2 },
+		},
+		{
+			{ 0x00, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b },
+			{ 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58 },
+		},
+		{
+			{ 0x00, 0x8b, 0x0b, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba },
+			{ 0x00, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f },
+		},
+		{
+			{ 0x00, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x07, 0x94, 0xe5 },
+			{ 0x00, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2 },
+			{ 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b },
+		},
+		{
+			{ 0x00, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84 },
+			{ 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8 },
+		},
+		{
+			{ 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94 },
+			{ 0x00, 0x8a, 0x09, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5 },
+		},
+		{
+			{ 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x08, 0x43, 0x7a },
+			{ 0x00, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8 },
+			{ 0x00, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b },
+		},
+		{
+			{ 0x00, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76 },
+			{ 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 },
+		},
+		{
+			{ 0x00, 0xc9, 0x8f, 0x46, 0x03, 0xca, 0x8c, 0x45, 0x06, 0xcf, 0x89, 0x40, 0x05, 0xcc, 0x8a, 0x43 },
+			{ 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 },
+		},
+		{
+			{ 0x00, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6 },
+			{ 0x00, 0x44, 0x88, 0xcc, 0x0d, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x03, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf },
+			{ 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 },
+		},
+		{
+			{ 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x05, 0x31 },
+			{ 0x00, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37 },
+		},
+		{
+			{ 0x00, 0x8a, 0x09, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5 },
+			{ 0x00, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0x0f, 0xd7, 0x9f },
+		},
+		{
+			{ 0x00, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e },
+			{ 0x00, 0x8e, 0x01, 0x8f, 0x02, 0x8c, 0x03, 0x8d, 0x04, 0x8a, 0x05, 0x8b, 0x06, 0x88, 0x07, 0x89 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x03, 0xd6, 0x05, 0x6d, 0xbe, 0xbd, 0x6e, 0x06, 0xd5 },
+			{ 0x00, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1 },
+		},
+		{
+			{ 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 },
+			{ 0x00, 0x50, 0xa0, 0xf0, 0x5d, 0x0d, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17 },
+		},
+		{
+			{ 0x00, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x09, 0xa1, 0x32 },
+			{ 0x00, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x07 },
+		},
+		{
+			{ 0x00, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e },
+			{ 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 },
+			{ 0x00, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a },
+		},
+		{
+			{ 0x00, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0x0a, 0x9a, 0xea },
+			{ 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 },
+		},
+		{
+			{ 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 },
+			{ 0x00, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf },
+		},
+		{
+			{ 0x00, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84 },
+			{ 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0x0c, 0xec, 0x47 },
+			{ 0x00, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x04 },
+		},
+		{
+			{ 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58 },
+			{ 0x00, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9 },
+		},
+		{
+			{ 0x00, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a },
+			{ 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 },
+		},
+		{
+			{ 0x00, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0x0d, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2 },
+			{ 0x00, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b },
+			{ 0x00, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45 },
+		},
+		{
+			{ 0x00, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x01, 0xee, 0x76, 0xc3, 0x5b },
+			{ 0x00, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9 },
+		},
+		{
+			{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd },
+			{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b },
+		},
+		{
+			{ 0x00, 0xf4, 0xf5, 0x01, 0xf7, 0x03, 0x02, 0xf6, 0xf3, 0x07, 0x06, 0xf2, 0x04, 0xf0, 0xf1, 0x05 },
+			{ 0x00, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 },
+			{ 0x00, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x01, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65 },
+		},
+		{
+			{ 0x00, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63 },
+			{ 0x00, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0x0f, 0x93, 0xc8, 0x25, 0x7e },
+		},
+		{
+			{ 0x00, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1 },
+			{ 0x00, 0x88, 0x0d, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab },
+		},
+		{
+			{ 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa },
+			{ 0x00, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42 },
+			{ 0x00, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0x0a, 0xe2, 0x7b, 0xcd, 0x54 },
+		},
+		{
+			{ 0x00, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x04 },
+			{ 0x00, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40 },
+		},
+		{
+			{ 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 },
+			{ 0x00, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x01, 0xdc, 0xf5, 0x28, 0x52, 0x8f },
+		},
+		{
+			{ 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13 },
+			{ 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3 },
+			{ 0x00, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b },
+		},
+		{
+			{ 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 },
+			{ 0x00, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x01, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68 },
+		},
+		{
+			{ 0x00, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21 },
+			{ 0x00, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x05, 0x71, 0xb3, 0xe8, 0x2a },
+		},
+		{
+			{ 0x00, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f },
+			{ 0x00, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+			{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+		},
+		{
+			{ 0x00, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45 },
+			{ 0x00, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24 },
+		},
+		{
+			{ 0x00, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0x0d, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2 },
+			{ 0x00, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef },
+		},
+		{
+			{ 0x00, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe },
+			{ 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d },
+			{ 0x00, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38 },
+		},
+		{
+			{ 0x00, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35 },
+			{ 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 },
+		},
+		{
+			{ 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57 },
+			{ 0x00, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19 },
+		},
+		{
+			{ 0x00, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0 },
+			{ 0x00, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x07, 0xb4, 0x7c, 0xcf },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21, 0x07, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26 },
+			{ 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a },
+		},
+		{
+			{ 0x00, 0x8d, 0x07, 0x8a, 0x0e, 0x83, 0x09, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98 },
+			{ 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x05, 0x4d, 0x75 },
+		},
+		{
+			{ 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce },
+			{ 0x00, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0x0b, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c },
+		},
+		{
+			{ 0x00, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x07, 0x7c, 0xff, 0x84, 0x09, 0x72, 0x0e, 0x75, 0xf8, 0x83 },
+			{ 0x00, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93, 0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x03, 0x3b, 0xd8 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xf6, 0xf1, 0x07, 0xff, 0x09, 0x0e, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b },
+			{ 0x00, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x06, 0x76, 0xad },
+		},
+		{
+			{ 0x00, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x09, 0xa1, 0x32 },
+			{ 0x00, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x07 },
+		},
+		{
+			{ 0x00, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8 },
+			{ 0x00, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b },
+		},
+		{
+			{ 0x00, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38 },
+			{ 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0x0e, 0x35, 0xd7 },
+			{ 0x00, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1 },
+		},
+		{
+			{ 0x00, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0x0e },
+			{ 0x00, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0 },
+		},
+		{
+			{ 0x00, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e },
+			{ 0x00, 0x8e, 0x01, 0x8f, 0x02, 0x8c, 0x03, 0x8d, 0x04, 0x8a, 0x05, 0x8b, 0x06, 0x88, 0x07, 0x89 },
+		},
+		{
+			{ 0x00, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42 },
+			{ 0x00, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0x0a, 0xe2, 0x7b, 0xcd, 0x54 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff },
+			{ 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b },
+		},
+		{
+			{ 0x00, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0x0d, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92 },
+			{ 0x00, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x03, 0xd6, 0x05, 0x6d, 0xbe, 0xbd, 0x6e, 0x06, 0xd5 },
+		},
+		{
+			{ 0x00, 0xa5, 0x57, 0xf2, 0xae, 0x0b, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d },
+			{ 0x00, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd },
+		},
+		{
+			{ 0x00, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x07, 0xb0, 0x5f, 0x73, 0x9c },
+			{ 0x00, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x50, 0xa0, 0xf0, 0x5d, 0x0d, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17 },
+			{ 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d },
+		},
+		{
+			{ 0x00, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3 },
+			{ 0x00, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x08, 0x23, 0x97, 0x56, 0xe2 },
+		},
+		{
+			{ 0x00, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0x0c, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73 },
+			{ 0x00, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63 },
+		},
+		{
+			{ 0x00, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x03, 0xe4 },
+			{ 0x00, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2 },
+			{ 0x00, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc },
+		},
+		{
+			{ 0x00, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0 },
+			{ 0x00, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x07, 0xb4, 0x7c, 0xcf },
+		},
+		{
+			{ 0x00, 0xa7, 0x53, 0xf4, 0xa6, 0x01, 0xf5, 0x52, 0x51, 0xf6, 0x02, 0xa5, 0xf7, 0x50, 0xa4, 0x03 },
+			{ 0x00, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30 },
+		},
+		{
+			{ 0x00, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f },
+			{ 0x00, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b },
+			{ 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x07, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a },
+		},
+		{
+			{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 },
+			{ 0x00, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81 },
+		},
+		{
+			{ 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20 },
+			{ 0x00, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a },
+		},
+		{
+			{ 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 },
+			{ 0x00, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b },
+			{ 0x00, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe },
+		},
+		{
+			{ 0x00, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe },
+			{ 0x00, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x01, 0xee, 0x76, 0xc3, 0x5b },
+		},
+		{
+			{ 0x00, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c },
+			{ 0x00, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e },
+		},
+		{
+			{ 0x00, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a },
+			{ 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f },
+			{ 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 },
+		},
+		{
+			{ 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 },
+			{ 0x00, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3 },
+		},
+		{
+			{ 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x08, 0x43, 0x7a },
+			{ 0x00, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3 },
+		},
+		{
+			{ 0x00, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9 },
+			{ 0x00, 0xf1, 0xff, 0x0e, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50 },
+			{ 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 },
+		},
+		{
+			{ 0x00, 0x8b, 0x0b, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba },
+			{ 0x00, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f },
+		},
+		{
+			{ 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0x0b, 0x3e },
+			{ 0x00, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x06, 0x5e, 0x29, 0xb0, 0xc7 },
+		},
+		{
+			{ 0x00, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3 },
+			{ 0x00, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xf2, 0xf9, 0x0b, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27 },
+			{ 0x00, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a },
+		},
+		{
+			{ 0x00, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d },
+			{ 0x00, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38 },
+		},
+		{
+			{ 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 },
+			{ 0x00, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34 },
+		},
+		{
+			{ 0x00, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2 },
+			{ 0x00, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6 },
+			{ 0x00, 0x44, 0x88, 0xcc, 0x0d, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb },
+		},
+		{
+			{ 0x00, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37 },
+			{ 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57 },
+		},
+		{
+			{ 0x00, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41 },
+			{ 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64 },
+		},
+		{
+			{ 0x00, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x04, 0xaf, 0x3d },
+			{ 0x00, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1 },
+			{ 0x00, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91 },
+		},
+		{
+			{ 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a },
+			{ 0x00, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9 },
+		},
+		{
+			{ 0x00, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x03, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf },
+			{ 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 },
+		},
+		{
+			{ 0x00, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0x0e, 0x35, 0xd7 },
+			{ 0x00, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c },
+			{ 0x00, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e },
+		},
+		{
+			{ 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 },
+			{ 0x00, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95 },
+		},
+		{
+			{ 0x00, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35 },
+			{ 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 },
+		},
+		{
+			{ 0x00, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b },
+			{ 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0x0d, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x03, 0xe4 },
+			{ 0x00, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6 },
+		},
+		{
+			{ 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x05, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec },
+			{ 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 },
+		},
+		{
+			{ 0x00, 0x45, 0x8a, 0xcf, 0x09, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4 },
+			{ 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1 },
+		},
+		{
+			{ 0x00, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4 },
+			{ 0x00, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xc9, 0x8f, 0x46, 0x03, 0xca, 0x8c, 0x45, 0x06, 0xcf, 0x89, 0x40, 0x05, 0xcc, 0x8a, 0x43 },
+			{ 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 },
+		},
+		{
+			{ 0x00, 0x8c, 0x05, 0x89, 0x0a, 0x86, 0x0f, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97 },
+			{ 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0x0d, 0x25, 0xfd, 0xd5, 0xad, 0x85 },
+		},
+		{
+			{ 0x00, 0x8f, 0x03, 0x8c, 0x06, 0x89, 0x05, 0x8a, 0x0c, 0x83, 0x0f, 0x80, 0x0a, 0x85, 0x09, 0x86 },
+			{ 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 },
+		},
+		{
+			{ 0x00, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41 },
+			{ 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 },
+			{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0x0d },
+		},
+		{
+			{ 0x00, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x05, 0x67, 0x0f, 0xb7, 0xdf, 0xda, 0xb2, 0x0a, 0x62 },
+			{ 0x00, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e },
+		},
+		{
+			{ 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x09, 0xcd, 0xe1, 0x95, 0xb9 },
+			{ 0x00, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f },
+		},
+		{
+			{ 0x00, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1 },
+			{ 0x00, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 },
+			{ 0x00, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf },
+		},
+		{
+			{ 0x00, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0x0a, 0xe2, 0x7b, 0xcd, 0x54 },
+			{ 0x00, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26, 0x0f, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29 },
+		},
+		{
+			{ 0x00, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60 },
+			{ 0x00, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e },
+		},
+		{
+			{ 0x00, 0x50, 0xa0, 0xf0, 0x5d, 0x0d, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17 },
+			{ 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0x0c, 0xbc, 0x52, 0x7d, 0x93 },
+			{ 0x00, 0x46, 0x8c, 0xca, 0x05, 0x43, 0x89, 0xcf, 0x0a, 0x4c, 0x86, 0xc0, 0x0f, 0x49, 0x83, 0xc5 },
+		},
+		{
+			{ 0x00, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f },
+			{ 0x00, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x03, 0x2f, 0x9a, 0x58, 0xed },
+		},
+		{
+			{ 0x00, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0x0c, 0xec, 0x47 },
+			{ 0x00, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x04 },
+		},
+		{
+			{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+			{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 },
+			{ 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x02, 0xc1, 0xec, 0x9b, 0xb6 },
+		},
+		{
+			{ 0x00, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7 },
+			{ 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 },
+		},
+		{
+			{ 0x00, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0 },
+			{ 0x00, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0x0f, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6 },
+		},
+		{
+			{ 0x00, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e },
+			{ 0x00, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x8b, 0x0b, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba },
+			{ 0x00, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f },
+		},
+		{
+			{ 0x00, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0x0b, 0xb9, 0x72, 0xc0 },
+			{ 0x00, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x07, 0xb0, 0x5f, 0x73, 0x9c },
+		},
+		{
+			{ 0x00, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0x0b, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0 },
+			{ 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 },
+		},
+		{
+			{ 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 },
+			{ 0x00, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x01, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0x0b, 0xd7, 0xf9, 0x25, 0x5c, 0x80 },
+			{ 0x00, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8 },
+		},
+		{
+			{ 0x00, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21 },
+			{ 0x00, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x05, 0x71, 0xb3, 0xe8, 0x2a },
+		},
+		{
+			{ 0x00, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x02, 0xd9, 0x90 },
+			{ 0x00, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5 },
+		},
+		{
+			{ 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 },
+			{ 0x00, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0x0b, 0x3e },
+			{ 0x00, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x06, 0x5e, 0x29, 0xb0, 0xc7 },
+		},
+		{
+			{ 0x00, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0x0b, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0 },
+			{ 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 },
+		},
+		{
+			{ 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 },
+			{ 0x00, 0xf0, 0xfd, 0x0d, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39 },
+		},
+		{
+			{ 0x00, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21 },
+			{ 0x00, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x05, 0x71, 0xb3, 0xe8, 0x2a },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0x0b, 0xb9, 0x72, 0xc0 },
+			{ 0x00, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x07, 0xb0, 0x5f, 0x73, 0x9c },
+		},
+		{
+			{ 0x00, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3 },
+			{ 0x00, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b },
+		},
+		{
+			{ 0x00, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0x0b, 0xd7, 0xf9, 0x25, 0x5c, 0x80 },
+			{ 0x00, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8 },
+		},
+		{
+			{ 0x00, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70 },
+			{ 0x00, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89, 0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0x0f, 0x53 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0x0b, 0x78, 0xa2 },
+			{ 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2 },
+		},
+		{
+			{ 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0x0b, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf },
+			{ 0x00, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x04, 0x9f, 0xc5, 0x2b, 0x71 },
+		},
+		{
+			{ 0x00, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc },
+			{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+		},
+		{
+			{ 0x00, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x01, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65 },
+			{ 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 },
+			{ 0x00, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e },
+		},
+		{
+			{ 0x00, 0xf0, 0xfd, 0x0d, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39 },
+			{ 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 },
+		},
+		{
+			{ 0x00, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0x0d, 0x52, 0x24, 0xbe, 0xc8 },
+			{ 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c },
+		},
+		{
+			{ 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee },
+			{ 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x01, 0x3c, 0x7b, 0x46 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e },
+			{ 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda },
+		},
+		{
+			{ 0x00, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x01, 0xdc, 0xf5, 0x28, 0x52, 0x8f },
+			{ 0x00, 0x51, 0xa2, 0xf3, 0x59, 0x08, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18 },
+		},
+		{
+			{ 0x00, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc },
+			{ 0x00, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b },
+		},
+		{
+			{ 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x05, 0x31 },
+			{ 0x00, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x01 },
+			{ 0x00, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10 },
+		},
+		{
+			{ 0x00, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93, 0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x03, 0x3b, 0xd8 },
+			{ 0x00, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x01 },
+		},
+		{
+			{ 0x00, 0xf4, 0xf5, 0x01, 0xf7, 0x03, 0x02, 0xf6, 0xf3, 0x07, 0x06, 0xf2, 0x04, 0xf0, 0xf1, 0x05 },
+			{ 0x00, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50 },
+		},
+		{
+			{ 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 },
+			{ 0x00, 0xf0, 0xfd, 0x0d, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x02, 0xd9, 0x90 },
+			{ 0x00, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5 },
+		},
+		{
+			{ 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d },
+			{ 0x00, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e },
+		},
+		{
+			{ 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a },
+			{ 0x00, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9 },
+		},
+		{
+			{ 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a },
+			{ 0x00, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0x0b, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd },
+			{ 0x00, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f },
+		},
+		{
+			{ 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 },
+			{ 0x00, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34 },
+		},
+		{
+			{ 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff },
+			{ 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b },
+		},
+		{
+			{ 0x00, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x01, 0xe2, 0x48 },
+			{ 0x00, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 },
+			{ 0x00, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3 },
+		},
+		{
+			{ 0x00, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x08 },
+			{ 0x00, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0x0b, 0xd7, 0xf9, 0x25, 0x5c, 0x80 },
+		},
+		{
+			{ 0x00, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x08, 0x23, 0x97, 0x56, 0xe2 },
+			{ 0x00, 0x8f, 0x03, 0x8c, 0x06, 0x89, 0x05, 0x8a, 0x0c, 0x83, 0x0f, 0x80, 0x0a, 0x85, 0x09, 0x86 },
+		},
+		{
+			{ 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 },
+			{ 0x00, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb },
+			{ 0x00, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0x0b },
+		},
+		{
+			{ 0x00, 0x88, 0x0d, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab },
+			{ 0x00, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x05, 0x67, 0x0f, 0xb7, 0xdf, 0xda, 0xb2, 0x0a, 0x62 },
+		},
+		{
+			{ 0x00, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0x0a, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4 },
+			{ 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc },
+		},
+		{
+			{ 0x00, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0x0d, 0xeb },
+			{ 0x00, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xf0, 0xfd, 0x0d, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39 },
+			{ 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 },
+		},
+		{
+			{ 0x00, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8 },
+			{ 0x00, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21, 0x07, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26 },
+		},
+		{
+			{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0x0d },
+			{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 },
+		},
+		{
+			{ 0x00, 0xf2, 0xf9, 0x0b, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27 },
+			{ 0x00, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0 },
+			{ 0x00, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x07, 0xb4, 0x7c, 0xcf },
+		},
+		{
+			{ 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc },
+			{ 0x00, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41 },
+		},
+		{
+			{ 0x00, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x03, 0xd6, 0x05, 0x6d, 0xbe, 0xbd, 0x6e, 0x06, 0xd5 },
+			{ 0x00, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1 },
+		},
+		{
+			{ 0x00, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc },
+			{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+		},
+	},
+	{
+		{
+			{ 0x00, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63 },
+			{ 0x00, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0x0f, 0x93, 0xc8, 0x25, 0x7e },
+		},
+		{
+			{ 0x00, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c },
+			{ 0x00, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa },
+		},
+		{
+			{ 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64 },
+			{ 0x00, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0x0e },
+		},
+		{
+			{ 0x00, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0 },
+			{ 0x00, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0x0f, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 },
+			{ 0x00, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac },
+		},
+		{
+			{ 0x00, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9 },
+			{ 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c },
+		},
+		{
+			{ 0x00, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x01, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65 },
+			{ 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e },
+		},
+		{
+			{ 0x00, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76 },
+			{ 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 },
+		},
+	},
+	{
+		{
+			{ 0x00, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e },
+			{ 0x00, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x08 },
+		},
+		{
+			{ 0x00, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72 },
+			{ 0x00, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0x0c, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73 },
+		},
+		{
+			{ 0x00, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9 },
+			{ 0x00, 0xf1, 0xff, 0x0e, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36 },
+		},
+		{
+			{ 0x00, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a },
+			{ 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 },
+		},
+	},
+};
+#endif
+
+#ifdef CONFIG_X86
+/**
+ * PSHUFB tables for generic multiplication.
+ *
+ * Indexes are [MULTIPLER][LH].
+ * Where MULTIPLER is from 0 to 255, LH from 0 to 1.
+ */
+const uint8_t __aligned(256) raid_gfmulpshufb[256][2][16] =
+{
+	{
+		{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+		{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+	},
+	{
+		{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+		{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+	},
+	{
+		{ 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e },
+		{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd },
+	},
+	{
+		{ 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 },
+		{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0x0d },
+	},
+	{
+		{ 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c },
+		{ 0x00, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7 },
+	},
+	{
+		{ 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 },
+		{ 0x00, 0x50, 0xa0, 0xf0, 0x5d, 0x0d, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17 },
+	},
+	{
+		{ 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 },
+		{ 0x00, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a },
+	},
+	{
+		{ 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d },
+		{ 0x00, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0x0a, 0x9a, 0xea },
+	},
+	{
+		{ 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 },
+		{ 0x00, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3 },
+	},
+	{
+		{ 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 },
+		{ 0x00, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23 },
+	},
+	{
+		{ 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 },
+		{ 0x00, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e },
+	},
+	{
+		{ 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 },
+		{ 0x00, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde },
+	},
+	{
+		{ 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 },
+		{ 0x00, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34 },
+	},
+	{
+		{ 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b },
+		{ 0x00, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0x0a, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4 },
+	},
+	{
+		{ 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a },
+		{ 0x00, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9 },
+	},
+	{
+		{ 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 },
+		{ 0x00, 0xf0, 0xfd, 0x0d, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39 },
+	},
+	{
+		{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+		{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+	},
+	{
+		{ 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff },
+		{ 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b },
+	},
+	{
+		{ 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee },
+		{ 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x01, 0x3c, 0x7b, 0x46 },
+	},
+	{
+		{ 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 },
+		{ 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x02, 0xc1, 0xec, 0x9b, 0xb6 },
+	},
+	{
+		{ 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc },
+		{ 0x00, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e, 0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x01, 0x5c },
+	},
+	{
+		{ 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 },
+		{ 0x00, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac },
+	},
+	{
+		{ 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 },
+		{ 0x00, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1 },
+	},
+	{
+		{ 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd },
+		{ 0x00, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51 },
+	},
+	{
+		{ 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 },
+		{ 0x00, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x01, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68 },
+	},
+	{
+		{ 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 },
+		{ 0x00, 0x8d, 0x07, 0x8a, 0x0e, 0x83, 0x09, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98 },
+	},
+	{
+		{ 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 },
+		{ 0x00, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95 },
+	},
+	{
+		{ 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 },
+		{ 0x00, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x01, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65 },
+	},
+	{
+		{ 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 },
+		{ 0x00, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x01, 0xdc, 0xf5, 0x28, 0x52, 0x8f },
+	},
+	{
+		{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+		{ 0x00, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f },
+	},
+	{
+		{ 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa },
+		{ 0x00, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72 },
+	},
+	{
+		{ 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 },
+		{ 0x00, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82 },
+	},
+	{
+		{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd },
+		{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b },
+	},
+	{
+		{ 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2 },
+		{ 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b },
+	},
+	{
+		{ 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0x0d, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3 },
+		{ 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 },
+	},
+	{
+		{ 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x05, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec },
+		{ 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 },
+	},
+	{
+		{ 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1 },
+		{ 0x00, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x01, 0x7b, 0xf7, 0x8d, 0x03, 0x79, 0x02, 0x78, 0xf6, 0x8c },
+	},
+	{
+		{ 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce },
+		{ 0x00, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0x0b, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c },
+	},
+	{
+		{ 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0x0b, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf },
+		{ 0x00, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x04, 0x9f, 0xc5, 0x2b, 0x71 },
+	},
+	{
+		{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 },
+		{ 0x00, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81 },
+	},
+	{
+		{ 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0x0d, 0x25, 0xfd, 0xd5, 0xad, 0x85 },
+		{ 0x00, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x01, 0xb9, 0x03, 0xd0, 0x6a, 0x6b, 0xd1, 0x02, 0xb8 },
+	},
+	{
+		{ 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x07, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a },
+		{ 0x00, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x01, 0xe2, 0x48 },
+	},
+	{
+		{ 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b },
+		{ 0x00, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45 },
+	},
+	{
+		{ 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94 },
+		{ 0x00, 0x8a, 0x09, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5 },
+	},
+	{
+		{ 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x09, 0xcd, 0xe1, 0x95, 0xb9 },
+		{ 0x00, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f },
+	},
+	{
+		{ 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x02, 0xc1, 0xec, 0x9b, 0xb6 },
+		{ 0x00, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x03, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf },
+	},
+	{
+		{ 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7 },
+		{ 0x00, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0x0b, 0x78, 0xa2 },
+	},
+	{
+		{ 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8 },
+		{ 0x00, 0xca, 0x89, 0x43, 0x0f, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52 },
+	},
+	{
+		{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0x0d },
+		{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 },
+	},
+	{
+		{ 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x02 },
+		{ 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20 },
+	},
+	{
+		{ 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13 },
+		{ 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d },
+	},
+	{
+		{ 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c },
+		{ 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd },
+	},
+	{
+		{ 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x05, 0x31 },
+		{ 0x00, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37 },
+	},
+	{
+		{ 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0x0b, 0x3e },
+		{ 0x00, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x06, 0x5e, 0x29, 0xb0, 0xc7 },
+	},
+	{
+		{ 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f },
+		{ 0x00, 0x47, 0x8e, 0xc9, 0x01, 0x46, 0x8f, 0xc8, 0x02, 0x45, 0x8c, 0xcb, 0x03, 0x44, 0x8d, 0xca },
+	},
+	{
+		{ 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20 },
+		{ 0x00, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a },
+	},
+	{
+		{ 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x05, 0x4d, 0x75 },
+		{ 0x00, 0xa7, 0x53, 0xf4, 0xa6, 0x01, 0xf5, 0x52, 0x51, 0xf6, 0x02, 0xa5, 0xf7, 0x50, 0xa4, 0x03 },
+	},
+	{
+		{ 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x08, 0x43, 0x7a },
+		{ 0x00, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3 },
+	},
+	{
+		{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b },
+		{ 0x00, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe },
+	},
+	{
+		{ 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64 },
+		{ 0x00, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0x0e },
+	},
+	{
+		{ 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0x0d, 0x31, 0x75, 0x49 },
+		{ 0x00, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x03, 0xe4 },
+	},
+	{
+		{ 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x01, 0x3c, 0x7b, 0x46 },
+		{ 0x00, 0xf7, 0xf3, 0x04, 0xfb, 0x0c, 0x08, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14 },
+	},
+	{
+		{ 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57 },
+		{ 0x00, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19 },
+	},
+	{
+		{ 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58 },
+		{ 0x00, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9 },
+	},
+	{
+		{ 0x00, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7 },
+		{ 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 },
+	},
+	{
+		{ 0x00, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8 },
+		{ 0x00, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21, 0x07, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26 },
+	},
+	{
+		{ 0x00, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9 },
+		{ 0x00, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b },
+	},
+	{
+		{ 0x00, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6 },
+		{ 0x00, 0x44, 0x88, 0xcc, 0x0d, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb },
+	},
+	{
+		{ 0x00, 0x44, 0x88, 0xcc, 0x0d, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb },
+		{ 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x05, 0x31 },
+	},
+	{
+		{ 0x00, 0x45, 0x8a, 0xcf, 0x09, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4 },
+		{ 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1 },
+	},
+	{
+		{ 0x00, 0x46, 0x8c, 0xca, 0x05, 0x43, 0x89, 0xcf, 0x0a, 0x4c, 0x86, 0xc0, 0x0f, 0x49, 0x83, 0xc5 },
+		{ 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc },
+	},
+	{
+		{ 0x00, 0x47, 0x8e, 0xc9, 0x01, 0x46, 0x8f, 0xc8, 0x02, 0x45, 0x8c, 0xcb, 0x03, 0x44, 0x8d, 0xca },
+		{ 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c },
+	},
+	{
+		{ 0x00, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0x0f, 0xd7, 0x9f },
+		{ 0x00, 0xf4, 0xf5, 0x01, 0xf7, 0x03, 0x02, 0xf6, 0xf3, 0x07, 0x06, 0xf2, 0x04, 0xf0, 0xf1, 0x05 },
+	},
+	{
+		{ 0x00, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x02, 0xd9, 0x90 },
+		{ 0x00, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5 },
+	},
+	{
+		{ 0x00, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81 },
+		{ 0x00, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8 },
+	},
+	{
+		{ 0x00, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e },
+		{ 0x00, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x08 },
+	},
+	{
+		{ 0x00, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3 },
+		{ 0x00, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x08, 0x23, 0x97, 0x56, 0xe2 },
+	},
+	{
+		{ 0x00, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac },
+		{ 0x00, 0xa4, 0x55, 0xf1, 0xaa, 0x0e, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12 },
+	},
+	{
+		{ 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd },
+		{ 0x00, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f },
+	},
+	{
+		{ 0x00, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0x0d, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2 },
+		{ 0x00, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef },
+	},
+	{
+		{ 0x00, 0x50, 0xa0, 0xf0, 0x5d, 0x0d, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17 },
+		{ 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d },
+	},
+	{
+		{ 0x00, 0x51, 0xa2, 0xf3, 0x59, 0x08, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18 },
+		{ 0x00, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0x0b, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d },
+	},
+	{
+		{ 0x00, 0x52, 0xa4, 0xf6, 0x55, 0x07, 0xf1, 0xa3, 0xaa, 0xf8, 0x0e, 0x5c, 0xff, 0xad, 0x5b, 0x09 },
+		{ 0x00, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x02, 0xd9, 0x90 },
+	},
+	{
+		{ 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 },
+		{ 0x00, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60 },
+	},
+	{
+		{ 0x00, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b },
+		{ 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x07, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a },
+	},
+	{
+		{ 0x00, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24 },
+		{ 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x08, 0x43, 0x7a },
+	},
+	{
+		{ 0x00, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35 },
+		{ 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 },
+	},
+	{
+		{ 0x00, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a },
+		{ 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 },
+	},
+	{
+		{ 0x00, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f },
+		{ 0x00, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe },
+	},
+	{
+		{ 0x00, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60 },
+		{ 0x00, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e },
+	},
+	{
+		{ 0x00, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x04, 0x9f, 0xc5, 0x2b, 0x71 },
+		{ 0x00, 0xc9, 0x8f, 0x46, 0x03, 0xca, 0x8c, 0x45, 0x06, 0xcf, 0x89, 0x40, 0x05, 0xcc, 0x8a, 0x43 },
+	},
+	{
+		{ 0x00, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0x0f, 0x93, 0xc8, 0x25, 0x7e },
+		{ 0x00, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35, 0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3 },
+	},
+	{
+		{ 0x00, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89, 0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0x0f, 0x53 },
+		{ 0x00, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78, 0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59 },
+	},
+	{
+		{ 0x00, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e, 0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x01, 0x5c },
+		{ 0x00, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x08, 0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9 },
+	},
+	{
+		{ 0x00, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d },
+		{ 0x00, 0x89, 0x0f, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4 },
+	},
+	{
+		{ 0x00, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42 },
+		{ 0x00, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0x0a, 0xe2, 0x7b, 0xcd, 0x54 },
+	},
+	{
+		{ 0x00, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a },
+		{ 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd },
+	},
+	{
+		{ 0x00, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15 },
+		{ 0x00, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d },
+	},
+	{
+		{ 0x00, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x04 },
+		{ 0x00, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40 },
+	},
+	{
+		{ 0x00, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0x0b },
+		{ 0x00, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0 },
+	},
+	{
+		{ 0x00, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21, 0x07, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26 },
+		{ 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a },
+	},
+	{
+		{ 0x00, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26, 0x0f, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29 },
+		{ 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa },
+	},
+	{
+		{ 0x00, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38 },
+		{ 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7 },
+	},
+	{
+		{ 0x00, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37 },
+		{ 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57 },
+	},
+	{
+		{ 0x00, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x05, 0x67, 0x0f, 0xb7, 0xdf, 0xda, 0xb2, 0x0a, 0x62 },
+		{ 0x00, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e },
+	},
+	{
+		{ 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d },
+		{ 0x00, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e },
+	},
+	{
+		{ 0x00, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0x0b, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c },
+		{ 0x00, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0x0c, 0xbc, 0x52, 0x7d, 0x93 },
+	},
+	{
+		{ 0x00, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0x0c, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73 },
+		{ 0x00, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63 },
+	},
+	{
+		{ 0x00, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e },
+		{ 0x00, 0x8e, 0x01, 0x8f, 0x02, 0x8c, 0x03, 0x8d, 0x04, 0x8a, 0x05, 0x8b, 0x06, 0x88, 0x07, 0x89 },
+	},
+	{
+		{ 0x00, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51 },
+		{ 0x00, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79 },
+	},
+	{
+		{ 0x00, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40 },
+		{ 0x00, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74 },
+	},
+	{
+		{ 0x00, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f },
+		{ 0x00, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84 },
+	},
+	{
+		{ 0x00, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0x0a, 0x9a, 0xea },
+		{ 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 },
+	},
+	{
+		{ 0x00, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x07, 0x94, 0xe5 },
+		{ 0x00, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6 },
+	},
+	{
+		{ 0x00, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4 },
+		{ 0x00, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb },
+	},
+	{
+		{ 0x00, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb },
+		{ 0x00, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0x0b },
+	},
+	{
+		{ 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 },
+		{ 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 },
+	},
+	{
+		{ 0x00, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9 },
+		{ 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 },
+	},
+	{
+		{ 0x00, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0x0d, 0x52, 0x24, 0xbe, 0xc8 },
+		{ 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c },
+	},
+	{
+		{ 0x00, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x06, 0x5e, 0x29, 0xb0, 0xc7 },
+		{ 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x05, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec },
+	},
+	{
+		{ 0x00, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0x0d, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92 },
+		{ 0x00, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x03, 0xd6, 0x05, 0x6d, 0xbe, 0xbd, 0x6e, 0x06, 0xd5 },
+	},
+	{
+		{ 0x00, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0x0b, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d },
+		{ 0x00, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0x0e, 0x7d, 0xbe, 0xe6, 0x25 },
+	},
+	{
+		{ 0x00, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x01, 0x7b, 0xf7, 0x8d, 0x03, 0x79, 0x02, 0x78, 0xf6, 0x8c },
+		{ 0x00, 0xf3, 0xfb, 0x08, 0xeb, 0x18, 0x10, 0xe3, 0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28 },
+	},
+	{
+		{ 0x00, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x07, 0x7c, 0xff, 0x84, 0x09, 0x72, 0x0e, 0x75, 0xf8, 0x83 },
+		{ 0x00, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93, 0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x03, 0x3b, 0xd8 },
+	},
+	{
+		{ 0x00, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae },
+		{ 0x00, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x09, 0xa1, 0x32 },
+	},
+	{
+		{ 0x00, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1 },
+		{ 0x00, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2 },
+	},
+	{
+		{ 0x00, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0 },
+		{ 0x00, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x07, 0xb4, 0x7c, 0xcf },
+	},
+	{
+		{ 0x00, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf },
+		{ 0x00, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f },
+	},
+	{
+		{ 0x00, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3 },
+		{ 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 },
+	},
+	{
+		{ 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc },
+		{ 0x00, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41 },
+	},
+	{
+		{ 0x00, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd },
+		{ 0x00, 0xc8, 0x8d, 0x45, 0x07, 0xcf, 0x8a, 0x42, 0x0e, 0xc6, 0x83, 0x4b, 0x09, 0xc1, 0x84, 0x4c },
+	},
+	{
+		{ 0x00, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2 },
+		{ 0x00, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc },
+	},
+	{
+		{ 0x00, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef },
+		{ 0x00, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56 },
+	},
+	{
+		{ 0x00, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0 },
+		{ 0x00, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0x0f, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6 },
+	},
+	{
+		{ 0x00, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1 },
+		{ 0x00, 0x88, 0x0d, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab },
+	},
+	{
+		{ 0x00, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe },
+		{ 0x00, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x01, 0xee, 0x76, 0xc3, 0x5b },
+	},
+	{
+		{ 0x00, 0x88, 0x0d, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab },
+		{ 0x00, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x05, 0x67, 0x0f, 0xb7, 0xdf, 0xda, 0xb2, 0x0a, 0x62 },
+	},
+	{
+		{ 0x00, 0x89, 0x0f, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4 },
+		{ 0x00, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0x0d, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92 },
+	},
+	{
+		{ 0x00, 0x8a, 0x09, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5 },
+		{ 0x00, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0x0f, 0xd7, 0x9f },
+	},
+	{
+		{ 0x00, 0x8b, 0x0b, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba },
+		{ 0x00, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f },
+	},
+	{
+		{ 0x00, 0x8c, 0x05, 0x89, 0x0a, 0x86, 0x0f, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97 },
+		{ 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0x0d, 0x25, 0xfd, 0xd5, 0xad, 0x85 },
+	},
+	{
+		{ 0x00, 0x8d, 0x07, 0x8a, 0x0e, 0x83, 0x09, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98 },
+		{ 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x05, 0x4d, 0x75 },
+	},
+	{
+		{ 0x00, 0x8e, 0x01, 0x8f, 0x02, 0x8c, 0x03, 0x8d, 0x04, 0x8a, 0x05, 0x8b, 0x06, 0x88, 0x07, 0x89 },
+		{ 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 },
+	},
+	{
+		{ 0x00, 0x8f, 0x03, 0x8c, 0x06, 0x89, 0x05, 0x8a, 0x0c, 0x83, 0x0f, 0x80, 0x0a, 0x85, 0x09, 0x86 },
+		{ 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 },
+	},
+	{
+		{ 0x00, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23 },
+		{ 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a },
+	},
+	{
+		{ 0x00, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c },
+		{ 0x00, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa },
+	},
+	{
+		{ 0x00, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x04, 0xaf, 0x3d },
+		{ 0x00, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7 },
+	},
+	{
+		{ 0x00, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x09, 0xa1, 0x32 },
+		{ 0x00, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x07 },
+	},
+	{
+		{ 0x00, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f },
+		{ 0x00, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x03, 0x2f, 0x9a, 0x58, 0xed },
+	},
+	{
+		{ 0x00, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10 },
+		{ 0x00, 0xa5, 0x57, 0xf2, 0xae, 0x0b, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d },
+	},
+	{
+		{ 0x00, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x01 },
+		{ 0x00, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10 },
+	},
+	{
+		{ 0x00, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0x0e },
+		{ 0x00, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0 },
+	},
+	{
+		{ 0x00, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x01, 0xee, 0x76, 0xc3, 0x5b },
+		{ 0x00, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9 },
+	},
+	{
+		{ 0x00, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0x0a, 0xe2, 0x7b, 0xcd, 0x54 },
+		{ 0x00, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26, 0x0f, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29 },
+	},
+	{
+		{ 0x00, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45 },
+		{ 0x00, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24 },
+	},
+	{
+		{ 0x00, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a },
+		{ 0x00, 0x45, 0x8a, 0xcf, 0x09, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4 },
+	},
+	{
+		{ 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 },
+		{ 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0x0b, 0x3e },
+	},
+	{
+		{ 0x00, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x01, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68 },
+		{ 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce },
+	},
+	{
+		{ 0x00, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79 },
+		{ 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 },
+	},
+	{
+		{ 0x00, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76 },
+		{ 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 },
+	},
+	{
+		{ 0x00, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e },
+		{ 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda },
+	},
+	{
+		{ 0x00, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21 },
+		{ 0x00, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x05, 0x71, 0xb3, 0xe8, 0x2a },
+	},
+	{
+		{ 0x00, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30 },
+		{ 0x00, 0xf2, 0xf9, 0x0b, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27 },
+	},
+	{
+		{ 0x00, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f },
+		{ 0x00, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0x0e, 0x35, 0xd7 },
+	},
+	{
+		{ 0x00, 0xa4, 0x55, 0xf1, 0xaa, 0x0e, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12 },
+		{ 0x00, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x04, 0xaf, 0x3d },
+	},
+	{
+		{ 0x00, 0xa5, 0x57, 0xf2, 0xae, 0x0b, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d },
+		{ 0x00, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd },
+	},
+	{
+		{ 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c },
+		{ 0x00, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0x0b, 0xb9, 0x72, 0xc0 },
+	},
+	{
+		{ 0x00, 0xa7, 0x53, 0xf4, 0xa6, 0x01, 0xf5, 0x52, 0x51, 0xf6, 0x02, 0xa5, 0xf7, 0x50, 0xa4, 0x03 },
+		{ 0x00, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30 },
+	},
+	{
+		{ 0x00, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56 },
+		{ 0x00, 0x52, 0xa4, 0xf6, 0x55, 0x07, 0xf1, 0xa3, 0xaa, 0xf8, 0x0e, 0x5c, 0xff, 0xad, 0x5b, 0x09 },
+	},
+	{
+		{ 0x00, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78, 0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59 },
+		{ 0x00, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9 },
+	},
+	{
+		{ 0x00, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x01, 0xe2, 0x48 },
+		{ 0x00, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4 },
+	},
+	{
+		{ 0x00, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0x0c, 0xec, 0x47 },
+		{ 0x00, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x04 },
+	},
+	{
+		{ 0x00, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63, 0x09, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a },
+		{ 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee },
+	},
+	{
+		{ 0x00, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x01, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65 },
+		{ 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e },
+	},
+	{
+		{ 0x00, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74 },
+		{ 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13 },
+	},
+	{
+		{ 0x00, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b },
+		{ 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0x0d, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3 },
+	},
+	{
+		{ 0x00, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde },
+		{ 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 },
+	},
+	{
+		{ 0x00, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1 },
+		{ 0x00, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91 },
+	},
+	{
+		{ 0x00, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0x0b, 0xb9, 0x72, 0xc0 },
+		{ 0x00, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x07, 0xb0, 0x5f, 0x73, 0x9c },
+	},
+	{
+		{ 0x00, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x07, 0xb4, 0x7c, 0xcf },
+		{ 0x00, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c },
+	},
+	{
+		{ 0x00, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x08, 0x23, 0x97, 0x56, 0xe2 },
+		{ 0x00, 0x8f, 0x03, 0x8c, 0x06, 0x89, 0x05, 0x8a, 0x0c, 0x83, 0x0f, 0x80, 0x0a, 0x85, 0x09, 0x86 },
+	},
+	{
+		{ 0x00, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x03, 0x2f, 0x9a, 0x58, 0xed },
+		{ 0x00, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76 },
+	},
+	{
+		{ 0x00, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc },
+		{ 0x00, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b },
+	},
+	{
+		{ 0x00, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3 },
+		{ 0x00, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b },
+	},
+	{
+		{ 0x00, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0x0f, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6 },
+		{ 0x00, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0x0d, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2 },
+	},
+	{
+		{ 0x00, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x08, 0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9 },
+		{ 0x00, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42 },
+	},
+	{
+		{ 0x00, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x01, 0xb9, 0x03, 0xd0, 0x6a, 0x6b, 0xd1, 0x02, 0xb8 },
+		{ 0x00, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f },
+	},
+	{
+		{ 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 },
+		{ 0x00, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf },
+	},
+	{
+		{ 0x00, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a },
+		{ 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 },
+	},
+	{
+		{ 0x00, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95 },
+		{ 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 },
+	},
+	{
+		{ 0x00, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84 },
+		{ 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8 },
+	},
+	{
+		{ 0x00, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b },
+		{ 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58 },
+	},
+	{
+		{ 0x00, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34 },
+		{ 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 },
+	},
+	{
+		{ 0x00, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b },
+		{ 0x00, 0x8c, 0x05, 0x89, 0x0a, 0x86, 0x0f, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97 },
+	},
+	{
+		{ 0x00, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x05, 0x71, 0xb3, 0xe8, 0x2a },
+		{ 0x00, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a },
+	},
+	{
+		{ 0x00, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0x0e, 0x7d, 0xbe, 0xe6, 0x25 },
+		{ 0x00, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63, 0x09, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a },
+	},
+	{
+		{ 0x00, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x08 },
+		{ 0x00, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0x0b, 0xd7, 0xf9, 0x25, 0x5c, 0x80 },
+	},
+	{
+		{ 0x00, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x07 },
+		{ 0x00, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70 },
+	},
+	{
+		{ 0x00, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16 },
+		{ 0x00, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d },
+	},
+	{
+		{ 0x00, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19 },
+		{ 0x00, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d },
+	},
+	{
+		{ 0x00, 0xc8, 0x8d, 0x45, 0x07, 0xcf, 0x8a, 0x42, 0x0e, 0xc6, 0x83, 0x4b, 0x09, 0xc1, 0x84, 0x4c },
+		{ 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 },
+	},
+	{
+		{ 0x00, 0xc9, 0x8f, 0x46, 0x03, 0xca, 0x8c, 0x45, 0x06, 0xcf, 0x89, 0x40, 0x05, 0xcc, 0x8a, 0x43 },
+		{ 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 },
+	},
+	{
+		{ 0x00, 0xca, 0x89, 0x43, 0x0f, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52 },
+		{ 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0x0d, 0x31, 0x75, 0x49 },
+	},
+	{
+		{ 0x00, 0xcb, 0x8b, 0x40, 0x0b, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d },
+		{ 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x09, 0xcd, 0xe1, 0x95, 0xb9 },
+	},
+	{
+		{ 0x00, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70 },
+		{ 0x00, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89, 0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0x0f, 0x53 },
+	},
+	{
+		{ 0x00, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f },
+		{ 0x00, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3 },
+	},
+	{
+		{ 0x00, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e },
+		{ 0x00, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae },
+	},
+	{
+		{ 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 },
+		{ 0x00, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e },
+	},
+	{
+		{ 0x00, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0x0a, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4 },
+		{ 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc },
+	},
+	{
+		{ 0x00, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0x0d, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb },
+		{ 0x00, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c },
+	},
+	{
+		{ 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda },
+		{ 0x00, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21 },
+	},
+	{
+		{ 0x00, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x03, 0xd6, 0x05, 0x6d, 0xbe, 0xbd, 0x6e, 0x06, 0xd5 },
+		{ 0x00, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1 },
+	},
+	{
+		{ 0x00, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8 },
+		{ 0x00, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b },
+	},
+	{
+		{ 0x00, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7 },
+		{ 0x00, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0x0d, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb },
+	},
+	{
+		{ 0x00, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6 },
+		{ 0x00, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6 },
+	},
+	{
+		{ 0x00, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9 },
+		{ 0x00, 0xf1, 0xff, 0x0e, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36 },
+	},
+	{
+		{ 0x00, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc },
+		{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+	},
+	{
+		{ 0x00, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35, 0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3 },
+		{ 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff },
+	},
+	{
+		{ 0x00, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0x0b, 0x78, 0xa2 },
+		{ 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2 },
+	},
+	{
+		{ 0x00, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x06, 0x76, 0xad },
+		{ 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x02 },
+	},
+	{
+		{ 0x00, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0x0b, 0xd7, 0xf9, 0x25, 0x5c, 0x80 },
+		{ 0x00, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8 },
+	},
+	{
+		{ 0x00, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x01, 0xdc, 0xf5, 0x28, 0x52, 0x8f },
+		{ 0x00, 0x51, 0xa2, 0xf3, 0x59, 0x08, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18 },
+	},
+	{
+		{ 0x00, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e },
+		{ 0x00, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15 },
+	},
+	{
+		{ 0x00, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91 },
+		{ 0x00, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x07, 0x94, 0xe5 },
+	},
+	{
+		{ 0x00, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9 },
+		{ 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c },
+	},
+	{
+		{ 0x00, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6 },
+		{ 0x00, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc },
+	},
+	{
+		{ 0x00, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0x0e, 0x35, 0xd7 },
+		{ 0x00, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1 },
+	},
+	{
+		{ 0x00, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93, 0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x03, 0x3b, 0xd8 },
+		{ 0x00, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x01 },
+	},
+	{
+		{ 0x00, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5 },
+		{ 0x00, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0x0d, 0xeb },
+	},
+	{
+		{ 0x00, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa },
+		{ 0x00, 0xf6, 0xf1, 0x07, 0xff, 0x09, 0x0e, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b },
+	},
+	{
+		{ 0x00, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0x0d, 0xeb },
+		{ 0x00, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16 },
+	},
+	{
+		{ 0x00, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x03, 0xe4 },
+		{ 0x00, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6 },
+	},
+	{
+		{ 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 },
+		{ 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0x0b, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf },
+	},
+	{
+		{ 0x00, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe },
+		{ 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f },
+	},
+	{
+		{ 0x00, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x03, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf },
+		{ 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 },
+	},
+	{
+		{ 0x00, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0x0b, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0 },
+		{ 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 },
+	},
+	{
+		{ 0x00, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d },
+		{ 0x00, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38 },
+	},
+	{
+		{ 0x00, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82 },
+		{ 0x00, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0x0d, 0x52, 0x24, 0xbe, 0xc8 },
+	},
+	{
+		{ 0x00, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0x0c, 0xbc, 0x52, 0x7d, 0x93 },
+		{ 0x00, 0x46, 0x8c, 0xca, 0x05, 0x43, 0x89, 0xcf, 0x0a, 0x4c, 0x86, 0xc0, 0x0f, 0x49, 0x83, 0xc5 },
+	},
+	{
+		{ 0x00, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x07, 0xb0, 0x5f, 0x73, 0x9c },
+		{ 0x00, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35 },
+	},
+	{
+		{ 0x00, 0xf0, 0xfd, 0x0d, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39 },
+		{ 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 },
+	},
+	{
+		{ 0x00, 0xf1, 0xff, 0x0e, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36 },
+		{ 0x00, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0x0c, 0xec, 0x47 },
+	},
+	{
+		{ 0x00, 0xf2, 0xf9, 0x0b, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27 },
+		{ 0x00, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a },
+	},
+	{
+		{ 0x00, 0xf3, 0xfb, 0x08, 0xeb, 0x18, 0x10, 0xe3, 0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28 },
+		{ 0x00, 0x8b, 0x0b, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba },
+	},
+	{
+		{ 0x00, 0xf4, 0xf5, 0x01, 0xf7, 0x03, 0x02, 0xf6, 0xf3, 0x07, 0x06, 0xf2, 0x04, 0xf0, 0xf1, 0x05 },
+		{ 0x00, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50 },
+	},
+	{
+		{ 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a },
+		{ 0x00, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0x0b, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0 },
+	},
+	{
+		{ 0x00, 0xf6, 0xf1, 0x07, 0xff, 0x09, 0x0e, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b },
+		{ 0x00, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x06, 0x76, 0xad },
+	},
+	{
+		{ 0x00, 0xf7, 0xf3, 0x04, 0xfb, 0x0c, 0x08, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14 },
+		{ 0x00, 0xcb, 0x8b, 0x40, 0x0b, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d },
+	},
+	{
+		{ 0x00, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41 },
+		{ 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64 },
+	},
+	{
+		{ 0x00, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e },
+		{ 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94 },
+	},
+	{
+		{ 0x00, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f },
+		{ 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 },
+	},
+	{
+		{ 0x00, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50 },
+		{ 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 },
+	},
+	{
+		{ 0x00, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d },
+		{ 0x00, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x07, 0x7c, 0xff, 0x84, 0x09, 0x72, 0x0e, 0x75, 0xf8, 0x83 },
+	},
+	{
+		{ 0x00, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72 },
+		{ 0x00, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0x0c, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73 },
+	},
+	{
+		{ 0x00, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63 },
+		{ 0x00, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0x0f, 0x93, 0xc8, 0x25, 0x7e },
+	},
+	{
+		{ 0x00, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c },
+		{ 0x00, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e },
+	},
+};
+#endif
+
diff --git a/raid/tag.c b/raid/tag.c
new file mode 100644
index 00000000..bfeefaad
--- /dev/null
+++ b/raid/tag.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+
+static struct raid_func {
+	const char *name;
+	void (*p)();
+} RAID_FUNC[] = {
+	{ "int8", raid_gen3_int8 },
+	{ "int8", raid_gen4_int8 },
+	{ "int8", raid_gen5_int8 },
+	{ "int8", raid_gen6_int8 },
+	{ "int32", raid_gen1_int32 },
+	{ "int64", raid_gen1_int64 },
+	{ "int32", raid_gen2_int32 },
+	{ "int64", raid_gen2_int64 },
+	{ "int32", raid_genz_int32 },
+	{ "int64", raid_genz_int64 },
+	{ "int8", raid_rec1_int8 },
+	{ "int8", raid_rec2_int8 },
+	{ "int8", raid_recX_int8 },
+
+#ifdef CONFIG_X86
+#ifdef CONFIG_SSE2
+	{ "sse2", raid_gen1_sse2 },
+	{ "sse2", raid_gen2_sse2 },
+	{ "sse2", raid_genz_sse2 },
+#endif
+#ifdef CONFIG_SSSE3
+	{ "ssse3", raid_gen3_ssse3 },
+	{ "ssse3", raid_gen4_ssse3 },
+	{ "ssse3", raid_gen5_ssse3 },
+	{ "ssse3", raid_gen6_ssse3 },
+	{ "ssse3", raid_rec1_ssse3 },
+	{ "ssse3", raid_rec2_ssse3 },
+	{ "ssse3", raid_recX_ssse3 },
+#endif
+#ifdef CONFIG_AVX2
+	{ "avx2", raid_gen1_avx2 },
+	{ "avx2", raid_gen2_avx2 },
+	{ "avx2", raid_rec1_avx2 },
+	{ "avx2", raid_rec2_avx2 },
+	{ "avx2", raid_recX_avx2 },
+#endif
+#endif
+
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_SSE2
+	{ "sse2e", raid_gen2_sse2ext },
+	{ "sse2e", raid_genz_sse2ext },
+#endif
+#ifdef CONFIG_SSSE3
+	{ "ssse3e", raid_gen3_ssse3ext },
+	{ "ssse3e", raid_gen4_ssse3ext },
+	{ "ssse3e", raid_gen5_ssse3ext },
+	{ "ssse3e", raid_gen6_ssse3ext },
+#endif
+#ifdef CONFIG_AVX2
+	{ "avx2e", raid_gen3_avx2ext },
+	{ "avx2e", raid_genz_avx2ext },
+	{ "avx2e", raid_gen4_avx2ext },
+	{ "avx2e", raid_gen5_avx2ext },
+	{ "avx2e", raid_gen6_avx2ext },
+#endif
+#endif
+	{ 0, 0 }
+};
+
+static const char *raid_tag(void (*func)())
+{
+	struct raid_func *i = RAID_FUNC;
+
+	while (i->name != 0) {
+		if (i->p == func)
+			return i->name;
+		++i;
+	}
+
+	/* LCOV_EXCL_START */
+	return "unknown";
+	/* LCOV_EXCL_STOP */
+}
+
+const char *raid_gen1_tag(void)
+{
+	return raid_tag(raid_gen_ptr[0]);
+}
+
+const char *raid_gen2_tag(void)
+{
+	return raid_tag(raid_gen_ptr[1]);
+}
+
+const char *raid_genz_tag(void)
+{
+	return raid_tag(raid_genz_ptr);
+}
+
+const char *raid_gen3_tag(void)
+{
+	return raid_tag(raid_gen_ptr[2]);
+}
+
+const char *raid_gen4_tag(void)
+{
+	return raid_tag(raid_gen_ptr[3]);
+}
+
+const char *raid_gen5_tag(void)
+{
+	return raid_tag(raid_gen_ptr[4]);
+}
+
+const char *raid_gen6_tag(void)
+{
+	return raid_tag(raid_gen_ptr[5]);
+}
+
+const char *raid_rec1_tag(void)
+{
+	return raid_tag(raid_rec_ptr[0]);
+}
+
+const char *raid_rec2_tag(void)
+{
+	return raid_tag(raid_rec_ptr[1]);
+}
+
+const char *raid_recX_tag(void)
+{
+	return raid_tag(raid_rec_ptr[2]);
+}
+
diff --git a/raid/test.c b/raid/test.c
new file mode 100644
index 00000000..feb8a415
--- /dev/null
+++ b/raid/test.c
@@ -0,0 +1,452 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+#include "cpu.h"
+#include "combo.h"
+#include "memory.h"
+
+/**
+ * Binomial coefficient of n over r.
+ */
+static int ibc(int n, int r)
+{
+	if (r == 0 || n == r)
+		return 1;
+	else
+		return ibc(n - 1, r - 1) + ibc(n - 1, r);
+}
+
+/**
+ * Power n ^ r;
+ */
+static int ipow(int n, int r)
+{
+	int v = 1;
+
+	while (r) {
+		v *= n;
+		--r;
+	}
+	return v;
+}
+
+int raid_test_combo(void)
+{
+	int r;
+	int count;
+	int p[RAID_PARITY_MAX];
+
+	for (r = 1; r <= RAID_PARITY_MAX; ++r) {
+		/* count combination (r of RAID_PARITY_MAX) elements */
+		count = 0;
+		combination_first(r, RAID_PARITY_MAX, p);
+
+		do {
+			++count;
+		} while (combination_next(r, RAID_PARITY_MAX, p));
+
+		if (count != ibc(RAID_PARITY_MAX, r)) {
+			/* LCOV_EXCL_START */
+			return -1;
+			/* LCOV_EXCL_STOP */
+		}
+	}
+
+	for (r = 1; r <= RAID_PARITY_MAX; ++r) {
+		/* count permutation (r of RAID_PARITY_MAX) elements */
+		count = 0;
+		permutation_first(r, RAID_PARITY_MAX, p);
+
+		do {
+			++count;
+		} while (permutation_next(r, RAID_PARITY_MAX, p));
+
+		if (count != ipow(RAID_PARITY_MAX, r)) {
+			/* LCOV_EXCL_START */
+			return -1;
+			/* LCOV_EXCL_STOP */
+		}
+	}
+
+	return 0;
+}
+
+int raid_test_insert(void)
+{
+	int p[RAID_PARITY_MAX];
+	int r;
+
+	for (r = 1; r <= RAID_PARITY_MAX; ++r) {
+		permutation_first(r, RAID_PARITY_MAX, p);
+		do {
+			int i[RAID_PARITY_MAX];
+			int j;
+
+			/* insert in order */
+			for (j = 0; j < r; ++j)
+				raid_insert(j, i, p[j]);
+
+			/* check order */
+			for (j = 1; j < r; ++j) {
+				if (i[j - 1] > i[j]) {
+					/* LCOV_EXCL_START */
+					return -1;
+					/* LCOV_EXCL_STOP */
+				}
+			}
+		} while (permutation_next(r, RAID_PARITY_MAX, p));
+	}
+
+	return 0;
+}
+
+int raid_test_sort(void)
+{
+	int p[RAID_PARITY_MAX];
+	int r;
+
+	for (r = 1; r <= RAID_PARITY_MAX; ++r) {
+		permutation_first(r, RAID_PARITY_MAX, p);
+		do {
+			int i[RAID_PARITY_MAX];
+			int j;
+
+			/* make a copy */
+			for (j = 0; j < r; ++j)
+				i[j] = p[j];
+
+			raid_sort(r, i);
+
+			/* check order */
+			for (j = 1; j < r; ++j) {
+				if (i[j - 1] > i[j]) {
+					/* LCOV_EXCL_START */
+					return -1;
+					/* LCOV_EXCL_STOP */
+				}
+			}
+		} while (permutation_next(r, RAID_PARITY_MAX, p));
+	}
+
+	return 0;
+}
+
+int raid_test_rec(int mode, int nd, size_t size)
+{
+	void (*f[RAID_PARITY_MAX][4])(
+		int nr, int *id, int *ip, int nd, size_t size, void **vbuf);
+	void *v_alloc;
+	void **v;
+	void **data;
+	void **parity;
+	void **test;
+	void *data_save[RAID_PARITY_MAX];
+	void *parity_save[RAID_PARITY_MAX];
+	void *waste;
+	int nv;
+	int id[RAID_PARITY_MAX];
+	int ip[RAID_PARITY_MAX];
+	int i;
+	int j;
+	int nr;
+	int nf[RAID_PARITY_MAX];
+	int np;
+
+	raid_mode(mode);
+	if (mode == RAID_MODE_CAUCHY)
+		np = RAID_PARITY_MAX;
+	else
+		np = 3;
+
+	nv = nd + np * 2 + 2;
+
+	v = raid_malloc_vector(nd, nv, size, &v_alloc);
+	if (!v) {
+		/* LCOV_EXCL_START */
+		return -1;
+		/* LCOV_EXCL_STOP */
+	}
+
+	data = v;
+	parity = v + nd;
+	test = v + nd + np;
+
+	for (i = 0; i < np; ++i)
+		parity_save[i] = parity[i];
+
+	memset(v[nv - 2], 0, size);
+	raid_zero(v[nv - 2]);
+
+	waste = v[nv - 1];
+
+	/* fill with pseudo-random data with the arbitrary seed "1" */
+	raid_mrand_vector(1, nd, size, v);
+
+	/* setup recov functions */
+	for (i = 0; i < np; ++i) {
+		nf[i] = 0;
+		if (i == 0) {
+			f[i][nf[i]++] = raid_rec1_int8;
+#ifdef CONFIG_X86
+#ifdef CONFIG_SSSE3
+			if (raid_cpu_has_ssse3())
+				f[i][nf[i]++] = raid_rec1_ssse3;
+#endif
+#ifdef CONFIG_AVX2
+			if (raid_cpu_has_avx2())
+				f[i][nf[i]++] = raid_rec1_avx2;
+#endif
+#endif
+		} else if (i == 1) {
+			f[i][nf[i]++] = raid_rec2_int8;
+#ifdef CONFIG_X86
+#ifdef CONFIG_SSSE3
+			if (raid_cpu_has_ssse3())
+				f[i][nf[i]++] = raid_rec2_ssse3;
+#endif
+#ifdef CONFIG_AVX2
+			if (raid_cpu_has_avx2())
+				f[i][nf[i]++] = raid_rec2_avx2;
+#endif
+#endif
+		} else {
+			f[i][nf[i]++] = raid_recX_int8;
+#ifdef CONFIG_X86
+#ifdef CONFIG_SSSE3
+			if (raid_cpu_has_ssse3())
+				f[i][nf[i]++] = raid_recX_ssse3;
+#endif
+#ifdef CONFIG_AVX2
+			if (raid_cpu_has_avx2())
+				f[i][nf[i]++] = raid_recX_avx2;
+#endif
+#endif
+		}
+	}
+
+	/* compute the parity */
+	raid_gen_ref(nd, np, size, v);
+
+	/* set all the parity to the waste v */
+	for (i = 0; i < np; ++i)
+		parity[i] = waste;
+
+	/* all parity levels */
+	for (nr = 1; nr <= np; ++nr) {
+		/* all combinations (nr of nd) disks */
+		combination_first(nr, nd, id);
+		do {
+			/* all combinations (nr of np) parities */
+			combination_first(nr, np, ip);
+			do {
+				/* for each recover function */
+				for (j = 0; j < nf[nr - 1]; ++j) {
+					/* set */
+					for (i = 0; i < nr; ++i) {
+						/* remove the missing data */
+						data_save[i] = data[id[i]];
+						data[id[i]] = test[i];
+						/* set the parity to use */
+						parity[ip[i]] = parity_save[ip[i]];
+					}
+
+					/* recover */
+					f[nr - 1][j](nr, id, ip, nd, size, v);
+
+					/* check */
+					for (i = 0; i < nr; ++i) {
+						if (memcmp(test[i], data_save[i], size) != 0) {
+							/* LCOV_EXCL_START */
+							goto bail;
+							/* LCOV_EXCL_STOP */
+						}
+					}
+
+					/* restore */
+					for (i = 0; i < nr; ++i) {
+						/* restore the data */
+						data[id[i]] = data_save[i];
+						/* restore the parity */
+						parity[ip[i]] = waste;
+					}
+				}
+			} while (combination_next(nr, np, ip));
+		} while (combination_next(nr, nd, id));
+	}
+
+	free(v_alloc);
+	free(v);
+	return 0;
+
+bail:
+	/* LCOV_EXCL_START */
+	free(v_alloc);
+	free(v);
+	return -1;
+	/* LCOV_EXCL_STOP */
+}
+
+int raid_test_par(int mode, int nd, size_t size)
+{
+	void (*f[64])(int nd, size_t size, void **vbuf);
+	void *v_alloc;
+	void **v;
+	int nv;
+	int i, j;
+	int nf;
+	int np;
+
+	raid_mode(mode);
+	if (mode == RAID_MODE_CAUCHY)
+		np = RAID_PARITY_MAX;
+	else
+		np = 3;
+
+	nv = nd + np * 2;
+
+	v = raid_malloc_vector(nd, nv, size, &v_alloc);
+	if (!v) {
+		/* LCOV_EXCL_START */
+		return -1;
+		/* LCOV_EXCL_STOP */
+	}
+
+	/* check memory */
+	if (raid_mtest_vector(nv, size, v) != 0) {
+		/* LCOV_EXCL_START */
+		goto bail;
+		/* LCOV_EXCL_STOP */
+	}
+
+	/* fill with pseudo-random data with the arbitrary seed "2" */
+	raid_mrand_vector(2, nv, size, v);
+
+	/* compute the parity */
+	raid_gen_ref(nd, np, size, v);
+
+	/* copy in back buffers */
+	for (i = 0; i < np; ++i)
+		memcpy(v[nd + np + i], v[nd + i], size);
+
+	/* load all the available functions */
+	nf = 0;
+
+	f[nf++] = raid_gen1_int32;
+	f[nf++] = raid_gen1_int64;
+	f[nf++] = raid_gen2_int32;
+	f[nf++] = raid_gen2_int64;
+
+#ifdef CONFIG_X86
+#ifdef CONFIG_SSE2
+	if (raid_cpu_has_sse2()) {
+		f[nf++] = raid_gen1_sse2;
+		f[nf++] = raid_gen2_sse2;
+#ifdef CONFIG_X86_64
+		f[nf++] = raid_gen2_sse2ext;
+#endif
+	}
+#endif
+
+#ifdef CONFIG_AVX2
+	if (raid_cpu_has_avx2()) {
+		f[nf++] = raid_gen1_avx2;
+		f[nf++] = raid_gen2_avx2;
+	}
+#endif
+#endif /* CONFIG_X86 */
+
+	if (mode == RAID_MODE_CAUCHY) {
+		f[nf++] = raid_gen3_int8;
+		f[nf++] = raid_gen4_int8;
+		f[nf++] = raid_gen5_int8;
+		f[nf++] = raid_gen6_int8;
+
+#ifdef CONFIG_X86
+#ifdef CONFIG_SSSE3
+		if (raid_cpu_has_ssse3()) {
+			f[nf++] = raid_gen3_ssse3;
+			f[nf++] = raid_gen4_ssse3;
+			f[nf++] = raid_gen5_ssse3;
+			f[nf++] = raid_gen6_ssse3;
+#ifdef CONFIG_X86_64
+			f[nf++] = raid_gen3_ssse3ext;
+			f[nf++] = raid_gen4_ssse3ext;
+			f[nf++] = raid_gen5_ssse3ext;
+			f[nf++] = raid_gen6_ssse3ext;
+#endif
+		}
+#endif
+
+#ifdef CONFIG_AVX2
+#ifdef CONFIG_X86_64
+		if (raid_cpu_has_avx2()) {
+			f[nf++] = raid_gen3_avx2ext;
+			f[nf++] = raid_gen4_avx2ext;
+			f[nf++] = raid_gen5_avx2ext;
+			f[nf++] = raid_gen6_avx2ext;
+		}
+#endif
+#endif
+#endif /* CONFIG_X86 */
+	} else {
+		f[nf++] = raid_genz_int32;
+		f[nf++] = raid_genz_int64;
+
+#ifdef CONFIG_X86
+#ifdef CONFIG_SSE2
+		if (raid_cpu_has_sse2()) {
+			f[nf++] = raid_genz_sse2;
+#ifdef CONFIG_X86_64
+			f[nf++] = raid_genz_sse2ext;
+#endif
+		}
+#endif
+
+#ifdef CONFIG_AVX2
+#ifdef CONFIG_X86_64
+		if (raid_cpu_has_avx2())
+			f[nf++] = raid_genz_avx2ext;
+#endif
+#endif
+#endif /* CONFIG_X86 */
+	}
+
+	/* check all the functions */
+	for (j = 0; j < nf; ++j) {
+		/* compute parity */
+		f[j](nd, size, v);
+
+		/* check it */
+		for (i = 0; i < np; ++i) {
+			if (memcmp(v[nd + np + i], v[nd + i], size) != 0) {
+				/* LCOV_EXCL_START */
+				goto bail;
+				/* LCOV_EXCL_STOP */
+			}
+		}
+	}
+
+	free(v_alloc);
+	free(v);
+	return 0;
+
+bail:
+	/* LCOV_EXCL_START */
+	free(v_alloc);
+	free(v);
+	return -1;
+	/* LCOV_EXCL_STOP */
+}
+
diff --git a/raid/test.h b/raid/test.h
new file mode 100644
index 00000000..6d902c7f
--- /dev/null
+++ b/raid/test.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAID_TEST_H
+#define __RAID_TEST_H
+
+/**
+ * Tests insertion function.
+ *
+ * Test raid_insert() with all the possible combinations of elements to insert.
+ *
+ * Returns 0 on success.
+ */
+int raid_test_insert(void);
+
+/**
+ * Tests sorting function.
+ *
+ * Test raid_sort() with all the possible combinations of elements to sort.
+ *
+ * Returns 0 on success.
+ */
+int raid_test_sort(void);
+
+/**
+ * Tests combination functions.
+ *
+ * Tests combination_first() and combination_next() for all the parity levels.
+ *
+ * Returns 0 on success.
+ */
+int raid_test_combo(void);
+
+/**
+ * Tests recovering functions.
+ *
+ * All the recovering functions are tested with all the combinations
+ * of failing disks and recovering parities.
+ *
+ * Take care that the test time grows exponentially with the number of disks.
+ *
+ * Returns 0 on success.
+ */
+int raid_test_rec(unsigned mode, int nd, size_t size);
+
+/**
+ * Tests parity generation functions.
+ *
+ * All the parity generation functions are tested with the specified
+ * number of disks.
+ *
+ * Returns 0 on success.
+ */
+int raid_test_par(unsigned mode, int nd, size_t size);
+
+#endif
+
diff --git a/raid/x86.c b/raid/x86.c
new file mode 100644
index 00000000..84b12c1c
--- /dev/null
+++ b/raid/x86.c
@@ -0,0 +1,2452 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+#include "gf.h"
+
+/*
+ * For x86 optimizations you can see:
+ *
+ * Software optimization resources
+ * http://www.agner.org/optimize/
+ *
+ * x86, x64 Instruction Latency, Memory Latency and CPUID dumps
+ * http://users.atw.hu/instlatx64/
+ */
+
+#if defined(CONFIG_X86) && defined(CONFIG_SSE2)
+/*
+ * GEN1 (RAID5 with xor) SSE2 implementation
+ *
+ * Intentionally don't process more than 64 bytes because 64 is the typical
+ * cache block, and processing 128 bytes doesn't increase performance, and in
+ * some cases it even decreases it.
+ */
+void raid_gen1_sse2(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+
+	raid_sse_begin();
+
+	for (i = 0; i < size; i += 64) {
+		asm volatile ("movdqa %0,%%xmm0" : : "m" (v[l][i]));
+		asm volatile ("movdqa %0,%%xmm1" : : "m" (v[l][i + 16]));
+		asm volatile ("movdqa %0,%%xmm2" : : "m" (v[l][i + 32]));
+		asm volatile ("movdqa %0,%%xmm3" : : "m" (v[l][i + 48]));
+		for (d = l - 1; d >= 0; --d) {
+			asm volatile ("pxor %0,%%xmm0" : : "m" (v[d][i]));
+			asm volatile ("pxor %0,%%xmm1" : : "m" (v[d][i + 16]));
+			asm volatile ("pxor %0,%%xmm2" : : "m" (v[d][i + 32]));
+			asm volatile ("pxor %0,%%xmm3" : : "m" (v[d][i + 48]));
+		}
+		asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
+		asm volatile ("movntdq %%xmm1,%0" : "=m" (p[i + 16]));
+		asm volatile ("movntdq %%xmm2,%0" : "=m" (p[i + 32]));
+		asm volatile ("movntdq %%xmm3,%0" : "=m" (p[i + 48]));
+	}
+
+	raid_sse_end();
+}
+#endif
+
+#if defined(CONFIG_X86) && defined(CONFIG_AVX2)
+/*
+ * GEN1 (RAID5 with xor) AVX2 implementation
+ *
+ * Intentionally don't process more than 64 bytes because 64 is the typical
+ * cache block, and processing 128 bytes doesn't increase performance, and in
+ * some cases it even decreases it.
+ */
+void raid_gen1_avx2(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+
+	raid_avx_begin();
+
+	for (i = 0; i < size; i += 64) {
+		asm volatile ("vmovdqa %0,%%ymm0" : : "m" (v[l][i]));
+		asm volatile ("vmovdqa %0,%%ymm1" : : "m" (v[l][i + 32]));
+		for (d = l - 1; d >= 0; --d) {
+			asm volatile ("vpxor %0,%%ymm0,%%ymm0" : : "m" (v[d][i]));
+			asm volatile ("vpxor %0,%%ymm1,%%ymm1" : : "m" (v[d][i + 32]));
+		}
+		asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
+		asm volatile ("vmovntdq %%ymm1,%0" : "=m" (p[i + 32]));
+	}
+
+	raid_avx_end();
+}
+#endif
+
+#if defined(CONFIG_X86) && defined(CONFIG_SSE2)
+static const struct gfconst16 {
+	uint8_t poly[16];
+	uint8_t low4[16];
+} gfconst16 __aligned(32) = {
+	{
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d
+	},
+	{
+		0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+		0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
+	},
+};
+#endif
+
+#if defined(CONFIG_X86) && defined(CONFIG_SSE2)
+/*
+ * GEN2 (RAID6 with powers of 2) SSE2 implementation
+ */
+void raid_gen2_sse2(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+
+	raid_sse_begin();
+
+	asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.poly[0]));
+
+	for (i = 0; i < size; i += 32) {
+		asm volatile ("movdqa %0,%%xmm0" : : "m" (v[l][i]));
+		asm volatile ("movdqa %0,%%xmm1" : : "m" (v[l][i + 16]));
+		asm volatile ("movdqa %xmm0,%xmm2");
+		asm volatile ("movdqa %xmm1,%xmm3");
+		for (d = l - 1; d >= 0; --d) {
+			asm volatile ("pxor %xmm4,%xmm4");
+			asm volatile ("pxor %xmm5,%xmm5");
+			asm volatile ("pcmpgtb %xmm2,%xmm4");
+			asm volatile ("pcmpgtb %xmm3,%xmm5");
+			asm volatile ("paddb %xmm2,%xmm2");
+			asm volatile ("paddb %xmm3,%xmm3");
+			asm volatile ("pand %xmm7,%xmm4");
+			asm volatile ("pand %xmm7,%xmm5");
+			asm volatile ("pxor %xmm4,%xmm2");
+			asm volatile ("pxor %xmm5,%xmm3");
+
+			asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
+			asm volatile ("movdqa %0,%%xmm5" : : "m" (v[d][i + 16]));
+			asm volatile ("pxor %xmm4,%xmm0");
+			asm volatile ("pxor %xmm5,%xmm1");
+			asm volatile ("pxor %xmm4,%xmm2");
+			asm volatile ("pxor %xmm5,%xmm3");
+		}
+		asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
+		asm volatile ("movntdq %%xmm1,%0" : "=m" (p[i + 16]));
+		asm volatile ("movntdq %%xmm2,%0" : "=m" (q[i]));
+		asm volatile ("movntdq %%xmm3,%0" : "=m" (q[i + 16]));
+	}
+
+	raid_sse_end();
+}
+#endif
+
+#if defined(CONFIG_X86) && defined(CONFIG_AVX2)
+/*
+ * GEN2 (RAID6 with powers of 2) AVX2 implementation
+ */
+void raid_gen2_avx2(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+
+	raid_avx_begin();
+
+	asm volatile ("vbroadcasti128 %0, %%ymm7" : : "m" (gfconst16.poly[0]));
+	asm volatile ("vpxor %ymm6,%ymm6,%ymm6");
+
+	for (i = 0; i < size; i += 64) {
+		asm volatile ("vmovdqa %0,%%ymm0" : : "m" (v[l][i]));
+		asm volatile ("vmovdqa %0,%%ymm1" : : "m" (v[l][i + 32]));
+		asm volatile ("vmovdqa %ymm0,%ymm2");
+		asm volatile ("vmovdqa %ymm1,%ymm3");
+		for (d = l - 1; d >= 0; --d) {
+			asm volatile ("vpcmpgtb %ymm2,%ymm6,%ymm4");
+			asm volatile ("vpcmpgtb %ymm3,%ymm6,%ymm5");
+			asm volatile ("vpaddb %ymm2,%ymm2,%ymm2");
+			asm volatile ("vpaddb %ymm3,%ymm3,%ymm3");
+			asm volatile ("vpand %ymm7,%ymm4,%ymm4");
+			asm volatile ("vpand %ymm7,%ymm5,%ymm5");
+			asm volatile ("vpxor %ymm4,%ymm2,%ymm2");
+			asm volatile ("vpxor %ymm5,%ymm3,%ymm3");
+
+			asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[d][i]));
+			asm volatile ("vmovdqa %0,%%ymm5" : : "m" (v[d][i + 32]));
+			asm volatile ("vpxor %ymm4,%ymm0,%ymm0");
+			asm volatile ("vpxor %ymm5,%ymm1,%ymm1");
+			asm volatile ("vpxor %ymm4,%ymm2,%ymm2");
+			asm volatile ("vpxor %ymm5,%ymm3,%ymm3");
+		}
+		asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
+		asm volatile ("vmovntdq %%ymm1,%0" : "=m" (p[i + 32]));
+		asm volatile ("vmovntdq %%ymm2,%0" : "=m" (q[i]));
+		asm volatile ("vmovntdq %%ymm3,%0" : "=m" (q[i + 32]));
+	}
+
+	raid_avx_end();
+}
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SSE2)
+/*
+ * GEN2 (RAID6 with powers of 2) SSE2 implementation
+ *
+ * Note that it uses 16 registers, meaning that x64 is required.
+ */
+void raid_gen2_sse2ext(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+
+	raid_sse_begin();
+
+	asm volatile ("movdqa %0,%%xmm15" : : "m" (gfconst16.poly[0]));
+
+	for (i = 0; i < size; i += 64) {
+		asm volatile ("movdqa %0,%%xmm0" : : "m" (v[l][i]));
+		asm volatile ("movdqa %0,%%xmm1" : : "m" (v[l][i + 16]));
+		asm volatile ("movdqa %0,%%xmm2" : : "m" (v[l][i + 32]));
+		asm volatile ("movdqa %0,%%xmm3" : : "m" (v[l][i + 48]));
+		asm volatile ("movdqa %xmm0,%xmm4");
+		asm volatile ("movdqa %xmm1,%xmm5");
+		asm volatile ("movdqa %xmm2,%xmm6");
+		asm volatile ("movdqa %xmm3,%xmm7");
+		for (d = l - 1; d >= 0; --d) {
+			asm volatile ("pxor %xmm8,%xmm8");
+			asm volatile ("pxor %xmm9,%xmm9");
+			asm volatile ("pxor %xmm10,%xmm10");
+			asm volatile ("pxor %xmm11,%xmm11");
+			asm volatile ("pcmpgtb %xmm4,%xmm8");
+			asm volatile ("pcmpgtb %xmm5,%xmm9");
+			asm volatile ("pcmpgtb %xmm6,%xmm10");
+			asm volatile ("pcmpgtb %xmm7,%xmm11");
+			asm volatile ("paddb %xmm4,%xmm4");
+			asm volatile ("paddb %xmm5,%xmm5");
+			asm volatile ("paddb %xmm6,%xmm6");
+			asm volatile ("paddb %xmm7,%xmm7");
+			asm volatile ("pand %xmm15,%xmm8");
+			asm volatile ("pand %xmm15,%xmm9");
+			asm volatile ("pand %xmm15,%xmm10");
+			asm volatile ("pand %xmm15,%xmm11");
+			asm volatile ("pxor %xmm8,%xmm4");
+			asm volatile ("pxor %xmm9,%xmm5");
+			asm volatile ("pxor %xmm10,%xmm6");
+			asm volatile ("pxor %xmm11,%xmm7");
+
+			asm volatile ("movdqa %0,%%xmm8" : : "m" (v[d][i]));
+			asm volatile ("movdqa %0,%%xmm9" : : "m" (v[d][i + 16]));
+			asm volatile ("movdqa %0,%%xmm10" : : "m" (v[d][i + 32]));
+			asm volatile ("movdqa %0,%%xmm11" : : "m" (v[d][i + 48]));
+			asm volatile ("pxor %xmm8,%xmm0");
+			asm volatile ("pxor %xmm9,%xmm1");
+			asm volatile ("pxor %xmm10,%xmm2");
+			asm volatile ("pxor %xmm11,%xmm3");
+			asm volatile ("pxor %xmm8,%xmm4");
+			asm volatile ("pxor %xmm9,%xmm5");
+			asm volatile ("pxor %xmm10,%xmm6");
+			asm volatile ("pxor %xmm11,%xmm7");
+		}
+		asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
+		asm volatile ("movntdq %%xmm1,%0" : "=m" (p[i + 16]));
+		asm volatile ("movntdq %%xmm2,%0" : "=m" (p[i + 32]));
+		asm volatile ("movntdq %%xmm3,%0" : "=m" (p[i + 48]));
+		asm volatile ("movntdq %%xmm4,%0" : "=m" (q[i]));
+		asm volatile ("movntdq %%xmm5,%0" : "=m" (q[i + 16]));
+		asm volatile ("movntdq %%xmm6,%0" : "=m" (q[i + 32]));
+		asm volatile ("movntdq %%xmm7,%0" : "=m" (q[i + 48]));
+	}
+
+	raid_sse_end();
+}
+#endif
+
+#if defined(CONFIG_X86) && defined(CONFIG_SSSE3)
+/*
+ * GEN3 (triple parity with Cauchy matrix) SSSE3 implementation
+ */
+void raid_gen3_ssse3(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+
+	/* special case with only one data disk */
+	if (l == 0) {
+		for (i = 0; i < 3; ++i)
+			memcpy(v[1 + i], v[0], size);
+		return;
+	}
+
+	raid_sse_begin();
+
+	/* generic case with at least two data disks */
+	asm volatile ("movdqa %0,%%xmm3" : : "m" (gfconst16.poly[0]));
+	asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
+
+	for (i = 0; i < size; i += 16) {
+		/* last disk without the by two multiplication */
+		asm volatile ("movdqa %0,%%xmm4" : : "m" (v[l][i]));
+
+		asm volatile ("movdqa %xmm4,%xmm0");
+		asm volatile ("movdqa %xmm4,%xmm1");
+
+		asm volatile ("movdqa %xmm4,%xmm5");
+		asm volatile ("psrlw  $4,%xmm5");
+		asm volatile ("pand   %xmm7,%xmm4");
+		asm volatile ("pand   %xmm7,%xmm5");
+
+		asm volatile ("movdqa %0,%%xmm2" : : "m" (gfgenpshufb[l][0][0][0]));
+		asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[l][0][1][0]));
+		asm volatile ("pshufb %xmm4,%xmm2");
+		asm volatile ("pshufb %xmm5,%xmm6");
+		asm volatile ("pxor   %xmm6,%xmm2");
+
+		/* intermediate disks */
+		for (d = l - 1; d > 0; --d) {
+			asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
+
+			asm volatile ("pxor %xmm5,%xmm5");
+			asm volatile ("pcmpgtb %xmm1,%xmm5");
+			asm volatile ("paddb %xmm1,%xmm1");
+			asm volatile ("pand %xmm3,%xmm5");
+			asm volatile ("pxor %xmm5,%xmm1");
+
+			asm volatile ("pxor %xmm4,%xmm0");
+			asm volatile ("pxor %xmm4,%xmm1");
+
+			asm volatile ("movdqa %xmm4,%xmm5");
+			asm volatile ("psrlw  $4,%xmm5");
+			asm volatile ("pand   %xmm7,%xmm4");
+			asm volatile ("pand   %xmm7,%xmm5");
+
+			asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][0][0][0]));
+			asm volatile ("pshufb %xmm4,%xmm6");
+			asm volatile ("pxor   %xmm6,%xmm2");
+			asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][0][1][0]));
+			asm volatile ("pshufb %xmm5,%xmm6");
+			asm volatile ("pxor   %xmm6,%xmm2");
+		}
+
+		/* first disk with all coefficients at 1 */
+		asm volatile ("movdqa %0,%%xmm4" : : "m" (v[0][i]));
+
+		asm volatile ("pxor %xmm5,%xmm5");
+		asm volatile ("pcmpgtb %xmm1,%xmm5");
+		asm volatile ("paddb %xmm1,%xmm1");
+		asm volatile ("pand %xmm3,%xmm5");
+		asm volatile ("pxor %xmm5,%xmm1");
+
+		asm volatile ("pxor %xmm4,%xmm0");
+		asm volatile ("pxor %xmm4,%xmm1");
+		asm volatile ("pxor %xmm4,%xmm2");
+
+		asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
+		asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
+		asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
+	}
+
+	raid_sse_end();
+}
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SSSE3)
+/*
+ * GEN3 (triple parity with Cauchy matrix) SSSE3 implementation
+ *
+ * Note that it uses 16 registers, meaning that x64 is required.
+ */
+void raid_gen3_ssse3ext(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+
+	/* special case with only one data disk */
+	if (l == 0) {
+		for (i = 0; i < 3; ++i)
+			memcpy(v[1 + i], v[0], size);
+		return;
+	}
+
+	raid_sse_begin();
+
+	/* generic case with at least two data disks */
+	asm volatile ("movdqa %0,%%xmm3" : : "m" (gfconst16.poly[0]));
+	asm volatile ("movdqa %0,%%xmm11" : : "m" (gfconst16.low4[0]));
+
+	for (i = 0; i < size; i += 32) {
+		/* last disk without the by two multiplication */
+		asm volatile ("movdqa %0,%%xmm4" : : "m" (v[l][i]));
+		asm volatile ("movdqa %0,%%xmm12" : : "m" (v[l][i + 16]));
+
+		asm volatile ("movdqa %xmm4,%xmm0");
+		asm volatile ("movdqa %xmm4,%xmm1");
+		asm volatile ("movdqa %xmm12,%xmm8");
+		asm volatile ("movdqa %xmm12,%xmm9");
+
+		asm volatile ("movdqa %xmm4,%xmm5");
+		asm volatile ("movdqa %xmm12,%xmm13");
+		asm volatile ("psrlw  $4,%xmm5");
+		asm volatile ("psrlw  $4,%xmm13");
+		asm volatile ("pand   %xmm11,%xmm4");
+		asm volatile ("pand   %xmm11,%xmm12");
+		asm volatile ("pand   %xmm11,%xmm5");
+		asm volatile ("pand   %xmm11,%xmm13");
+
+		asm volatile ("movdqa %0,%%xmm2" : : "m" (gfgenpshufb[l][0][0][0]));
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][0][1][0]));
+		asm volatile ("movdqa %xmm2,%xmm10");
+		asm volatile ("movdqa %xmm7,%xmm15");
+		asm volatile ("pshufb %xmm4,%xmm2");
+		asm volatile ("pshufb %xmm12,%xmm10");
+		asm volatile ("pshufb %xmm5,%xmm7");
+		asm volatile ("pshufb %xmm13,%xmm15");
+		asm volatile ("pxor   %xmm7,%xmm2");
+		asm volatile ("pxor   %xmm15,%xmm10");
+
+		/* intermediate disks */
+		for (d = l - 1; d > 0; --d) {
+			asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
+			asm volatile ("movdqa %0,%%xmm12" : : "m" (v[d][i + 16]));
+
+			asm volatile ("pxor %xmm5,%xmm5");
+			asm volatile ("pxor %xmm13,%xmm13");
+			asm volatile ("pcmpgtb %xmm1,%xmm5");
+			asm volatile ("pcmpgtb %xmm9,%xmm13");
+			asm volatile ("paddb %xmm1,%xmm1");
+			asm volatile ("paddb %xmm9,%xmm9");
+			asm volatile ("pand %xmm3,%xmm5");
+			asm volatile ("pand %xmm3,%xmm13");
+			asm volatile ("pxor %xmm5,%xmm1");
+			asm volatile ("pxor %xmm13,%xmm9");
+
+			asm volatile ("pxor %xmm4,%xmm0");
+			asm volatile ("pxor %xmm4,%xmm1");
+			asm volatile ("pxor %xmm12,%xmm8");
+			asm volatile ("pxor %xmm12,%xmm9");
+
+			asm volatile ("movdqa %xmm4,%xmm5");
+			asm volatile ("movdqa %xmm12,%xmm13");
+			asm volatile ("psrlw  $4,%xmm5");
+			asm volatile ("psrlw  $4,%xmm13");
+			asm volatile ("pand   %xmm11,%xmm4");
+			asm volatile ("pand   %xmm11,%xmm12");
+			asm volatile ("pand   %xmm11,%xmm5");
+			asm volatile ("pand   %xmm11,%xmm13");
+
+			asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][0][0][0]));
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][0][1][0]));
+			asm volatile ("movdqa %xmm6,%xmm14");
+			asm volatile ("movdqa %xmm7,%xmm15");
+			asm volatile ("pshufb %xmm4,%xmm6");
+			asm volatile ("pshufb %xmm12,%xmm14");
+			asm volatile ("pshufb %xmm5,%xmm7");
+			asm volatile ("pshufb %xmm13,%xmm15");
+			asm volatile ("pxor   %xmm6,%xmm2");
+			asm volatile ("pxor   %xmm14,%xmm10");
+			asm volatile ("pxor   %xmm7,%xmm2");
+			asm volatile ("pxor   %xmm15,%xmm10");
+		}
+
+		/* first disk with all coefficients at 1 */
+		asm volatile ("movdqa %0,%%xmm4" : : "m" (v[0][i]));
+		asm volatile ("movdqa %0,%%xmm12" : : "m" (v[0][i + 16]));
+
+		asm volatile ("pxor %xmm5,%xmm5");
+		asm volatile ("pxor %xmm13,%xmm13");
+		asm volatile ("pcmpgtb %xmm1,%xmm5");
+		asm volatile ("pcmpgtb %xmm9,%xmm13");
+		asm volatile ("paddb %xmm1,%xmm1");
+		asm volatile ("paddb %xmm9,%xmm9");
+		asm volatile ("pand %xmm3,%xmm5");
+		asm volatile ("pand %xmm3,%xmm13");
+		asm volatile ("pxor %xmm5,%xmm1");
+		asm volatile ("pxor %xmm13,%xmm9");
+
+		asm volatile ("pxor %xmm4,%xmm0");
+		asm volatile ("pxor %xmm4,%xmm1");
+		asm volatile ("pxor %xmm4,%xmm2");
+		asm volatile ("pxor %xmm12,%xmm8");
+		asm volatile ("pxor %xmm12,%xmm9");
+		asm volatile ("pxor %xmm12,%xmm10");
+
+		asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
+		asm volatile ("movntdq %%xmm8,%0" : "=m" (p[i + 16]));
+		asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
+		asm volatile ("movntdq %%xmm9,%0" : "=m" (q[i + 16]));
+		asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
+		asm volatile ("movntdq %%xmm10,%0" : "=m" (r[i + 16]));
+	}
+
+	raid_sse_end();
+}
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_AVX2)
+/*
+ * GEN3 (triple parity with Cauchy matrix) AVX2 implementation
+ *
+ * Note that it uses 16 registers, meaning that x64 is required.
+ */
+void raid_gen3_avx2ext(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+
+	/* special case with only one data disk */
+	if (l == 0) {
+		for (i = 0; i < 3; ++i)
+			memcpy(v[1 + i], v[0], size);
+		return;
+	}
+
+	raid_avx_begin();
+
+	/* generic case with at least two data disks */
+	asm volatile ("vbroadcasti128 %0, %%ymm3" : : "m" (gfconst16.poly[0]));
+	asm volatile ("vbroadcasti128 %0, %%ymm11" : : "m" (gfconst16.low4[0]));
+
+	for (i = 0; i < size; i += 64) {
+		/* last disk without the by two multiplication */
+		asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[l][i]));
+		asm volatile ("vmovdqa %0,%%ymm12" : : "m" (v[l][i + 32]));
+
+		asm volatile ("vmovdqa %ymm4,%ymm0");
+		asm volatile ("vmovdqa %ymm4,%ymm1");
+		asm volatile ("vmovdqa %ymm12,%ymm8");
+		asm volatile ("vmovdqa %ymm12,%ymm9");
+
+		asm volatile ("vpsrlw  $4,%ymm4,%ymm5");
+		asm volatile ("vpsrlw  $4,%ymm12,%ymm13");
+		asm volatile ("vpand   %ymm11,%ymm4,%ymm4");
+		asm volatile ("vpand   %ymm11,%ymm12,%ymm12");
+		asm volatile ("vpand   %ymm11,%ymm5,%ymm5");
+		asm volatile ("vpand   %ymm11,%ymm13,%ymm13");
+
+		asm volatile ("vbroadcasti128 %0,%%ymm10" : : "m" (gfgenpshufb[l][0][0][0]));
+		asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfgenpshufb[l][0][1][0]));
+		asm volatile ("vpshufb %ymm4,%ymm10,%ymm2");
+		asm volatile ("vpshufb %ymm12,%ymm10,%ymm10");
+		asm volatile ("vpshufb %ymm5,%ymm15,%ymm7");
+		asm volatile ("vpshufb %ymm13,%ymm15,%ymm15");
+		asm volatile ("vpxor   %ymm7,%ymm2,%ymm2");
+		asm volatile ("vpxor   %ymm15,%ymm10,%ymm10");
+
+		/* intermediate disks */
+		for (d = l - 1; d > 0; --d) {
+			asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[d][i]));
+			asm volatile ("vmovdqa %0,%%ymm12" : : "m" (v[d][i + 32]));
+
+			asm volatile ("vpxor %ymm5,%ymm5,%ymm5");
+			asm volatile ("vpxor %ymm13,%ymm13,%ymm13");
+			asm volatile ("vpcmpgtb %ymm1,%ymm5,%ymm5");
+			asm volatile ("vpcmpgtb %ymm9,%ymm13,%ymm13");
+			asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
+			asm volatile ("vpaddb %ymm9,%ymm9,%ymm9");
+			asm volatile ("vpand %ymm3,%ymm5,%ymm5");
+			asm volatile ("vpand %ymm3,%ymm13,%ymm13");
+			asm volatile ("vpxor %ymm5,%ymm1,%ymm1");
+			asm volatile ("vpxor %ymm13,%ymm9,%ymm9");
+
+			asm volatile ("vpxor %ymm4,%ymm0,%ymm0");
+			asm volatile ("vpxor %ymm4,%ymm1,%ymm1");
+			asm volatile ("vpxor %ymm12,%ymm8,%ymm8");
+			asm volatile ("vpxor %ymm12,%ymm9,%ymm9");
+
+			asm volatile ("vpsrlw  $4,%ymm4,%ymm5");
+			asm volatile ("vpsrlw  $4,%ymm12,%ymm13");
+			asm volatile ("vpand   %ymm11,%ymm4,%ymm4");
+			asm volatile ("vpand   %ymm11,%ymm12,%ymm12");
+			asm volatile ("vpand   %ymm11,%ymm5,%ymm5");
+			asm volatile ("vpand   %ymm11,%ymm13,%ymm13");
+
+			asm volatile ("vbroadcasti128 %0,%%ymm14" : : "m" (gfgenpshufb[d][0][0][0]));
+			asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfgenpshufb[d][0][1][0]));
+			asm volatile ("vpshufb %ymm4,%ymm14,%ymm6");
+			asm volatile ("vpshufb %ymm12,%ymm14,%ymm14");
+			asm volatile ("vpshufb %ymm5,%ymm15,%ymm7");
+			asm volatile ("vpshufb %ymm13,%ymm15,%ymm15");
+			asm volatile ("vpxor   %ymm6,%ymm2,%ymm2");
+			asm volatile ("vpxor   %ymm14,%ymm10,%ymm10");
+			asm volatile ("vpxor   %ymm7,%ymm2,%ymm2");
+			asm volatile ("vpxor   %ymm15,%ymm10,%ymm10");
+		}
+
+		/* first disk with all coefficients at 1 */
+		asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[0][i]));
+		asm volatile ("vmovdqa %0,%%ymm12" : : "m" (v[0][i + 32]));
+
+		asm volatile ("vpxor %ymm5,%ymm5,%ymm5");
+		asm volatile ("vpxor %ymm13,%ymm13,%ymm13");
+		asm volatile ("vpcmpgtb %ymm1,%ymm5,%ymm5");
+		asm volatile ("vpcmpgtb %ymm9,%ymm13,%ymm13");
+		asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
+		asm volatile ("vpaddb %ymm9,%ymm9,%ymm9");
+		asm volatile ("vpand %ymm3,%ymm5,%ymm5");
+		asm volatile ("vpand %ymm3,%ymm13,%ymm13");
+		asm volatile ("vpxor %ymm5,%ymm1,%ymm1");
+		asm volatile ("vpxor %ymm13,%ymm9,%ymm9");
+
+		asm volatile ("vpxor %ymm4,%ymm0,%ymm0");
+		asm volatile ("vpxor %ymm4,%ymm1,%ymm1");
+		asm volatile ("vpxor %ymm4,%ymm2,%ymm2");
+		asm volatile ("vpxor %ymm12,%ymm8,%ymm8");
+		asm volatile ("vpxor %ymm12,%ymm9,%ymm9");
+		asm volatile ("vpxor %ymm12,%ymm10,%ymm10");
+
+		asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
+		asm volatile ("vmovntdq %%ymm8,%0" : "=m" (p[i + 32]));
+		asm volatile ("vmovntdq %%ymm1,%0" : "=m" (q[i]));
+		asm volatile ("vmovntdq %%ymm9,%0" : "=m" (q[i + 32]));
+		asm volatile ("vmovntdq %%ymm2,%0" : "=m" (r[i]));
+		asm volatile ("vmovntdq %%ymm10,%0" : "=m" (r[i + 32]));
+	}
+
+	raid_avx_end();
+}
+#endif
+
+#if defined(CONFIG_X86) && defined(CONFIG_SSSE3)
+/*
+ * GEN4 (quad parity with Cauchy matrix) SSSE3 implementation
+ */
+void raid_gen4_ssse3(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	uint8_t *s;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+	s = v[nd + 3];
+
+	/* special case with only one data disk */
+	if (l == 0) {
+		for (i = 0; i < 4; ++i)
+			memcpy(v[1 + i], v[0], size);
+		return;
+	}
+
+	raid_sse_begin();
+
+	/* generic case with at least two data disks */
+	for (i = 0; i < size; i += 16) {
+		/* last disk without the by two multiplication */
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
+		asm volatile ("movdqa %0,%%xmm4" : : "m" (v[l][i]));
+
+		asm volatile ("movdqa %xmm4,%xmm0");
+		asm volatile ("movdqa %xmm4,%xmm1");
+
+		asm volatile ("movdqa %xmm4,%xmm5");
+		asm volatile ("psrlw  $4,%xmm5");
+		asm volatile ("pand   %xmm7,%xmm4");
+		asm volatile ("pand   %xmm7,%xmm5");
+
+		asm volatile ("movdqa %0,%%xmm2" : : "m" (gfgenpshufb[l][0][0][0]));
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][0][1][0]));
+		asm volatile ("pshufb %xmm4,%xmm2");
+		asm volatile ("pshufb %xmm5,%xmm7");
+		asm volatile ("pxor   %xmm7,%xmm2");
+
+		asm volatile ("movdqa %0,%%xmm3" : : "m" (gfgenpshufb[l][1][0][0]));
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][1][1][0]));
+		asm volatile ("pshufb %xmm4,%xmm3");
+		asm volatile ("pshufb %xmm5,%xmm7");
+		asm volatile ("pxor   %xmm7,%xmm3");
+
+		/* intermediate disks */
+		for (d = l - 1; d > 0; --d) {
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.poly[0]));
+			asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
+
+			asm volatile ("pxor %xmm5,%xmm5");
+			asm volatile ("pcmpgtb %xmm1,%xmm5");
+			asm volatile ("paddb %xmm1,%xmm1");
+			asm volatile ("pand %xmm7,%xmm5");
+			asm volatile ("pxor %xmm5,%xmm1");
+
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
+
+			asm volatile ("pxor %xmm4,%xmm0");
+			asm volatile ("pxor %xmm4,%xmm1");
+
+			asm volatile ("movdqa %xmm4,%xmm5");
+			asm volatile ("psrlw  $4,%xmm5");
+			asm volatile ("pand   %xmm7,%xmm4");
+			asm volatile ("pand   %xmm7,%xmm5");
+
+			asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][0][0][0]));
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][0][1][0]));
+			asm volatile ("pshufb %xmm4,%xmm6");
+			asm volatile ("pshufb %xmm5,%xmm7");
+			asm volatile ("pxor   %xmm6,%xmm2");
+			asm volatile ("pxor   %xmm7,%xmm2");
+
+			asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][1][0][0]));
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][1][1][0]));
+			asm volatile ("pshufb %xmm4,%xmm6");
+			asm volatile ("pshufb %xmm5,%xmm7");
+			asm volatile ("pxor   %xmm6,%xmm3");
+			asm volatile ("pxor   %xmm7,%xmm3");
+		}
+
+		/* first disk with all coefficients at 1 */
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.poly[0]));
+		asm volatile ("movdqa %0,%%xmm4" : : "m" (v[0][i]));
+
+		asm volatile ("pxor %xmm5,%xmm5");
+		asm volatile ("pcmpgtb %xmm1,%xmm5");
+		asm volatile ("paddb %xmm1,%xmm1");
+		asm volatile ("pand %xmm7,%xmm5");
+		asm volatile ("pxor %xmm5,%xmm1");
+
+		asm volatile ("pxor %xmm4,%xmm0");
+		asm volatile ("pxor %xmm4,%xmm1");
+		asm volatile ("pxor %xmm4,%xmm2");
+		asm volatile ("pxor %xmm4,%xmm3");
+
+		asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
+		asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
+		asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
+		asm volatile ("movntdq %%xmm3,%0" : "=m" (s[i]));
+	}
+
+	raid_sse_end();
+}
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SSSE3)
+/*
+ * GEN4 (quad parity with Cauchy matrix) SSSE3 implementation
+ *
+ * Note that it uses 16 registers, meaning that x64 is required.
+ */
+void raid_gen4_ssse3ext(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	uint8_t *s;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+	s = v[nd + 3];
+
+	/* special case with only one data disk */
+	if (l == 0) {
+		for (i = 0; i < 4; ++i)
+			memcpy(v[1 + i], v[0], size);
+		return;
+	}
+
+	raid_sse_begin();
+
+	/* generic case with at least two data disks */
+	for (i = 0; i < size; i += 32) {
+		/* last disk without the by two multiplication */
+		asm volatile ("movdqa %0,%%xmm15" : : "m" (gfconst16.low4[0]));
+		asm volatile ("movdqa %0,%%xmm4" : : "m" (v[l][i]));
+		asm volatile ("movdqa %0,%%xmm12" : : "m" (v[l][i + 16]));
+
+		asm volatile ("movdqa %xmm4,%xmm0");
+		asm volatile ("movdqa %xmm4,%xmm1");
+		asm volatile ("movdqa %xmm12,%xmm8");
+		asm volatile ("movdqa %xmm12,%xmm9");
+
+		asm volatile ("movdqa %xmm4,%xmm5");
+		asm volatile ("movdqa %xmm12,%xmm13");
+		asm volatile ("psrlw  $4,%xmm5");
+		asm volatile ("psrlw  $4,%xmm13");
+		asm volatile ("pand   %xmm15,%xmm4");
+		asm volatile ("pand   %xmm15,%xmm12");
+		asm volatile ("pand   %xmm15,%xmm5");
+		asm volatile ("pand   %xmm15,%xmm13");
+
+		asm volatile ("movdqa %0,%%xmm2" : : "m" (gfgenpshufb[l][0][0][0]));
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][0][1][0]));
+		asm volatile ("movdqa %xmm2,%xmm10");
+		asm volatile ("movdqa %xmm7,%xmm15");
+		asm volatile ("pshufb %xmm4,%xmm2");
+		asm volatile ("pshufb %xmm12,%xmm10");
+		asm volatile ("pshufb %xmm5,%xmm7");
+		asm volatile ("pshufb %xmm13,%xmm15");
+		asm volatile ("pxor   %xmm7,%xmm2");
+		asm volatile ("pxor   %xmm15,%xmm10");
+
+		asm volatile ("movdqa %0,%%xmm3" : : "m" (gfgenpshufb[l][1][0][0]));
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][1][1][0]));
+		asm volatile ("movdqa %xmm3,%xmm11");
+		asm volatile ("movdqa %xmm7,%xmm15");
+		asm volatile ("pshufb %xmm4,%xmm3");
+		asm volatile ("pshufb %xmm12,%xmm11");
+		asm volatile ("pshufb %xmm5,%xmm7");
+		asm volatile ("pshufb %xmm13,%xmm15");
+		asm volatile ("pxor   %xmm7,%xmm3");
+		asm volatile ("pxor   %xmm15,%xmm11");
+
+		/* intermediate disks */
+		for (d = l - 1; d > 0; --d) {
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.poly[0]));
+			asm volatile ("movdqa %0,%%xmm15" : : "m" (gfconst16.low4[0]));
+			asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
+			asm volatile ("movdqa %0,%%xmm12" : : "m" (v[d][i + 16]));
+
+			asm volatile ("pxor %xmm5,%xmm5");
+			asm volatile ("pxor %xmm13,%xmm13");
+			asm volatile ("pcmpgtb %xmm1,%xmm5");
+			asm volatile ("pcmpgtb %xmm9,%xmm13");
+			asm volatile ("paddb %xmm1,%xmm1");
+			asm volatile ("paddb %xmm9,%xmm9");
+			asm volatile ("pand %xmm7,%xmm5");
+			asm volatile ("pand %xmm7,%xmm13");
+			asm volatile ("pxor %xmm5,%xmm1");
+			asm volatile ("pxor %xmm13,%xmm9");
+
+			asm volatile ("pxor %xmm4,%xmm0");
+			asm volatile ("pxor %xmm4,%xmm1");
+			asm volatile ("pxor %xmm12,%xmm8");
+			asm volatile ("pxor %xmm12,%xmm9");
+
+			asm volatile ("movdqa %xmm4,%xmm5");
+			asm volatile ("movdqa %xmm12,%xmm13");
+			asm volatile ("psrlw  $4,%xmm5");
+			asm volatile ("psrlw  $4,%xmm13");
+			asm volatile ("pand   %xmm15,%xmm4");
+			asm volatile ("pand   %xmm15,%xmm12");
+			asm volatile ("pand   %xmm15,%xmm5");
+			asm volatile ("pand   %xmm15,%xmm13");
+
+			asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][0][0][0]));
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][0][1][0]));
+			asm volatile ("movdqa %xmm6,%xmm14");
+			asm volatile ("movdqa %xmm7,%xmm15");
+			asm volatile ("pshufb %xmm4,%xmm6");
+			asm volatile ("pshufb %xmm12,%xmm14");
+			asm volatile ("pshufb %xmm5,%xmm7");
+			asm volatile ("pshufb %xmm13,%xmm15");
+			asm volatile ("pxor   %xmm6,%xmm2");
+			asm volatile ("pxor   %xmm14,%xmm10");
+			asm volatile ("pxor   %xmm7,%xmm2");
+			asm volatile ("pxor   %xmm15,%xmm10");
+
+			asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][1][0][0]));
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][1][1][0]));
+			asm volatile ("movdqa %xmm6,%xmm14");
+			asm volatile ("movdqa %xmm7,%xmm15");
+			asm volatile ("pshufb %xmm4,%xmm6");
+			asm volatile ("pshufb %xmm12,%xmm14");
+			asm volatile ("pshufb %xmm5,%xmm7");
+			asm volatile ("pshufb %xmm13,%xmm15");
+			asm volatile ("pxor   %xmm6,%xmm3");
+			asm volatile ("pxor   %xmm14,%xmm11");
+			asm volatile ("pxor   %xmm7,%xmm3");
+			asm volatile ("pxor   %xmm15,%xmm11");
+		}
+
+		/* first disk with all coefficients at 1 */
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.poly[0]));
+		asm volatile ("movdqa %0,%%xmm15" : : "m" (gfconst16.low4[0]));
+		asm volatile ("movdqa %0,%%xmm4" : : "m" (v[0][i]));
+		asm volatile ("movdqa %0,%%xmm12" : : "m" (v[0][i + 16]));
+
+		asm volatile ("pxor %xmm5,%xmm5");
+		asm volatile ("pxor %xmm13,%xmm13");
+		asm volatile ("pcmpgtb %xmm1,%xmm5");
+		asm volatile ("pcmpgtb %xmm9,%xmm13");
+		asm volatile ("paddb %xmm1,%xmm1");
+		asm volatile ("paddb %xmm9,%xmm9");
+		asm volatile ("pand %xmm7,%xmm5");
+		asm volatile ("pand %xmm7,%xmm13");
+		asm volatile ("pxor %xmm5,%xmm1");
+		asm volatile ("pxor %xmm13,%xmm9");
+
+		asm volatile ("pxor %xmm4,%xmm0");
+		asm volatile ("pxor %xmm4,%xmm1");
+		asm volatile ("pxor %xmm4,%xmm2");
+		asm volatile ("pxor %xmm4,%xmm3");
+		asm volatile ("pxor %xmm12,%xmm8");
+		asm volatile ("pxor %xmm12,%xmm9");
+		asm volatile ("pxor %xmm12,%xmm10");
+		asm volatile ("pxor %xmm12,%xmm11");
+
+		asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
+		asm volatile ("movntdq %%xmm8,%0" : "=m" (p[i + 16]));
+		asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
+		asm volatile ("movntdq %%xmm9,%0" : "=m" (q[i + 16]));
+		asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
+		asm volatile ("movntdq %%xmm10,%0" : "=m" (r[i + 16]));
+		asm volatile ("movntdq %%xmm3,%0" : "=m" (s[i]));
+		asm volatile ("movntdq %%xmm11,%0" : "=m" (s[i + 16]));
+	}
+
+	raid_sse_end();
+}
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_AVX2)
+/*
+ * GEN4 (quad parity with Cauchy matrix) AVX2 implementation
+ *
+ * Note that it uses 16 registers, meaning that x64 is required.
+ */
+void raid_gen4_avx2ext(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	uint8_t *s;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+	s = v[nd + 3];
+
+	/* special case with only one data disk */
+	if (l == 0) {
+		for (i = 0; i < 4; ++i)
+			memcpy(v[1 + i], v[0], size);
+		return;
+	}
+
+	raid_avx_begin();
+
+	/* generic case with at least two data disks */
+	for (i = 0; i < size; i += 64) {
+		/* last disk without the by two multiplication */
+		asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfconst16.low4[0]));
+		asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[l][i]));
+		asm volatile ("vmovdqa %0,%%ymm12" : : "m" (v[l][i + 32]));
+
+		asm volatile ("vmovdqa %ymm4,%ymm0");
+		asm volatile ("vmovdqa %ymm4,%ymm1");
+		asm volatile ("vmovdqa %ymm12,%ymm8");
+		asm volatile ("vmovdqa %ymm12,%ymm9");
+
+		asm volatile ("vpsrlw  $4,%ymm4,%ymm5");
+		asm volatile ("vpsrlw  $4,%ymm12,%ymm13");
+		asm volatile ("vpand   %ymm15,%ymm4,%ymm4");
+		asm volatile ("vpand   %ymm15,%ymm12,%ymm12");
+		asm volatile ("vpand   %ymm15,%ymm5,%ymm5");
+		asm volatile ("vpand   %ymm15,%ymm13,%ymm13");
+
+		asm volatile ("vbroadcasti128 %0,%%ymm10" : : "m" (gfgenpshufb[l][0][0][0]));
+		asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfgenpshufb[l][0][1][0]));
+		asm volatile ("vpshufb %ymm4,%ymm10,%ymm2");
+		asm volatile ("vpshufb %ymm5,%ymm15,%ymm7");
+		asm volatile ("vpshufb %ymm12,%ymm10,%ymm10");
+		asm volatile ("vpshufb %ymm13,%ymm15,%ymm15");
+		asm volatile ("vpxor   %ymm7,%ymm2,%ymm2");
+		asm volatile ("vpxor   %ymm15,%ymm10,%ymm10");
+
+		asm volatile ("vbroadcasti128 %0,%%ymm11" : : "m" (gfgenpshufb[l][1][0][0]));
+		asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfgenpshufb[l][1][1][0]));
+		asm volatile ("vpshufb %ymm4,%ymm11,%ymm3");
+		asm volatile ("vpshufb %ymm5,%ymm15,%ymm7");
+		asm volatile ("vpshufb %ymm12,%ymm11,%ymm11");
+		asm volatile ("vpshufb %ymm13,%ymm15,%ymm15");
+		asm volatile ("vpxor   %ymm7,%ymm3,%ymm3");
+		asm volatile ("vpxor   %ymm15,%ymm11,%ymm11");
+
+		/* intermediate disks */
+		for (d = l - 1; d > 0; --d) {
+			asm volatile ("vbroadcasti128 %0,%%ymm7" : : "m" (gfconst16.poly[0]));
+			asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfconst16.low4[0]));
+			asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[d][i]));
+			asm volatile ("vmovdqa %0,%%ymm12" : : "m" (v[d][i + 32]));
+
+			asm volatile ("vpxor %ymm5,%ymm5,%ymm5");
+			asm volatile ("vpxor %ymm13,%ymm13,%ymm13");
+			asm volatile ("vpcmpgtb %ymm1,%ymm5,%ymm5");
+			asm volatile ("vpcmpgtb %ymm9,%ymm13,%ymm13");
+			asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
+			asm volatile ("vpaddb %ymm9,%ymm9,%ymm9");
+			asm volatile ("vpand %ymm7,%ymm5,%ymm5");
+			asm volatile ("vpand %ymm7,%ymm13,%ymm13");
+			asm volatile ("vpxor %ymm5,%ymm1,%ymm1");
+			asm volatile ("vpxor %ymm13,%ymm9,%ymm9");
+
+			asm volatile ("vpxor %ymm4,%ymm0,%ymm0");
+			asm volatile ("vpxor %ymm4,%ymm1,%ymm1");
+			asm volatile ("vpxor %ymm12,%ymm8,%ymm8");
+			asm volatile ("vpxor %ymm12,%ymm9,%ymm9");
+
+			asm volatile ("vpsrlw  $4,%ymm4,%ymm5");
+			asm volatile ("vpsrlw  $4,%ymm12,%ymm13");
+			asm volatile ("vpand   %ymm15,%ymm4,%ymm4");
+			asm volatile ("vpand   %ymm15,%ymm12,%ymm12");
+			asm volatile ("vpand   %ymm15,%ymm5,%ymm5");
+			asm volatile ("vpand   %ymm15,%ymm13,%ymm13");
+
+			asm volatile ("vbroadcasti128 %0,%%ymm14" : : "m" (gfgenpshufb[d][0][0][0]));
+			asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfgenpshufb[d][0][1][0]));
+			asm volatile ("vpshufb %ymm4,%ymm14,%ymm6");
+			asm volatile ("vpshufb %ymm5,%ymm15,%ymm7");
+			asm volatile ("vpshufb %ymm12,%ymm14,%ymm14");
+			asm volatile ("vpshufb %ymm13,%ymm15,%ymm15");
+			asm volatile ("vpxor   %ymm6,%ymm2,%ymm2");
+			asm volatile ("vpxor   %ymm14,%ymm10,%ymm10");
+			asm volatile ("vpxor   %ymm7,%ymm2,%ymm2");
+			asm volatile ("vpxor   %ymm15,%ymm10,%ymm10");
+
+			asm volatile ("vbroadcasti128 %0,%%ymm14" : : "m" (gfgenpshufb[d][1][0][0]));
+			asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfgenpshufb[d][1][1][0]));
+			asm volatile ("vpshufb %ymm4,%ymm14,%ymm6");
+			asm volatile ("vpshufb %ymm5,%ymm15,%ymm7");
+			asm volatile ("vpshufb %ymm12,%ymm14,%ymm14");
+			asm volatile ("vpshufb %ymm13,%ymm15,%ymm15");
+			asm volatile ("vpxor   %ymm6,%ymm3,%ymm3");
+			asm volatile ("vpxor   %ymm14,%ymm11,%ymm11");
+			asm volatile ("vpxor   %ymm7,%ymm3,%ymm3");
+			asm volatile ("vpxor   %ymm15,%ymm11,%ymm11");
+		}
+
+		/* first disk with all coefficients at 1 */
+		asm volatile ("vbroadcasti128 %0,%%ymm7" : : "m" (gfconst16.poly[0]));
+		asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfconst16.low4[0]));
+		asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[0][i]));
+		asm volatile ("vmovdqa %0,%%ymm12" : : "m" (v[0][i + 32]));
+
+		asm volatile ("vpxor %ymm5,%ymm5,%ymm5");
+		asm volatile ("vpxor %ymm13,%ymm13,%ymm13");
+		asm volatile ("vpcmpgtb %ymm1,%ymm5,%ymm5");
+		asm volatile ("vpcmpgtb %ymm9,%ymm13,%ymm13");
+		asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
+		asm volatile ("vpaddb %ymm9,%ymm9,%ymm9");
+		asm volatile ("vpand %ymm7,%ymm5,%ymm5");
+		asm volatile ("vpand %ymm7,%ymm13,%ymm13");
+		asm volatile ("vpxor %ymm5,%ymm1,%ymm1");
+		asm volatile ("vpxor %ymm13,%ymm9,%ymm9");
+
+		asm volatile ("vpxor %ymm4,%ymm0,%ymm0");
+		asm volatile ("vpxor %ymm4,%ymm1,%ymm1");
+		asm volatile ("vpxor %ymm4,%ymm2,%ymm2");
+		asm volatile ("vpxor %ymm4,%ymm3,%ymm3");
+		asm volatile ("vpxor %ymm12,%ymm8,%ymm8");
+		asm volatile ("vpxor %ymm12,%ymm9,%ymm9");
+		asm volatile ("vpxor %ymm12,%ymm10,%ymm10");
+		asm volatile ("vpxor %ymm12,%ymm11,%ymm11");
+
+		asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
+		asm volatile ("vmovntdq %%ymm8,%0" : "=m" (p[i + 32]));
+		asm volatile ("vmovntdq %%ymm1,%0" : "=m" (q[i]));
+		asm volatile ("vmovntdq %%ymm9,%0" : "=m" (q[i + 32]));
+		asm volatile ("vmovntdq %%ymm2,%0" : "=m" (r[i]));
+		asm volatile ("vmovntdq %%ymm10,%0" : "=m" (r[i + 32]));
+		asm volatile ("vmovntdq %%ymm3,%0" : "=m" (s[i]));
+		asm volatile ("vmovntdq %%ymm11,%0" : "=m" (s[i + 32]));
+	}
+
+	raid_avx_end();
+}
+#endif
+
+#if defined(CONFIG_X86) && defined(CONFIG_SSSE3)
+/*
+ * GEN5 (penta parity with Cauchy matrix) SSSE3 implementation
+ */
+void raid_gen5_ssse3(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	uint8_t *s;
+	uint8_t *t;
+	int d, l;
+	size_t i;
+	uint8_t buffer[16+16];
+	uint8_t *pd = __align_ptr(buffer, 16);
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+	s = v[nd + 3];
+	t = v[nd + 4];
+
+	/* special case with only one data disk */
+	if (l == 0) {
+		for (i = 0; i < 5; ++i)
+			memcpy(v[1 + i], v[0], size);
+		return;
+	}
+
+	raid_sse_begin();
+
+	/* generic case with at least two data disks */
+	for (i = 0; i < size; i += 16) {
+		/* last disk without the by two multiplication */
+		asm volatile ("movdqa %0,%%xmm4" : : "m" (v[l][i]));
+
+		asm volatile ("movdqa %xmm4,%xmm0");
+		asm volatile ("movdqa %%xmm4,%0" : "=m" (pd[0]));
+
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
+		asm volatile ("movdqa %xmm4,%xmm5");
+		asm volatile ("psrlw  $4,%xmm5");
+		asm volatile ("pand   %xmm7,%xmm4");
+		asm volatile ("pand   %xmm7,%xmm5");
+
+		asm volatile ("movdqa %0,%%xmm1" : : "m" (gfgenpshufb[l][0][0][0]));
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][0][1][0]));
+		asm volatile ("pshufb %xmm4,%xmm1");
+		asm volatile ("pshufb %xmm5,%xmm7");
+		asm volatile ("pxor   %xmm7,%xmm1");
+
+		asm volatile ("movdqa %0,%%xmm2" : : "m" (gfgenpshufb[l][1][0][0]));
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][1][1][0]));
+		asm volatile ("pshufb %xmm4,%xmm2");
+		asm volatile ("pshufb %xmm5,%xmm7");
+		asm volatile ("pxor   %xmm7,%xmm2");
+
+		asm volatile ("movdqa %0,%%xmm3" : : "m" (gfgenpshufb[l][2][0][0]));
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][2][1][0]));
+		asm volatile ("pshufb %xmm4,%xmm3");
+		asm volatile ("pshufb %xmm5,%xmm7");
+		asm volatile ("pxor   %xmm7,%xmm3");
+
+		/* intermediate disks */
+		for (d = l - 1; d > 0; --d) {
+			asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
+			asm volatile ("movdqa %0,%%xmm6" : : "m" (pd[0]));
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.poly[0]));
+
+			asm volatile ("pxor %xmm5,%xmm5");
+			asm volatile ("pcmpgtb %xmm0,%xmm5");
+			asm volatile ("paddb %xmm0,%xmm0");
+			asm volatile ("pand %xmm7,%xmm5");
+			asm volatile ("pxor %xmm5,%xmm0");
+
+			asm volatile ("pxor %xmm4,%xmm0");
+			asm volatile ("pxor %xmm4,%xmm6");
+			asm volatile ("movdqa %%xmm6,%0" : "=m" (pd[0]));
+
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
+			asm volatile ("movdqa %xmm4,%xmm5");
+			asm volatile ("psrlw  $4,%xmm5");
+			asm volatile ("pand   %xmm7,%xmm4");
+			asm volatile ("pand   %xmm7,%xmm5");
+
+			asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][0][0][0]));
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][0][1][0]));
+			asm volatile ("pshufb %xmm4,%xmm6");
+			asm volatile ("pshufb %xmm5,%xmm7");
+			asm volatile ("pxor   %xmm6,%xmm1");
+			asm volatile ("pxor   %xmm7,%xmm1");
+
+			asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][1][0][0]));
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][1][1][0]));
+			asm volatile ("pshufb %xmm4,%xmm6");
+			asm volatile ("pshufb %xmm5,%xmm7");
+			asm volatile ("pxor   %xmm6,%xmm2");
+			asm volatile ("pxor   %xmm7,%xmm2");
+
+			asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][2][0][0]));
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][2][1][0]));
+			asm volatile ("pshufb %xmm4,%xmm6");
+			asm volatile ("pshufb %xmm5,%xmm7");
+			asm volatile ("pxor   %xmm6,%xmm3");
+			asm volatile ("pxor   %xmm7,%xmm3");
+		}
+
+		/* first disk with all coefficients at 1 */
+		asm volatile ("movdqa %0,%%xmm4" : : "m" (v[0][i]));
+		asm volatile ("movdqa %0,%%xmm6" : : "m" (pd[0]));
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.poly[0]));
+
+		asm volatile ("pxor %xmm5,%xmm5");
+		asm volatile ("pcmpgtb %xmm0,%xmm5");
+		asm volatile ("paddb %xmm0,%xmm0");
+		asm volatile ("pand %xmm7,%xmm5");
+		asm volatile ("pxor %xmm5,%xmm0");
+
+		asm volatile ("pxor %xmm4,%xmm0");
+		asm volatile ("pxor %xmm4,%xmm1");
+		asm volatile ("pxor %xmm4,%xmm2");
+		asm volatile ("pxor %xmm4,%xmm3");
+		asm volatile ("pxor %xmm4,%xmm6");
+
+		asm volatile ("movntdq %%xmm6,%0" : "=m" (p[i]));
+		asm volatile ("movntdq %%xmm0,%0" : "=m" (q[i]));
+		asm volatile ("movntdq %%xmm1,%0" : "=m" (r[i]));
+		asm volatile ("movntdq %%xmm2,%0" : "=m" (s[i]));
+		asm volatile ("movntdq %%xmm3,%0" : "=m" (t[i]));
+	}
+
+	raid_sse_end();
+}
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SSSE3)
+/*
+ * GEN5 (penta parity with Cauchy matrix) SSSE3 implementation
+ *
+ * Note that it uses 16 registers, meaning that x64 is required.
+ */
+void raid_gen5_ssse3ext(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	uint8_t *s;
+	uint8_t *t;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+	s = v[nd + 3];
+	t = v[nd + 4];
+
+	/* special case with only one data disk */
+	if (l == 0) {
+		for (i = 0; i < 5; ++i)
+			memcpy(v[1 + i], v[0], size);
+		return;
+	}
+
+	raid_sse_begin();
+
+	/* generic case with at least two data disks */
+	asm volatile ("movdqa %0,%%xmm14" : : "m" (gfconst16.poly[0]));
+	asm volatile ("movdqa %0,%%xmm15" : : "m" (gfconst16.low4[0]));
+
+	for (i = 0; i < size; i += 16) {
+		/* last disk without the by two multiplication */
+		asm volatile ("movdqa %0,%%xmm10" : : "m" (v[l][i]));
+
+		asm volatile ("movdqa %xmm10,%xmm0");
+		asm volatile ("movdqa %xmm10,%xmm1");
+
+		asm volatile ("movdqa %xmm10,%xmm11");
+		asm volatile ("psrlw  $4,%xmm11");
+		asm volatile ("pand   %xmm15,%xmm10");
+		asm volatile ("pand   %xmm15,%xmm11");
+
+		asm volatile ("movdqa %0,%%xmm2" : : "m" (gfgenpshufb[l][0][0][0]));
+		asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[l][0][1][0]));
+		asm volatile ("pshufb %xmm10,%xmm2");
+		asm volatile ("pshufb %xmm11,%xmm13");
+		asm volatile ("pxor   %xmm13,%xmm2");
+
+		asm volatile ("movdqa %0,%%xmm3" : : "m" (gfgenpshufb[l][1][0][0]));
+		asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[l][1][1][0]));
+		asm volatile ("pshufb %xmm10,%xmm3");
+		asm volatile ("pshufb %xmm11,%xmm13");
+		asm volatile ("pxor   %xmm13,%xmm3");
+
+		asm volatile ("movdqa %0,%%xmm4" : : "m" (gfgenpshufb[l][2][0][0]));
+		asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[l][2][1][0]));
+		asm volatile ("pshufb %xmm10,%xmm4");
+		asm volatile ("pshufb %xmm11,%xmm13");
+		asm volatile ("pxor   %xmm13,%xmm4");
+
+		/* intermediate disks */
+		for (d = l - 1; d > 0; --d) {
+			asm volatile ("movdqa %0,%%xmm10" : : "m" (v[d][i]));
+
+			asm volatile ("pxor %xmm11,%xmm11");
+			asm volatile ("pcmpgtb %xmm1,%xmm11");
+			asm volatile ("paddb %xmm1,%xmm1");
+			asm volatile ("pand %xmm14,%xmm11");
+			asm volatile ("pxor %xmm11,%xmm1");
+
+			asm volatile ("pxor %xmm10,%xmm0");
+			asm volatile ("pxor %xmm10,%xmm1");
+
+			asm volatile ("movdqa %xmm10,%xmm11");
+			asm volatile ("psrlw  $4,%xmm11");
+			asm volatile ("pand   %xmm15,%xmm10");
+			asm volatile ("pand   %xmm15,%xmm11");
+
+			asm volatile ("movdqa %0,%%xmm12" : : "m" (gfgenpshufb[d][0][0][0]));
+			asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[d][0][1][0]));
+			asm volatile ("pshufb %xmm10,%xmm12");
+			asm volatile ("pshufb %xmm11,%xmm13");
+			asm volatile ("pxor   %xmm12,%xmm2");
+			asm volatile ("pxor   %xmm13,%xmm2");
+
+			asm volatile ("movdqa %0,%%xmm12" : : "m" (gfgenpshufb[d][1][0][0]));
+			asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[d][1][1][0]));
+			asm volatile ("pshufb %xmm10,%xmm12");
+			asm volatile ("pshufb %xmm11,%xmm13");
+			asm volatile ("pxor   %xmm12,%xmm3");
+			asm volatile ("pxor   %xmm13,%xmm3");
+
+			asm volatile ("movdqa %0,%%xmm12" : : "m" (gfgenpshufb[d][2][0][0]));
+			asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[d][2][1][0]));
+			asm volatile ("pshufb %xmm10,%xmm12");
+			asm volatile ("pshufb %xmm11,%xmm13");
+			asm volatile ("pxor   %xmm12,%xmm4");
+			asm volatile ("pxor   %xmm13,%xmm4");
+		}
+
+		/* first disk with all coefficients at 1 */
+		asm volatile ("movdqa %0,%%xmm10" : : "m" (v[0][i]));
+
+		asm volatile ("pxor %xmm11,%xmm11");
+		asm volatile ("pcmpgtb %xmm1,%xmm11");
+		asm volatile ("paddb %xmm1,%xmm1");
+		asm volatile ("pand %xmm14,%xmm11");
+		asm volatile ("pxor %xmm11,%xmm1");
+
+		asm volatile ("pxor %xmm10,%xmm0");
+		asm volatile ("pxor %xmm10,%xmm1");
+		asm volatile ("pxor %xmm10,%xmm2");
+		asm volatile ("pxor %xmm10,%xmm3");
+		asm volatile ("pxor %xmm10,%xmm4");
+
+		asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
+		asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
+		asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
+		asm volatile ("movntdq %%xmm3,%0" : "=m" (s[i]));
+		asm volatile ("movntdq %%xmm4,%0" : "=m" (t[i]));
+	}
+
+	raid_sse_end();
+}
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_AVX2)
+/*
+ * GEN5 (penta parity with Cauchy matrix) AVX2 implementation
+ *
+ * Note that it uses 16 registers, meaning that x64 is required.
+ */
+void raid_gen5_avx2ext(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	uint8_t *s;
+	uint8_t *t;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+	s = v[nd + 3];
+	t = v[nd + 4];
+
+	/* special case with only one data disk */
+	if (l == 0) {
+		for (i = 0; i < 5; ++i)
+			memcpy(v[1 + i], v[0], size);
+		return;
+	}
+
+	raid_avx_begin();
+
+	/* generic case with at least two data disks */
+	asm volatile ("vpxor %ymm8,%ymm8,%ymm8");
+	asm volatile ("vbroadcasti128 %0,%%ymm14" : : "m" (gfconst16.poly[0]));
+	asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfconst16.low4[0]));
+
+	for (i = 0; i < size; i += 32) {
+		/* last disk without the by two multiplication */
+		asm volatile ("vmovdqa %0,%%ymm10" : : "m" (v[l][i]));
+
+		asm volatile ("vmovdqa %ymm10,%ymm0");
+		asm volatile ("vmovdqa %ymm10,%ymm1");
+
+		asm volatile ("vpsrlw  $4,%ymm10,%ymm11");
+		asm volatile ("vpand   %ymm15,%ymm10,%ymm10");
+		asm volatile ("vpand   %ymm15,%ymm11,%ymm11");
+
+		asm volatile ("vbroadcasti128 %0,%%ymm2" : : "m" (gfgenpshufb[l][0][0][0]));
+		asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[l][0][1][0]));
+		asm volatile ("vpshufb %ymm10,%ymm2,%ymm2");
+		asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
+		asm volatile ("vpxor   %ymm13,%ymm2,%ymm2");
+
+		asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfgenpshufb[l][1][0][0]));
+		asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[l][1][1][0]));
+		asm volatile ("vpshufb %ymm10,%ymm3,%ymm3");
+		asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
+		asm volatile ("vpxor   %ymm13,%ymm3,%ymm3");
+
+		asm volatile ("vbroadcasti128 %0,%%ymm4" : : "m" (gfgenpshufb[l][2][0][0]));
+		asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[l][2][1][0]));
+		asm volatile ("vpshufb %ymm10,%ymm4,%ymm4");
+		asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
+		asm volatile ("vpxor   %ymm13,%ymm4,%ymm4");
+
+		/* intermediate disks */
+		for (d = l - 1; d > 0; --d) {
+			asm volatile ("vmovdqa %0,%%ymm10" : : "m" (v[d][i]));
+
+			asm volatile ("vpcmpgtb %ymm1,%ymm8,%ymm11");
+			asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
+			asm volatile ("vpand %ymm14,%ymm11,%ymm11");
+			asm volatile ("vpxor %ymm11,%ymm1,%ymm1");
+
+			asm volatile ("vpxor %ymm10,%ymm0,%ymm0");
+			asm volatile ("vpxor %ymm10,%ymm1,%ymm1");
+
+			asm volatile ("vpsrlw  $4,%ymm10,%ymm11");
+			asm volatile ("vpand   %ymm15,%ymm10,%ymm10");
+			asm volatile ("vpand   %ymm15,%ymm11,%ymm11");
+
+			asm volatile ("vbroadcasti128 %0,%%ymm12" : : "m" (gfgenpshufb[d][0][0][0]));
+			asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[d][0][1][0]));
+			asm volatile ("vpshufb %ymm10,%ymm12,%ymm12");
+			asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
+			asm volatile ("vpxor   %ymm12,%ymm2,%ymm2");
+			asm volatile ("vpxor   %ymm13,%ymm2,%ymm2");
+
+			asm volatile ("vbroadcasti128 %0,%%ymm12" : : "m" (gfgenpshufb[d][1][0][0]));
+			asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[d][1][1][0]));
+			asm volatile ("vpshufb %ymm10,%ymm12,%ymm12");
+			asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
+			asm volatile ("vpxor   %ymm12,%ymm3,%ymm3");
+			asm volatile ("vpxor   %ymm13,%ymm3,%ymm3");
+
+			asm volatile ("vbroadcasti128 %0,%%ymm12" : : "m" (gfgenpshufb[d][2][0][0]));
+			asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[d][2][1][0]));
+			asm volatile ("vpshufb %ymm10,%ymm12,%ymm12");
+			asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
+			asm volatile ("vpxor   %ymm12,%ymm4,%ymm4");
+			asm volatile ("vpxor   %ymm13,%ymm4,%ymm4");
+		}
+
+		/* first disk with all coefficients at 1 */
+		asm volatile ("vmovdqa %0,%%ymm10" : : "m" (v[0][i]));
+
+		asm volatile ("vpcmpgtb %ymm1,%ymm8,%ymm11");
+		asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
+		asm volatile ("vpand %ymm14,%ymm11,%ymm11");
+		asm volatile ("vpxor %ymm11,%ymm1,%ymm1");
+
+		asm volatile ("vpxor %ymm10,%ymm0,%ymm0");
+		asm volatile ("vpxor %ymm10,%ymm1,%ymm1");
+		asm volatile ("vpxor %ymm10,%ymm2,%ymm2");
+		asm volatile ("vpxor %ymm10,%ymm3,%ymm3");
+		asm volatile ("vpxor %ymm10,%ymm4,%ymm4");
+
+		asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
+		asm volatile ("vmovntdq %%ymm1,%0" : "=m" (q[i]));
+		asm volatile ("vmovntdq %%ymm2,%0" : "=m" (r[i]));
+		asm volatile ("vmovntdq %%ymm3,%0" : "=m" (s[i]));
+		asm volatile ("vmovntdq %%ymm4,%0" : "=m" (t[i]));
+	}
+
+	raid_avx_end();
+}
+#endif
+
+#if defined(CONFIG_X86) && defined(CONFIG_SSSE3)
+/*
+ * GEN6 (hexa parity with Cauchy matrix) SSSE3 implementation
+ */
+void raid_gen6_ssse3(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	uint8_t *s;
+	uint8_t *t;
+	uint8_t *u;
+	int d, l;
+	size_t i;
+	uint8_t buffer[2*16+16];
+	uint8_t *pd = __align_ptr(buffer, 16);
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+	s = v[nd + 3];
+	t = v[nd + 4];
+	u = v[nd + 5];
+
+	/* special case with only one data disk */
+	if (l == 0) {
+		for (i = 0; i < 6; ++i)
+			memcpy(v[1 + i], v[0], size);
+		return;
+	}
+
+	raid_sse_begin();
+
+	/* generic case with at least two data disks */
+	for (i = 0; i < size; i += 16) {
+		/* last disk without the by two multiplication */
+		asm volatile ("movdqa %0,%%xmm4" : : "m" (v[l][i]));
+
+		asm volatile ("movdqa %%xmm4,%0" : "=m" (pd[0]));
+		asm volatile ("movdqa %%xmm4,%0" : "=m" (pd[16]));
+
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
+		asm volatile ("movdqa %xmm4,%xmm5");
+		asm volatile ("psrlw  $4,%xmm5");
+		asm volatile ("pand   %xmm7,%xmm4");
+		asm volatile ("pand   %xmm7,%xmm5");
+
+		asm volatile ("movdqa %0,%%xmm0" : : "m" (gfgenpshufb[l][0][0][0]));
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][0][1][0]));
+		asm volatile ("pshufb %xmm4,%xmm0");
+		asm volatile ("pshufb %xmm5,%xmm7");
+		asm volatile ("pxor   %xmm7,%xmm0");
+
+		asm volatile ("movdqa %0,%%xmm1" : : "m" (gfgenpshufb[l][1][0][0]));
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][1][1][0]));
+		asm volatile ("pshufb %xmm4,%xmm1");
+		asm volatile ("pshufb %xmm5,%xmm7");
+		asm volatile ("pxor   %xmm7,%xmm1");
+
+		asm volatile ("movdqa %0,%%xmm2" : : "m" (gfgenpshufb[l][2][0][0]));
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][2][1][0]));
+		asm volatile ("pshufb %xmm4,%xmm2");
+		asm volatile ("pshufb %xmm5,%xmm7");
+		asm volatile ("pxor   %xmm7,%xmm2");
+
+		asm volatile ("movdqa %0,%%xmm3" : : "m" (gfgenpshufb[l][3][0][0]));
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[l][3][1][0]));
+		asm volatile ("pshufb %xmm4,%xmm3");
+		asm volatile ("pshufb %xmm5,%xmm7");
+		asm volatile ("pxor   %xmm7,%xmm3");
+
+		/* intermediate disks */
+		for (d = l - 1; d > 0; --d) {
+			asm volatile ("movdqa %0,%%xmm5" : : "m" (pd[0]));
+			asm volatile ("movdqa %0,%%xmm6" : : "m" (pd[16]));
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.poly[0]));
+
+			asm volatile ("pxor %xmm4,%xmm4");
+			asm volatile ("pcmpgtb %xmm6,%xmm4");
+			asm volatile ("paddb %xmm6,%xmm6");
+			asm volatile ("pand %xmm7,%xmm4");
+			asm volatile ("pxor %xmm4,%xmm6");
+
+			asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
+
+			asm volatile ("pxor %xmm4,%xmm5");
+			asm volatile ("pxor %xmm4,%xmm6");
+			asm volatile ("movdqa %%xmm5,%0" : "=m" (pd[0]));
+			asm volatile ("movdqa %%xmm6,%0" : "=m" (pd[16]));
+
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
+			asm volatile ("movdqa %xmm4,%xmm5");
+			asm volatile ("psrlw  $4,%xmm5");
+			asm volatile ("pand   %xmm7,%xmm4");
+			asm volatile ("pand   %xmm7,%xmm5");
+
+			asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][0][0][0]));
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][0][1][0]));
+			asm volatile ("pshufb %xmm4,%xmm6");
+			asm volatile ("pshufb %xmm5,%xmm7");
+			asm volatile ("pxor   %xmm6,%xmm0");
+			asm volatile ("pxor   %xmm7,%xmm0");
+
+			asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][1][0][0]));
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][1][1][0]));
+			asm volatile ("pshufb %xmm4,%xmm6");
+			asm volatile ("pshufb %xmm5,%xmm7");
+			asm volatile ("pxor   %xmm6,%xmm1");
+			asm volatile ("pxor   %xmm7,%xmm1");
+
+			asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][2][0][0]));
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][2][1][0]));
+			asm volatile ("pshufb %xmm4,%xmm6");
+			asm volatile ("pshufb %xmm5,%xmm7");
+			asm volatile ("pxor   %xmm6,%xmm2");
+			asm volatile ("pxor   %xmm7,%xmm2");
+
+			asm volatile ("movdqa %0,%%xmm6" : : "m" (gfgenpshufb[d][3][0][0]));
+			asm volatile ("movdqa %0,%%xmm7" : : "m" (gfgenpshufb[d][3][1][0]));
+			asm volatile ("pshufb %xmm4,%xmm6");
+			asm volatile ("pshufb %xmm5,%xmm7");
+			asm volatile ("pxor   %xmm6,%xmm3");
+			asm volatile ("pxor   %xmm7,%xmm3");
+		}
+
+		/* first disk with all coefficients at 1 */
+		asm volatile ("movdqa %0,%%xmm5" : : "m" (pd[0]));
+		asm volatile ("movdqa %0,%%xmm6" : : "m" (pd[16]));
+		asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.poly[0]));
+
+		asm volatile ("pxor %xmm4,%xmm4");
+		asm volatile ("pcmpgtb %xmm6,%xmm4");
+		asm volatile ("paddb %xmm6,%xmm6");
+		asm volatile ("pand %xmm7,%xmm4");
+		asm volatile ("pxor %xmm4,%xmm6");
+
+		asm volatile ("movdqa %0,%%xmm4" : : "m" (v[0][i]));
+		asm volatile ("pxor %xmm4,%xmm0");
+		asm volatile ("pxor %xmm4,%xmm1");
+		asm volatile ("pxor %xmm4,%xmm2");
+		asm volatile ("pxor %xmm4,%xmm3");
+		asm volatile ("pxor %xmm4,%xmm5");
+		asm volatile ("pxor %xmm4,%xmm6");
+
+		asm volatile ("movntdq %%xmm5,%0" : "=m" (p[i]));
+		asm volatile ("movntdq %%xmm6,%0" : "=m" (q[i]));
+		asm volatile ("movntdq %%xmm0,%0" : "=m" (r[i]));
+		asm volatile ("movntdq %%xmm1,%0" : "=m" (s[i]));
+		asm volatile ("movntdq %%xmm2,%0" : "=m" (t[i]));
+		asm volatile ("movntdq %%xmm3,%0" : "=m" (u[i]));
+	}
+
+	raid_sse_end();
+}
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SSSE3)
+/*
+ * GEN6 (hexa parity with Cauchy matrix) SSSE3 implementation
+ *
+ * Note that it uses 16 registers, meaning that x64 is required.
+ */
+void raid_gen6_ssse3ext(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	uint8_t *s;
+	uint8_t *t;
+	uint8_t *u;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+	s = v[nd + 3];
+	t = v[nd + 4];
+	u = v[nd + 5];
+
+	/* special case with only one data disk */
+	if (l == 0) {
+		for (i = 0; i < 6; ++i)
+			memcpy(v[1 + i], v[0], size);
+		return;
+	}
+
+	raid_sse_begin();
+
+	/* generic case with at least two data disks */
+	asm volatile ("movdqa %0,%%xmm14" : : "m" (gfconst16.poly[0]));
+	asm volatile ("movdqa %0,%%xmm15" : : "m" (gfconst16.low4[0]));
+
+	for (i = 0; i < size; i += 16) {
+		/* last disk without the by two multiplication */
+		asm volatile ("movdqa %0,%%xmm10" : : "m" (v[l][i]));
+
+		asm volatile ("movdqa %xmm10,%xmm0");
+		asm volatile ("movdqa %xmm10,%xmm1");
+
+		asm volatile ("movdqa %xmm10,%xmm11");
+		asm volatile ("psrlw  $4,%xmm11");
+		asm volatile ("pand   %xmm15,%xmm10");
+		asm volatile ("pand   %xmm15,%xmm11");
+
+		asm volatile ("movdqa %0,%%xmm2" : : "m" (gfgenpshufb[l][0][0][0]));
+		asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[l][0][1][0]));
+		asm volatile ("pshufb %xmm10,%xmm2");
+		asm volatile ("pshufb %xmm11,%xmm13");
+		asm volatile ("pxor   %xmm13,%xmm2");
+
+		asm volatile ("movdqa %0,%%xmm3" : : "m" (gfgenpshufb[l][1][0][0]));
+		asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[l][1][1][0]));
+		asm volatile ("pshufb %xmm10,%xmm3");
+		asm volatile ("pshufb %xmm11,%xmm13");
+		asm volatile ("pxor   %xmm13,%xmm3");
+
+		asm volatile ("movdqa %0,%%xmm4" : : "m" (gfgenpshufb[l][2][0][0]));
+		asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[l][2][1][0]));
+		asm volatile ("pshufb %xmm10,%xmm4");
+		asm volatile ("pshufb %xmm11,%xmm13");
+		asm volatile ("pxor   %xmm13,%xmm4");
+
+		asm volatile ("movdqa %0,%%xmm5" : : "m" (gfgenpshufb[l][3][0][0]));
+		asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[l][3][1][0]));
+		asm volatile ("pshufb %xmm10,%xmm5");
+		asm volatile ("pshufb %xmm11,%xmm13");
+		asm volatile ("pxor   %xmm13,%xmm5");
+
+		/* intermediate disks */
+		for (d = l - 1; d > 0; --d) {
+			asm volatile ("movdqa %0,%%xmm10" : : "m" (v[d][i]));
+
+			asm volatile ("pxor %xmm11,%xmm11");
+			asm volatile ("pcmpgtb %xmm1,%xmm11");
+			asm volatile ("paddb %xmm1,%xmm1");
+			asm volatile ("pand %xmm14,%xmm11");
+			asm volatile ("pxor %xmm11,%xmm1");
+
+			asm volatile ("pxor %xmm10,%xmm0");
+			asm volatile ("pxor %xmm10,%xmm1");
+
+			asm volatile ("movdqa %xmm10,%xmm11");
+			asm volatile ("psrlw  $4,%xmm11");
+			asm volatile ("pand   %xmm15,%xmm10");
+			asm volatile ("pand   %xmm15,%xmm11");
+
+			asm volatile ("movdqa %0,%%xmm12" : : "m" (gfgenpshufb[d][0][0][0]));
+			asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[d][0][1][0]));
+			asm volatile ("pshufb %xmm10,%xmm12");
+			asm volatile ("pshufb %xmm11,%xmm13");
+			asm volatile ("pxor   %xmm12,%xmm2");
+			asm volatile ("pxor   %xmm13,%xmm2");
+
+			asm volatile ("movdqa %0,%%xmm12" : : "m" (gfgenpshufb[d][1][0][0]));
+			asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[d][1][1][0]));
+			asm volatile ("pshufb %xmm10,%xmm12");
+			asm volatile ("pshufb %xmm11,%xmm13");
+			asm volatile ("pxor   %xmm12,%xmm3");
+			asm volatile ("pxor   %xmm13,%xmm3");
+
+			asm volatile ("movdqa %0,%%xmm12" : : "m" (gfgenpshufb[d][2][0][0]));
+			asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[d][2][1][0]));
+			asm volatile ("pshufb %xmm10,%xmm12");
+			asm volatile ("pshufb %xmm11,%xmm13");
+			asm volatile ("pxor   %xmm12,%xmm4");
+			asm volatile ("pxor   %xmm13,%xmm4");
+
+			asm volatile ("movdqa %0,%%xmm12" : : "m" (gfgenpshufb[d][3][0][0]));
+			asm volatile ("movdqa %0,%%xmm13" : : "m" (gfgenpshufb[d][3][1][0]));
+			asm volatile ("pshufb %xmm10,%xmm12");
+			asm volatile ("pshufb %xmm11,%xmm13");
+			asm volatile ("pxor   %xmm12,%xmm5");
+			asm volatile ("pxor   %xmm13,%xmm5");
+		}
+
+		/* first disk with all coefficients at 1 */
+		asm volatile ("movdqa %0,%%xmm10" : : "m" (v[0][i]));
+
+		asm volatile ("pxor %xmm11,%xmm11");
+		asm volatile ("pcmpgtb %xmm1,%xmm11");
+		asm volatile ("paddb %xmm1,%xmm1");
+		asm volatile ("pand %xmm14,%xmm11");
+		asm volatile ("pxor %xmm11,%xmm1");
+
+		asm volatile ("pxor %xmm10,%xmm0");
+		asm volatile ("pxor %xmm10,%xmm1");
+		asm volatile ("pxor %xmm10,%xmm2");
+		asm volatile ("pxor %xmm10,%xmm3");
+		asm volatile ("pxor %xmm10,%xmm4");
+		asm volatile ("pxor %xmm10,%xmm5");
+
+		asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
+		asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
+		asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
+		asm volatile ("movntdq %%xmm3,%0" : "=m" (s[i]));
+		asm volatile ("movntdq %%xmm4,%0" : "=m" (t[i]));
+		asm volatile ("movntdq %%xmm5,%0" : "=m" (u[i]));
+	}
+
+	raid_sse_end();
+}
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_AVX2)
+/*
+ * GEN6 (hexa parity with Cauchy matrix) AVX2 implementation
+ *
+ * Note that it uses 16 registers, meaning that x64 is required.
+ */
+void raid_gen6_avx2ext(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	uint8_t *s;
+	uint8_t *t;
+	uint8_t *u;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+	s = v[nd + 3];
+	t = v[nd + 4];
+	u = v[nd + 5];
+
+	/* special case with only one data disk */
+	if (l == 0) {
+		for (i = 0; i < 6; ++i)
+			memcpy(v[1 + i], v[0], size);
+		return;
+	}
+
+	raid_avx_begin();
+
+	/* generic case with at least two data disks */
+	asm volatile ("vpxor %ymm8,%ymm8,%ymm8");
+	asm volatile ("vbroadcasti128 %0,%%ymm14" : : "m" (gfconst16.poly[0]));
+	asm volatile ("vbroadcasti128 %0,%%ymm15" : : "m" (gfconst16.low4[0]));
+
+	for (i = 0; i < size; i += 32) {
+		/* last disk without the by two multiplication */
+		asm volatile ("vmovdqa %0,%%ymm10" : : "m" (v[l][i]));
+
+		asm volatile ("vmovdqa %ymm10,%ymm0");
+		asm volatile ("vmovdqa %ymm10,%ymm1");
+
+		asm volatile ("vpsrlw  $4,%ymm10,%ymm11");
+		asm volatile ("vpand   %ymm15,%ymm10,%ymm10");
+		asm volatile ("vpand   %ymm15,%ymm11,%ymm11");
+
+		asm volatile ("vbroadcasti128 %0,%%ymm2" : : "m" (gfgenpshufb[l][0][0][0]));
+		asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[l][0][1][0]));
+		asm volatile ("vpshufb %ymm10,%ymm2,%ymm2");
+		asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
+		asm volatile ("vpxor   %ymm13,%ymm2,%ymm2");
+
+		asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfgenpshufb[l][1][0][0]));
+		asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[l][1][1][0]));
+		asm volatile ("vpshufb %ymm10,%ymm3,%ymm3");
+		asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
+		asm volatile ("vpxor   %ymm13,%ymm3,%ymm3");
+
+		asm volatile ("vbroadcasti128 %0,%%ymm4" : : "m" (gfgenpshufb[l][2][0][0]));
+		asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[l][2][1][0]));
+		asm volatile ("vpshufb %ymm10,%ymm4,%ymm4");
+		asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
+		asm volatile ("vpxor   %ymm13,%ymm4,%ymm4");
+
+		asm volatile ("vbroadcasti128 %0,%%ymm5" : : "m" (gfgenpshufb[l][3][0][0]));
+		asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[l][3][1][0]));
+		asm volatile ("vpshufb %ymm10,%ymm5,%ymm5");
+		asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
+		asm volatile ("vpxor   %ymm13,%ymm5,%ymm5");
+
+		/* intermediate disks */
+		for (d = l - 1; d > 0; --d) {
+			asm volatile ("vmovdqa %0,%%ymm10" : : "m" (v[d][i]));
+
+			asm volatile ("vpcmpgtb %ymm1,%ymm8,%ymm11");
+			asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
+			asm volatile ("vpand %ymm14,%ymm11,%ymm11");
+			asm volatile ("vpxor %ymm11,%ymm1,%ymm1");
+
+			asm volatile ("vpxor %ymm10,%ymm0,%ymm0");
+			asm volatile ("vpxor %ymm10,%ymm1,%ymm1");
+
+			asm volatile ("vpsrlw  $4,%ymm10,%ymm11");
+			asm volatile ("vpand   %ymm15,%ymm10,%ymm10");
+			asm volatile ("vpand   %ymm15,%ymm11,%ymm11");
+
+			asm volatile ("vbroadcasti128 %0,%%ymm12" : : "m" (gfgenpshufb[d][0][0][0]));
+			asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[d][0][1][0]));
+			asm volatile ("vpshufb %ymm10,%ymm12,%ymm12");
+			asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
+			asm volatile ("vpxor   %ymm12,%ymm2,%ymm2");
+			asm volatile ("vpxor   %ymm13,%ymm2,%ymm2");
+
+			asm volatile ("vbroadcasti128 %0,%%ymm12" : : "m" (gfgenpshufb[d][1][0][0]));
+			asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[d][1][1][0]));
+			asm volatile ("vpshufb %ymm10,%ymm12,%ymm12");
+			asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
+			asm volatile ("vpxor   %ymm12,%ymm3,%ymm3");
+			asm volatile ("vpxor   %ymm13,%ymm3,%ymm3");
+
+			asm volatile ("vbroadcasti128 %0,%%ymm12" : : "m" (gfgenpshufb[d][2][0][0]));
+			asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[d][2][1][0]));
+			asm volatile ("vpshufb %ymm10,%ymm12,%ymm12");
+			asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
+			asm volatile ("vpxor   %ymm12,%ymm4,%ymm4");
+			asm volatile ("vpxor   %ymm13,%ymm4,%ymm4");
+
+			asm volatile ("vbroadcasti128 %0,%%ymm12" : : "m" (gfgenpshufb[d][3][0][0]));
+			asm volatile ("vbroadcasti128 %0,%%ymm13" : : "m" (gfgenpshufb[d][3][1][0]));
+			asm volatile ("vpshufb %ymm10,%ymm12,%ymm12");
+			asm volatile ("vpshufb %ymm11,%ymm13,%ymm13");
+			asm volatile ("vpxor   %ymm12,%ymm5,%ymm5");
+			asm volatile ("vpxor   %ymm13,%ymm5,%ymm5");
+		}
+
+		/* first disk with all coefficients at 1 */
+		asm volatile ("vmovdqa %0,%%ymm10" : : "m" (v[0][i]));
+
+		asm volatile ("vpcmpgtb %ymm1,%ymm8,%ymm11");
+		asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
+		asm volatile ("vpand %ymm14,%ymm11,%ymm11");
+		asm volatile ("vpxor %ymm11,%ymm1,%ymm1");
+
+		asm volatile ("vpxor %ymm10,%ymm0,%ymm0");
+		asm volatile ("vpxor %ymm10,%ymm1,%ymm1");
+		asm volatile ("vpxor %ymm10,%ymm2,%ymm2");
+		asm volatile ("vpxor %ymm10,%ymm3,%ymm3");
+		asm volatile ("vpxor %ymm10,%ymm4,%ymm4");
+		asm volatile ("vpxor %ymm10,%ymm5,%ymm5");
+
+		asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
+		asm volatile ("vmovntdq %%ymm1,%0" : "=m" (q[i]));
+		asm volatile ("vmovntdq %%ymm2,%0" : "=m" (r[i]));
+		asm volatile ("vmovntdq %%ymm3,%0" : "=m" (s[i]));
+		asm volatile ("vmovntdq %%ymm4,%0" : "=m" (t[i]));
+		asm volatile ("vmovntdq %%ymm5,%0" : "=m" (u[i]));
+	}
+
+	raid_avx_end();
+}
+#endif
+
+#if defined(CONFIG_X86) && defined(CONFIG_SSSE3)
+/*
+ * RAID recovering for one disk SSSE3 implementation
+ */
+void raid_rec1_ssse3(int nr, int *id, int *ip, int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *pa;
+	uint8_t G;
+	uint8_t V;
+	size_t i;
+
+	(void)nr; /* unused, it's always 1 */
+
+	/* if it's RAID5 uses the faster function */
+	if (ip[0] == 0) {
+		raid_rec1of1(id, nd, size, vv);
+		return;
+	}
+
+	/* setup the coefficients matrix */
+	G = A(ip[0], id[0]);
+
+	/* invert it to solve the system of linear equations */
+	V = inv(G);
+
+	/* compute delta parity */
+	raid_delta_gen(1, id, ip, nd, size, vv);
+
+	p = v[nd + ip[0]];
+	pa = v[id[0]];
+
+	raid_sse_begin();
+
+	asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
+	asm volatile ("movdqa %0,%%xmm4" : : "m" (gfmulpshufb[V][0][0]));
+	asm volatile ("movdqa %0,%%xmm5" : : "m" (gfmulpshufb[V][1][0]));
+
+	for (i = 0; i < size; i += 16) {
+		asm volatile ("movdqa %0,%%xmm0" : : "m" (p[i]));
+		asm volatile ("movdqa %0,%%xmm1" : : "m" (pa[i]));
+		asm volatile ("movdqa %xmm4,%xmm2");
+		asm volatile ("movdqa %xmm5,%xmm3");
+		asm volatile ("pxor   %xmm0,%xmm1");
+		asm volatile ("movdqa %xmm1,%xmm0");
+		asm volatile ("psrlw  $4,%xmm1");
+		asm volatile ("pand   %xmm7,%xmm0");
+		asm volatile ("pand   %xmm7,%xmm1");
+		asm volatile ("pshufb %xmm0,%xmm2");
+		asm volatile ("pshufb %xmm1,%xmm3");
+		asm volatile ("pxor   %xmm3,%xmm2");
+		asm volatile ("movdqa %%xmm2,%0" : "=m" (pa[i]));
+	}
+
+	raid_sse_end();
+}
+#endif
+
+#if defined(CONFIG_X86) && defined(CONFIG_SSSE3)
+/*
+ * RAID recovering for two disks SSSE3 implementation
+ */
+void raid_rec2_ssse3(int nr, int *id, int *ip, int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	const int N = 2;
+	uint8_t *p[N];
+	uint8_t *pa[N];
+	uint8_t G[N * N];
+	uint8_t V[N * N];
+	size_t i;
+	int j, k;
+
+	(void)nr; /* unused, it's always 2 */
+
+	/* setup the coefficients matrix */
+	for (j = 0; j < N; ++j)
+		for (k = 0; k < N; ++k)
+			G[j * N + k] = A(ip[j], id[k]);
+
+	/* invert it to solve the system of linear equations */
+	raid_invert(G, V, N);
+
+	/* compute delta parity */
+	raid_delta_gen(N, id, ip, nd, size, vv);
+
+	for (j = 0; j < N; ++j) {
+		p[j] = v[nd + ip[j]];
+		pa[j] = v[id[j]];
+	}
+
+	raid_sse_begin();
+
+	asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
+
+	for (i = 0; i < size; i += 16) {
+		asm volatile ("movdqa %0,%%xmm0" : : "m" (p[0][i]));
+		asm volatile ("movdqa %0,%%xmm2" : : "m" (pa[0][i]));
+		asm volatile ("movdqa %0,%%xmm1" : : "m" (p[1][i]));
+		asm volatile ("movdqa %0,%%xmm3" : : "m" (pa[1][i]));
+		asm volatile ("pxor   %xmm2,%xmm0");
+		asm volatile ("pxor   %xmm3,%xmm1");
+
+		asm volatile ("pxor %xmm6,%xmm6");
+
+		asm volatile ("movdqa %0,%%xmm2" : : "m" (gfmulpshufb[V[0]][0][0]));
+		asm volatile ("movdqa %0,%%xmm3" : : "m" (gfmulpshufb[V[0]][1][0]));
+		asm volatile ("movdqa %xmm0,%xmm4");
+		asm volatile ("movdqa %xmm0,%xmm5");
+		asm volatile ("psrlw  $4,%xmm5");
+		asm volatile ("pand   %xmm7,%xmm4");
+		asm volatile ("pand   %xmm7,%xmm5");
+		asm volatile ("pshufb %xmm4,%xmm2");
+		asm volatile ("pshufb %xmm5,%xmm3");
+		asm volatile ("pxor   %xmm2,%xmm6");
+		asm volatile ("pxor   %xmm3,%xmm6");
+
+		asm volatile ("movdqa %0,%%xmm2" : : "m" (gfmulpshufb[V[1]][0][0]));
+		asm volatile ("movdqa %0,%%xmm3" : : "m" (gfmulpshufb[V[1]][1][0]));
+		asm volatile ("movdqa %xmm1,%xmm4");
+		asm volatile ("movdqa %xmm1,%xmm5");
+		asm volatile ("psrlw  $4,%xmm5");
+		asm volatile ("pand   %xmm7,%xmm4");
+		asm volatile ("pand   %xmm7,%xmm5");
+		asm volatile ("pshufb %xmm4,%xmm2");
+		asm volatile ("pshufb %xmm5,%xmm3");
+		asm volatile ("pxor   %xmm2,%xmm6");
+		asm volatile ("pxor   %xmm3,%xmm6");
+
+		asm volatile ("movdqa %%xmm6,%0" : "=m" (pa[0][i]));
+
+		asm volatile ("pxor %xmm6,%xmm6");
+
+		asm volatile ("movdqa %0,%%xmm2" : : "m" (gfmulpshufb[V[2]][0][0]));
+		asm volatile ("movdqa %0,%%xmm3" : : "m" (gfmulpshufb[V[2]][1][0]));
+		asm volatile ("movdqa %xmm0,%xmm4");
+		asm volatile ("movdqa %xmm0,%xmm5");
+		asm volatile ("psrlw  $4,%xmm5");
+		asm volatile ("pand   %xmm7,%xmm4");
+		asm volatile ("pand   %xmm7,%xmm5");
+		asm volatile ("pshufb %xmm4,%xmm2");
+		asm volatile ("pshufb %xmm5,%xmm3");
+		asm volatile ("pxor   %xmm2,%xmm6");
+		asm volatile ("pxor   %xmm3,%xmm6");
+
+		asm volatile ("movdqa %0,%%xmm2" : : "m" (gfmulpshufb[V[3]][0][0]));
+		asm volatile ("movdqa %0,%%xmm3" : : "m" (gfmulpshufb[V[3]][1][0]));
+		asm volatile ("movdqa %xmm1,%xmm4");
+		asm volatile ("movdqa %xmm1,%xmm5");
+		asm volatile ("psrlw  $4,%xmm5");
+		asm volatile ("pand   %xmm7,%xmm4");
+		asm volatile ("pand   %xmm7,%xmm5");
+		asm volatile ("pshufb %xmm4,%xmm2");
+		asm volatile ("pshufb %xmm5,%xmm3");
+		asm volatile ("pxor   %xmm2,%xmm6");
+		asm volatile ("pxor   %xmm3,%xmm6");
+
+		asm volatile ("movdqa %%xmm6,%0" : "=m" (pa[1][i]));
+	}
+
+	raid_sse_end();
+}
+#endif
+
+#if defined(CONFIG_X86) && defined(CONFIG_SSSE3)
+/*
+ * RAID recovering SSSE3 implementation
+ */
+void raid_recX_ssse3(int nr, int *id, int *ip, int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	int N = nr;
+	uint8_t *p[RAID_PARITY_MAX];
+	uint8_t *pa[RAID_PARITY_MAX];
+	uint8_t G[RAID_PARITY_MAX * RAID_PARITY_MAX];
+	uint8_t V[RAID_PARITY_MAX * RAID_PARITY_MAX];
+	uint8_t buffer[RAID_PARITY_MAX*16+16];
+	uint8_t *pd = __align_ptr(buffer, 16);
+	size_t i;
+	int j, k;
+
+	/* setup the coefficients matrix */
+	for (j = 0; j < N; ++j)
+		for (k = 0; k < N; ++k)
+			G[j * N + k] = A(ip[j], id[k]);
+
+	/* invert it to solve the system of linear equations */
+	raid_invert(G, V, N);
+
+	/* compute delta parity */
+	raid_delta_gen(N, id, ip, nd, size, vv);
+
+	for (j = 0; j < N; ++j) {
+		p[j] = v[nd + ip[j]];
+		pa[j] = v[id[j]];
+	}
+
+	raid_sse_begin();
+
+	asm volatile ("movdqa %0,%%xmm7" : : "m" (gfconst16.low4[0]));
+
+	for (i = 0; i < size; i += 16) {
+		/* delta */
+		for (j = 0; j < N; ++j) {
+			asm volatile ("movdqa %0,%%xmm0" : : "m" (p[j][i]));
+			asm volatile ("movdqa %0,%%xmm1" : : "m" (pa[j][i]));
+			asm volatile ("pxor   %xmm1,%xmm0");
+			asm volatile ("movdqa %%xmm0,%0" : "=m" (pd[j*16]));
+		}
+
+		/* reconstruct */
+		for (j = 0; j < N; ++j) {
+			asm volatile ("pxor %xmm0,%xmm0");
+			asm volatile ("pxor %xmm1,%xmm1");
+
+			for (k = 0; k < N; ++k) {
+				uint8_t m = V[j * N + k];
+
+				asm volatile ("movdqa %0,%%xmm2" : : "m" (gfmulpshufb[m][0][0]));
+				asm volatile ("movdqa %0,%%xmm3" : : "m" (gfmulpshufb[m][1][0]));
+				asm volatile ("movdqa %0,%%xmm4" : : "m" (pd[k*16]));
+				asm volatile ("movdqa %xmm4,%xmm5");
+				asm volatile ("psrlw  $4,%xmm5");
+				asm volatile ("pand   %xmm7,%xmm4");
+				asm volatile ("pand   %xmm7,%xmm5");
+				asm volatile ("pshufb %xmm4,%xmm2");
+				asm volatile ("pshufb %xmm5,%xmm3");
+				asm volatile ("pxor   %xmm2,%xmm0");
+				asm volatile ("pxor   %xmm3,%xmm1");
+			}
+
+			asm volatile ("pxor %xmm1,%xmm0");
+			asm volatile ("movdqa %%xmm0,%0" : "=m" (pa[j][i]));
+		}
+	}
+
+	raid_sse_end();
+}
+#endif
+
+#if defined(CONFIG_X86) && defined(CONFIG_AVX2)
+/*
+ * RAID recovering for one disk AVX2 implementation
+ */
+void raid_rec1_avx2(int nr, int *id, int *ip, int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *pa;
+	uint8_t G;
+	uint8_t V;
+	size_t i;
+
+	(void)nr; /* unused, it's always 1 */
+
+	/* if it's RAID5 uses the faster function */
+	if (ip[0] == 0) {
+		raid_rec1of1(id, nd, size, vv);
+		return;
+	}
+
+	/* setup the coefficients matrix */
+	G = A(ip[0], id[0]);
+
+	/* invert it to solve the system of linear equations */
+	V = inv(G);
+
+	/* compute delta parity */
+	raid_delta_gen(1, id, ip, nd, size, vv);
+
+	p = v[nd + ip[0]];
+	pa = v[id[0]];
+
+	raid_avx_begin();
+
+	asm volatile ("vbroadcasti128 %0,%%ymm7" : : "m" (gfconst16.low4[0]));
+	asm volatile ("vbroadcasti128 %0,%%ymm4" : : "m" (gfmulpshufb[V][0][0]));
+	asm volatile ("vbroadcasti128 %0,%%ymm5" : : "m" (gfmulpshufb[V][1][0]));
+
+	for (i = 0; i < size; i += 32) {
+		asm volatile ("vmovdqa %0,%%ymm0" : : "m" (p[i]));
+		asm volatile ("vmovdqa %0,%%ymm1" : : "m" (pa[i]));
+		asm volatile ("vpxor   %ymm1,%ymm0,%ymm0");
+		asm volatile ("vpsrlw  $4,%ymm0,%ymm1");
+		asm volatile ("vpand   %ymm7,%ymm0,%ymm0");
+		asm volatile ("vpand   %ymm7,%ymm1,%ymm1");
+		asm volatile ("vpshufb %ymm0,%ymm4,%ymm2");
+		asm volatile ("vpshufb %ymm1,%ymm5,%ymm3");
+		asm volatile ("vpxor   %ymm3,%ymm2,%ymm2");
+		asm volatile ("vmovdqa %%ymm2,%0" : "=m" (pa[i]));
+	}
+
+	raid_avx_end();
+}
+#endif
+
+#if defined(CONFIG_X86) && defined(CONFIG_AVX2)
+/*
+ * RAID recovering for two disks AVX2 implementation
+ */
+void raid_rec2_avx2(int nr, int *id, int *ip, int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	const int N = 2;
+	uint8_t *p[N];
+	uint8_t *pa[N];
+	uint8_t G[N * N];
+	uint8_t V[N * N];
+	size_t i;
+	int j, k;
+
+	(void)nr; /* unused, it's always 2 */
+
+	/* setup the coefficients matrix */
+	for (j = 0; j < N; ++j)
+		for (k = 0; k < N; ++k)
+			G[j * N + k] = A(ip[j], id[k]);
+
+	/* invert it to solve the system of linear equations */
+	raid_invert(G, V, N);
+
+	/* compute delta parity */
+	raid_delta_gen(N, id, ip, nd, size, vv);
+
+	for (j = 0; j < N; ++j) {
+		p[j] = v[nd + ip[j]];
+		pa[j] = v[id[j]];
+	}
+
+	raid_avx_begin();
+
+	asm volatile ("vbroadcasti128 %0,%%ymm7" : : "m" (gfconst16.low4[0]));
+
+	for (i = 0; i < size; i += 32) {
+		asm volatile ("vmovdqa %0,%%ymm0" : : "m" (p[0][i]));
+		asm volatile ("vmovdqa %0,%%ymm2" : : "m" (pa[0][i]));
+		asm volatile ("vmovdqa %0,%%ymm1" : : "m" (p[1][i]));
+		asm volatile ("vmovdqa %0,%%ymm3" : : "m" (pa[1][i]));
+		asm volatile ("vpxor   %ymm2,%ymm0,%ymm0");
+		asm volatile ("vpxor   %ymm3,%ymm1,%ymm1");
+
+		asm volatile ("vpxor %ymm6,%ymm6,%ymm6");
+
+		asm volatile ("vbroadcasti128 %0,%%ymm2" : : "m" (gfmulpshufb[V[0]][0][0]));
+		asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfmulpshufb[V[0]][1][0]));
+		asm volatile ("vpsrlw  $4,%ymm0,%ymm5");
+		asm volatile ("vpand   %ymm7,%ymm0,%ymm4");
+		asm volatile ("vpand   %ymm7,%ymm5,%ymm5");
+		asm volatile ("vpshufb %ymm4,%ymm2,%ymm2");
+		asm volatile ("vpshufb %ymm5,%ymm3,%ymm3");
+		asm volatile ("vpxor   %ymm2,%ymm6,%ymm6");
+		asm volatile ("vpxor   %ymm3,%ymm6,%ymm6");
+
+		asm volatile ("vbroadcasti128 %0,%%ymm2" : : "m" (gfmulpshufb[V[1]][0][0]));
+		asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfmulpshufb[V[1]][1][0]));
+		asm volatile ("vpsrlw  $4,%ymm1,%ymm5");
+		asm volatile ("vpand   %ymm7,%ymm1,%ymm4");
+		asm volatile ("vpand   %ymm7,%ymm5,%ymm5");
+		asm volatile ("vpshufb %ymm4,%ymm2,%ymm2");
+		asm volatile ("vpshufb %ymm5,%ymm3,%ymm3");
+		asm volatile ("vpxor   %ymm2,%ymm6,%ymm6");
+		asm volatile ("vpxor   %ymm3,%ymm6,%ymm6");
+
+		asm volatile ("vmovdqa %%ymm6,%0" : "=m" (pa[0][i]));
+
+		asm volatile ("vpxor %ymm6,%ymm6,%ymm6");
+
+		asm volatile ("vbroadcasti128 %0,%%ymm2" : : "m" (gfmulpshufb[V[2]][0][0]));
+		asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfmulpshufb[V[2]][1][0]));
+		asm volatile ("vpsrlw  $4,%ymm0,%ymm5");
+		asm volatile ("vpand   %ymm7,%ymm0,%ymm4");
+		asm volatile ("vpand   %ymm7,%ymm5,%ymm5");
+		asm volatile ("vpshufb %ymm4,%ymm2,%ymm2");
+		asm volatile ("vpshufb %ymm5,%ymm3,%ymm3");
+		asm volatile ("vpxor   %ymm2,%ymm6,%ymm6");
+		asm volatile ("vpxor   %ymm3,%ymm6,%ymm6");
+
+		asm volatile ("vbroadcasti128 %0,%%ymm2" : : "m" (gfmulpshufb[V[3]][0][0]));
+		asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfmulpshufb[V[3]][1][0]));
+		asm volatile ("vpsrlw  $4,%ymm1,%ymm5");
+		asm volatile ("vpand   %ymm7,%ymm1,%ymm4");
+		asm volatile ("vpand   %ymm7,%ymm5,%ymm5");
+		asm volatile ("vpshufb %ymm4,%ymm2,%ymm2");
+		asm volatile ("vpshufb %ymm5,%ymm3,%ymm3");
+		asm volatile ("vpxor   %ymm2,%ymm6,%ymm6");
+		asm volatile ("vpxor   %ymm3,%ymm6,%ymm6");
+
+		asm volatile ("vmovdqa %%ymm6,%0" : "=m" (pa[1][i]));
+	}
+
+	raid_avx_end();
+}
+#endif
+
+#if defined(CONFIG_X86) && defined(CONFIG_AVX2)
+/*
+ * RAID recovering AVX2 implementation
+ */
+void raid_recX_avx2(int nr, int *id, int *ip, int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	int N = nr;
+	uint8_t *p[RAID_PARITY_MAX];
+	uint8_t *pa[RAID_PARITY_MAX];
+	uint8_t G[RAID_PARITY_MAX * RAID_PARITY_MAX];
+	uint8_t V[RAID_PARITY_MAX * RAID_PARITY_MAX];
+	uint8_t buffer[RAID_PARITY_MAX*32+32];
+	uint8_t *pd = __align_ptr(buffer, 32);
+	size_t i;
+	int j, k;
+
+	/* setup the coefficients matrix */
+	for (j = 0; j < N; ++j)
+		for (k = 0; k < N; ++k)
+			G[j * N + k] = A(ip[j], id[k]);
+
+	/* invert it to solve the system of linear equations */
+	raid_invert(G, V, N);
+
+	/* compute delta parity */
+	raid_delta_gen(N, id, ip, nd, size, vv);
+
+	for (j = 0; j < N; ++j) {
+		p[j] = v[nd + ip[j]];
+		pa[j] = v[id[j]];
+	}
+
+	raid_avx_begin();
+
+	asm volatile ("vbroadcasti128 %0,%%ymm7" : : "m" (gfconst16.low4[0]));
+
+	for (i = 0; i < size; i += 32) {
+		/* delta */
+		for (j = 0; j < N; ++j) {
+			asm volatile ("vmovdqa %0,%%ymm0" : : "m" (p[j][i]));
+			asm volatile ("vmovdqa %0,%%ymm1" : : "m" (pa[j][i]));
+			asm volatile ("vpxor   %ymm1,%ymm0,%ymm0");
+			asm volatile ("vmovdqa %%ymm0,%0" : "=m" (pd[j*32]));
+		}
+
+		/* reconstruct */
+		for (j = 0; j < N; ++j) {
+			asm volatile ("vpxor %ymm0,%ymm0,%ymm0");
+			asm volatile ("vpxor %ymm1,%ymm1,%ymm1");
+
+			for (k = 0; k < N; ++k) {
+				uint8_t m = V[j * N + k];
+
+				asm volatile ("vbroadcasti128 %0,%%ymm2" : : "m" (gfmulpshufb[m][0][0]));
+				asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfmulpshufb[m][1][0]));
+				asm volatile ("vmovdqa %0,%%ymm4" : : "m" (pd[k*32]));
+				asm volatile ("vpsrlw  $4,%ymm4,%ymm5");
+				asm volatile ("vpand   %ymm7,%ymm4,%ymm4");
+				asm volatile ("vpand   %ymm7,%ymm5,%ymm5");
+				asm volatile ("vpshufb %ymm4,%ymm2,%ymm2");
+				asm volatile ("vpshufb %ymm5,%ymm3,%ymm3");
+				asm volatile ("vpxor   %ymm2,%ymm0,%ymm0");
+				asm volatile ("vpxor   %ymm3,%ymm1,%ymm1");
+			}
+
+			asm volatile ("vpxor %ymm1,%ymm0,%ymm0");
+			asm volatile ("vmovdqa %%ymm0,%0" : "=m" (pa[j][i]));
+		}
+	}
+
+	raid_avx_end();
+}
+#endif
+
diff --git a/raid/x86z.c b/raid/x86z.c
new file mode 100644
index 00000000..1e3fe89a
--- /dev/null
+++ b/raid/x86z.c
@@ -0,0 +1,255 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+
+#if defined(CONFIG_X86) && defined(CONFIG_SSE2)
+static const struct gfzconst16 {
+	uint8_t poly[16];
+	uint8_t half[16];
+	uint8_t low7[16];
+} gfzconst16 __aligned(64) =
+{
+	{
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d
+	},
+	{
+		0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e,
+		0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e
+	},
+	{
+		0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+		0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f
+	}
+};
+#endif
+
+#if defined(CONFIG_X86) && defined(CONFIG_SSE2)
+/*
+ * GENz (triple parity with powers of 2^-1) SSE2 implementation
+ */
+void raid_genz_sse2(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t**)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+
+	raid_sse_begin();
+
+	asm volatile ("movdqa %0,%%xmm7" : : "m" (gfzconst16.poly[0]));
+	asm volatile ("movdqa %0,%%xmm3" : : "m" (gfzconst16.half[0]));
+	asm volatile ("movdqa %0,%%xmm6" : : "m" (gfzconst16.low7[0]));
+
+	for (i = 0; i < size; i += 16) {
+		asm volatile ("movdqa %0,%%xmm0" : : "m" (v[l][i]));
+		asm volatile ("movdqa %xmm0,%xmm1");
+		asm volatile ("movdqa %xmm0,%xmm2");
+		for (d = l - 1; d >= 0; --d) {
+			asm volatile ("pxor %xmm4,%xmm4");
+			asm volatile ("pcmpgtb %xmm1,%xmm4");
+			asm volatile ("paddb %xmm1,%xmm1");
+			asm volatile ("pand %xmm7,%xmm4");
+			asm volatile ("pxor %xmm4,%xmm1");
+
+			asm volatile ("movdqa %xmm2,%xmm4");
+			asm volatile ("pxor %xmm5,%xmm5");
+			asm volatile ("psllw $7,%xmm4");
+			asm volatile ("psrlw $1,%xmm2");
+			asm volatile ("pcmpgtb %xmm4,%xmm5");
+			asm volatile ("pand %xmm6,%xmm2");
+			asm volatile ("pand %xmm3,%xmm5");
+			asm volatile ("pxor %xmm5,%xmm2");
+
+			asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
+			asm volatile ("pxor %xmm4,%xmm0");
+			asm volatile ("pxor %xmm4,%xmm1");
+			asm volatile ("pxor %xmm4,%xmm2");
+		}
+		asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
+		asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
+		asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
+	}
+
+	raid_sse_end();
+}
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SSE2)
+/*
+ * GENz (triple parity with powers of 2^-1) SSE2 implementation
+ *
+ * Note that it uses 16 registers, meaning that x64 is required.
+ */
+void raid_genz_sse2ext(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t**)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+
+	raid_sse_begin();
+
+	asm volatile ("movdqa %0,%%xmm7" : : "m" (gfzconst16.poly[0]));
+	asm volatile ("movdqa %0,%%xmm3" : : "m" (gfzconst16.half[0]));
+	asm volatile ("movdqa %0,%%xmm11" : : "m" (gfzconst16.low7[0]));
+
+	for (i = 0; i < size; i += 32) {
+		asm volatile ("movdqa %0,%%xmm0" : : "m" (v[l][i]));
+		asm volatile ("movdqa %0,%%xmm8" : : "m" (v[l][i + 16]));
+		asm volatile ("movdqa %xmm0,%xmm1");
+		asm volatile ("movdqa %xmm8,%xmm9");
+		asm volatile ("movdqa %xmm0,%xmm2");
+		asm volatile ("movdqa %xmm8,%xmm10");
+		for (d = l - 1; d >= 0; --d) {
+			asm volatile ("movdqa %xmm2,%xmm6");
+			asm volatile ("movdqa %xmm10,%xmm14");
+			asm volatile ("pxor %xmm4,%xmm4");
+			asm volatile ("pxor %xmm12,%xmm12");
+			asm volatile ("pxor %xmm5,%xmm5");
+			asm volatile ("pxor %xmm13,%xmm13");
+			asm volatile ("psllw $7,%xmm6");
+			asm volatile ("psllw $7,%xmm14");
+			asm volatile ("psrlw $1,%xmm2");
+			asm volatile ("psrlw $1,%xmm10");
+			asm volatile ("pcmpgtb %xmm1,%xmm4");
+			asm volatile ("pcmpgtb %xmm9,%xmm12");
+			asm volatile ("pcmpgtb %xmm6,%xmm5");
+			asm volatile ("pcmpgtb %xmm14,%xmm13");
+			asm volatile ("paddb %xmm1,%xmm1");
+			asm volatile ("paddb %xmm9,%xmm9");
+			asm volatile ("pand %xmm11,%xmm2");
+			asm volatile ("pand %xmm11,%xmm10");
+			asm volatile ("pand %xmm7,%xmm4");
+			asm volatile ("pand %xmm7,%xmm12");
+			asm volatile ("pand %xmm3,%xmm5");
+			asm volatile ("pand %xmm3,%xmm13");
+			asm volatile ("pxor %xmm4,%xmm1");
+			asm volatile ("pxor %xmm12,%xmm9");
+			asm volatile ("pxor %xmm5,%xmm2");
+			asm volatile ("pxor %xmm13,%xmm10");
+
+			asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
+			asm volatile ("movdqa %0,%%xmm12" : : "m" (v[d][i + 16]));
+			asm volatile ("pxor %xmm4,%xmm0");
+			asm volatile ("pxor %xmm4,%xmm1");
+			asm volatile ("pxor %xmm4,%xmm2");
+			asm volatile ("pxor %xmm12,%xmm8");
+			asm volatile ("pxor %xmm12,%xmm9");
+			asm volatile ("pxor %xmm12,%xmm10");
+		}
+		asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
+		asm volatile ("movntdq %%xmm8,%0" : "=m" (p[i + 16]));
+		asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
+		asm volatile ("movntdq %%xmm9,%0" : "=m" (q[i + 16]));
+		asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
+		asm volatile ("movntdq %%xmm10,%0" : "=m" (r[i + 16]));
+	}
+
+	raid_sse_end();
+}
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_AVX2)
+/*
+ * GENz (triple parity with powers of 2^-1) AVX2 implementation
+ *
+ * Note that it uses 16 registers, meaning that x64 is required.
+ */
+void raid_genz_avx2ext(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t**)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+
+	raid_avx_begin();
+
+	asm volatile ("vbroadcasti128 %0,%%ymm7" : : "m" (gfzconst16.poly[0]));
+	asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfzconst16.half[0]));
+	asm volatile ("vbroadcasti128 %0,%%ymm11" : : "m" (gfzconst16.low7[0]));
+	asm volatile ("vpxor %ymm15,%ymm15,%ymm15");
+
+	for (i = 0; i < size; i += 64) {
+		asm volatile ("vmovdqa %0,%%ymm0" : : "m" (v[l][i]));
+		asm volatile ("vmovdqa %0,%%ymm8" : : "m" (v[l][i + 32]));
+		asm volatile ("vmovdqa %ymm0,%ymm1");
+		asm volatile ("vmovdqa %ymm8,%ymm9");
+		asm volatile ("vmovdqa %ymm0,%ymm2");
+		asm volatile ("vmovdqa %ymm8,%ymm10");
+		for (d = l - 1; d >= 0; --d) {
+			asm volatile ("vpsllw $7,%ymm2,%ymm6");
+			asm volatile ("vpsllw $7,%ymm10,%ymm14");
+			asm volatile ("vpsrlw $1,%ymm2,%ymm2");
+			asm volatile ("vpsrlw $1,%ymm10,%ymm10");
+			asm volatile ("vpcmpgtb %ymm1,%ymm15,%ymm4");
+			asm volatile ("vpcmpgtb %ymm9,%ymm15,%ymm12");
+			asm volatile ("vpcmpgtb %ymm6,%ymm15,%ymm5");
+			asm volatile ("vpcmpgtb %ymm14,%ymm15,%ymm13");
+			asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
+			asm volatile ("vpaddb %ymm9,%ymm9,%ymm9");
+			asm volatile ("vpand %ymm11,%ymm2,%ymm2");
+			asm volatile ("vpand %ymm11,%ymm10,%ymm10");
+			asm volatile ("vpand %ymm7,%ymm4,%ymm4");
+			asm volatile ("vpand %ymm7,%ymm12,%ymm12");
+			asm volatile ("vpand %ymm3,%ymm5,%ymm5");
+			asm volatile ("vpand %ymm3,%ymm13,%ymm13");
+			asm volatile ("vpxor %ymm4,%ymm1,%ymm1");
+			asm volatile ("vpxor %ymm12,%ymm9,%ymm9");
+			asm volatile ("vpxor %ymm5,%ymm2,%ymm2");
+			asm volatile ("vpxor %ymm13,%ymm10,%ymm10");
+
+			asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[d][i]));
+			asm volatile ("vmovdqa %0,%%ymm12" : : "m" (v[d][i + 32]));
+			asm volatile ("vpxor %ymm4,%ymm0,%ymm0");
+			asm volatile ("vpxor %ymm4,%ymm1,%ymm1");
+			asm volatile ("vpxor %ymm4,%ymm2,%ymm2");
+			asm volatile ("vpxor %ymm12,%ymm8,%ymm8");
+			asm volatile ("vpxor %ymm12,%ymm9,%ymm9");
+			asm volatile ("vpxor %ymm12,%ymm10,%ymm10");
+		}
+		asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
+		asm volatile ("vmovntdq %%ymm8,%0" : "=m" (p[i + 32]));
+		asm volatile ("vmovntdq %%ymm1,%0" : "=m" (q[i]));
+		asm volatile ("vmovntdq %%ymm9,%0" : "=m" (q[i + 32]));
+		asm volatile ("vmovntdq %%ymm2,%0" : "=m" (r[i]));
+		asm volatile ("vmovntdq %%ymm10,%0" : "=m" (r[i + 32]));
+	}
+
+	raid_avx_end();
+}
+#endif
+
diff --git a/rustfmt.toml b/rustfmt.toml
new file mode 100644
index 00000000..83b9248a
--- /dev/null
+++ b/rustfmt.toml
@@ -0,0 +1,5 @@
+# Default settings, i.e. idiomatic rust
+edition = "2021"
+newline_style = "Unix"
+enum_discrim_align_threshold = 20
+struct_field_align_threshold = 20
diff --git a/src/bcachefs.rs b/src/bcachefs.rs
new file mode 100644
index 00000000..417cb01e
--- /dev/null
+++ b/src/bcachefs.rs
@@ -0,0 +1,121 @@
+mod commands;
+mod key;
+mod logging;
+mod wrappers;
+
+use std::{
+    ffi::{c_char, CString},
+    process::{ExitCode, Termination},
+};
+
+use bch_bindgen::c;
+use log::debug;
+
+#[derive(Debug)]
+pub struct ErrnoError(pub errno::Errno);
+impl std::fmt::Display for ErrnoError {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+        self.0.fmt(f)
+    }
+}
+
+impl std::error::Error for ErrnoError {}
+
+fn handle_c_command(mut argv: Vec<String>, symlink_cmd: Option<&str>) -> i32 {
+    let cmd = match symlink_cmd {
+        Some(s) => s.to_string(),
+        None => argv.remove(1),
+    };
+
+    let argc: i32 = argv.len().try_into().unwrap();
+
+    let argv: Vec<_> = argv.into_iter().map(|s| CString::new(s).unwrap()).collect();
+    let mut argv = argv
+        .into_iter()
+        .map(|s| Box::into_raw(s.into_boxed_c_str()).cast::<c_char>())
+        .collect::<Box<[*mut c_char]>>();
+    let argv = argv.as_mut_ptr();
+
+    // The C functions will mutate argv. It shouldn't be used after this block.
+    unsafe {
+        match cmd.as_str() {
+            "--help" => {
+                c::bcachefs_usage();
+                0
+            }
+            "data" => c::data_cmds(argc, argv),
+            "device" => c::device_cmds(argc, argv),
+            "dump" => c::cmd_dump(argc, argv),
+            "format" => c::cmd_format(argc, argv),
+            "fs" => c::fs_cmds(argc, argv),
+            "fsck" => c::cmd_fsck(argc, argv),
+            "list_journal" => c::cmd_list_journal(argc, argv),
+            "kill_btree_node" => c::cmd_kill_btree_node(argc, argv),
+            "migrate" => c::cmd_migrate(argc, argv),
+            "migrate-superblock" => c::cmd_migrate_superblock(argc, argv),
+            "mkfs" => c::cmd_format(argc, argv),
+            "remove-passphrase" => c::cmd_remove_passphrase(argc, argv),
+            "reset-counters" => c::cmd_reset_counters(argc, argv),
+            "set-fs-option" => c::cmd_set_option(argc, argv),
+            "set-passphrase" => c::cmd_set_passphrase(argc, argv),
+            "set-file-option" => c::cmd_setattr(argc, argv),
+            "show-super" => c::cmd_show_super(argc, argv),
+            "unlock" => c::cmd_unlock(argc, argv),
+            "version" => c::cmd_version(argc, argv),
+
+            #[cfg(feature = "fuse")]
+            "fusemount" => c::cmd_fusemount(argc, argv),
+
+            _ => {
+                println!("Unknown command {cmd}");
+                c::bcachefs_usage();
+                1
+            }
+        }
+    }
+}
+
+fn main() -> ExitCode {
+    let args: Vec<String> = std::env::args().collect();
+
+    let symlink_cmd: Option<&str> = if args[0].contains("mkfs") {
+        Some("mkfs")
+    } else if args[0].contains("fsck") {
+        Some("fsck")
+    } else if args[0].contains("mount.fuse") {
+        Some("fusemount")
+    } else if args[0].contains("mount") {
+        Some("mount")
+    } else {
+        None
+    };
+
+    if symlink_cmd.is_none() && args.len() < 2 {
+        println!("missing command");
+        unsafe { c::bcachefs_usage() };
+        return ExitCode::from(1);
+    }
+
+    unsafe { c::raid_init() };
+
+    let cmd = match symlink_cmd {
+        Some(s) => s,
+        None => args[1].as_str(),
+    };
+
+    match cmd {
+        "completions" => {
+            commands::completions(args[1..].to_vec());
+            ExitCode::SUCCESS
+        }
+        "list" => commands::list(args[1..].to_vec()).report(),
+        "mount" => commands::mount(args, symlink_cmd),
+        "subvolume" => commands::subvolume(args[1..].to_vec()).report(),
+        _ => {
+            let r = handle_c_command(args, symlink_cmd);
+
+            debug!("return code from C command: {r}");
+            ExitCode::from(r as u8)
+        }
+    }
+}
diff --git a/src/commands/completions.rs b/src/commands/completions.rs
new file mode 100644
index 00000000..e05934ca
--- /dev/null
+++ b/src/commands/completions.rs
@@ -0,0 +1,18 @@
+use clap::{Command, CommandFactory, Parser};
+use clap_complete::{generate, Generator, Shell};
+use std::io;
+
+/// Generate shell completions
+#[derive(Parser, Debug)]
+pub struct Cli {
+    shell: Shell,
+}
+
+fn print_completions<G: Generator>(gen: G, cmd: &mut Command) {
+    generate(gen, cmd, cmd.get_name().to_string(), &mut io::stdout());
+}
+
+pub fn completions(argv: Vec<String>) {
+    let cli = Cli::parse_from(argv);
+    print_completions(cli.shell, &mut super::Cli::command());
+}
diff --git a/src/commands/list.rs b/src/commands/list.rs
new file mode 100644
index 00000000..757d6063
--- /dev/null
+++ b/src/commands/list.rs
@@ -0,0 +1,208 @@
+use anyhow::Result;
+use bch_bindgen::bcachefs;
+use bch_bindgen::bkey::BkeySC;
+use bch_bindgen::btree::BtreeIter;
+use bch_bindgen::btree::BtreeIterFlags;
+use bch_bindgen::btree::BtreeNodeIter;
+use bch_bindgen::btree::BtreeTrans;
+use bch_bindgen::fs::Fs;
+use bch_bindgen::opt_set;
+use clap::Parser;
+use std::io::{stdout, IsTerminal};
+
+use crate::logging;
+
+fn list_keys(fs: &Fs, opt: &Cli) -> anyhow::Result<()> {
+    let trans = BtreeTrans::new(fs);
+    let mut iter = BtreeIter::new(
+        &trans,
+        opt.btree,
+        opt.start,
+        BtreeIterFlags::ALL_SNAPSHOTS | BtreeIterFlags::PREFETCH,
+    );
+
+    while let Some(k) = iter.peek_and_restart()? {
+        if k.k.p > opt.end {
+            break;
+        }
+
+        if let Some(ty) = opt.bkey_type {
+            if k.k.type_ != ty as u8 {
+                iter.advance();
+                continue;
+            }
+        }
+
+        println!("{}", k.to_text(fs));
+        iter.advance();
+    }
+
+    Ok(())
+}
+
+fn list_btree_formats(fs: &Fs, opt: &Cli) -> anyhow::Result<()> {
+    let trans = BtreeTrans::new(fs);
+    let mut iter = BtreeNodeIter::new(
+        &trans,
+        opt.btree,
+        opt.start,
+        0,
+        opt.level,
+        BtreeIterFlags::PREFETCH,
+    );
+
+    while let Some(b) = iter.peek_and_restart()? {
+        if b.key.k.p > opt.end {
+            break;
+        }
+
+        println!("{}", b.to_text(fs));
+        iter.advance();
+    }
+
+    Ok(())
+}
+
+fn list_btree_nodes(fs: &Fs, opt: &Cli) -> anyhow::Result<()> {
+    let trans = BtreeTrans::new(fs);
+    let mut iter = BtreeNodeIter::new(
+        &trans,
+        opt.btree,
+        opt.start,
+        0,
+        opt.level,
+        BtreeIterFlags::PREFETCH,
+    );
+
+    while let Some(b) = iter.peek_and_restart()? {
+        if b.key.k.p > opt.end {
+            break;
+        }
+
+        println!("{}", BkeySC::from(&b.key).to_text(fs));
+        iter.advance();
+    }
+
+    Ok(())
+}
+
+fn list_nodes_ondisk(fs: &Fs, opt: &Cli) -> anyhow::Result<()> {
+    let trans = BtreeTrans::new(fs);
+    let mut iter = BtreeNodeIter::new(
+        &trans,
+        opt.btree,
+        opt.start,
+        0,
+        opt.level,
+        BtreeIterFlags::PREFETCH,
+    );
+
+    while let Some(b) = iter.peek_and_restart()? {
+        if b.key.k.p > opt.end {
+            break;
+        }
+
+        println!("{}", b.ondisk_to_text(fs));
+        iter.advance();
+    }
+
+    Ok(())
+}
+
+#[derive(Clone, clap::ValueEnum, Debug)]
+enum Mode {
+    Keys,
+    Formats,
+    Nodes,
+    NodesOndisk,
+}
+
+/// List filesystem metadata in textual form
+#[derive(Parser, Debug)]
+pub struct Cli {
+    /// Btree to list from
+    #[arg(short, long, default_value_t=bcachefs::btree_id::BTREE_ID_extents)]
+    btree: bcachefs::btree_id,
+
+    /// Bkey type to list
+    #[arg(short = 'k', long)]
+    bkey_type: Option<bcachefs::bch_bkey_type>,
+
+    /// Btree depth to descend to (0 == leaves)
+    #[arg(short, long, default_value_t = 0)]
+    level: u32,
+
+    /// Start position to list from
+    #[arg(short, long, default_value = "POS_MIN")]
+    start: bcachefs::bpos,
+
+    /// End position
+    #[arg(short, long, default_value = "SPOS_MAX")]
+    end: bcachefs::bpos,
+
+    #[arg(short, long, default_value = "keys")]
+    mode: Mode,
+
+    /// Check (fsck) the filesystem first
+    #[arg(short, long)]
+    fsck: bool,
+
+    // FIXME: would be nicer to have `--color[=WHEN]` like diff or ls?
+    /// Force color on/off. Default: autodetect tty
+    #[arg(short, long, action = clap::ArgAction::Set, default_value_t=stdout().is_terminal())]
+    colorize: bool,
+
+    /// Verbose mode
+    #[arg(short, long, action = clap::ArgAction::Count)]
+    verbose: u8,
+
+    #[arg(required(true))]
+    devices: Vec<std::path::PathBuf>,
+}
+
+fn cmd_list_inner(opt: &Cli) -> anyhow::Result<()> {
+    let mut fs_opts = bcachefs::bch_opts::default();
+
+    opt_set!(fs_opts, noexcl, 1);
+    opt_set!(fs_opts, nochanges, 1);
+    opt_set!(fs_opts, read_only, 1);
+    opt_set!(fs_opts, norecovery, 1);
+    opt_set!(fs_opts, degraded, 1);
+    opt_set!(fs_opts, very_degraded, 1);
+    opt_set!(
+        fs_opts,
+        errors,
+        bcachefs::bch_error_actions::BCH_ON_ERROR_continue as u8
+    );
+
+    if opt.fsck {
+        opt_set!(
+            fs_opts,
+            fix_errors,
+            bcachefs::fsck_err_opts::FSCK_FIX_yes as u8
+        );
+        opt_set!(fs_opts, norecovery, 0);
+    }
+
+    if opt.verbose > 0 {
+        opt_set!(fs_opts, verbose, 1);
+    }
+
+    let fs = Fs::open(&opt.devices, fs_opts)?;
+
+    match opt.mode {
+        Mode::Keys => list_keys(&fs, opt),
+        Mode::Formats => list_btree_formats(&fs, opt),
+        Mode::Nodes => list_btree_nodes(&fs, opt),
+        Mode::NodesOndisk => list_nodes_ondisk(&fs, opt),
+    }
+}
+
+pub fn list(argv: Vec<String>) -> Result<()> {
+    let opt = Cli::parse_from(argv);
+
+    // TODO: centralize this on the top level CLI
+    logging::setup(opt.verbose, opt.colorize);
+
+    cmd_list_inner(&opt)
+}
diff --git a/src/commands/mod.rs b/src/commands/mod.rs
new file mode 100644
index 00000000..7f466f92
--- /dev/null
+++ b/src/commands/mod.rs
@@ -0,0 +1,52 @@
+use clap::Subcommand;
+
+pub mod completions;
+pub mod list;
+pub mod mount;
+pub mod subvolume;
+
+pub use completions::completions;
+pub use list::list;
+pub use mount::mount;
+pub use subvolume::subvolume;
+
+#[derive(clap::Parser, Debug)]
+#[command(name = "bcachefs")]
+pub struct Cli {
+    #[command(subcommand)]
+    subcommands: Subcommands,
+}
+
+#[derive(Subcommand, Debug)]
+enum Subcommands {
+    List(list::Cli),
+    Mount(mount::Cli),
+    Completions(completions::Cli),
+    #[command(visible_aliases = ["subvol"])]
+    Subvolume(subvolume::Cli),
+}
+
+// FIXME: Can be removed after bumping MSRV >= 1.77 in favor of `c""` literals
+#[macro_export]
+macro_rules! c_str {
+    ($lit:expr) => {
+        ::std::ffi::CStr::from_bytes_with_nul(concat!($lit, "\0").as_bytes())
+            .unwrap()
+            .as_ptr()
+    };
+}
+
+#[cfg(test)]
+mod tests {
+    use std::ffi::CStr;
+
+    #[test]
+    fn check_cstr_macro() {
+        let literal = c_str!("hello");
+
+        assert_eq!(
+            literal,
+            CStr::from_bytes_with_nul(b"hello\0").unwrap().as_ptr()
+        );
+    }
+}
diff --git a/src/commands/mount.rs b/src/commands/mount.rs
new file mode 100644
index 00000000..fe69bf57
--- /dev/null
+++ b/src/commands/mount.rs
@@ -0,0 +1,390 @@
+use std::{
+    collections::HashMap,
+    env,
+    ffi::CString,
+    io::{stdout, IsTerminal},
+    path::{Path, PathBuf},
+    ptr, str,
+};
+
+use anyhow::{ensure, Result};
+use bch_bindgen::{bcachefs, bcachefs::bch_sb_handle, opt_set, path_to_cstr};
+use clap::Parser;
+use log::{debug, info};
+use uuid::Uuid;
+
+use crate::{
+    key::{KeyHandle, Passphrase, UnlockPolicy},
+    logging,
+};
+
+fn mount_inner(
+    src: String,
+    target: &std::path::Path,
+    fstype: &str,
+    mut mountflags: libc::c_ulong,
+    data: Option<String>,
+) -> anyhow::Result<()> {
+    // bind the CStrings to keep them alive
+    let c_src = CString::new(src.clone())?;
+    let c_target = path_to_cstr(target);
+    let data = data.map(CString::new).transpose()?;
+    let fstype = CString::new(fstype)?;
+
+    // convert to pointers for ffi
+    let c_src = c_src.as_ptr();
+    let c_target = c_target.as_ptr();
+    let data_ptr = data.as_ref().map_or(ptr::null(), |data| data.as_ptr().cast());
+    let fstype = fstype.as_ptr();
+
+    let mut ret;
+    loop {
+        ret = {
+            info!("mounting filesystem");
+            // REQUIRES: CAP_SYS_ADMIN
+            unsafe { libc::mount(c_src, c_target, fstype, mountflags, data_ptr) }
+        };
+
+        let err = errno::errno().0;
+
+        if ret == 0
+            || (err != libc::EACCES && err != libc::EROFS)
+            || (mountflags & libc::MS_RDONLY) != 0
+        {
+            break;
+        }
+
+        println!("mount: device write-protected, mounting read-only");
+        mountflags |= libc::MS_RDONLY;
+    }
+
+    drop(data);
+
+    if ret != 0 {
+        let err = errno::errno();
+        let e = crate::ErrnoError(err);
+
+        if err.0 == libc::EBUSY {
+            eprintln!("mount: {}: {} already mounted or mount point busy", target.to_string_lossy(), src);
+        } else {
+            eprintln!("mount: {}: {}", src, e);
+        }
+
+        Err(e.into())
+    } else {
+        Ok(())
+    }
+}
+
+/// Parse a comma-separated mount options and split out mountflags and filesystem
+/// specific options.
+fn parse_mount_options(options: impl AsRef<str>) -> (Option<String>, libc::c_ulong) {
+    use either::Either::{Left, Right};
+
+    debug!("parsing mount options: {}", options.as_ref());
+    let (opts, flags) = options
+        .as_ref()
+        .split(',')
+        .map(|o| match o {
+            "dirsync" => Left(libc::MS_DIRSYNC),
+            "lazytime" => Left(1 << 25), // MS_LAZYTIME
+            "mand" => Left(libc::MS_MANDLOCK),
+            "noatime" => Left(libc::MS_NOATIME),
+            "nodev" => Left(libc::MS_NODEV),
+            "nodiratime" => Left(libc::MS_NODIRATIME),
+            "noexec" => Left(libc::MS_NOEXEC),
+            "nosuid" => Left(libc::MS_NOSUID),
+            "relatime" => Left(libc::MS_RELATIME),
+            "remount" => Left(libc::MS_REMOUNT),
+            "ro" => Left(libc::MS_RDONLY),
+            "rw" | "" => Left(0),
+            "strictatime" => Left(libc::MS_STRICTATIME),
+            "sync" => Left(libc::MS_SYNCHRONOUS),
+            o => Right(o),
+        })
+        .fold((Vec::new(), 0), |(mut opts, flags), next| match next {
+            Left(f) => (opts, flags | f),
+            Right(o) => {
+                opts.push(o);
+                (opts, flags)
+            }
+        });
+
+    (
+        if opts.is_empty() {
+            None
+        } else {
+            Some(opts.join(","))
+        },
+        flags,
+    )
+}
+
+fn read_super_silent(path: impl AsRef<Path>) -> anyhow::Result<bch_sb_handle> {
+    let mut opts = bcachefs::bch_opts::default();
+    opt_set!(opts, noexcl, 1);
+
+    bch_bindgen::sb_io::read_super_silent(path.as_ref(), opts)
+}
+
+fn device_property_map(dev: &udev::Device) -> HashMap<String, String> {
+    let rc: HashMap<_, _> = dev
+        .properties()
+        .map(|i| {
+            (
+                String::from(i.name().to_string_lossy()),
+                String::from(i.value().to_string_lossy()),
+            )
+        })
+        .collect();
+    rc
+}
+
+fn udev_bcachefs_info() -> anyhow::Result<HashMap<String, Vec<String>>> {
+    let mut info = HashMap::new();
+
+    if env::var("BCACHEFS_BLOCK_SCAN").is_ok() {
+        debug!("Checking all block devices for bcachefs super block!");
+        return Ok(info);
+    }
+
+    let mut udev = udev::Enumerator::new()?;
+
+    debug!("Walking udev db!");
+
+    udev.match_subsystem("block")?;
+    udev.match_property("ID_FS_TYPE", "bcachefs")?;
+
+    for m in udev
+        .scan_devices()?
+        .filter(udev::Device::is_initialized)
+        .map(|dev| device_property_map(&dev))
+        .filter(|m| m.contains_key("ID_FS_UUID") && m.contains_key("DEVNAME"))
+    {
+        let fs_uuid = m["ID_FS_UUID"].clone();
+        let dev_node = m["DEVNAME"].clone();
+        info.insert(dev_node.clone(), vec![fs_uuid.clone()]);
+        info.entry(fs_uuid).or_insert(vec![]).push(dev_node.clone());
+    }
+
+    Ok(info)
+}
+
+fn get_super_blocks(uuid: Uuid, devices: &[String]) -> Vec<(PathBuf, bch_sb_handle)> {
+    devices
+        .iter()
+        .filter_map(|dev| {
+            read_super_silent(PathBuf::from(dev))
+                .ok()
+                .map(|sb| (PathBuf::from(dev), sb))
+        })
+        .filter(|(_, sb)| sb.sb().uuid() == uuid)
+        .collect::<Vec<_>>()
+}
+
+fn get_all_block_devnodes() -> anyhow::Result<Vec<String>> {
+    let mut udev = udev::Enumerator::new()?;
+    udev.match_subsystem("block")?;
+
+    let devices = udev
+        .scan_devices()?
+        .filter_map(|dev| {
+            if dev.is_initialized() {
+                dev.devnode().map(|dn| dn.to_string_lossy().into_owned())
+            } else {
+                None
+            }
+        })
+        .collect::<Vec<_>>();
+    Ok(devices)
+}
+
+fn get_devices_by_uuid(
+    udev_bcachefs: &HashMap<String, Vec<String>>,
+    uuid: Uuid,
+) -> anyhow::Result<Vec<(PathBuf, bch_sb_handle)>> {
+    let devices = {
+        if !udev_bcachefs.is_empty() {
+            let uuid_string = uuid.hyphenated().to_string();
+            if let Some(devices) = udev_bcachefs.get(&uuid_string) {
+                devices.clone()
+            } else {
+                Vec::new()
+            }
+        } else {
+            get_all_block_devnodes()?
+        }
+    };
+
+    Ok(get_super_blocks(uuid, &devices))
+}
+
+fn devs_str_sbs_from_uuid(
+    udev_info: &HashMap<String, Vec<String>>,
+    uuid: &str,
+) -> anyhow::Result<(String, Vec<bch_sb_handle>)> {
+    debug!("enumerating devices with UUID {}", uuid);
+
+    let devs_sbs = Uuid::parse_str(uuid).map(|uuid| get_devices_by_uuid(udev_info, uuid))??;
+
+    let devs_str = devs_sbs
+        .iter()
+        .map(|(dev, _)| dev.to_str().unwrap())
+        .collect::<Vec<_>>()
+        .join(":");
+
+    let sbs: Vec<bch_sb_handle> = devs_sbs.iter().map(|(_, sb)| *sb).collect();
+
+    Ok((devs_str, sbs))
+}
+
+fn devs_str_sbs_from_device(
+    udev_info: &HashMap<String, Vec<String>>,
+    device: &Path,
+) -> anyhow::Result<(String, Vec<bch_sb_handle>)> {
+    let dev_sb = read_super_silent(device)?;
+
+    if dev_sb.sb().number_of_devices() == 1 {
+        Ok((device.as_os_str().to_str().unwrap().to_string(), vec![dev_sb]))
+    } else {
+        let uuid = dev_sb.sb().uuid();
+
+        devs_str_sbs_from_uuid(udev_info, &uuid.to_string())
+    }
+}
+
+/// If a user explicitly specifies `unlock_policy` or `passphrase_file` then use
+/// that without falling back to other mechanisms. If these options are not
+/// used, then search for the key or ask for it.
+fn handle_unlock(cli: &Cli, sb: &bch_sb_handle) -> Result<KeyHandle> {
+    if let Some(policy) = cli.unlock_policy.as_ref() {
+        return policy.apply(sb);
+    }
+
+    if let Some(path) = cli.passphrase_file.as_deref() {
+        return Passphrase::new_from_file(path).and_then(|p| KeyHandle::new(sb, &p));
+    }
+
+    let uuid = sb.sb().uuid();
+    KeyHandle::new_from_search(&uuid)
+        .or_else(|_| Passphrase::new(&uuid).and_then(|p| KeyHandle::new(sb, &p)))
+}
+
+fn cmd_mount_inner(cli: &Cli) -> Result<()> {
+    // Grab the udev information once
+    let udev_info = udev_bcachefs_info()?;
+
+    let (devices, mut sbs) =
+        if let Some(("UUID" | "OLD_BLKID_UUID", uuid)) = cli.dev.split_once('=') {
+            devs_str_sbs_from_uuid(&udev_info, uuid)?
+        } else if cli.dev.contains(':') {
+            // If the device string contains ":" we will assume the user knows the
+            // entire list. If they supply a single device it could be either the FS
+            // only has 1 device or it's only 1 of a number of devices which are
+            // part of the FS. This appears to be the case when we get called during
+            // fstab mount processing and the fstab specifies a UUID.
+
+            let sbs = cli
+                .dev
+                .split(':')
+                .map(read_super_silent)
+                .collect::<Result<Vec<_>>>()?;
+
+            (cli.dev.clone(), sbs)
+        } else {
+            devs_str_sbs_from_device(&udev_info, Path::new(&cli.dev))?
+        };
+
+    ensure!(!sbs.is_empty(), "No device(s) to mount specified");
+
+    let first_sb = &sbs[0];
+    if unsafe { bcachefs::bch2_sb_is_encrypted(first_sb.sb) } {
+        handle_unlock(cli, first_sb)?;
+    }
+
+    for sb in &mut sbs {
+        unsafe {
+            bch_bindgen::sb_io::bch2_free_super(sb);
+        }
+    }
+    drop(sbs);
+
+    if let Some(mountpoint) = cli.mountpoint.as_deref() {
+        info!(
+            "mounting with params: device: {}, target: {}, options: {}",
+            devices,
+            mountpoint.to_string_lossy(),
+            &cli.options
+        );
+
+        let (data, mountflags) = parse_mount_options(&cli.options);
+        mount_inner(devices, mountpoint, "bcachefs", mountflags, data)
+    } else {
+        info!(
+            "would mount with params: device: {}, options: {}",
+            devices, &cli.options
+        );
+
+        Ok(())
+    }
+}
+
+/// Mount a bcachefs filesystem by its UUID.
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+pub struct Cli {
+    /// Path to passphrase file
+    ///
+    /// This can be used to optionally specify a file to read the passphrase
+    /// from. An explictly specified key_location/unlock_policy overrides this
+    /// argument.
+    #[arg(short = 'f', long)]
+    passphrase_file: Option<PathBuf>,
+
+    /// Passphrase policy to use in case of an encrypted filesystem. If not
+    /// specified, the password will be searched for in the keyring. If not
+    /// found, the password will be prompted or read from stdin, depending on
+    /// whether the stdin is connected to a terminal or not.
+    #[arg(short = 'k', long = "key_location", value_enum)]
+    unlock_policy: Option<UnlockPolicy>,
+
+    /// Device, or UUID=\<UUID\>
+    dev: String,
+
+    /// Where the filesystem should be mounted. If not set, then the filesystem
+    /// won't actually be mounted. But all steps preceeding mounting the
+    /// filesystem (e.g. asking for passphrase) will still be performed.
+    mountpoint: Option<PathBuf>,
+
+    /// Mount options
+    #[arg(short, default_value = "")]
+    options: String,
+
+    // FIXME: would be nicer to have `--color[=WHEN]` like diff or ls?
+    /// Force color on/off. Autodetect tty is used to define default:
+    #[arg(short, long, action = clap::ArgAction::Set, default_value_t=stdout().is_terminal())]
+    colorize: bool,
+
+    /// Verbose mode
+    #[arg(short, long, action = clap::ArgAction::Count)]
+    verbose: u8,
+}
+
+pub fn mount(mut argv: Vec<String>, symlink_cmd: Option<&str>) -> std::process::ExitCode {
+    // If the bcachefs tool is being called as "bcachefs mount dev ..." (as opposed to via a
+    // symlink like "/usr/sbin/mount.bcachefs dev ...", then we need to pop the 0th argument
+    // ("bcachefs") since the CLI parser here expects the device at position 1.
+    if symlink_cmd.is_none() {
+        argv.remove(0);
+    }
+
+    let cli = Cli::parse_from(argv);
+
+    // TODO: centralize this on the top level CLI
+    logging::setup(cli.verbose, cli.colorize);
+
+    match cmd_mount_inner(&cli) {
+        Ok(_)   => std::process::ExitCode::SUCCESS,
+        Err(_)   => std::process::ExitCode::FAILURE,
+    }
+}
diff --git a/src/commands/subvolume.rs b/src/commands/subvolume.rs
new file mode 100644
index 00000000..7df20819
--- /dev/null
+++ b/src/commands/subvolume.rs
@@ -0,0 +1,103 @@
+use std::{env, path::PathBuf};
+
+use anyhow::{Context, Result};
+use bch_bindgen::c::BCH_SUBVOL_SNAPSHOT_RO;
+use clap::{Parser, Subcommand};
+
+use crate::wrappers::handle::BcachefsHandle;
+
+#[derive(Parser, Debug)]
+pub struct Cli {
+    #[command(subcommand)]
+    subcommands: Subcommands,
+}
+
+/// Subvolumes-related commands
+#[derive(Subcommand, Debug)]
+enum Subcommands {
+    #[command(visible_aliases = ["new"])]
+    Create {
+        /// Paths
+        targets: Vec<PathBuf>,
+    },
+
+    #[command(visible_aliases = ["del"])]
+    Delete {
+        /// Path
+        targets: Vec<PathBuf>,
+    },
+
+    #[command(allow_missing_positional = true, visible_aliases = ["snap"])]
+    Snapshot {
+        /// Make snapshot read only
+        #[arg(long, short)]
+        read_only: bool,
+        source:    Option<PathBuf>,
+        dest:      PathBuf,
+    },
+}
+
+pub fn subvolume(argv: Vec<String>) -> Result<()> {
+    let cli = Cli::parse_from(argv);
+
+    match cli.subcommands {
+        Subcommands::Create { targets } => {
+            for target in targets {
+                let target = if target.is_absolute() {
+                    target
+                } else {
+                    env::current_dir()
+                        .map(|p| p.join(target))
+                        .context("unable to get current directory")?
+                };
+
+                if let Some(dirname) = target.parent() {
+                    let fs = unsafe { BcachefsHandle::open(dirname) };
+                    fs.create_subvolume(target)
+                        .context("Failed to create the subvolume")?;
+                }
+            }
+        }
+        Subcommands::Delete { targets } => {
+            for target in targets {
+                let target = target
+                    .canonicalize()
+                    .context("subvolume path does not exist or can not be canonicalized")?;
+
+                if let Some(dirname) = target.parent() {
+                    let fs = unsafe { BcachefsHandle::open(dirname) };
+                    fs.delete_subvolume(target)
+                        .context("Failed to delete the subvolume")?;
+                }
+            }
+        }
+        Subcommands::Snapshot {
+            read_only,
+            source,
+            dest,
+        } => {
+            if let Some(dirname) = dest.parent() {
+                let dot = PathBuf::from(".");
+                let dir = if dirname.as_os_str().is_empty() {
+                    &dot
+                } else {
+                    dirname
+                };
+                let fs = unsafe { BcachefsHandle::open(dir) };
+
+                fs.snapshot_subvolume(
+                    if read_only {
+                        BCH_SUBVOL_SNAPSHOT_RO
+                    } else {
+                        0x0
+                    },
+                    source,
+                    dest,
+                )
+                .context("Failed to snapshot the subvolume")?;
+            }
+        }
+    }
+
+    Ok(())
+}
diff --git a/src/key.rs b/src/key.rs
new file mode 100644
index 00000000..0043095c
--- /dev/null
+++ b/src/key.rs
@@ -0,0 +1,255 @@
+use std::{
+    ffi::{c_long, CStr, CString},
+    fs,
+    io::{stdin, IsTerminal},
+    mem,
+    path::Path,
+    ptr, thread,
+    time::Duration,
+};
+use std::process::{Command, Stdio};
+use anyhow::{anyhow, ensure, Result};
+use bch_bindgen::{
+    bcachefs::{self, bch_key, bch_sb_handle},
+    c::{bch2_chacha_encrypt_key, bch_encrypted_key, bch_sb_field_crypt},
+    keyutils::{self, keyctl_search},
+};
+use log::{debug, info};
+use rustix::termios;
+use uuid::Uuid;
+use zeroize::{ZeroizeOnDrop, Zeroizing};
+
+use crate::{c_str, ErrnoError};
+
+const BCH_KEY_MAGIC: &[u8; 8] = b"bch**key";
+
+#[derive(Clone, Debug, clap::ValueEnum, strum::Display)]
+pub enum UnlockPolicy {
+    /// Don't ask for passphrase, if the key cannot be found in the keyring just
+    /// fail
+    Fail,
+    /// Wait for passphrase to become available before mounting
+    Wait,
+    /// Interactively prompt the user for a passphrase
+    Ask,
+    /// Try to read the passphrase from `stdin` without prompting
+    Stdin,
+}
+
+impl UnlockPolicy {
+    pub fn apply(&self, sb: &bch_sb_handle) -> Result<KeyHandle> {
+        let uuid = sb.sb().uuid();
+
+        info!("Using filesystem unlock policy '{self}' on {uuid}");
+
+        match self {
+            Self::Fail => KeyHandle::new_from_search(&uuid),
+            Self::Wait => Ok(KeyHandle::wait_for_unlock(&uuid)?),
+            Self::Ask => Passphrase::new_from_prompt(&uuid).and_then(|p| KeyHandle::new(sb, &p)),
+            Self::Stdin => Passphrase::new_from_stdin().and_then(|p| KeyHandle::new(sb, &p)),
+        }
+    }
+}
+
+impl Default for UnlockPolicy {
+    fn default() -> Self {
+        Self::Ask
+    }
+}
+
+/// A handle to an existing bcachefs key in the kernel keyring
+pub struct KeyHandle {
+    // FIXME: Either these come in useful for something or we remove them
+    _uuid: Uuid,
+    _id:   c_long,
+}
+
+impl KeyHandle {
+    pub fn format_key_name(uuid: &Uuid) -> CString {
+        CString::new(format!("bcachefs:{uuid}")).unwrap()
+    }
+
+    pub fn new(sb: &bch_sb_handle, passphrase: &Passphrase) -> Result<Self> {
+        let key_name = Self::format_key_name(&sb.sb().uuid());
+        let key_name = CStr::as_ptr(&key_name);
+        let key_type = c_str!("user");
+
+        let (passphrase_key, _sb_key) = passphrase.check(sb)?;
+
+        let key_id = unsafe {
+            keyutils::add_key(
+                key_type,
+                key_name,
+                ptr::addr_of!(passphrase_key).cast(),
+                mem::size_of_val(&passphrase_key),
+                keyutils::KEY_SPEC_USER_KEYRING,
+            )
+        };
+
+        if key_id > 0 {
+            info!("Added key to keyring");
+            Ok(KeyHandle {
+                _uuid: sb.sb().uuid(),
+                _id:   c_long::from(key_id),
+            })
+        } else {
+            Err(anyhow!("failed to add key to keyring: {}", errno::errno()))
+        }
+    }
+
+    fn search_keyring(keyring: i32, key_name: &CStr) -> Result<c_long> {
+        let key_name = CStr::as_ptr(key_name);
+        let key_type = c_str!("user");
+
+        let key_id = unsafe { keyctl_search(keyring, key_type, key_name, 0) };
+
+        if key_id > 0 {
+            info!("Found key in keyring");
+            Ok(key_id)
+        } else {
+            Err(ErrnoError(errno::errno()).into())
+        }
+    }
+
+    pub fn new_from_search(uuid: &Uuid) -> Result<Self> {
+        let key_name = Self::format_key_name(uuid);
+
+        Self::search_keyring(keyutils::KEY_SPEC_SESSION_KEYRING, &key_name)
+            .or_else(|_| Self::search_keyring(keyutils::KEY_SPEC_USER_KEYRING, &key_name))
+            .or_else(|_| Self::search_keyring(keyutils::KEY_SPEC_USER_SESSION_KEYRING, &key_name))
+            .map(|id| Self {
+                _uuid: *uuid,
+                _id:   id,
+            })
+    }
+
+    fn wait_for_unlock(uuid: &Uuid) -> Result<Self> {
+        loop {
+            match Self::new_from_search(uuid) {
+                Err(_) => thread::sleep(Duration::from_secs(1)),
+                r => break r,
+            }
+        }
+    }
+}
+
+#[derive(ZeroizeOnDrop)]
+pub struct Passphrase(CString);
+
+impl Passphrase {
+    fn get(&self) -> &CStr {
+        &self.0
+    }
+
+    pub fn new(uuid: &Uuid) -> Result<Self> {
+        if stdin().is_terminal() {
+            Self::new_from_prompt(uuid)
+        } else {
+            Self::new_from_stdin()
+        }
+    }
+
+    // The outer result represents a failure when trying to run systemd-ask-password,
+    // it is non-critical and will cause the password to be asked internally.
+    // The inner result represent a successful request that returned an error
+    // this one results in an error.
+    fn new_from_askpassword(uuid: &Uuid) -> Result<Result<Self>> {
+        let output = Command::new("systemd-ask-password")
+            .arg("--icon=drive-harddisk")
+            .arg(format!("--id=bcachefs:{}", uuid.as_hyphenated()))
+            .arg("-n")
+            .arg("Enter passphrase: ")
+            .stdin(Stdio::inherit())
+            .stderr(Stdio::inherit())
+            .output()?;
+        Ok(if output.status.success() {
+            match CString::new(output.stdout) {
+                Ok(cstr) => Ok(Self(cstr)),
+                Err(e) => Err(e.into())
+            }
+        } else {
+            Err(anyhow!("systemd-ask-password returned an error"))
+        })
+    }
+
+    // blocks indefinitely if no input is available on stdin
+    pub fn new_from_prompt(uuid: &Uuid) -> Result<Self> {
+        match Self::new_from_askpassword(uuid) {
+            Ok(phrase) => return phrase,
+            Err(_) => debug!("Failed to start systemd-ask-password, doing the prompt ourselves"),
+        }
+        let old = termios::tcgetattr(stdin())?;
+        let mut new = old.clone();
+        new.local_modes.remove(termios::LocalModes::ECHO);
+        termios::tcsetattr(stdin(), termios::OptionalActions::Flush, &new)?;
+
+        eprint!("Enter passphrase: ");
+
+        let mut line = Zeroizing::new(String::new());
+        let res = stdin().read_line(&mut line);
+        termios::tcsetattr(stdin(), termios::OptionalActions::Flush, &old)?;
+        eprintln!();
+        res?;
+
+        Ok(Self(CString::new(line.trim_end_matches('\n'))?))
+    }
+
+    // blocks indefinitely if no input is available on stdin
+    pub fn new_from_stdin() -> Result<Self> {
+        info!("Trying to read passphrase from stdin...");
+
+        let mut line = Zeroizing::new(String::new());
+        stdin().read_line(&mut line)?;
+
+        Ok(Self(CString::new(line.trim_end_matches('\n'))?))
+    }
+
+    pub fn new_from_file(passphrase_file: impl AsRef<Path>) -> Result<Self> {
+        let passphrase_file = passphrase_file.as_ref();
+
+        info!(
+            "Attempting to unlock key with passphrase from file {}",
+            passphrase_file.display()
+        );
+
+        let passphrase = Zeroizing::new(fs::read_to_string(passphrase_file)?);
+
+        Ok(Self(CString::new(passphrase.trim_end_matches('\n'))?))
+    }
+
+    fn derive(&self, crypt: &bch_sb_field_crypt) -> bch_key {
+        let crypt_ptr = (crypt as *const bch_sb_field_crypt).cast_mut();
+
+        unsafe { bcachefs::derive_passphrase(crypt_ptr, self.get().as_ptr()) }
+    }
+
+    pub fn check(&self, sb: &bch_sb_handle) -> Result<(bch_key, bch_encrypted_key)> {
+        let bch_key_magic = u64::from_le_bytes(*BCH_KEY_MAGIC);
+
+        let crypt = sb
+            .sb()
+            .crypt()
+            .ok_or_else(|| anyhow!("filesystem is not encrypted"))?;
+        let mut sb_key = *crypt.key();
+
+        ensure!(
+            sb_key.magic != bch_key_magic,
+            "filesystem does not have encryption key"
+        );
+
+        let mut passphrase_key: bch_key = self.derive(crypt);
+
+        let ret = unsafe {
+            bch2_chacha_encrypt_key(
+                ptr::addr_of_mut!(passphrase_key),
+                sb.sb().nonce(),
+                ptr::addr_of_mut!(sb_key).cast(),
+                mem::size_of_val(&sb_key),
+            )
+        };
+        ensure!(ret == 0, "error encrypting key");
+        ensure!(sb_key.magic == bch_key_magic, "incorrect passphrase");
+
+        Ok((passphrase_key, sb_key))
+    }
+}
diff --git a/src/logging.rs b/src/logging.rs
new file mode 100644
index 00000000..7b113b6a
--- /dev/null
+++ b/src/logging.rs
@@ -0,0 +1,48 @@
+use std::io::Write;
+
+use env_logger::WriteStyle;
+use log::{Level, LevelFilter};
+use owo_colors::{OwoColorize, Style};
+
+pub fn setup(verbose: u8, color: bool) {
+    let level_filter = match verbose {
+        0 => LevelFilter::Warn,
+        1 => LevelFilter::Info,
+        2 => LevelFilter::Debug,
+        _ => LevelFilter::Trace,
+    };
+
+    let style = if color {
+        WriteStyle::Always
+    } else {
+        WriteStyle::Never
+    };
+
+    env_logger::Builder::new()
+        .filter_level(level_filter)
+        .write_style(style)
+        .parse_env("BCACHEFS_LOG")
+        .format(move |buf, record| {
+            let style = if style == WriteStyle::Never {
+                Style::new()
+            } else {
+                match record.level() {
+                    Level::Trace => Style::new().cyan(),
+                    Level::Debug => Style::new().blue(),
+                    Level::Info => Style::new().green(),
+                    Level::Warn => Style::new().yellow(),
+                    Level::Error => Style::new().red().bold(),
+                }
+            };
+
+            writeln!(
+                buf,
+                "[{:<5} {}:{}] {}",
+                record.level().style(style),
+                record.file().unwrap(),
+                record.line().unwrap(),
+                record.args()
+            )
+        })
+        .init();
+}
diff --git a/src/wrappers/handle.rs b/src/wrappers/handle.rs
new file mode 100644
index 00000000..b9a4a63d
--- /dev/null
+++ b/src/wrappers/handle.rs
@@ -0,0 +1,131 @@
+use std::path::Path;
+
+use bch_bindgen::c::{
+    bcache_fs_close, bcache_fs_open, bch_ioctl_subvolume, bchfs_handle, BCH_IOCTL_SUBVOLUME_CREATE,
+    BCH_IOCTL_SUBVOLUME_DESTROY, BCH_SUBVOL_SNAPSHOT_CREATE,
+};
+use bch_bindgen::path_to_cstr;
+use errno::Errno;
+
+/// A handle to a bcachefs filesystem
+/// This can be used to send [`libc::ioctl`] to the underlying filesystem.
+pub(crate) struct BcachefsHandle {
+    inner: bchfs_handle,
+}
+
+impl BcachefsHandle {
+    /// Opens a bcachefs filesystem and returns its handle
+    /// TODO(raitobezarius): how can this not be faillible?
+    pub(crate) unsafe fn open<P: AsRef<Path>>(path: P) -> Self {
+        let path = path_to_cstr(path);
+        Self {
+            inner: bcache_fs_open(path.as_ptr()),
+        }
+    }
+}
+
+/// I/O control commands that can be sent to a bcachefs filesystem
+/// Those are non-exhaustive
+#[repr(u32)]
+#[non_exhaustive]
+pub enum BcachefsIoctl {
+    SubvolumeCreate  = BCH_IOCTL_SUBVOLUME_CREATE,
+    SubvolumeDestroy = BCH_IOCTL_SUBVOLUME_DESTROY,
+}
+
+/// I/O control commands payloads
+#[non_exhaustive]
+pub enum BcachefsIoctlPayload {
+    Subvolume(bch_ioctl_subvolume),
+}
+
+impl From<&BcachefsIoctlPayload> for *const libc::c_void {
+    fn from(value: &BcachefsIoctlPayload) -> Self {
+        match value {
+            BcachefsIoctlPayload::Subvolume(p) => (p as *const bch_ioctl_subvolume).cast(),
+        }
+    }
+}
+
+impl BcachefsHandle {
+    /// Type-safe [`libc::ioctl`] for bcachefs filesystems
+    pub fn ioctl(
+        &self,
+        request: BcachefsIoctl,
+        payload: &BcachefsIoctlPayload,
+    ) -> Result<(), Errno> {
+        let payload_ptr: *const libc::c_void = payload.into();
+        let ret = unsafe { libc::ioctl(self.inner.ioctl_fd, request as libc::Ioctl, payload_ptr) };
+
+        if ret == -1 {
+            Err(errno::errno())
+        } else {
+            Ok(())
+        }
+    }
+
+    /// Create a subvolume for this bcachefs filesystem
+    /// at the given path
+    pub fn create_subvolume<P: AsRef<Path>>(&self, dst: P) -> Result<(), Errno> {
+        let dst = path_to_cstr(dst);
+        self.ioctl(
+            BcachefsIoctl::SubvolumeCreate,
+            &BcachefsIoctlPayload::Subvolume(bch_ioctl_subvolume {
+                dirfd: libc::AT_FDCWD as u32,
+                mode: 0o777,
+                dst_ptr: dst.as_ptr() as u64,
+                ..Default::default()
+            }),
+        )
+    }
+
+    /// Delete the subvolume at the given path
+    /// for this bcachefs filesystem
+    pub fn delete_subvolume<P: AsRef<Path>>(&self, dst: P) -> Result<(), Errno> {
+        let dst = path_to_cstr(dst);
+        self.ioctl(
+            BcachefsIoctl::SubvolumeDestroy,
+            &BcachefsIoctlPayload::Subvolume(bch_ioctl_subvolume {
+                dirfd: libc::AT_FDCWD as u32,
+                mode: 0o777,
+                dst_ptr: dst.as_ptr() as u64,
+                ..Default::default()
+            }),
+        )
+    }
+
+    /// Snapshot a subvolume for this bcachefs filesystem
+    /// at the given path
+    pub fn snapshot_subvolume<P: AsRef<Path>>(
+        &self,
+        extra_flags: u32,
+        src: Option<P>,
+        dst: P,
+    ) -> Result<(), Errno> {
+        let src = src.map(|src| path_to_cstr(src));
+        let dst = path_to_cstr(dst);
+
+        let res = self.ioctl(
+            BcachefsIoctl::SubvolumeCreate,
+            &BcachefsIoctlPayload::Subvolume(bch_ioctl_subvolume {
+                flags: BCH_SUBVOL_SNAPSHOT_CREATE | extra_flags,
+                dirfd: libc::AT_FDCWD as u32,
+                mode: 0o777,
+                src_ptr: src.as_ref().map_or(0, |x| x.as_ptr() as u64),
+                //src_ptr: if let Some(src) = src { src.as_ptr() } else { std::ptr::null() } as u64,
+                dst_ptr: dst.as_ptr() as u64,
+                ..Default::default()
+            }),
+        );
+
+        drop(src);
+        drop(dst);
+        res
+    }
+}
+
+impl Drop for BcachefsHandle {
+    fn drop(&mut self) {
+        unsafe { bcache_fs_close(self.inner) };
+    }
+}
diff --git a/src/wrappers/mod.rs b/src/wrappers/mod.rs
new file mode 100644
index 00000000..b2679605
--- /dev/null
+++ b/src/wrappers/mod.rs
@@ -0,0 +1 @@
+pub mod handle;
diff --git a/udev/64-bcachefs.rules b/udev/64-bcachefs.rules
new file mode 100644
index 00000000..ca1f3818
--- /dev/null
+++ b/udev/64-bcachefs.rules
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Copyright (C) 2024 Oracle.  All rights reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+#
+# Don't let udisks automount bcachefs filesystems without even asking a user.
+# This doesn't eliminate filesystems as an attack surface; it only prevents
+# evil maid attacks when all sessions are locked.
+#
+# According to http://storaged.org/doc/udisks2-api/latest/udisks.8.html,
+# supplying UDISKS_AUTO=0 here changes the HintAuto property of the block
+# device abstraction to mean "do not automatically start" (e.g. mount).
+SUBSYSTEM=="block", ENV{ID_FS_TYPE}=="bcachefs", ENV{UDISKS_AUTO}="0"